Bladeren bron

AMBARI-18736. Perf: Simulate alerts for multiple Ambari Agents running on single Host.(vbrodetskyi)

Vitaly Brodetskyi 8 jaren geleden
bovenliggende
commit
3a34213883
39 gewijzigde bestanden met toevoegingen van 1464 en 2427 verwijderingen
  1. 108 0
      ambari-common/src/main/python/resource_management/libraries/functions/simulate_perf_cluster_alert_behaviour.py
  2. 20 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/alerts.json
  3. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/configuration/happy-alert-config.xml
  4. 5 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/metainfo.xml
  5. 13 22
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/package/alerts/alert_happy_process.py
  6. 9 101
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/alerts.json
  7. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/configuration/hbase-alert-config.xml
  8. 1 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/metainfo.xml
  9. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/package/alerts/hbase_master_process.py
  10. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/package/alerts/hbase_regionserver_process.py
  11. 31 1697
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/alerts.json
  12. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/configuration/hdfs-alert-config.xml
  13. 1 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/metainfo.xml
  14. 14 24
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_checkpoint_time.py
  15. 16 31
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_datanode_unmounted_data_dir.py
  16. 0 75
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_ha_namenode_health.py
  17. 0 85
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_metrics_deviation.py
  18. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_nfs_gateway_process.py
  19. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_snamenode_process.py
  20. 17 32
      ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_upgrade_finalized.py
  21. 20 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/alerts.json
  22. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/configuration/sleepy-alert-config.xml
  23. 5 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/metainfo.xml
  24. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/package/alerts/alert_sleepy_process.py
  25. 20 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/alerts.json
  26. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/configuration/snow-alert-config.xml
  27. 5 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/metainfo.xml
  28. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/package/alerts/alert_snow_process.py
  29. 23 338
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/alerts.json
  30. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/configuration/yarn-alert-config.xml
  31. 3 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/metainfo.xml
  32. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_history_process.py
  33. 14 22
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_nodemanager_health.py
  34. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_resourcemanager_process.py
  35. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_timeline_process.py
  36. 20 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/alerts.json
  37. 75 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/configuration/zk-alert-config.xml
  38. 4 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/metainfo.xml
  39. 59 0
      ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/package/alerts/alert_zk_server_process.py

+ 108 - 0
ambari-common/src/main/python/resource_management/libraries/functions/simulate_perf_cluster_alert_behaviour.py

@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+__all__ = ["simulate_perf_cluster_alert_behaviour"]
+
+import logging
+import random
+import time
+
+from datetime import datetime
+from resource_management.core.exceptions import Fail
+
+RESULT_CODE_OK = 'OK'
+RESULT_CODE_CRITICAL = 'CRITICAL'
+RESULT_CODE_UNKNOWN = 'UNKNOWN'
+
+OK_MESSAGE = 'Ok'
+FAIL_MESSAGE = 'Expected Fail'
+UNKNOWN_MESSAGE = 'Expected Unknown'
+
+logger = logging.getLogger('ambari_alerts')
+
+return_values_map = {"true":[RESULT_CODE_OK, OK_MESSAGE], "false":[RESULT_CODE_CRITICAL, FAIL_MESSAGE],
+                     "none":[RESULT_CODE_UNKNOWN, UNKNOWN_MESSAGE]}
+
+def simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+  alert_behaviour_type=None
+  alert_behaviour_type_key=alert_behaviour_properties["alert_behaviour_type"]
+  if alert_behaviour_type_key in configurations:
+    alert_behaviour_type = configurations[alert_behaviour_type_key].lower()
+
+  if alert_behaviour_type == "percentage":
+    alert_success_percentage=None
+    alert_success_percentage_key=alert_behaviour_properties["alert_success_percentage"]
+
+    if alert_success_percentage_key in configurations:
+      alert_success_percentage = configurations[alert_success_percentage_key]
+
+    if alert_success_percentage:
+      random_number = random.uniform(0, 100)
+      if random_number <= int(alert_success_percentage):
+        return (RESULT_CODE_OK, [OK_MESSAGE])
+      else:
+        return (RESULT_CODE_CRITICAL, [FAIL_MESSAGE])
+    else:
+      raise Fail("Percentage behaviour was set but alert.success.percentage was not set!")
+  elif alert_behaviour_type == "timeout":
+    alert_timeout_return_value=None
+    alert_timeout_secs=None
+    alert_timeout_return_value_key=alert_behaviour_properties["alert_timeout_return_value"]
+    alert_timeout_secs_key=alert_behaviour_properties["alert_timeout_secs"]
+
+    if alert_timeout_return_value_key in configurations:
+      alert_timeout_return_value = configurations[alert_timeout_return_value_key].lower()
+
+    if alert_timeout_secs_key in configurations:
+      alert_timeout_secs = configurations[alert_timeout_secs_key]
+
+    if alert_timeout_return_value and alert_timeout_secs:
+      logger.info("Sleeping for {0} seconds".format(alert_timeout_secs))
+      print "Sleeping for {0} seconds".format(alert_timeout_secs)
+      time.sleep(int(alert_timeout_secs))
+      return (return_values_map[alert_timeout_return_value][0], [return_values_map[alert_timeout_return_value][1]])
+    else:
+      raise Fail("Timeout behaviour was set but alert.timeout.return.value/alert.timeout.secs were not set!")
+  elif alert_behaviour_type == "flip":
+    alert_flip_interval_mins=None
+    alert_flip_interval_mins_key=alert_behaviour_properties["alert_flip_interval_mins"]
+
+    if alert_flip_interval_mins_key in configurations:
+      alert_flip_interval_mins = configurations[alert_flip_interval_mins_key]
+
+    if alert_flip_interval_mins:
+      curr_time = datetime.utcnow()
+      return_value = ((curr_time.minute / int(alert_flip_interval_mins)) % 2) == 0
+      return (return_values_map[str(return_value).lower()][0], [return_values_map[str(return_value).lower()][1]])
+    else:
+      raise Fail("Flip behaviour was set but alert.flip.interval.mins was not set!")
+
+
+
+  result_code = RESULT_CODE_OK
+  label = OK_MESSAGE
+  return (result_code, [label])

+ 20 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/alerts.json

@@ -0,0 +1,20 @@
+{
+    "HAPPY": {
+
+        "HAPPY": [
+            {
+                "name": "happy_process",
+                "label": "Happy Process",
+                "description": "Alert for happy component process status",
+                "interval": 1,
+                "scope": "HOST",
+                "enabled": true,
+                "source": {
+                    "type": "SCRIPT",
+                    "path": "PERF/1.0/services/HAPPY/package/alerts/alert_happy_process.py",
+                    "parameters": []
+                }
+            }
+        ]
+    }
+}

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/configuration/happy-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 5 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/metainfo.xml

@@ -52,6 +52,11 @@
           <default>true</default>
         </theme>
       </themes>
+
+      <configuration-dependencies>
+        <config-type>happy-alert-config</config-type>
+      </configuration-dependencies>
+
     </service>
   </services>
 </metainfo>

+ 13 - 22
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_nodemanagers_summary.py → ambari-server/src/main/resources/stacks/PERF/1.0/services/HAPPY/package/alerts/alert_happy_process.py

@@ -20,32 +20,30 @@ limitations under the License.
 
 import logging
 
-RESULT_CODE_OK = 'OK'
-RESULT_CODE_CRITICAL = 'CRITICAL'
-RESULT_CODE_UNKNOWN = 'UNKNOWN'
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
 
-OK_MESSAGE = 'All NodeManagers are healthy'
+ALERT_BEHAVIOUR_TYPE = "{{happy-alert-config/alert.behavior.type}}"
 
-NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.address}}'
-NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.https.address}}'
-YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
+ALERT_SUCCESS_PERCENTAGE = "{{happy-alert-config/alert.success.percentage}}"
 
-KERBEROS_KEYTAB = '{{yarn-site/yarn.nodemanager.webapp.spnego-keytab-file}}'
-KERBEROS_PRINCIPAL = '{{yarn-site/yarn.nodemanager.webapp.spnego-principal}}'
-SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
-SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
-EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
+ALERT_TIMEOUT_RETURN_VALUE = "{{happy-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{happy-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{happy-alert-config/alert.flip.interval.mins}}"
 
 logger = logging.getLogger('ambari_alerts')
 
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
 
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
-  return NODEMANAGER_HTTP_ADDRESS_KEY, NODEMANAGER_HTTPS_ADDRESS_KEY, EXECUTABLE_SEARCH_PATHS, \
-    YARN_HTTP_POLICY_KEY, SMOKEUSER_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
 
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -57,12 +55,5 @@ def execute(configurations={}, parameters={}, host_name=None):
   parameters (dictionary): a mapping of script parameter key to value
   host_name (string): the name of this host where the alert is running
   """
-  result_code = RESULT_CODE_UNKNOWN
-
-  if configurations is None:
-    return (result_code, ['There were no configurations supplied to the script.'])
-
-  result_code = RESULT_CODE_OK
-  label = OK_MESSAGE
 
-  return (result_code, [label])
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 9 - 101
ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/alerts.json

@@ -1,98 +1,18 @@
 {
   "HBASE": {
-    "service": [
-      {
-        "name": "hbase_regionserver_process_percent",
-        "label": "Percent RegionServers Available",
-        "description": "This service-level alert is triggered if the configured percentage of RegionServer processes cannot be determined to be up and listening on the network for the configured warning and critical thresholds. It aggregates the results of RegionServer process down checks.",
-        "interval": 1,
-        "scope": "SERVICE",
-        "enabled": true,
-        "source": {
-          "type": "AGGREGATE",
-          "alert_name": "hbase_regionserver_process",
-          "reporting": {
-            "ok": {
-              "text": "affected: [{1}], total: [{0}]"
-            },
-            "warning": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 10
-            },
-            "critical": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 30
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          }
-        }
-      }    
-    ],
+
     "HBASE_MASTER": [
       {
         "name": "hbase_master_process",
         "label": "HBase Master Process",
         "description": "This alert is triggered if the HBase master processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
         "interval": 1,
-        "scope": "ANY",
-        "source": {
-          "type": "PORT",
-          "uri": "{{hbase-site/hbase.master.port}}",
-          "default_port": 60000,
-          "reporting": {
-            "ok": {
-              "text": "TCP OK - {0:.3f}s response on port {1}"
-            },
-            "warning": {
-              "text": "TCP OK - {0:.3f}s response on port {1}",
-              "value": 1.5
-            },
-            "critical": {
-              "text": "Connection failed: {0} to {1}:{2}",
-              "value": 5.0
-            }
-          }
-        }
-      },
-      {
-        "name": "hbase_master_cpu",
-        "label": "HBase Master CPU Utilization",
-        "description": "This host-level alert is triggered if CPU utilization of the HBase Master exceeds certain warning and critical thresholds. It checks the HBase Master JMX Servlet for the SystemCPULoad property. The threshold values are in percent.",
-        "interval": 5,
-        "scope": "ANY",
+        "scope": "HOST",
         "enabled": true,
         "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hbase-site/hbase.master.info.port}}",
-            "default_port": 60010,
-            "connection_timeout": 5.0,
-            "kerberos_keytab": "{{hbase-site/hbase.security.authentication.spnego.kerberos.principal}}",
-            "kerberos_principal": "{{hbase-site/hbase.security.authentication.spnego.kerberos.keytab}}"
-          },
-          "reporting": {
-            "ok": {
-              "text": "{1} CPU, load {0:.1%}"
-            },
-            "warning": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 200
-            },
-            "critical": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 250
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "java.lang:type=OperatingSystem/SystemCpuLoad",
-              "java.lang:type=OperatingSystem/AvailableProcessors"
-            ],
-            "value": "{0} * 100"
-          }
+          "type": "SCRIPT",
+          "path": "PERF/1.0/services/HBASE/package/alerts/hbase_master_process.py",
+          "parameters": []
         }
       }
     ],
@@ -103,23 +23,11 @@
         "description": "This host-level alert is triggered if the RegionServer processes cannot be confirmed to be up and listening on the network for the configured critical threshold, given in seconds.",
         "interval": 1,
         "scope": "HOST",
+        "enabled": true,
         "source": {
-          "type": "PORT",
-          "uri": "{{hbase-site/hbase.regionserver.info.port}}",
-          "default_port": 60030,
-          "reporting": {
-            "ok": {
-              "text": "TCP OK - {0:.3f}s response on port {1}"
-            },
-            "warning": {
-              "text": "TCP OK - {0:.3f}s response on port {1}",
-              "value": 1.5
-            },
-            "critical": {
-              "text": "Connection failed: {0} to {1}:{2}",
-              "value": 5.0
-            }
-          }
+          "type": "SCRIPT",
+          "path": "PERF/1.0/services/HBASE/package/alerts/hbase_regionserver_process.py",
+          "parameters": []
         }
       }
     ]

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/configuration/hbase-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 1 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/metainfo.xml

@@ -168,6 +168,7 @@
       <osSpecifics></osSpecifics>
 
       <configuration-dependencies>
+        <config-type>hbase-alert-config</config-type>
         <config-type>core-site</config-type> <!-- hbase puts core-site in it's folder -->
         <config-type>hbase-policy</config-type>
         <config-type>hbase-site</config-type>

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/package/alerts/hbase_master_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{hbase-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{hbase-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{hbase-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hbase-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{hbase-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HBASE/package/alerts/hbase_regionserver_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{hbase-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{hbase-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{hbase-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hbase-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{hbase-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 31 - 1697
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/alerts.json

@@ -1,469 +1,21 @@
 {
   "HDFS":{
-    "service": [
-      {
-        "name": "datanode_process_percent",
-        "label": "Percent DataNodes Available",
-        "description": "This alert is triggered if the number of down DataNodes in the cluster is greater than the configured critical threshold. It aggregates the results of DataNode process checks.",
-        "interval": 1,
-        "scope": "SERVICE",
-        "enabled": true,
-        "source": {
-          "type": "AGGREGATE",
-          "alert_name": "datanode_process",
-          "reporting": {
-            "ok": {
-              "text": "affected: [{1}], total: [{0}]"
-            },
-            "warning": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 10
-            },
-            "critical": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 30
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          }
-        }
-      },
-      {
-        "name": "datanode_storage_percent",
-        "label": "Percent DataNodes With Available Space",
-        "description": "This service-level alert is triggered if the storage on a certain percentage of DataNodes exceeds either the warning or critical threshold values.",
-        "interval": 1,
-        "scope": "SERVICE",
-        "enabled": true,
-        "source": {
-          "type": "AGGREGATE",
-          "alert_name": "datanode_storage",
-          "reporting": {
-            "ok": {
-              "text": "affected: [{1}], total: [{0}]"
-            },
-            "warning": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 10
-            },
-            "critical": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 30
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          }
-        }
-      },
-      {
-        "name": "journalnode_process_percent",
-        "label": "Percent JournalNodes Available",
-        "description": "This alert is triggered if the number of down JournalNodes in the cluster is greater than the configured critical threshold. It aggregates the results of JournalNode process checks.",
-        "interval": 1,
-        "scope": "SERVICE",
-        "enabled": true,
-        "source": {
-          "type": "AGGREGATE",
-          "alert_name": "journalnode_process",
-          "reporting": {
-            "ok": {
-              "text": "affected: [{1}], total: [{0}]"
-            },
-            "warning": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 33
-            },
-            "critical": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 50
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          }
-        }
-      }
-    ],
     "NAMENODE": [
-      {
-        "name": "namenode_webui",
-        "label": "NameNode Web UI",
-        "description": "This host-level alert is triggered if the NameNode Web UI is unreachable.",
-        "interval": 1,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      },
+
       {
         "name": "upgrade_finalized_state",
         "label": "HDFS Upgrade Finalized State",
         "description": "This service-level alert is triggered if HDFS is not in the finalized state",
         "interval": 1,
-        "scope": "SERVICE",
+        "scope": "HOST",
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py",
+          "path": "PERF/1.0/services/HDFS/package/alerts/alert_upgrade_finalized.py",
           "parameters": []
         }
       },
-      {
-        "name": "namenode_cpu",
-        "label": "NameNode Host CPU Utilization",
-        "description": "This host-level alert is triggered if CPU utilization of the NameNode exceeds certain warning and critical thresholds. It checks the NameNode JMX Servlet for the SystemCPULoad property. The threshold values are in percent.",
-        "interval": 5,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "{1} CPU, load {0:.1%}"
-            },
-            "warning": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 200
-            },
-            "critical": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 250
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "java.lang:type=OperatingSystem/SystemCpuLoad",
-              "java.lang:type=OperatingSystem/AvailableProcessors"
-            ],
-            "value": "{0} * 100"
-          }
-        }
-      },
-      {
-        "name": "namenode_hdfs_blocks_health",
-        "label": "NameNode Blocks Health",
-        "description": "This service-level alert is triggered if the number of corrupt or missing blocks exceeds the configured critical threshold. The threshold values are in blocks.",
-        "interval": 2,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "Total Blocks:[{1}], Missing Blocks:[{0}]"
-            },
-            "warning": {
-              "text": "Total Blocks:[{1}], Missing Blocks:[{0}]",
-              "value": 1
-            },          
-            "critical": {
-              "text": "Total Blocks:[{1}], Missing Blocks:[{0}]",
-              "value": 1
-            },
-            "units" : "Blocks"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=NameNode,name=FSNamesystem/MissingBlocks",
-              "Hadoop:service=NameNode,name=FSNamesystem/BlocksTotal"
-            ],
-            "value": "{0}"
-          }
-        }
-      },
-      {
-        "name": "namenode_hdfs_pending_deletion_blocks",
-        "label": "HDFS Pending Deletion Blocks",
-        "description": "This service-level alert is triggered if the number of blocks pending deletion in HDFS exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the PendingDeletionBlock property.",
-        "interval": 2,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "Pending Deletion Blocks:[{0}]"
-            },
-            "warning": {
-              "text": "Pending Deletion Blocks:[{0}]",
-              "value": 100000
-            },
-            "critical": {
-              "text": "Pending Deletion Blocks:[{0}]",
-              "value": 100000
-            },
-            "units" : "Blocks"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=NameNode,name=FSNamesystem/PendingDeletionBlocks"
-            ],
-            "value": "{0}"
-          }
-        }
-      },
-      {
-        "name": "namenode_hdfs_capacity_utilization",
-        "label": "HDFS Capacity Utilization",
-        "description": "This service-level alert is triggered if the HDFS capacity utilization exceeds the configured warning and critical thresholds. It checks the NameNode JMX Servlet for the CapacityUsed and CapacityRemaining properties. The threshold values are in percent.",
-        "interval": 2,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]"
-            },
-            "warning": {
-              "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
-              "value": 75
-            },          
-            "critical": {
-              "text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
-              "value": 80
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=NameNode,name=FSNamesystemState/CapacityUsed",
-              "Hadoop:service=NameNode,name=FSNamesystemState/CapacityRemaining"
-            ],
-            "value": "{0}/({0} + {1}) * 100.0"
-          }
-        }
-      },
-      {
-        "name": "namenode_rpc_latency",
-        "label": "NameNode RPC Latency",
-        "description": "This host-level alert is triggered if the NameNode RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for NameNode operations. The threshold values are in milliseconds.",
-        "interval": 2,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
-            },
-            "warning": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-              "value": 3000
-            },          
-            "critical": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-              "value": 5000
-            },
-            "units" : "ms"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
-              "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
-            ],
-            "value": "{0}"
-          }
-        }
-      },
-      {
-        "name": "namenode_directory_status",
-        "label": "NameNode Directory Status",
-        "description": "This host-level alert is triggered if the NameNode NameDirStatuses metric (name=NameNodeInfo/NameDirStatuses) reports a failed directory. The threshold values are in the number of directories that are not healthy.",
-        "interval": 1,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key" : "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern" : "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern" : "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "Directories are healthy"
-            },
-            "warning": {
-              "text": "Failed directory count: {1}",
-              "value": 1
-            },          
-            "critical": {
-              "text": "Failed directory count: {1}",
-              "value": 1
-            },
-            "units" : "Dirs"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=NameNode,name=NameNodeInfo/NameDirStatuses"
-            ],
-            "value": "calculate(args)\ndef calculate(args):\n  import json\n  json_statuses = json.loads({0})\n  return len(json_statuses['failed']) if 'failed' in json_statuses else 0"
-          }
-        }
-      },
-      {
-        "name": "datanode_health_summary",
-        "label": "DataNode Health Summary",
-        "description": "This service-level alert is triggered if there are unhealthy DataNodes",
-        "interval": 1,
-        "scope": "SERVICE",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "nameservice": "{{hdfs-site/dfs.internal.nameservices}}",
-              "alias_key": "{{hdfs-site/dfs.ha.namenodes.{{ha-nameservice}}}}",
-              "http_pattern": "{{hdfs-site/dfs.namenode.http-address.{{ha-nameservice}}.{{alias}}}}",
-              "https_pattern": "{{hdfs-site/dfs.namenode.https-address.{{ha-nameservice}}.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "All {2} DataNode(s) are healthy"
-            },
-            "warning": {
-              "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]",
-              "value": 1
-            },
-            "critical": {
-              "text": "DataNode Health: [Live={2}, Stale={1}, Dead={0}]",
-              "value": 1
-            },
-            "units": "DNs"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=NameNode,name=FSNamesystemState/NumDeadDataNodes",
-              "Hadoop:service=NameNode,name=FSNamesystemState/NumStaleDataNodes",
-              "Hadoop:service=NameNode,name=FSNamesystemState/NumLiveDataNodes"
-            ],
-            "value": "{0} + {1}"
-          }
-        }
-      },
+
       {
         "name": "namenode_last_checkpoint",
         "label": "NameNode Last Checkpoint",
@@ -473,7 +25,7 @@
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py",
+          "path": "PERF/1.0/services/HDFS/package/alerts/alert_checkpoint_time.py",
           "parameters": [
             {
               "name": "connection.timeout",
@@ -520,1265 +72,47 @@
             }
           ]
         }
-      },
+      }
+    ],
+    "SECONDARY_NAMENODE": [
       {
-        "name": "namenode_ha_health",
-        "label": "NameNode High Availability Health",
-        "description": "This service-level alert is triggered if either the Active NameNode or Standby NameNode are not running.",
+        "name": "secondary_namenode_process",
+        "label": "Secondary NameNode Process",
+        "description": "This host-level alert is triggered if the Secondary NameNode process cannot be confirmed to be up and listening on the network.",
         "interval": 1,
-        "scope": "ANY",
-        "enabled": true,
-        "ignore_host": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py",
-          "parameters": [
-            {
-              "name": "connection.timeout",
-              "display_name": "Connection Timeout",
-              "value": 5.0,
-              "type": "NUMERIC",
-              "description": "The maximum time before this alert is considered to be CRITICAL",
-              "units": "seconds",
-              "threshold": "CRITICAL"
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_service_rpc_queue_latency_hourly",
-        "label": "NameNode Service RPC Queue Latency (Hourly)",
-        "description": "This service-level alert is triggered if the deviation of RPC queue latency on datanode port has grown beyond the specified threshold within an hour period.",
-        "interval": 5,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 60,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_client_rpc_queue_latency_hourly",
-        "label": "NameNode Client RPC Queue Latency (Hourly)",
-        "description": "This service-level alert is triggered if the deviation of RPC queue latency on client port has grown beyond the specified threshold within an hour period.",
-        "interval": 5,
-        "scope": "ANY",
+        "scope": "HOST",
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 60,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
+          "path": "PERF/1.0/services/HDFS/package/alerts/alert_snamenode_process.py"
         }
-      },
+      }
+    ],
+    "NFS_GATEWAY": [
       {
-        "name": "namenode_service_rpc_processing_latency_hourly",
-        "label": "NameNode Service RPC Processing Latency (Hourly)",
-        "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within an hour period.",
-        "interval": 5,
-        "scope": "ANY",
+        "name": "nfsgateway_process",
+        "label": "NFS Gateway Process",
+        "description": "This host-level alert is triggered if the NFS Gateway process cannot be confirmed to be up and listening on the network.",
+        "interval": 1,
+        "scope": "HOST",
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 60,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
+          "path": "PERF/1.0/services/HDFS/package/alerts/alert_nfs_gateway_process.py"
         }
-      },
+      }
+    ],
+    "DATANODE": [
       {
-        "name": "namenode_client_rpc_processing_latency_hourly",
-        "label": "NameNode Client RPC Processing Latency (Hourly)",
-        "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within an hour period.",
-        "interval": 5,
-        "scope": "ANY",
+        "name": "datanode_unmounted_data_dir",
+        "label": "DataNode Unmounted Data Dir",
+        "description": "This host-level alert is triggered if one of the data directories on a host was previously on a mount point and became unmounted. If the mount history file does not exist, then report an error if a host has one or more mounted data directories as well as one or more unmounted data directories on the root partition. This may indicate that a data directory is writing to the root partition, which is undesirable.",
+        "interval": 1,
+        "scope": "HOST",
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 60,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
-        }
-      },
-      {
-        "name": "increase_nn_heap_usage_daily",
-        "label": "NameNode Heap Usage (Daily)",
-        "description": "This service-level alert is triggered if the NameNode heap usage deviation has grown beyond the specified threshold within a day period.",
-        "interval": 480,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 1440,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "jvm.JvmMetrics.MemHeapUsedM",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 20,
-              "description": "The percentage of NameNode heap usage growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 50,
-              "description": "The percentage of NameNode heap usage growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "MB",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Heap",
-              "value": 100,
-              "type": "NUMERIC",
-              "units": "MB",
-              "description": "The minimum heap increase in a day."
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_service_rpc_processing_latency_daily",
-        "label": "NameNode Service RPC Processing Latency (Daily)",
-        "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a day period.",
-        "interval": 480,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 1440,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_client_rpc_processing_latency_daily",
-        "label": "NameNode Client RPC Processing Latency (Daily)",
-        "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a day period.",
-        "interval": 480,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 1440,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC processing latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_service_rpc_queue_latency_daily",
-        "label": "NameNode Service RPC Queue Latency (Daily)",
-        "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a day period.",
-        "interval": 480,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 1440,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "MB",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_client_rpc_queue_latency_daily",
-        "label": "NameNode Client RPC Queue Latency (Daily)",
-        "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a day period.",
-        "interval": 480,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 1440,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 100,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 200,
-              "description": "The percentage of RPC queue latency growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Latency",
-              "value": 30,
-              "type": "NUMERIC",
-              "units": "seconds",
-              "description": "The minimum latency to measure growth."
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "ms",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_increase_in_storage_capacity_usage_daily",
-        "label": "HDFS Storage Capacity Usage (Daily)",
-        "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a day period.",
-        "interval": 480,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 1440,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "dfs.FSNamesystem.CapacityUsed",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 30,
-              "description": "The percentage of storage capacity usage growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 50,
-              "description": "The percentage of storage capacity usage growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "B",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Capacity",
-              "value": 100,
-              "type": "NUMERIC",
-              "units": "MB",
-              "description": "The minimum capacity increase in a day."
-            }
-          ]
-        }
-      },
-      {
-        "name": "increase_nn_heap_usage_weekly",
-        "label": "NameNode Heap Usage (Weekly)",
-        "description": "This service-level alert is triggered if the NameNode heap usage deviation has grown beyond the specified threshold within a week period.",
-        "interval": 1440,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 10080,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "jvm.JvmMetrics.MemHeapUsedM",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 20,
-              "description": "The percentage of NameNode heap usage growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 50,
-              "description": "The percentage of NameNode heap usage growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "MB",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Heap",
-              "value": 1000,
-              "type": "NUMERIC",
-              "units": "MB",
-              "description": "The minimum heap increase in a week."
-            }
-          ]
-        }
-      },
-      {
-        "name": "namenode_increase_in_storage_capacity_usage_weekly",
-        "label": "HDFS Storage Capacity Usage (Weekly)",
-        "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a week period.",
-        "interval": 1440,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
-          "parameters": [
-            {
-              "name": "mergeHaMetrics",
-              "display_name": "Whether active and stanby NameNodes metrics should be merged",
-              "value": "false",
-              "type": "STRING",
-              "description": "Whether active and stanby NameNodes metrics should be merged.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "interval",
-              "display_name": "Time interval in minutes",
-              "value": 10080,
-              "type": "NUMERIC",
-              "description": "Time interval in minutes.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "appId",
-              "display_name": "AMS application id",
-              "value": "NAMENODE",
-              "type": "STRING",
-              "description": "The application id used to retrieve the metric.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metricName",
-              "display_name": "Metric Name",
-              "value": "dfs.FSNamesystem.CapacityUsed",
-              "type": "STRING",
-              "description": "The metric to monitor.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "metric.deviation.warning.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 10,
-              "description": "The percentage of storage capacity usage growth.",
-              "threshold": "WARNING"
-            },
-            {
-              "name": "metric.deviation.critical.threshold",
-              "display_name": "Growth Rate",
-              "type": "PERCENT",
-              "units": "%",
-              "value": 20,
-              "description": "The percentage of storage capacity usage growth.",
-              "threshold": "CRITICAL"
-            },
-            {
-              "name": "metric.units",
-              "display_name": "Metric Units",
-              "type": "STRING",
-              "value": "B",
-              "description": "The units that the metric data points are reported in.",
-              "visibility": "HIDDEN"
-            },
-            {
-              "name": "minimumValue",
-              "display_name": "Minimum Capacity",
-              "value": 1000,
-              "type": "NUMERIC",
-              "units": "MB",
-              "description": "The minimum capacity increase in a week."
-            }
-          ]
-        }
-      }
-    ],
-    "SECONDARY_NAMENODE": [
-      {
-        "name": "secondary_namenode_process",
-        "label": "Secondary NameNode Process",
-        "description": "This host-level alert is triggered if the Secondary NameNode process cannot be confirmed to be up and listening on the network.",
-        "interval": 1,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{hdfs-site/dfs.namenode.secondary.http-address}}",
-            "https": "{{hdfs-site/dfs.namenode.secondary.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY"
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      }
-    ],
-    "NFS_GATEWAY": [
-      {
-        "name": "nfsgateway_process",
-        "label": "NFS Gateway Process",
-        "description": "This host-level alert is triggered if the NFS Gateway process cannot be confirmed to be up and listening on the network.",
-        "interval": 1,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "PORT",
-          "uri": "{{hdfs-site/nfs.server.port}}",
-          "default_port": 2049,
-          "reporting": {
-            "ok": {
-              "text": "TCP OK - {0:.3f}s response on port {1}"
-            },
-            "warning": {
-              "text": "TCP OK - {0:.3f}s response on port {1}",
-              "value": 1.5
-            },
-            "critical": {
-              "text": "Connection failed: {0} to {1}:{2}",
-              "value": 5.0
-            }
-          }
-        }
-      }
-    ],
-    "JOURNALNODE": [
-      {
-        "name": "journalnode_process",
-        "label": "JournalNode Web UI",
-        "description": "This host-level alert is triggered if the JournalNode Web UI is unreachable.",
-        "interval": 1,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{hdfs-site/dfs.journalnode.http-address}}",
-            "https": "{{hdfs-site/dfs.journalnode.https-address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning": {
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      }
-    ],      
-    "DATANODE": [
-      {
-        "name": "datanode_process",
-        "label": "DataNode Process",
-        "description": "This host-level alert is triggered if the individual DataNode processes cannot be established to be up and listening on the network.",
-        "interval": 1,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "PORT",        
-          "uri": "{{hdfs-site/dfs.datanode.address}}",
-          "default_port": 50010,
-          "reporting": {
-            "ok": {
-              "text": "TCP OK - {0:.3f}s response on port {1}"
-            },
-            "warning": {
-              "text": "TCP OK - {0:.3f}s response on port {1}",
-              "value": 1.5
-            },
-            "critical": {
-              "text": "Connection failed: {0} to {1}:{2}",
-              "value": 5.0
-            }
-          }
-        }
-      },
-      {
-        "name": "datanode_webui",
-        "label": "DataNode Web UI",
-        "description": "This host-level alert is triggered if the DataNode Web UI is unreachable.",
-        "interval": 1,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{hdfs-site/dfs.datanode.http.address}}",
-            "https": "{{hdfs-site/dfs.datanode.https.address}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      },    
-      {
-        "name": "datanode_storage",
-        "label": "DataNode Storage",
-        "description": "This host-level alert is triggered if storage capacity if full on the DataNode. It checks the DataNode JMX Servlet for the Capacity and Remaining properties. The threshold values are in percent.",
-        "interval": 2,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.datanode.http.address}}",
-            "https": "{{hdfs-site/dfs.datanode.https.address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]"
-            },
-            "warning": {
-              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
-              "value": 75
-            },
-            "critical": {
-              "text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
-              "value": 80
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining",
-              "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity"
-            ],
-            "value": "({1} - {0})/{1} * 100.0"
-          }
-        }
-      },
-      {
-        "name": "datanode_unmounted_data_dir",
-        "label": "DataNode Unmounted Data Dir",
-        "description": "This host-level alert is triggered if one of the data directories on a host was previously on a mount point and became unmounted. If the mount history file does not exist, then report an error if a host has one or more mounted data directories as well as one or more unmounted data directories on the root partition. This may indicate that a data directory is writing to the root partition, which is undesirable.",
-        "interval": 2,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "SCRIPT",
-          "path": "HDFS/2.1.0.2.0/package/alerts/alert_datanode_unmounted_data_dir.py"
-        }
-      },
-      {
-        "name": "datanode_heap_usage",
-        "label": "DataNode Heap Usage",
-        "description": "This host-level alert is triggered if heap usage goes past thresholds on the DataNode. It checks the DataNode JMXServlet for the MemHeapUsedM and MemHeapMaxM properties. The threshold values are in percent.",
-        "interval": 2,
-        "scope": "HOST",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{hdfs-site/dfs.datanode.http.address}}",
-            "https": "{{hdfs-site/dfs.datanode.https.address}}",
-            "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
-            "https_property": "{{hdfs-site/dfs.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB"
-            },
-            "warning": {
-              "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
-              "value": 80
-            },
-            "critical": {
-              "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
-              "value": 90
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=DataNode,name=JvmMetrics/MemHeapUsedM",
-              "Hadoop:service=DataNode,name=JvmMetrics/MemHeapMaxM"
-            ],
-            "value": "100.0 - (({1} - {0})/{1} * 100.0)"
-          }
-        }
-      }
-    ],
-    "ZKFC": [
-      {
-        "name": "hdfs_zookeeper_failover_controller_process",
-        "label": "ZooKeeper Failover Controller Process",
-        "description": "This host-level alert is triggered if the ZooKeeper Failover Controller process cannot be confirmed to be up and listening on the network.",
-        "interval": 1,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "PORT",
-          "uri": "{{hdfs-site/dfs.ha.zkfc.port}}",
-          "default_port": 8019,
-          "reporting": {
-            "ok": {
-              "text": "TCP OK - {0:.3f}s response on port {1}"
-            },
-            "warning": {
-              "text": "TCP OK - {0:.3f}s response on port {1}",
-              "value": 1.5
-            },
-            "critical": {
-              "text": "Connection failed: {0} to {1}:{2}",
-              "value": 5.0
-            }
-          }
+          "path": "PERF/1.0/services/HDFS/package/alerts/alert_datanode_unmounted_data_dir.py"
         }
       }
     ]

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/configuration/hdfs-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 1 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/metainfo.xml

@@ -227,6 +227,7 @@
       </requiredServices>
 
       <configuration-dependencies>
+        <config-type>hdfs-alert-config</config-type>
         <config-type>core-site</config-type>
         <config-type>hdfs-site</config-type>
         <config-type>hadoop-env</config-type>

+ 14 - 24
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_checkpoint_time.py

@@ -20,36 +20,31 @@ limitations under the License.
 
 import logging
 
-RESULT_CODE_OK = 'OK'
-RESULT_CODE_CRITICAL = 'CRITICAL'
-RESULT_CODE_UNKNOWN = 'UNKNOWN'
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
 
-OK_MESSAGE = 'Ok'
+ALERT_BEHAVIOUR_TYPE = "{{hdfs-alert-config/alert.behavior.type}}"
 
-HDFS_SITE_KEY = '{{hdfs-site}}'
+ALERT_SUCCESS_PERCENTAGE = "{{hdfs-alert-config/alert.success.percentage}}"
 
-NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
-NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
-NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
-NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}'
-NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}'
+ALERT_TIMEOUT_RETURN_VALUE = "{{hdfs-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hdfs-alert-config/alert.timeout.secs}}"
 
-KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
-KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
-SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
-SMOKEUSER_KEY = "{{cluster-env/smokeuser}}"
-EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
+ALERT_FLIP_INTERVAL_MINS = "{{hdfs-alert-config/alert.flip.interval.mins}}"
 
 logger = logging.getLogger('ambari_alerts')
 
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
-  return (HDFS_SITE_KEY, NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, EXECUTABLE_SEARCH_PATHS,
-      NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY, SMOKEUSER_KEY)
-  
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
 
 def execute(configurations={}, parameters={}, host_name=None):
   """
@@ -61,9 +56,4 @@ def execute(configurations={}, parameters={}, host_name=None):
   host_name (string): the name of this host where the alert is running
   """
 
-  if configurations is None:
-    return (('UNKNOWN', ['There were no configurations supplied to the script.']))
-
-  result_code = RESULT_CODE_OK
-  label = OK_MESSAGE
-  return (result_code, [label])
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 16 - 31
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_datanode_unmounted_data_dir.py

@@ -20,24 +20,30 @@ limitations under the License.
 
 import logging
 
-RESULT_STATE_OK = 'OK'
-RESULT_STATE_WARNING = 'WARNING'
-RESULT_STATE_CRITICAL = 'CRITICAL'
-RESULT_STATE_UNKNOWN = 'UNKNOWN'
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
 
-OK_MESSAGE = 'Ok'
+ALERT_BEHAVIOUR_TYPE = "{{hdfs-alert-config/alert.behavior.type}}"
 
-DFS_DATA_DIR = '{{hdfs-site/dfs.datanode.data.dir}}'
-DATA_DIR_MOUNT_FILE = "/var/lib/ambari-agent/data/datanode/dfs_data_dir_mount.hist"
+ALERT_SUCCESS_PERCENTAGE = "{{hdfs-alert-config/alert.success.percentage}}"
 
-logger = logging.getLogger()
+ALERT_TIMEOUT_RETURN_VALUE = "{{hdfs-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hdfs-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{hdfs-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
 
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
-  return (DFS_DATA_DIR, DATA_DIR_MOUNT_FILE)
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
 
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -48,27 +54,6 @@ def execute(configurations={}, parameters={}, host_name=None):
   configurations (dictionary): a mapping of configuration key to value
   parameters (dictionary): a mapping of script parameter key to value
   host_name (string): the name of this host where the alert is running
-
-  DataNode directories can be of the following formats and each needs to be supported:
-    /grid/dn/archive0
-    [SSD]/grid/dn/archive0
-    [ARCHIVE]file:///grid/dn/archive0
   """
-  warnings = []
-  errors = []
-
-  if configurations is None:
-    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
-
-  # Check required properties
-  if DFS_DATA_DIR not in configurations:
-    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(DFS_DATA_DIR)])
-
-  dfs_data_dir = configurations[DFS_DATA_DIR]
-
-  if dfs_data_dir is None:
-    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script and the value is null'.format(DFS_DATA_DIR)])
 
-  result_code = RESULT_STATE_OK
-  label = OK_MESSAGE
-  return (result_code, [label])
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 0 - 75
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_ha_namenode_health.py

@@ -1,75 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import logging
-
-RESULT_STATE_OK = 'OK'
-RESULT_STATE_CRITICAL = 'CRITICAL'
-RESULT_STATE_UNKNOWN = 'UNKNOWN'
-RESULT_STATE_SKIPPED = 'SKIPPED'
-
-OK_MESSAGE = 'Ok'
-
-HDFS_SITE_KEY = '{{hdfs-site}}'
-NAMESERVICE_KEY = '{{hdfs-site/dfs.internal.nameservices}}'
-NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
-NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
-DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
-
-KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
-KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
-SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
-SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
-EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
-
-logger = logging.getLogger('ambari_alerts')
-
-def get_tokens():
-  """
-  Returns a tuple of tokens in the format {{site/property}} that will be used
-  to build the dictionary passed into execute
-  """
-  return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY, EXECUTABLE_SEARCH_PATHS,
-  NN_HTTPS_ADDRESS_KEY, DFS_POLICY_KEY, SMOKEUSER_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY)
-  
-
-def execute(configurations={}, parameters={}, host_name=None):
-  """
-  Returns a tuple containing the result code and a pre-formatted result label
-
-  Keyword arguments:
-  configurations (dictionary): a mapping of configuration key to value
-  parameters (dictionary): a mapping of script parameter key to value
-  host_name (string): the name of this host where the alert is running
-  """
-  if configurations is None:
-    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
-
-  # if not in HA mode, then SKIP
-  if not NAMESERVICE_KEY in configurations:
-    return (RESULT_STATE_SKIPPED, ['NameNode HA is not enabled'])
-
-  # hdfs-site is required
-  if not HDFS_SITE_KEY in configurations:
-    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
-
-  result_code = RESULT_STATE_OK
-  label = OK_MESSAGE
-  return (result_code, [label])

+ 0 - 85
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_metrics_deviation.py

@@ -1,85 +0,0 @@
-#!/usr/bin/env python
-
-"""
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-import logging
-
-RESULT_STATE_OK = 'OK'
-RESULT_STATE_CRITICAL = 'CRITICAL'
-RESULT_STATE_WARNING = 'WARNING'
-RESULT_STATE_UNKNOWN = 'UNKNOWN'
-RESULT_STATE_SKIPPED = 'SKIPPED'
-
-OK_MESSAGE = 'Ok'
-
-HDFS_SITE_KEY = '{{hdfs-site}}'
-NAMESERVICE_KEY = '{{hdfs-site/dfs.internal.nameservices}}'
-NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
-NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
-DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
-
-KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
-KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
-SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
-SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
-EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
-
-METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY = '{{ams-site/timeline.metrics.service.webapp.address}}'
-METRICS_COLLECTOR_VIP_HOST_KEY = '{{cluster-env/metrics_collector_vip_host}}'
-METRICS_COLLECTOR_VIP_PORT_KEY = '{{cluster-env/metrics_collector_vip_port}}'
-
-logger = logging.getLogger()
-
-
-def get_tokens():
-  """
-  Returns a tuple of tokens in the format {{site/property}} that will be used
-  to build the dictionary passed into execute
-  """
-  return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY, DFS_POLICY_KEY,
-          EXECUTABLE_SEARCH_PATHS, NN_HTTPS_ADDRESS_KEY, SMOKEUSER_KEY,
-          KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY,
-          METRICS_COLLECTOR_VIP_HOST_KEY, METRICS_COLLECTOR_VIP_PORT_KEY,
-          METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)
-
-def execute(configurations={}, parameters={}, host_name=None):
-  """
-  Returns a tuple containing the result code and a pre-formatted result label
-
-  Keyword arguments:
-  configurations : a mapping of configuration key to value
-  parameters : a mapping of script parameter key to value
-  host_name : the name of this host where the alert is running
-
-  :type configurations dict
-  :type parameters dict
-  :type host_name str
-  """
-
-  #parse configuration
-  if configurations is None:
-    return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
-
-  # hdfs-site is required
-  if not HDFS_SITE_KEY in configurations:
-    return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
-
-  result_code = RESULT_STATE_OK
-  label = OK_MESSAGE
-  return (result_code, [label])

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_nfs_gateway_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{hdfs-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{hdfs-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{hdfs-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hdfs-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{hdfs-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_snamenode_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{hdfs-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{hdfs-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{hdfs-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hdfs-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{hdfs-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 17 - 32
ambari-server/src/main/resources/stacks/PERF/1.0/services/HDFS/package/alerts/alert_upgrade_finalized.py

@@ -20,32 +20,30 @@ limitations under the License.
 
 import logging
 
-RESULT_STATE_OK = 'OK'
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
 
-OK_MESSAGE = 'Ok'
+ALERT_BEHAVIOUR_TYPE = "{{hdfs-alert-config/alert.behavior.type}}"
 
-NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
-NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
-NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
+ALERT_SUCCESS_PERCENTAGE = "{{hdfs-alert-config/alert.success.percentage}}"
 
-HDFS_SITE_KEY = '{{hdfs-site}}'
-KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
-KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
-SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
-SMOKEUSER_KEY = "{{cluster-env/smokeuser}}"
-EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
+ALERT_TIMEOUT_RETURN_VALUE = "{{hdfs-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{hdfs-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{hdfs-alert-config/alert.flip.interval.mins}}"
 
 logger = logging.getLogger('ambari_alerts')
 
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
-
-  :rtype tuple
   """
-  return (HDFS_SITE_KEY, NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, EXECUTABLE_SEARCH_PATHS,
-          KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY, SMOKEUSER_KEY)
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
 
 
 def execute(configurations={}, parameters={}, host_name=None):
@@ -53,22 +51,9 @@ def execute(configurations={}, parameters={}, host_name=None):
   Returns a tuple containing the result code and a pre-formatted result label
 
   Keyword arguments:
-  configurations : a mapping of configuration key to value
-  parameters : a mapping of script parameter key to value
-  host_name : the name of this host where the alert is running
-
-  :type configurations dict
-  :type parameters dict
-  :type host_name str
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
   """
 
-  if configurations is None:
-    return (('UNKNOWN', ['There were no configurations supplied to the script.']))
-
-  # hdfs-site is required
-  if not HDFS_SITE_KEY in configurations:
-    return 'SKIPPED', ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)]
-
-  result_code = RESULT_STATE_OK
-  label = OK_MESSAGE
-  return (result_code, [label])
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 20 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/alerts.json

@@ -0,0 +1,20 @@
+{
+    "SLEEPY": {
+
+        "SLEEPY": [
+            {
+                "name": "sleepy_process",
+                "label": "Sleepy Process",
+                "description": "Alert for sleepy component process status",
+                "interval": 1,
+                "scope": "HOST",
+                "enabled": true,
+                "source": {
+                    "type": "SCRIPT",
+                    "path": "PERF/1.0/services/SLEEPY/package/alerts/alert_sleepy_process.py",
+                    "parameters": []
+                }
+            }
+        ]
+    }
+}

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/configuration/sleepy-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 5 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/metainfo.xml

@@ -52,6 +52,11 @@
           <default>true</default>
         </theme>
       </themes>
+
+      <configuration-dependencies>
+        <config-type>sleepy-alert-config</config-type>
+      </configuration-dependencies>
+
     </service>
   </services>
 </metainfo>

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SLEEPY/package/alerts/alert_sleepy_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{sleepy-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{sleepy-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{sleepy-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{sleepy-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{sleepy-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 20 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/alerts.json

@@ -0,0 +1,20 @@
+{
+    "SNOW": {
+
+        "SNOW_WHITE": [
+            {
+                "name": "snow_process",
+                "label": "Snow Process",
+                "description": "Alert for snow component process status",
+                "interval": 1,
+                "scope": "HOST",
+                "enabled": true,
+                "source": {
+                    "type": "SCRIPT",
+                    "path": "PERF/1.0/services/SNOW/package/alerts/alert_snow_process.py",
+                    "parameters": []
+                }
+            }
+        ]
+    }
+}

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/configuration/snow-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 5 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/metainfo.xml

@@ -52,6 +52,11 @@
           <default>true</default>
         </theme>
       </themes>
+
+      <configuration-dependencies>
+        <config-type>snow-alert-config</config-type>
+      </configuration-dependencies>
+
     </service>
   </services>
 </metainfo>

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/SNOW/package/alerts/alert_snow_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{snow-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{snow-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{snow-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{snow-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{snow-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 23 - 338
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/alerts.json

@@ -1,184 +1,24 @@
 {
   "MAPREDUCE2": {
-    "service": [],
     "HISTORYSERVER": [
       {
-        "name": "mapreduce_history_server_webui",
-        "label": "History Server Web UI",
-        "description": "This host-level alert is triggered if the History Server Web UI is unreachable.",
+        "name": "mapreduce_history_process",
+        "label": "History Server process",
+        "description": "Alert for history server process status",
         "interval": 1,
-        "scope": "ANY",
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
-            "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
-            "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "kerberos_keytab": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-principal}}",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      },
-      {
-        "name": "mapreduce_history_server_cpu",
-        "label": "History Server CPU Utilization",
-        "description": "This host-level alert is triggered if the percent of CPU utilization on the History Server exceeds the configured critical threshold. The threshold values are in percent.",
-        "interval": 5,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
-            "kerberos_keytab": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-principal}}",
-            "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
-            "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "{1} CPU, load {0:.1%}"
-            },
-            "warning": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 200
-            },
-            "critical": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 250
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "java.lang:type=OperatingSystem/SystemCpuLoad",
-              "java.lang:type=OperatingSystem/AvailableProcessors"
-            ],
-            "value": "{0} * 100"
-          }
-        }
-      },
-      {
-        "name": "mapreduce_history_server_rpc_latency",
-        "label": "History Server RPC Latency",
-        "description": "This host-level alert is triggered if the History Server operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for operations. The threshold values are in milliseconds.",
-        "interval": 5,
-        "scope": "ANY",
+        "scope": "HOST",
         "enabled": true,
         "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
-            "https": "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
-            "kerberos_keytab": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{mapred-site/mapreduce.jobhistory.webapp.spnego-principal}}",
-            "https_property": "{{mapred-site/mapreduce.jobhistory.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
-            },
-            "warning": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-              "value": 3000
-            },          
-            "critical": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-              "value": 5000
-            },
-            "units" : "ms"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
-              "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
-            ],
-            "value": "{0}"
-          }
+          "type": "SCRIPT",
+          "path": "PERF/1.0/services/YARN/package/alerts/alert_history_process.py",
+          "parameters": []
         }
       }
     ]
   },
   "YARN": {
-    "service": [
-      {
-        "name": "yarn_nodemanager_webui_percent",
-        "label": "Percent NodeManagers Available",
-        "description": "This alert is triggered if the number of down NodeManagers in the cluster is greater than the configured critical threshold. It aggregates the results of NodeManager process checks.",
-        "interval": 1,
-        "scope": "SERVICE",
-        "enabled": true,
-        "source": {
-          "type": "AGGREGATE",
-          "alert_name": "yarn_nodemanager_webui",
-          "reporting": {
-            "ok": {
-              "text": "affected: [{1}], total: [{0}]"
-            },
-            "warning": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 10
-            },
-            "critical": {
-              "text": "affected: [{1}], total: [{0}]",
-              "value": 30
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          }
-        }
-      }
-    ],
+
     "NODEMANAGER": [
-      {
-        "name": "yarn_nodemanager_webui",
-        "label": "NodeManager Web UI",
-        "description": "This host-level alert is triggered if the NodeManager Web UI is unreachable.",
-        "interval": 1,
-        "scope": "HOST",
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{yarn-site/yarn.nodemanager.webapp.address}}",
-            "https": "{{yarn-site/yarn.nodemanager.webapp.https.address}}",
-            "https_property": "{{yarn-site/yarn.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "default_port": 8042,
-            "kerberos_keytab": "{{yarn-site/yarn.nodemanager.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{yarn-site/yarn.nodemanager.webapp.spnego-principal}}",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      },
       {
         "name": "yarn_nodemanager_health",
         "label": "NodeManager Health",
@@ -188,7 +28,7 @@
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py",
+          "path": "PERF/1.0/services/YARN/package/alerts/alert_nodemanager_health.py",
           "parameters": [
             {
               "name": "connection.timeout",
@@ -205,186 +45,31 @@
     ],
     "RESOURCEMANAGER": [
       {
-        "name": "yarn_resourcemanager_webui",
-        "label": "ResourceManager Web UI",
-        "description": "This host-level alert is triggered if the ResourceManager Web UI is unreachable.",
+        "name": "yarn_resourcemanager_process",
+        "label": "ResourceManager process",
+        "description": "Alert for resourcemanager process status",
         "interval": 1,
-        "scope": "ANY",
-        "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
-            "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
-            "https_property": "{{yarn-site/yarn.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "alias_key" : "{{yarn-site/yarn.resourcemanager.ha.rm-ids}}",
-              "http_pattern" : "{{yarn-site/yarn.resourcemanager.webapp.address.{{alias}}}}",
-              "https_pattern" : "{{yarn-site/yarn.resourcemanager.webapp.https.address.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
-        }
-      },
-      {
-        "name": "yarn_resourcemanager_cpu",
-        "label": "ResourceManager CPU Utilization",
-        "description": "This host-level alert is triggered if CPU utilization of the ResourceManager exceeds certain warning and critical thresholds. It checks the ResourceManager JMX Servlet for the SystemCPULoad property. The threshold values are in percent.",
-        "interval": 5,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
-            "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
-            "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}",
-            "https_property": "{{yarn-site/yarn.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "alias_key" : "{{yarn-site/yarn.resourcemanager.ha.rm-ids}}",
-              "http_pattern" : "{{yarn-site/yarn.resourcemanager.webapp.address.{{alias}}}}",
-              "https_pattern" : "{{yarn-site/yarn.resourcemanager.webapp.https.address.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "{1} CPU, load {0:.1%}"
-            },
-            "warning": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 200
-            },
-            "critical": {
-              "text": "{1} CPU, load {0:.1%}",
-              "value": 250
-            },
-            "units" : "%",
-            "type": "PERCENT"
-          },
-          "jmx": {
-            "property_list": [
-              "java.lang:type=OperatingSystem/SystemCpuLoad",
-              "java.lang:type=OperatingSystem/AvailableProcessors"
-            ],
-            "value": "{0} * 100"
-          }
-        }
-      },
-      {
-        "name": "yarn_resourcemanager_rpc_latency",
-        "label": "ResourceManager RPC Latency",
-        "description": "This host-level alert is triggered if the ResourceManager operations RPC latency exceeds the configured critical threshold. Typically an increase in the RPC processing time increases the RPC queue length, causing the average queue wait time to increase for ResourceManager operations. The threshold values are in milliseconds.",
-        "interval": 5,
-        "scope": "ANY",
-        "enabled": true,
-        "source": {
-          "type": "METRIC",
-          "uri": {
-            "http": "{{yarn-site/yarn.resourcemanager.webapp.address}}",
-            "https": "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
-            "kerberos_keytab": "{{yarn-site/yarn.resourcemanager.webapp.spnego-keytab-file}}",
-            "kerberos_principal": "{{yarn-site/yarn.resourcemanager.webapp.spnego-principal}}",
-            "https_property": "{{yarn-site/yarn.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "connection_timeout": 5.0,
-            "high_availability": {
-              "alias_key" : "{{yarn-site/yarn.resourcemanager.ha.rm-ids}}",
-              "http_pattern" : "{{yarn-site/yarn.resourcemanager.webapp.address.{{alias}}}}",
-              "https_pattern" : "{{yarn-site/yarn.resourcemanager.webapp.https.address.{{alias}}}}"
-            }
-          },
-          "reporting": {
-            "ok": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]"
-            },
-            "warning": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-              "value": 3000
-            },          
-            "critical": {
-              "text": "Average Queue Time:[{0}], Average Processing Time:[{1}]",
-              "value": 5000
-            },
-            "units" : "ms"
-          },
-          "jmx": {
-            "property_list": [
-              "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
-              "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
-            ],
-            "value": "{0}"
-          }
-        }
-      },
-      {
-        "name": "nodemanager_health_summary",
-        "label": "NodeManager Health Summary",
-        "description": "This service-level alert is triggered if there are unhealthy NodeManagers",
-        "interval": 1,
-        "scope": "SERVICE",
+        "scope": "HOST",
         "enabled": true,
         "source": {
           "type": "SCRIPT",
-          "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py",
-          "parameters": [
-            {
-              "name": "connection.timeout",
-              "display_name": "Connection Timeout",
-              "value": 5.0,
-              "type": "NUMERIC",
-              "description": "The maximum time before this alert is considered to be CRITICAL",
-              "units": "seconds",
-              "threshold": "CRITICAL"
-            }
-          ]
+          "path": "PERF/1.0/services/YARN/package/alerts/alert_resourcemanager_process.py",
+          "parameters": []
         }
       }
     ],
     "APP_TIMELINE_SERVER": [
       {
-        "name": "yarn_app_timeline_server_webui",
-        "label": "App Timeline Web UI",
-        "description": "This host-level alert is triggered if the App Timeline Server Web UI is unreachable.",
+        "name": "yarn_app_timeline_server_process",
+        "label": "App Timeline process",
+        "description": "Alert for app timeline server process status",
         "interval": 1,
-        "scope": "ANY",
+        "scope": "HOST",
+        "enabled": true,
         "source": {
-          "type": "WEB",
-          "uri": {
-            "http": "{{yarn-site/yarn.timeline-service.webapp.address}}/ws/v1/timeline",
-            "https": "{{yarn-site/yarn.timeline-service.webapp.https.address}}/ws/v1/timeline",
-            "https_property": "{{yarn-site/yarn.http.policy}}",
-            "https_property_value": "HTTPS_ONLY",
-            "kerberos_keytab": "{{yarn-site/yarn.timeline-service.http-authentication.kerberos.keytab}}",
-            "kerberos_principal": "{{yarn-site/yarn.timeline-service.http-authentication.kerberos.principal}}",
-            "connection_timeout": 5.0
-          },
-          "reporting": {
-            "ok": {
-              "text": "HTTP {0} response in {2:.3f}s"
-            },
-            "warning":{
-              "text": "HTTP {0} response from {1} in {2:.3f}s ({3})"
-            },
-            "critical": {
-              "text": "Connection failed to {1} ({3})"
-            }
-          }
+          "type": "SCRIPT",
+          "path": "PERF/1.0/services/YARN/package/alerts/alert_timeline_process.py",
+          "parameters": []
         }
       }
     ]

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/configuration/yarn-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 3 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/metainfo.xml

@@ -122,6 +122,7 @@
           -->
 
           <configuration-dependencies>
+            <config-type>yarn-alert-config</config-type>
             <config-type>capacity-scheduler</config-type>
             <config-type>hdfs-site</config-type>
           </configuration-dependencies>
@@ -193,6 +194,7 @@
       </components>
 
       <configuration-dependencies>
+        <config-type>yarn-alert-config</config-type>
         <config-type>yarn-site</config-type>
         <config-type>yarn-env</config-type>
         <config-type>hdfs-site</config-type>
@@ -324,6 +326,7 @@
       </requiredServices>
 
       <configuration-dependencies>
+        <config-type>yarn-alert-config</config-type>
         <config-type>hdfs-site</config-type>
         <config-type>hadoop-env</config-type>
         <config-type>core-site</config-type>

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_history_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{yarn-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{yarn-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{yarn-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{yarn-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{yarn-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 14 - 22
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_nodemanager_health.py

@@ -20,33 +20,31 @@ limitations under the License.
 
 import logging
 
-RESULT_CODE_OK = 'OK'
-RESULT_CODE_CRITICAL = 'CRITICAL'
-RESULT_CODE_UNKNOWN = 'UNKNOWN'
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
 
-OK_MESSAGE = 'NodeManager Healthy'
+ALERT_BEHAVIOUR_TYPE = "{{yarn-alert-config/alert.behavior.type}}"
 
-NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.address}}'
-NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.https.address}}'
-YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
+ALERT_SUCCESS_PERCENTAGE = "{{yarn-alert-config/alert.success.percentage}}"
 
-KERBEROS_KEYTAB = '{{yarn-site/yarn.nodemanager.webapp.spnego-keytab-file}}'
-KERBEROS_PRINCIPAL = '{{yarn-site/yarn.nodemanager.webapp.spnego-principal}}'
-SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
-SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
-EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
+ALERT_TIMEOUT_RETURN_VALUE = "{{yarn-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{yarn-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{yarn-alert-config/alert.flip.interval.mins}}"
 
 logger = logging.getLogger('ambari_alerts')
 
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
 
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
   to build the dictionary passed into execute
   """
-  return (NODEMANAGER_HTTP_ADDRESS_KEY,NODEMANAGER_HTTPS_ADDRESS_KEY, EXECUTABLE_SEARCH_PATHS,
-  YARN_HTTP_POLICY_KEY, SMOKEUSER_KEY, KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY)
-  
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
 
 def execute(configurations={}, parameters={}, host_name=None):
   """
@@ -57,11 +55,5 @@ def execute(configurations={}, parameters={}, host_name=None):
   parameters (dictionary): a mapping of script parameter key to value
   host_name (string): the name of this host where the alert is running
   """
-  result_code = RESULT_CODE_UNKNOWN
-
-  if configurations is None:
-    return (result_code, ['There were no configurations supplied to the script.'])
 
-  result_code = RESULT_CODE_OK
-  label = OK_MESSAGE
-  return (result_code, [label])
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_resourcemanager_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{yarn-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{yarn-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{yarn-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{yarn-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{yarn-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/YARN/package/alerts/alert_timeline_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{yarn-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{yarn-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{yarn-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{yarn-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{yarn-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)

+ 20 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/alerts.json

@@ -0,0 +1,20 @@
+{
+    "ZOOKEEPER": {
+
+        "ZOOKEEPER_SERVER": [
+            {
+                "name": "zookeeper_server_process",
+                "label": "Zookeeper server Process",
+                "description": "Alert for zk server component process status",
+                "interval": 1,
+                "scope": "HOST",
+                "enabled": true,
+                "source": {
+                    "type": "SCRIPT",
+                    "path": "PERF/1.0/services/ZOOKEEPER/package/alerts/alert_zk_server_process.py",
+                    "parameters": []
+                }
+            }
+        ]
+    }
+}

+ 75 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/configuration/zk-alert-config.xml

@@ -0,0 +1,75 @@
+<?xml version="1.0"?>
+<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    (the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+-->
+
+<configuration xmlns:xi="http://www.w3.org/2001/XInclude" supports_final="true">
+
+    <property>
+        <name>alert.behavior.type</name>
+        <value>percentage</value>
+        <description>
+            This property describes type of alert behaviour.
+            There are three types percentage, timeout, flip.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.success.percentage</name>
+        <value>100</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "percentage". Here you should set percent of successful
+            alert checks.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.timeout.return.value</name>
+        <value>false</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set result which alert will
+            return after timeout, false|true|none.
+        </description>
+    </property>
+
+    <property>
+        <name>alert.timeout.secs</name>
+        <value>120</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "timeout". Here you should set number of seconds for
+            alert to sleep.
+        </description>
+    </property>
+
+
+    <property>
+        <name>alert.flip.interval.mins</name>
+        <value>3</value>
+        <description>
+            This property will be actual only when alert.behaviour.type
+            set to "flip". Here you should set number of minutes at which
+            the alert should flip from true|false.
+        </description>
+    </property>
+
+
+</configuration>

+ 4 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/metainfo.xml

@@ -43,6 +43,10 @@
         </component>
       </components>
 
+      <configuration-dependencies>
+        <config-type>zk-alert-config</config-type>
+      </configuration-dependencies>
+
       <!-- No packages to install. -->
       <osSpecifics></osSpecifics>
     </service>

+ 59 - 0
ambari-server/src/main/resources/stacks/PERF/1.0/services/ZOOKEEPER/package/alerts/alert_zk_server_process.py

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import logging
+
+from resource_management.libraries.functions.simulate_perf_cluster_alert_behaviour import simulate_perf_cluster_alert_behaviour
+
+ALERT_BEHAVIOUR_TYPE = "{{zk-alert-config/alert.behavior.type}}"
+
+ALERT_SUCCESS_PERCENTAGE = "{{zk-alert-config/alert.success.percentage}}"
+
+ALERT_TIMEOUT_RETURN_VALUE = "{{zk-alert-config/alert.timeout.return.value}}"
+ALERT_TIMEOUT_SECS = "{{zk-alert-config/alert.timeout.secs}}"
+
+ALERT_FLIP_INTERVAL_MINS = "{{zk-alert-config/alert.flip.interval.mins}}"
+
+logger = logging.getLogger('ambari_alerts')
+
+alert_behaviour_properties = {"alert_behaviour_type" : ALERT_BEHAVIOUR_TYPE, "alert_success_percentage" : ALERT_SUCCESS_PERCENTAGE,
+                              "alert_timeout_return_value" : ALERT_TIMEOUT_RETURN_VALUE, "alert_timeout_secs" : ALERT_TIMEOUT_SECS,
+                              "alert_flip_interval_mins" : ALERT_FLIP_INTERVAL_MINS}
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (ALERT_BEHAVIOUR_TYPE, ALERT_SUCCESS_PERCENTAGE, ALERT_TIMEOUT_RETURN_VALUE, ALERT_TIMEOUT_SECS,
+          ALERT_FLIP_INTERVAL_MINS)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  return simulate_perf_cluster_alert_behaviour(alert_behaviour_properties, configurations)