Prechádzať zdrojové kódy

AMBARI-9458 - HDFS, YARN, and HBase Slave Health Alert Definitions (Yurii Shylov via jonathanhurley)

Jonathan Hurley 10 rokov pred
rodič
commit
405b3762c5

+ 39 - 0
ambari-server/src/main/resources/common-services/HBASE/0.96.0.2.0/alerts.json

@@ -91,6 +91,45 @@
             "value": "{0} * 100"
             "value": "{0} * 100"
           }
           }
         }
         }
+      },
+      {
+        "name": "regionservers_health_summary",
+        "label": "RegionServers Health Summary",
+        "description": "This service-level alert is triggered if there are unhealthy RegionServers",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hbase-site/hbase.master.info.port}}",
+            "https": "{{hbase-site/hbase.master.info.port}}",
+            "https_property": "{{cluster-env/security_enabled}}",
+            "https_property_value": "true",
+            "default_port": 60010
+          },
+          "reporting": {
+            "ok": {
+              "text": "All {1} RegionServer(s) are alive"
+            },
+            "warning": {
+              "text": "Dead RegionServer(s): {0} out of {1}",
+              "value": 1
+            },
+            "critical": {
+              "text": "Dead RegionServer(s): {0} out of {1}",
+              "value": 1
+            },
+            "units" : "RegionServer(s)"
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=HBase,name=Master,sub=Server/numDeadRegionServers",
+              "Hadoop:service=HBase,name=Master,sub=Server/numRegionServers"
+            ],
+            "value": "{0}"
+          }
+        }
       }
       }
     ],
     ],
     "HBASE_REGIONSERVER": [
     "HBASE_REGIONSERVER": [

+ 39 - 0
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json

@@ -295,6 +295,45 @@
           }
           }
         }
         }
       },
       },
+      {
+        "name": "datanode_health_summary",
+        "label": "DataNode Health Summary",
+        "description": "This service-level alert is triggered if there are unhealthy DataNodes",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "METRIC",
+          "uri": {
+            "http": "{{hdfs-site/dfs.namenode.http-address}}",
+            "https": "{{hdfs-site/dfs.namenode.https-address}}",
+            "https_property": "{{hdfs-site/dfs.http.policy}}",
+            "https_property_value": "HTTPS_ONLY"
+          },
+          "reporting": {
+            "ok": {
+              "text": "All {2} DataNode(s) are healthy"
+            },
+            "warning": {
+              "text": "Dead/Stale Datanode(s): {0}/{1} out of {2}",
+              "value": 1
+            },
+            "critical": {
+              "text": "Dead/Stale Datanode(s): {0}/{1} out of {2}",
+              "value": 1
+            },
+            "units" : "DataNode(s)"
+          },
+          "jmx": {
+            "property_list": [
+              "Hadoop:service=NameNode,name=FSNamesystemState/NumDeadDataNodes",
+              "Hadoop:service=NameNode,name=FSNamesystemState/NumStaleDataNodes",
+              "Hadoop:service=NameNode,name=FSNamesystemState/NumLiveDataNodes"
+            ],
+            "value": "{0} + {1}"
+          }
+        }
+      },
       {
       {
         "name": "namenode_process",
         "name": "namenode_process",
         "label": "NameNode Process",
         "label": "NameNode Process",

+ 12 - 0
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/alerts.json

@@ -306,6 +306,18 @@
             "value": "{0}"
             "value": "{0}"
           }
           }
         }
         }
+      },
+      {
+        "name": "nodemanager_health_summary",
+        "label": "NodeManager Health Summary",
+        "description": "This service-level alert is triggered if there are unhealthy NodeManagers",
+        "interval": 1,
+        "scope": "SERVICE",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py"
+        }
       }
       }
     ],
     ],
     "APP_TIMELINE_SERVER": [
     "APP_TIMELINE_SERVER": [

+ 99 - 0
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py

@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import urllib2
+import json
+
+ERROR_LABEL = '{0} NodeManager{1} {2} unhealthy.'
+OK_LABEL = 'All NodeManagers are healthy'
+
+NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.address}}'
+NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.https.address}}'
+YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
+  
+
+def execute(parameters=None, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  if parameters is None:
+    return (('UNKNOWN', ['There were no parameters supplied to the script.']))
+
+  scheme = 'http'  
+  http_uri = None
+  https_uri = None
+  http_policy = 'HTTP_ONLY'
+  
+  if NODEMANAGER_HTTP_ADDRESS_KEY in parameters:
+    http_uri = parameters[NODEMANAGER_HTTP_ADDRESS_KEY]
+
+  if NODEMANAGER_HTTPS_ADDRESS_KEY in parameters:
+    https_uri = parameters[NODEMANAGER_HTTPS_ADDRESS_KEY]
+
+  if YARN_HTTP_POLICY_KEY in parameters:
+    http_policy = parameters[YARN_HTTP_POLICY_KEY]
+    
+  # determine the right URI and whether to use SSL
+  uri = http_uri
+  if http_policy == 'HTTPS_ONLY':
+    scheme = 'https'
+    
+    if https_uri is not None:
+      uri = https_uri
+
+  live_nodemanagers_qry = "{0}://{1}/jmx?qry=Hadoop:service=ResourceManager,name=RMNMInfo".format(scheme, uri)
+
+  try:
+    live_nodemanagers = json.loads(get_value_from_jmx(live_nodemanagers_qry, "LiveNodeManagers"))
+
+    unhealthy_count = 0
+
+    for nodemanager in live_nodemanagers:
+      health_report = nodemanager['State']
+      if health_report == 'UNHEALTHY':
+        unhealthy_count += 1
+
+    if unhealthy_count == 0:
+      result_code = 'OK'
+      label = OK_LABEL
+    else:
+      result_code = 'CRITICAL'
+      if unhealthy_count == 1:
+        label = ERROR_LABEL.format(unhealthy_count, '', 'is')
+      else:
+        label = ERROR_LABEL.format(unhealthy_count, 's', 'are')
+
+  except Exception, e:
+    label = str(e)
+    result_code = 'UNKNOWN'
+
+  return ((result_code, [label]))
+
+
+def get_value_from_jmx(qry, property):
+  response = urllib2.urlopen(qry)
+  data=response.read()
+  data_dict = json.loads(data)
+  return data_dict["beans"][0][property]