瀏覽代碼

AMBARI-7986 - Alerts: Convert Script-Style HDFS, Hive, and YARN Alerts From Nagios (jonathanhurley)

Jonathan Hurley 10 年之前
父節點
當前提交
bc25d5ae20

+ 2 - 2
ambari-agent/src/main/python/ambari_agent/alerts/base_alert.py

@@ -85,7 +85,7 @@ class BaseAlert(object):
     """ method used for collection.  defers to _collect() """
     
     res = (BaseAlert.RESULT_UNKNOWN, [])
-    res_base_text = "Unknown {0}"
+    res_base_text = "{0}"
     
     try:
       res = self._collect()
@@ -100,7 +100,7 @@ class BaseAlert(object):
         logger.warning(message)
 
       res = (BaseAlert.RESULT_UNKNOWN, [str(e)])
-      res_base_text = "Unknown {0}"
+      res_base_text = "{0}"
     
     
     if logger.isEnabledFor(logging.DEBUG):

+ 1 - 1
ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py

@@ -77,7 +77,7 @@ class MetricAlert(BaseAlert):
 
     logger.debug("Resolved value list is: {0}".format(str(value_list)))
     
-    return ((collect_result, value_list))
+    return (collect_result, value_list)
 
   
   def __get_result(self, value):

+ 2 - 2
ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py

@@ -78,9 +78,9 @@ class ScriptAlert(BaseAlert):
       for key in self.config_value_dict:
         parameters['{{' + key + '}}'] = self.config_value_dict[key]
       
-      return cmd_module.execute(parameters)
+      return cmd_module.execute(parameters, self.host_name)
     else:
-      return ((self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)]))
+      return (self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)])
     
 
   def _load_source(self):

+ 11 - 2
ambari-server/src/main/resources/host_scripts/alert_disk_space.py

@@ -23,13 +23,22 @@ import os
 import platform
 
 def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
   return None
   
 
-def execute(parameters=None):
+def execute(parameters=None, host_name=None):
   """
-  returns a tuple containing the result code and a pre-formatted result label
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
   """
+
   disk_usage = None
   try:
     disk_usage = _get_disk_usage()

+ 11 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/alerts.json

@@ -300,6 +300,17 @@
             }
           }        
         }
+      },
+      {
+        "name": "namenode_last_checkpoint",
+        "label": "NameNode Last Checkpoint",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDP/2.0.6/services/HDFS/package/files/alert_checkpoint_time.py"
+        }
       }
     ],
     "SECONDARY_NAMENODE": [

+ 136 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HDFS/package/files/alert_checkpoint_time.py

@@ -0,0 +1,136 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import time
+import urllib2
+import json
+
+LABEL = 'Last Checkpoint: [{h} hours, {m} minutes, {tx} transactions]'
+
+NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
+NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
+NN_HTTP_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
+NN_CHECKPOINT_TX_KEY = '{{hdfs-site/dfs.namenode.checkpoint.txns}}'
+NN_CHECKPOINT_PERIOD_KEY = '{{hdfs-site/dfs.namenode.checkpoint.period}}'
+
+PERCENT_WARNING = 200
+PERCENT_CRITICAL = 200
+
+CHECKPOINT_TX_DEFAULT = 1000000
+CHECKPOINT_PERIOD_DEFAULT = 21600
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (NN_HTTP_ADDRESS_KEY, NN_HTTPS_ADDRESS_KEY, NN_HTTP_POLICY_KEY, 
+      NN_CHECKPOINT_TX_KEY, NN_CHECKPOINT_PERIOD_KEY)      
+  
+
+def execute(parameters=None, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  if parameters is None:
+    return (('UNKNOWN', ['There were no parameters supplied to the script.']))
+  
+  uri = None
+  scheme = 'http'  
+  http_uri = None
+  https_uri = None
+  http_policy = 'HTTP_ONLY'
+  percent_warning = PERCENT_WARNING
+  percent_critical = PERCENT_CRITICAL
+  checkpoint_tx = CHECKPOINT_TX_DEFAULT
+  checkpoint_period = CHECKPOINT_PERIOD_DEFAULT
+  
+  if NN_HTTP_ADDRESS_KEY in parameters:
+    http_uri = parameters[NN_HTTP_ADDRESS_KEY]
+
+  if NN_HTTPS_ADDRESS_KEY in parameters:
+    https_uri = parameters[NN_HTTPS_ADDRESS_KEY]
+
+  if NN_HTTP_POLICY_KEY in parameters:
+    http_policy = parameters[NN_HTTP_POLICY_KEY]
+
+  if NN_CHECKPOINT_TX_KEY in parameters:
+    checkpoint_tx = parameters[NN_CHECKPOINT_TX_KEY]
+
+  if NN_CHECKPOINT_PERIOD_KEY in parameters:
+    checkpoint_period = parameters[NN_CHECKPOINT_PERIOD_KEY]
+    
+  # determine the right URI and whether to use SSL
+  uri = http_uri
+  if http_policy == 'HTTPS_ONLY':
+    scheme = 'https'
+    
+    if https_uri is not None:
+      uri = https_uri 
+  
+  current_time = int(round(time.time() * 1000))
+
+  last_checkpoint_time_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".format(scheme,uri)
+  journal_transaction_info_qry = "{0}://{1}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".format(scheme,uri)
+
+  # start out assuming an OK status
+  label = None
+  result_code = "OK"
+
+  try:
+    last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime"))
+    journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo")
+    journal_transaction_info_dict = json.loads(journal_transaction_info)
+  
+    last_tx = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
+    most_recent_tx = int(journal_transaction_info_dict['MostRecentCheckpointTxId'])
+    transaction_difference = last_tx - most_recent_tx
+    
+    delta = (current_time - last_checkpoint_time)/1000
+
+    label = LABEL.format(h=get_time(delta)['h'], m=get_time(delta)['m'], tx=transaction_difference)
+    
+    if (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_critical)):
+      result_code = 'CRITICAL'
+    elif (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)):
+      result_code = 'WARNING'
+
+  except Exception, e:
+    label = str(e)
+    result_code = 'UNKNOWN'
+        
+  return ((result_code, [label]))
+
+def get_time(delta):
+  h = int(delta/3600)
+  m = int((delta % 3600)/60)
+  return {'h':h, 'm':m}
+
+
+def get_value_from_jmx(qry, property):
+  response = urllib2.urlopen(qry)
+  data=response.read()
+  data_dict = json.loads(data)
+  return data_dict["beans"][0][property]

+ 26 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/alerts.json

@@ -21,6 +21,32 @@
           }        
         }
       }
+    ],
+    "HIVE_SERVER": [
+      {
+        "name": "hive_server_process",
+        "label": "HiveServer2 Process",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDP/2.0.6/services/HIVE/package/files/alert_hive_thrift_port.py"
+        }
+      }
+    ],
+    "WEBHCAT_SERVER": [
+      {
+        "name": "hive_webhcat_server_status",
+        "label": "WebHCat Server Status",
+        "interval": 1,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDP/2.0.6/services/HIVE/package/files/alert_webhcat_server.py"
+        }
+      }    
     ]
   }
 }

+ 89 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/package/files/alert_hive_thrift_port.py

@@ -0,0 +1,89 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import socket
+import time
+import traceback
+import urllib2
+from resource_management.libraries.functions import hive_check 
+
+OK_MESSAGE = "TCP OK - %.4f response on port %s"
+CRITICAL_MESSAGE = "Connection failed on host {0}:{1}"
+
+HIVE_SERVER_THRIFT_PORT_KEY = '{{hive-site/hive.server2.thrift.port}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
+PERCENT_WARNING = 200
+PERCENT_CRITICAL = 200
+
+THRIFT_PORT_DEFAULT = 10000
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (HIVE_SERVER_THRIFT_PORT_KEY,SECURITY_ENABLED_KEY)      
+  
+
+def execute(parameters=None, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  if parameters is None:
+    return (('UNKNOWN', ['There were no parameters supplied to the script.']))
+
+  thrift_port = THRIFT_PORT_DEFAULT
+  if HIVE_SERVER_THRIFT_PORT_KEY in parameters:
+    thrift_port = int(parameters[HIVE_SERVER_THRIFT_PORT_KEY])  
+
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in parameters:
+    security_enabled = bool(parameters[SECURITY_ENABLED_KEY])  
+
+  result_code = None
+
+  try:
+    if host_name is None:
+      host_name = socket.getfqdn()
+
+    start_time = time.time()
+    is_thrift_port_ok = hive_check.check_thrift_port_sasl(host_name,
+        thrift_port, security_enabled=security_enabled)
+     
+    if is_thrift_port_ok == True:
+      result_code = 'OK'
+      total_time = time.time() - start_time
+      label = OK_MESSAGE % (total_time, thrift_port)
+    else:
+      result_code = 'CRITICAL'
+      label = CRITICAL_MESSAGE.format(host_name,thrift_port)
+
+  except Exception, e:
+    label = str(e)
+    result_code = 'UNKNOWN'
+        
+  return ((result_code, [label]))

+ 111 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/package/files/alert_webhcat_server.py

@@ -0,0 +1,111 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import socket
+import time
+import urllib2
+
+RESULT_CODE_OK = 'OK'
+RESULT_CODE_CRITICAL = 'CRITICAL'
+RESULT_CODE_UNKNOWN = 'UNKNOWN'
+
+OK_MESSAGE = 'TCP OK - {0:.4f} response on port {1}'
+CRITICAL_CONNECTION_MESSAGE = 'Connection failed on host {0}:{1}'
+CRITICAL_TEMPLETON_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"'
+CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE = 'Unable to determine WebHCat health from unexpected JSON response'
+
+TEMPLETON_PORT_KEY = '{{webhcat-site/templeton.port}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
+TEMPLETON_OK_RESPONSE = 'ok'
+TEMPLETON_PORT_DEFAULT = 50111
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (TEMPLETON_PORT_KEY,SECURITY_ENABLED_KEY)      
+  
+
+def execute(parameters=None, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  result_code = RESULT_CODE_UNKNOWN
+
+  if parameters is None:
+    return (result_code, ['There were no parameters supplied to the script.'])
+
+  templeton_port = TEMPLETON_PORT_DEFAULT
+  if TEMPLETON_PORT_KEY in parameters:
+    templeton_port = int(parameters[TEMPLETON_PORT_KEY])  
+
+  security_enabled = False
+  if SECURITY_ENABLED_KEY in parameters:
+    security_enabled = parameters[SECURITY_ENABLED_KEY].lower() == 'true'
+
+  scheme = 'http'
+  if security_enabled is True:
+    scheme = 'https'
+
+  label = ''
+  url_response = None
+  templeton_status = ''
+  total_time = 0
+
+  try:
+    # the alert will always run on the webhcat host
+    if host_name is None:
+      host_name = socket.getfqdn()
+    
+    query = "{0}://{1}:{2}/templeton/v1/status".format(scheme, host_name,
+        templeton_port)
+    
+    # execute the query for the JSON that includes templeton status
+    start_time = time.time()
+    url_response = urllib2.urlopen(query)
+    total_time = time.time() - start_time
+  except:
+    label = CRITICAL_CONNECTION_MESSAGE.format(host_name,templeton_port)
+    return (RESULT_CODE_CRITICAL, [label])
+
+  # URL response received, parse it
+  try:
+    json_response = json.loads(url_response.read())
+    templeton_status = json_response['status']
+  except:
+    return (RESULT_CODE_CRITICAL, [CRITICAL_TEMPLETON_UNKNOWN_JSON_MESSAGE])
+
+  # proper JSON received, compare against known value
+  if templeton_status.lower() == TEMPLETON_OK_RESPONSE:
+    result_code = RESULT_CODE_OK
+    label = OK_MESSAGE.format(total_time, templeton_port)
+  else:
+    result_code = RESULT_CODE_CRITICAL
+    label = CRITICAL_TEMPLETON_STATUS_MESSAGE.format(templeton_status)
+
+  return (result_code, [label])

+ 14 - 3
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/alerts.json

@@ -117,9 +117,9 @@
             "critical": {
               "text": "Connection failed: {0} on host {1}:{2}"
             }
-          }        
+          }
         }
-      }    
+      }
     ]
   },
   "YARN": {
@@ -147,7 +147,7 @@
             }
           }
         }
-      }    
+      }
     ],
     "NODEMANAGER": [
       {
@@ -176,6 +176,17 @@
             }
           }
         }
+      },
+      {
+        "name": "yarn_nodemanager_health",
+        "label": "NodeManager Health",
+        "interval": 1,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HDP/2.0.6/services/YARN/package/files/alert_nodemanager_health.py"
+        }
       }
     ],
     "RESOURCEMANAGER": [

+ 123 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/YARN/package/files/alert_nodemanager_health.py

@@ -0,0 +1,123 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import json
+import socket
+import urllib2
+
+RESULT_CODE_OK = 'OK'
+RESULT_CODE_CRITICAL = 'CRITICAL'
+RESULT_CODE_UNKNOWN = 'UNKNOWN'
+
+NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.address}}'
+NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.nodemanager.webapp.https.address}}'
+YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
+
+OK_MESSAGE = 'NodeManager Healthy'
+CRITICAL_CONNECTION_MESSAGE = 'Connection failed on host {0}'
+CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
+CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
+
+NODEMANAGER_DEFAULT_PORT = 8042
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (NODEMANAGER_HTTP_ADDRESS_KEY,NODEMANAGER_HTTPS_ADDRESS_KEY,
+  YARN_HTTP_POLICY_KEY)
+  
+
+def execute(parameters=None, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  parameters (dictionary): a mapping of parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+  result_code = RESULT_CODE_UNKNOWN
+
+  if parameters is None:
+    return (result_code, ['There were no parameters supplied to the script.'])
+
+  scheme = 'http'
+  http_uri = None
+  https_uri = None
+  http_policy = 'HTTP_ONLY'
+
+  if NODEMANAGER_HTTP_ADDRESS_KEY in parameters:
+    http_uri = parameters[NODEMANAGER_HTTP_ADDRESS_KEY]
+
+  if NODEMANAGER_HTTPS_ADDRESS_KEY in parameters:
+    https_uri = parameters[NODEMANAGER_HTTPS_ADDRESS_KEY]
+
+  if YARN_HTTP_POLICY_KEY in parameters:
+    http_policy = parameters[YARN_HTTP_POLICY_KEY]
+
+  # determine the right URI and whether to use SSL
+  uri = http_uri
+  if http_policy == 'HTTPS_ONLY':
+    scheme = 'https'
+
+    if https_uri is not None:
+      uri = https_uri
+
+  label = ''
+  url_response = None
+  node_healthy = 'false'
+  total_time = 0
+
+  # some yarn-site structures don't have the web ui address
+  if uri is None:
+    if host_name is None:
+      host_name = socket.getfqdn()
+
+    uri = '{0}:{1}'.format(host_name, NODEMANAGER_DEFAULT_PORT)
+
+  try:
+    query = "{0}://{1}/ws/v1/node/info".format(scheme,uri)
+    
+    # execute the query for the JSON that includes templeton status
+    url_response = urllib2.urlopen(query)
+  except:
+    label = CRITICAL_CONNECTION_MESSAGE.format(uri)
+    return (RESULT_CODE_CRITICAL, [label])
+
+  # URL response received, parse it
+  try:
+    json_response = json.loads(url_response.read())
+    node_healthy = json_response['nodeInfo']['nodeHealthy']
+
+    # convert boolean to string
+    node_healthy = str(node_healthy)
+  except:
+    return (RESULT_CODE_CRITICAL, [query])
+
+  # proper JSON received, compare against known value
+  if node_healthy.lower() == 'true':
+    result_code = RESULT_CODE_OK
+    label = OK_MESSAGE
+  else:
+    result_code = RESULT_CODE_CRITICAL
+    label = CRITICAL_NODEMANAGER_STATUS_MESSAGE.format(node_healthy)
+
+  return (result_code, [label])