소스 검색

AMBARI-5681. Add Nagios alert if HDFS last checkpoint time exceeds threshold (aonishuk)

Andrew Onishuk 11 년 전
부모
커밋
614db762ff

+ 112 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_checkpoint_time.py

@@ -0,0 +1,112 @@
+#!/usr/bin/env python
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+
+import os
+import optparse
+import time
+import urllib2
+import json
+
+CRIT_MESSAGE = "CRITICAL: Last checkpoint time is below acceptable. Checkpoint was done {h}h. {m}m. ago"
+WARNING_MESSAGE = "WARNING: Last checkpoint time is below acceptable. Checkpoint was done {h}h. {m}m. ago"
+OK_MESSAGE = "OK: Last checkpoint time"
+WARNING_JMX_MESSAGE = "WARNING: NameNode JMX not accessible"
+
+def main():
+
+  current_time = int(round(time.time() * 1000))
+
+  parser = optparse.OptionParser()
+
+  parser.add_option("-H", "--host", dest="host",
+                    default="localhost", help="NameNode host")
+  parser.add_option("-p", "--port", dest="port",
+                    default="50070", help="NameNode jmx port")
+  parser.add_option("-w", "--warning", dest="warning",
+                    default="200", help="Percent for warning alert")
+  parser.add_option("-c", "--critical", dest="crit",
+                    default="200", help="Percent for critical alert")
+  parser.add_option("-t", "--period", dest="period",
+                    default="21600", help="Period time")
+  parser.add_option("-x", "--txns", dest="txns",
+                    default="1000000",
+                    help="CheckpointNode will create a checkpoint of the namespace every 'dfs.namenode.checkpoint.txns'")
+  (options, args) = parser.parse_args()
+
+  host = get_available_nn_host(options)
+
+  last_checkpoint_time_qry = "http://{host}:{port}/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem".\
+    format(host=host, port=options.port)
+  last_checkpoint_time = int(get_value_from_jmx(last_checkpoint_time_qry,"LastCheckpointTime"))
+
+  journal_transaction_info_qry = "http://{host}:{port}/jmx?qry=Hadoop:service=NameNode,name=NameNodeInfo".\
+    format(host=host, port=options.port)
+  journal_transaction_info = get_value_from_jmx(journal_transaction_info_qry,"JournalTransactionInfo")
+  journal_transaction_info_dict = json.loads(journal_transaction_info)
+
+  last_txid = int(journal_transaction_info_dict['LastAppliedOrWrittenTxId'])
+  most_txid = int(journal_transaction_info_dict['MostRecentCheckpointTxId'])
+
+  delta = (current_time - last_checkpoint_time)/1000
+
+  if ((last_txid - most_txid) > int(options.txns)) and (float(delta) / int(options.period)*100 >= int(options.crit)):
+    print CRIT_MESSAGE.format(h=get_time(delta)['h'], m=get_time(delta)['m'])
+    exit(2)
+  elif ((last_txid - most_txid) > int(options.txns)) and (float(delta) / int(options.period)*100 >= int(options.warning)):
+    print WARNING_MESSAGE.format(h=get_time(delta)['h'], m=get_time(delta)['m'])
+    exit(1)
+  else:
+    print OK_MESSAGE
+    exit(0)
+
+def get_time(delta):
+  h = int(delta/3600)
+  m = int((delta % 3600)/60)
+  return {'h':h, 'm':m}
+
+def get_value_from_jmx(qry, property):
+  try:
+    response = urllib2.urlopen(qry)
+    data=response.read()
+  except Exception:
+    print WARNING_JMX_MESSAGE
+    exit(1)
+
+  data_dict = json.loads(data)
+  return (data_dict["beans"][0][property])
+
+def get_available_nn_host(options):
+  nn_hosts = options.host.split(" ")
+  for nn_host in nn_hosts:
+    try:
+      urllib2.urlopen("http://{host}:{port}/jmx".format(host=nn_host, port=options.port))
+      return nn_host
+    except Exception:
+      pass
+  print WARNING_JMX_MESSAGE
+  exit(1)
+
+if __name__ == "__main__":
+  main()
+
+
+

+ 1 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py

@@ -66,6 +66,7 @@ def nagios_server_config():
   nagios_server_check( 'check_namenodes_ha.sh')
   nagios_server_check( 'check_wrapper.sh')
   nagios_server_check( 'hdp_nagios_init.php')
+  nagios_server_check( 'check_checkpoint_time.py' )
 
 
 def nagios_server_configfile(

+ 3 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/params.py

@@ -74,6 +74,8 @@ supervisor_port = "56431"
 storm_rest_api_port = "8745"
 falcon_port = config['configurations']['global']['falcon_port']
 ahs_port = get_port_from_url(config['configurations']['yarn-site']['yarn.timeline-service.webapp.address'])
+dfs_namenode_checkpoint_period = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.period']
+dfs_namenode_checkpoint_txns = config['configurations']['hdfs-site']['dfs.namenode.checkpoint.txns']
 
 # this is different for HDP1
 nn_metrics_property = "FSNamesystem"
@@ -162,6 +164,7 @@ _falcon_host = default("/clusterHostInfo/falcon_server_hosts", None)
 _hbase_rs_hosts = default("/clusterHostInfo/hbase_rs_hosts", _slave_hosts)
 _hue_server_host = default("/clusterHostInfo/hue_server_host", None)
 all_hosts = config['clusterHostInfo']['all_hosts']
+nn_hosts_string = " ".join(namenode_host)
 
 
 hostgroup_defs = {

+ 5 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2

@@ -136,4 +136,9 @@ define command{
   command_name check_tcp_wrapper
   command_line  $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$
 }
+
+define command{
+  command_name check_checkpoint_time
+  command_line python $USER1$/check_checkpoint_time.py -H "$ARG1$" -p $ARG2$ -w $ARG3$ -c $ARG4$ -t $ARG5$ -x $ARG6$
+}
         

+ 11 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2

@@ -409,6 +409,17 @@ define service {
 
 {%  endfor  %}
 
+define service {
+        host_name               {{namenode_host[0]}}
+        use                     hadoop-service
+        service_description     NAMENODE::Last checkpoint time
+        servicegroups           HDFS
+        check_command           check_checkpoint_time!{{ nn_hosts_string }}!{{ namenode_port }}!200!200!{{ dfs_namenode_checkpoint_period }}!{{dfs_namenode_checkpoint_txns}}
+        normal_check_interval   0.5
+        retry_check_interval    0.25
+        max_check_attempts      3
+}
+
 define service {
         hostgroup_name          nagios-server
         use                     hadoop-service

+ 5 - 0
ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py

@@ -246,6 +246,11 @@ class TestNagiosServer(RMFTestCase):
                               content=StaticFile('hdp_nagios_init.php'),
                               mode=0755
     )
+    self.assertResourceCalled('File',
+                              '/usr/lib64/nagios/plugins/check_checkpoint_time.py',
+                              content=StaticFile('check_checkpoint_time.py'),
+                              mode=0755
+    )
     self.assertResourceCalled('Execute',
                               'htpasswd2 -c -b  /etc/nagios/htpasswd.users nagiosadmin \'!`"\'"\'"\' 1\'',
                               not_if="grep nagiosadmin /etc/nagios/htpasswd.users"