Browse Source

BUG-12536 - Hadoop cluster alerts are not being communicated through SysLogger

Artem Baranchuk 12 years ago
parent
commit
defe744fcc
1 changed files with 60 additions and 16 deletions
  1. 60 16
      contrib/addons/src/addOns/nagios/plugins/sys_logger.py

+ 60 - 16
contrib/addons/src/addOns/nagios/plugins/sys_logger.py

@@ -30,27 +30,71 @@ degraded_alert_services = ['HBASEMASTER::HBase Master CPU utilization',
 fatal_alert_services = ['NAMENODE::NameNode process down']
 
 # dictionary of service->msg_id mappings
-msg_ids = {'Host::Ping':'host_down', 'HBASEMASTER::HBase Master CPU utilization':'master_cpu_utilization',
-           'HDFS::HDFS capacity utilization':'hdfs_percent_capacity', 'HDFS::Corrupt/Missing blocks':'hdfs_block',
-           'NAMENODE::NameNode edit logs directory status':'namenode_edit_log_write', 'HDFS::Percent DataNodes down':'datanode_down',
-           'DATANODE::DataNode process down':'datanode_process_down', 'HDFS::Percent DataNodes storage full':'datanodes_percent_storage_full',
-           'NAMENODE::NameNode process down':'namenode_process_down', 'HDFS::NameNode RPC latency':'namenode_rpc_latency',
-           'DATANODE::DataNode storage full':'datanodes_storage_full', 'JOBTRACKER::JobTracker process down':'jobtracker_process_down',
-           'MAPREDUCE::JobTracker RPC latency':'jobtracker_rpc_latency', 'MAPREDUCE::Percent TaskTrackers down':'tasktrackers_down',
-           'TASKTRACKER::TaskTracker process down':'tasktracker_process_down', 'HBASEMASTER::HBase Master process down':'hbasemaster_process_down',
-           'REGIONSERVER::RegionServer process down':'regionserver_process_down', 'HBASE::Percent RegionServers down':'regionservers_down',
-           'HIVE-METASTORE::Hive Metastore status check':'hive_metastore_process_down', 'ZOOKEEPER::Percent ZooKeeper Servers down':'zookeepers_down',
-           'ZOOKEEPER::ZooKeeper Server process down':'zookeeper_process_down', 'OOZIE::Oozie Server status check':'oozie_down',
-           'WEBHCAT::WebHCat Server status check':'templeton_down', 'PUPPET::Puppet agent down':'puppet_down',
-           'NAGIOS::Nagios status log staleness':'nagios_status_log_stale', 'GANGLIA::Ganglia [gmetad] process down':'ganglia_process_down',
+msg_ids = {'Host::Ping':'host_down',
+           'HBASEMASTER::HBase Master CPU utilization':'master_cpu_utilization',
+           'HDFS::HDFS capacity utilization':'hdfs_percent_capacity',
+           'HDFS::Corrupt/Missing blocks':'hdfs_block',
+           'NAMENODE::NameNode edit logs directory status':'namenode_edit_log_write',
+           'HDFS::Percent DataNodes down':'datanode_down',
+           'DATANODE::DataNode process down':'datanode_process_down',
+           'HDFS::Percent DataNodes storage full':'datanodes_percent_storage_full',
+           'NAMENODE::NameNode process down':'namenode_process_down',
+           'HDFS::NameNode RPC latency':'namenode_rpc_latency',
+           'DATANODE::DataNode storage full':'datanodes_storage_full',
+           'JOBTRACKER::JobTracker process down':'jobtracker_process_down',
+           'MAPREDUCE::JobTracker RPC latency':'jobtracker_rpc_latency',
+           'MAPREDUCE::Percent TaskTrackers down':'tasktrackers_down',
+           'TASKTRACKER::TaskTracker process down':'tasktracker_process_down',
+           'HBASEMASTER::HBase Master process down':'hbasemaster_process_down',
+           'REGIONSERVER::RegionServer process down':'regionserver_process_down',
+           'HBASE::Percent RegionServers down':'regionservers_down',
+           'HIVE-METASTORE::Hive Metastore status check':'hive_metastore_process_down',
+           'ZOOKEEPER::Percent ZooKeeper Servers down':'zookeepers_down',
+           'ZOOKEEPER::ZooKeeper Server process down':'zookeeper_process_down',
+           'OOZIE::Oozie Server status check':'oozie_down',
+           'WEBHCAT::WebHCat Server status check':'templeton_down',
+           'PUPPET::Puppet agent down':'puppet_down',
+           'NAGIOS::Nagios status log staleness':'nagios_status_log_stale',
+           'GANGLIA::Ganglia [gmetad] process down':'ganglia_process_down',
            'GANGLIA::Ganglia Collector [gmond] process down alert for HBase Master':'ganglia_collector_process_down',
            'GANGLIA::Ganglia Collector [gmond] process down alert for JobTracker':'ganglia_collector_process_down',
            'GANGLIA::Ganglia Collector [gmond] process down alert for NameNode':'ganglia_collector_process_down',
            'GANGLIA::Ganglia Collector [gmond] process down alert for slaves':'ganglia_collector_process_down',
            'NAMENODE::Secondary NameNode process down':'secondary_namenode_process_down',
            'JOBTRACKER::JobTracker CPU utilization':'jobtracker_cpu_utilization',
-           'HBASEMASTER::HBase Master Web UI down':'hbase_ui_down', 'NAMENODE::NameNode Web UI down':'namenode_ui_down',
-           'JOBTRACKER::JobHistory Web UI down':'jobhistory_ui_down', 'JOBTRACKER::JobTracker Web UI down':'jobtracker_ui_down'}
+           'HBASEMASTER::HBase Master Web UI down':'hbase_ui_down',
+           'NAMENODE::NameNode Web UI down':'namenode_ui_down',
+           'JOBTRACKER::JobHistory Web UI down':'jobhistory_ui_down',
+           'JOBTRACKER::JobTracker Web UI down':'jobtracker_ui_down',
+
+
+           'HBASEMASTER::HBaseMaster CPU utilization':'master_cpu_utilization',
+           'HDFS::HDFS Capacity utilization':'hdfs_percent_capacity',
+           'NAMENODE::Namenode Edit logs directory status':'namenode_edit_log_write',
+           'DATANODE::Process down':'datanode_process_down',
+           'NAMENODE::Namenode Process down':'namenode_process_down',
+           'HDFS::Namenode RPC Latency':'namenode_rpc_latency',
+           'DATANODE::Storage full':'datanodes_storage_full',
+           'JOBTRACKER::Jobtracker Process down':'jobtracker_process_down',
+           'MAPREDUCE::JobTracker RPC Latency':'jobtracker_rpc_latency',
+           'TASKTRACKER::Process down':'tasktracker_process_down',
+           'HBASEMASTER::HBaseMaster Process down':'hbasemaster_process_down',
+           'REGIONSERVER::Process down':'regionserver_process_down',
+           'HBASE::Percent region servers down':'regionservers_down',
+           'HIVE-METASTORE::HIVE-METASTORE status check':'hive_metastore_process_down',
+           'ZOOKEEPER::Percent zookeeper servers down':'zookeepers_down',
+           'ZKSERVERS::ZKSERVERS Process down':'zookeeper_process_down',
+           'Oozie status check':'oozie_down',
+           'WEBHCAT::WebHcat status check':'templeton_down',
+           'GANGLIA::Ganglia [gmetad] Process down':'ganglia_process_down',
+           'GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster':'ganglia_collector_process_down',
+           'GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker':'ganglia_collector_process_down',
+           'GANGLIA::Ganglia collector [gmond] Process down alert for namenode':'ganglia_collector_process_down',
+           'GANGLIA::Ganglia collector [gmond] Process down alert for slaves':'ganglia_collector_process_down',
+           'NAMENODE::Secondary Namenode Process down':'secondary_namenode_process_down',
+           'JOBTRACKER::Jobtracker CPU utilization':'jobtracker_cpu_utilization',
+           'HBASEMASTER::HBase Web UI down':'hbase_ui_down',
+           'NAMENODE::Namenode Web UI down':'namenode_ui_down'}
 
 
 # Determine the severity of the TVI alert based on the Nagios alert state.
@@ -87,7 +131,7 @@ def determine_domain():
 
 # log the TVI msg to the syslog
 def log_tvi_msg(msg):
-    syslog.openlog('Hadoop', syslog.LOG_PID)
+    syslog.openlog('nagios', syslog.LOG_PID)
     syslog.syslog(msg)