Browse Source

AMBARI-284. Define service groups in nagios such that users can more easily enable/disable the related alerts. Contributed by vgogate

git-svn-id: https://svn.apache.org/repos/asf/incubator/ambari/branches/ambari-186@1339993 13f79535-47bb-0310-9956-ffa450edef68
Hitesh Shah 13 years ago
parent
commit
fc3059e9ca

+ 3 - 0
CHANGES.txt

@@ -2,6 +2,9 @@ Ambari Change log
 
 Release 0.x.x - unreleased
 
+  AMBARI-284. Define service groups in nagios such that users can more easily
+  enable/disable the related alerts. (vgogate via hitesh)
+
   AMBARI-283. Fixup review and deploy rendering. (Vinod via hitesh)
 
   AMBARI-282. Make fetchTxnProgress post processing easier to debug. (Vikram via hitesh)

+ 1 - 0
hmc/puppet/modules/hdp-nagios/manifests/params.pp

@@ -11,6 +11,7 @@ class hdp-nagios::params() inherits hdp::params
   $nagios_obj_dir = hdp_default("nagios_obj_dir","/etc/nagios/objects")
   $nagios_host_cfg = hdp_default("nagios_host_cfg","${nagios_obj_dir}/hadoop-hosts.cfg")
   $nagios_hostgroup_cfg = hdp_default("nagios_hostgroup_cfg","${nagios_obj_dir}/hadoop-hostgroups.cfg")
+  $nagios_servicegroup_cfg = hdp_default("nagios_servicegroup_cfg","${nagios_obj_dir}/hadoop-servicegroups.cfg")
   $nagios_service_cfg = hdp_default("nagios_service_cfg","${nagios_obj_dir}/hadoop-services.cfg")
   $nagios_command_cfg = hdp_default("nagios_command_cfg","${nagios_obj_dir}/hadoop-commands.cfg")
   

+ 1 - 0
hmc/puppet/modules/hdp-nagios/manifests/server/config.pp

@@ -6,6 +6,7 @@ class hdp-nagios::server::config()
   hdp-nagios::server::configfile { 'nagios.cfg': conf_dir => $hdp-nagios::params::conf_dir }
   hdp-nagios::server::configfile { 'hadoop-hosts.cfg': }
   hdp-nagios::server::configfile { 'hadoop-hostgroups.cfg': }
+  hdp-nagios::server::configfile { 'hadoop-servicegroups.cfg': }
   hdp-nagios::server::configfile { 'hadoop-services.cfg': }
   hdp-nagios::server::configfile { 'hadoop-commands.cfg': }
   hdp-nagios::server::configfile { 'contacts.cfg': }

+ 35 - 0
hmc/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb

@@ -12,6 +12,7 @@ define service {
         hostgroup_name          nagios-server        
         use                     hadoop-service
         service_description     NAGIOS::Nagios status log staleness
+        servicegroups           NAGIOS
         check_command           check_nagios!10!/var/nagios/status.dat!/usr/bin/nagios
         normal_check_interval   5
         retry_check_interval    0.5
@@ -23,6 +24,7 @@ define service {
         hostgroup_name          nagios-server
         use                     hadoop-service
         service_description     HDFS::Percent DataNodes storage full
+        servicegroups           HDFS
         check_command           check_aggregate!"DATANODE::Storage full"!10%!30%
         normal_check_interval   2
         retry_check_interval    1 
@@ -33,6 +35,7 @@ define service {
         hostgroup_name          nagios-server
         use                     hadoop-service
         service_description     HDFS::Percent DataNodes down
+        servicegroups           HDFS
         check_command           check_aggregate!"DATANODE::Process down"!10%!30%
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -44,6 +47,7 @@ define service {
         hostgroup_name          nagios-server
         use                     hadoop-service
         service_description     MAPREDUCE::Percent TaskTrackers down
+        servicegroups           MAPREDUCE
         check_command           check_aggregate!"TASKTRACKER::Process down"!10%!30%
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -56,6 +60,7 @@ define service {
         hostgroup_name          nagios-server
         use                     hadoop-service
         service_description     ZOOKEEPER::Percent zookeeper servers down
+        servicegroups           ZOOKEEPER
         check_command           check_aggregate!"ZKSERVERS::ZKSERVERS Process down"!35%!70%
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -69,6 +74,7 @@ define service {
         hostgroup_name          nagios-server
         use                     hadoop-service
         service_description     HBASE::Percent region servers down
+        servicegroups           HBASE
         check_command           check_aggregate!"REGIONSERVER::Process down"!10%!30%
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -83,6 +89,7 @@ define service {
         hostgroup_name          ganglia-server
         use                     hadoop-service
         service_description     GANGLIA::Ganglia [gmetad] Process down
+        servicegroups           GANGLIA
         check_command           check_tcp!8651!-w 1 -c 1
         normal_check_interval   0.25
         retry_check_interval    0.25
@@ -93,6 +100,7 @@ define service {
         hostgroup_name          ganglia-server
         use                     hadoop-service
         service_description     GANGLIA::Ganglia collector [gmond] Process down alert for slaves
+        servicegroups           GANGLIA
         check_command           check_tcp!8660!-w 1 -c 1
         normal_check_interval   0.25
         retry_check_interval    0.25
@@ -103,6 +111,7 @@ define service {
         hostgroup_name          ganglia-server
         use                     hadoop-service
         service_description     GANGLIA::Ganglia collector [gmond] Process down alert for namenode
+        servicegroups           GANGLIA
         check_command           check_tcp!8661!-w 1 -c 1
         normal_check_interval   0.25
         retry_check_interval    0.25
@@ -113,6 +122,7 @@ define service {
         hostgroup_name          ganglia-server
         use                     hadoop-service
         service_description     GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker
+        servicegroups           GANGLIA
         check_command           check_tcp!8662!-w 1 -c 1
         normal_check_interval   0.25
         retry_check_interval    0.25
@@ -123,6 +133,7 @@ define service {
         hostgroup_name          ganglia-server
         use                     hadoop-service
         service_description     GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster
+        servicegroups           GANGLIA
         check_command           check_tcp!8663!-w 1 -c 1
         normal_check_interval   0.25
         retry_check_interval    0.25
@@ -136,6 +147,7 @@ define service {
         hostgroup_name          snamenode
         use                     hadoop-service
         service_description     NAMENODE::Secondary Namenode Process down
+        servicegroups           HDFS
         check_command           check_tcp!50090!-w 1 -c 1
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -148,6 +160,7 @@ define service {
         hostgroup_name          namenode
         use                     hadoop-service
         service_description     NAMENODE::Namenode Web UI down
+        servicegroups           HDFS
         check_command           check_webui!namenode
         normal_check_interval   1
         retry_check_interval    1
@@ -158,6 +171,7 @@ define service {
         hostgroup_name          namenode
         use                     hadoop-service
         service_description     NAMENODE::Namenode Edit logs directory status
+        servicegroups           HDFS
         check_command           check_name_dir_status!50070
         normal_check_interval   0.5
         retry_check_interval    0.5
@@ -168,6 +182,7 @@ define service {
         hostgroup_name          namenode        
         use                     hadoop-service
         service_description     NAMENODE::Namenode Host CPU utilization
+        servicegroups           HDFS
         check_command           check_cpu!200%!250%
         normal_check_interval   5
         retry_check_interval    2 
@@ -178,6 +193,7 @@ define service {
         hostgroup_name          namenode
         use                     hadoop-service
         service_description     NAMENODE::Namenode Process down
+        servicegroups           HDFS
         check_command           check_tcp!8020!-w 1 -c 1
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -188,6 +204,7 @@ define service {
         hostgroup_name          namenode
         use                     hadoop-service
         service_description     HDFS::Corrupt/Missing blocks
+        servicegroups           HDFS
         check_command           check_hdfs_blocks!50070!0%!0%
         normal_check_interval   2
         retry_check_interval    1 
@@ -198,6 +215,7 @@ define service {
         hostgroup_name          namenode
         use                     hadoop-service
         service_description     HDFS::HDFS Capacity utilization
+        servicegroups           HDFS
         check_command           check_hdfs_capacity!50070!80%!90%
         normal_check_interval   10
         retry_check_interval    1 
@@ -208,6 +226,7 @@ define service {
         hostgroup_name          namenode
         use                     hadoop-service
         service_description     HDFS::Namenode RPC Latency
+        servicegroups           HDFS
         check_command           check_rpcq_latency!NameNode!50070!3000!5000
         normal_check_interval   5
         retry_check_interval    1 
@@ -221,6 +240,7 @@ define service {
         hostgroup_name          jobtracker
         use                     hadoop-service
         service_description     JOBTRACKER::JobTracker Web UI down
+        servicegroups           MAPREDUCE
         check_command           check_webui!jobtracker
         normal_check_interval   1
         retry_check_interval    1
@@ -231,6 +251,7 @@ define service {
         hostgroup_name          jobtracker
         use                     hadoop-service
         service_description     JOBTRACKER::JobHistory Web UI down
+        servicegroups           MAPREDUCE
         check_command           check_webui!jobhistory
         normal_check_interval   1
         retry_check_interval    1
@@ -241,6 +262,7 @@ define service {
         hostgroup_name          jobtracker
         use                     hadoop-service
         service_description     JOBTRACKER::Jobtracker CPU utilization
+        servicegroups           MAPREDUCE
         check_command           check_cpu!200%!250%
         normal_check_interval   5
         retry_check_interval    2 
@@ -251,6 +273,7 @@ define service {
         hostgroup_name          jobtracker
         use                     hadoop-service
         service_description     JOBTRACKER::Jobtracker Process down
+        servicegroups           MAPREDUCE
         check_command           check_tcp!50030!-w 1 -c 1
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -261,6 +284,7 @@ define service {
         hostgroup_name          jobtracker
         use                     hadoop-service
         service_description     MAPREDUCE::JobTracker RPC Latency
+        servicegroups           MAPREDUCE
         check_command           check_rpcq_latency!JobTracker!50030!3000!5000
         normal_check_interval   5
         retry_check_interval    1 
@@ -274,6 +298,7 @@ define service {
         hostgroup_name          slaves
         use                     hadoop-service
         service_description     DATANODE::Process down
+        servicegroups           HDFS
         check_command           check_tcp!50010!-w 1 -c 1
         normal_check_interval   1
         retry_check_interval    0.5
@@ -285,6 +310,7 @@ define service {
         hostgroup_name          slaves
         use                     hadoop-service
         service_description     DATANODE::Storage full
+        servicegroups           HDFS
         check_command           check_datanode_storage!50075!90%!90%
         normal_check_interval   5
         retry_check_interval    1
@@ -297,6 +323,7 @@ define service {
         hostgroup_name          slaves
         use                     hadoop-service
         service_description     TASKTRACKER::Process down
+        servicegroups           MAPREDUCE
         check_command           check_tcp!50060!-w 1 -c 1
         normal_check_interval   1
         retry_check_interval    0.5
@@ -311,6 +338,7 @@ define service {
         hostgroup_name          region-servers
         use                     hadoop-service
         service_description     REGIONSERVER::Process down
+        servicegroups           HBASE
         check_command           check_tcp!60020!-w 1 -c 1
         normal_check_interval   1
         retry_check_interval    0.5
@@ -325,6 +353,7 @@ define service {
         hostgroup_name          zookeeper-servers
         use                     hadoop-service
         service_description     ZKSERVERS::ZKSERVERS Process down
+        servicegroups           ZOOKEEPER
         check_command           check_tcp!2181!-w 1 -c 1
         normal_check_interval   1
         retry_check_interval    0.5
@@ -338,6 +367,7 @@ define service {
         hostgroup_name          hbasemaster
         use                     hadoop-service
         service_description     HBASEMASTER::HBase Web UI down
+        servicegroups           HBASE
         check_command           check_webui!hbase
         normal_check_interval   1
         retry_check_interval    1
@@ -348,6 +378,7 @@ define service {
         hostgroup_name          hbasemaster
         use                     hadoop-service
         service_description     HBASEMASTER::HBaseMaster CPU utilization
+        servicegroups           HBASE
         check_command           check_cpu!200%!250%
         normal_check_interval   5
         retry_check_interval    2 
@@ -358,6 +389,7 @@ define service {
         hostgroup_name          hbasemaster
         use                     hadoop-service
         service_description     HBASEMASTER::HBaseMaster Process down
+        servicegroups           HBASE
         check_command           check_tcp!60000!-w 1 -c 1
         normal_check_interval   0.5
         retry_check_interval    0.25
@@ -371,6 +403,7 @@ define service {
         hostgroup_name          hiveserver
         use                     hadoop-service
         service_description     HIVE-METASTORE::HIVE-METASTORE status check
+        servicegroups           HIVE-METASTORE
         check_command           check_hive_metastore_status!9083
         normal_check_interval   0.5
         retry_check_interval    0.5
@@ -383,6 +416,7 @@ define service {
         hostgroup_name          oozie-server
         use                     hadoop-service
         service_description     OOZIE::Oozie status check
+        servicegroups           OOZIE
         check_command           check_oozie_status!11000!<%=scope.function_hdp_template_var("java32_home") %>
         normal_check_interval   1
         retry_check_interval    1
@@ -395,6 +429,7 @@ define service {
         hostgroup_name          templeton-server
         use                     hadoop-service
         service_description     TEMPLETON::Templeton status check
+        servicegroups           TEMPLETON
         check_command           check_templeton_status!50111!v1
         normal_check_interval   1
         retry_check_interval    0.5

+ 1 - 0
hmc/puppet/modules/hdp-nagios/templates/nagios.cfg.erb

@@ -47,6 +47,7 @@ cfg_file=/etc/nagios/objects/templates.cfg
 # Definitions for hadoop servers
 cfg_file=<%=scope.function_hdp_template_var("nagios_host_cfg")%>
 cfg_file=<%=scope.function_hdp_template_var("nagios_hostgroup_cfg")%>
+cfg_file=<%=scope.function_hdp_template_var("nagios_servicegroup_cfg")%>
 cfg_file=<%=scope.function_hdp_template_var("nagios_service_cfg")%>
 cfg_file=<%=scope.function_hdp_template_var("nagios_command_cfg")%>