|
@@ -12,6 +12,7 @@ define service {
|
|
hostgroup_name nagios-server
|
|
hostgroup_name nagios-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description NAGIOS::Nagios status log staleness
|
|
service_description NAGIOS::Nagios status log staleness
|
|
|
|
+ servicegroups NAGIOS
|
|
check_command check_nagios!10!/var/nagios/status.dat!/usr/bin/nagios
|
|
check_command check_nagios!10!/var/nagios/status.dat!/usr/bin/nagios
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -23,6 +24,7 @@ define service {
|
|
hostgroup_name nagios-server
|
|
hostgroup_name nagios-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HDFS::Percent DataNodes storage full
|
|
service_description HDFS::Percent DataNodes storage full
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_aggregate!"DATANODE::Storage full"!10%!30%
|
|
check_command check_aggregate!"DATANODE::Storage full"!10%!30%
|
|
normal_check_interval 2
|
|
normal_check_interval 2
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -33,6 +35,7 @@ define service {
|
|
hostgroup_name nagios-server
|
|
hostgroup_name nagios-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HDFS::Percent DataNodes down
|
|
service_description HDFS::Percent DataNodes down
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_aggregate!"DATANODE::Process down"!10%!30%
|
|
check_command check_aggregate!"DATANODE::Process down"!10%!30%
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -44,6 +47,7 @@ define service {
|
|
hostgroup_name nagios-server
|
|
hostgroup_name nagios-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description MAPREDUCE::Percent TaskTrackers down
|
|
service_description MAPREDUCE::Percent TaskTrackers down
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_aggregate!"TASKTRACKER::Process down"!10%!30%
|
|
check_command check_aggregate!"TASKTRACKER::Process down"!10%!30%
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -56,6 +60,7 @@ define service {
|
|
hostgroup_name nagios-server
|
|
hostgroup_name nagios-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description ZOOKEEPER::Percent zookeeper servers down
|
|
service_description ZOOKEEPER::Percent zookeeper servers down
|
|
|
|
+ servicegroups ZOOKEEPER
|
|
check_command check_aggregate!"ZKSERVERS::ZKSERVERS Process down"!35%!70%
|
|
check_command check_aggregate!"ZKSERVERS::ZKSERVERS Process down"!35%!70%
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -69,6 +74,7 @@ define service {
|
|
hostgroup_name nagios-server
|
|
hostgroup_name nagios-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HBASE::Percent region servers down
|
|
service_description HBASE::Percent region servers down
|
|
|
|
+ servicegroups HBASE
|
|
check_command check_aggregate!"REGIONSERVER::Process down"!10%!30%
|
|
check_command check_aggregate!"REGIONSERVER::Process down"!10%!30%
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -83,6 +89,7 @@ define service {
|
|
hostgroup_name ganglia-server
|
|
hostgroup_name ganglia-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description GANGLIA::Ganglia [gmetad] Process down
|
|
service_description GANGLIA::Ganglia [gmetad] Process down
|
|
|
|
+ servicegroups GANGLIA
|
|
check_command check_tcp!8651!-w 1 -c 1
|
|
check_command check_tcp!8651!-w 1 -c 1
|
|
normal_check_interval 0.25
|
|
normal_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -93,6 +100,7 @@ define service {
|
|
hostgroup_name ganglia-server
|
|
hostgroup_name ganglia-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for slaves
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for slaves
|
|
|
|
+ servicegroups GANGLIA
|
|
check_command check_tcp!8660!-w 1 -c 1
|
|
check_command check_tcp!8660!-w 1 -c 1
|
|
normal_check_interval 0.25
|
|
normal_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -103,6 +111,7 @@ define service {
|
|
hostgroup_name ganglia-server
|
|
hostgroup_name ganglia-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for namenode
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for namenode
|
|
|
|
+ servicegroups GANGLIA
|
|
check_command check_tcp!8661!-w 1 -c 1
|
|
check_command check_tcp!8661!-w 1 -c 1
|
|
normal_check_interval 0.25
|
|
normal_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -113,6 +122,7 @@ define service {
|
|
hostgroup_name ganglia-server
|
|
hostgroup_name ganglia-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker
|
|
|
|
+ servicegroups GANGLIA
|
|
check_command check_tcp!8662!-w 1 -c 1
|
|
check_command check_tcp!8662!-w 1 -c 1
|
|
normal_check_interval 0.25
|
|
normal_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -123,6 +133,7 @@ define service {
|
|
hostgroup_name ganglia-server
|
|
hostgroup_name ganglia-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster
|
|
service_description GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster
|
|
|
|
+ servicegroups GANGLIA
|
|
check_command check_tcp!8663!-w 1 -c 1
|
|
check_command check_tcp!8663!-w 1 -c 1
|
|
normal_check_interval 0.25
|
|
normal_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -136,6 +147,7 @@ define service {
|
|
hostgroup_name snamenode
|
|
hostgroup_name snamenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description NAMENODE::Secondary Namenode Process down
|
|
service_description NAMENODE::Secondary Namenode Process down
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_tcp!50090!-w 1 -c 1
|
|
check_command check_tcp!50090!-w 1 -c 1
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -148,6 +160,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description NAMENODE::Namenode Web UI down
|
|
service_description NAMENODE::Namenode Web UI down
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_webui!namenode
|
|
check_command check_webui!namenode
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -158,6 +171,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description NAMENODE::Namenode Edit logs directory status
|
|
service_description NAMENODE::Namenode Edit logs directory status
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_name_dir_status!50070
|
|
check_command check_name_dir_status!50070
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -168,6 +182,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description NAMENODE::Namenode Host CPU utilization
|
|
service_description NAMENODE::Namenode Host CPU utilization
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_cpu!200%!250%
|
|
check_command check_cpu!200%!250%
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 2
|
|
retry_check_interval 2
|
|
@@ -178,6 +193,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description NAMENODE::Namenode Process down
|
|
service_description NAMENODE::Namenode Process down
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_tcp!8020!-w 1 -c 1
|
|
check_command check_tcp!8020!-w 1 -c 1
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -188,6 +204,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HDFS::Corrupt/Missing blocks
|
|
service_description HDFS::Corrupt/Missing blocks
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_hdfs_blocks!50070!0%!0%
|
|
check_command check_hdfs_blocks!50070!0%!0%
|
|
normal_check_interval 2
|
|
normal_check_interval 2
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -198,6 +215,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HDFS::HDFS Capacity utilization
|
|
service_description HDFS::HDFS Capacity utilization
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_hdfs_capacity!50070!80%!90%
|
|
check_command check_hdfs_capacity!50070!80%!90%
|
|
normal_check_interval 10
|
|
normal_check_interval 10
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -208,6 +226,7 @@ define service {
|
|
hostgroup_name namenode
|
|
hostgroup_name namenode
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HDFS::Namenode RPC Latency
|
|
service_description HDFS::Namenode RPC Latency
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_rpcq_latency!NameNode!50070!3000!5000
|
|
check_command check_rpcq_latency!NameNode!50070!3000!5000
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -221,6 +240,7 @@ define service {
|
|
hostgroup_name jobtracker
|
|
hostgroup_name jobtracker
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description JOBTRACKER::JobTracker Web UI down
|
|
service_description JOBTRACKER::JobTracker Web UI down
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_webui!jobtracker
|
|
check_command check_webui!jobtracker
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -231,6 +251,7 @@ define service {
|
|
hostgroup_name jobtracker
|
|
hostgroup_name jobtracker
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description JOBTRACKER::JobHistory Web UI down
|
|
service_description JOBTRACKER::JobHistory Web UI down
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_webui!jobhistory
|
|
check_command check_webui!jobhistory
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -241,6 +262,7 @@ define service {
|
|
hostgroup_name jobtracker
|
|
hostgroup_name jobtracker
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description JOBTRACKER::Jobtracker CPU utilization
|
|
service_description JOBTRACKER::Jobtracker CPU utilization
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_cpu!200%!250%
|
|
check_command check_cpu!200%!250%
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 2
|
|
retry_check_interval 2
|
|
@@ -251,6 +273,7 @@ define service {
|
|
hostgroup_name jobtracker
|
|
hostgroup_name jobtracker
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description JOBTRACKER::Jobtracker Process down
|
|
service_description JOBTRACKER::Jobtracker Process down
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_tcp!50030!-w 1 -c 1
|
|
check_command check_tcp!50030!-w 1 -c 1
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -261,6 +284,7 @@ define service {
|
|
hostgroup_name jobtracker
|
|
hostgroup_name jobtracker
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description MAPREDUCE::JobTracker RPC Latency
|
|
service_description MAPREDUCE::JobTracker RPC Latency
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_rpcq_latency!JobTracker!50030!3000!5000
|
|
check_command check_rpcq_latency!JobTracker!50030!3000!5000
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -274,6 +298,7 @@ define service {
|
|
hostgroup_name slaves
|
|
hostgroup_name slaves
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description DATANODE::Process down
|
|
service_description DATANODE::Process down
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_tcp!50010!-w 1 -c 1
|
|
check_command check_tcp!50010!-w 1 -c 1
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -285,6 +310,7 @@ define service {
|
|
hostgroup_name slaves
|
|
hostgroup_name slaves
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description DATANODE::Storage full
|
|
service_description DATANODE::Storage full
|
|
|
|
+ servicegroups HDFS
|
|
check_command check_datanode_storage!50075!90%!90%
|
|
check_command check_datanode_storage!50075!90%!90%
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -297,6 +323,7 @@ define service {
|
|
hostgroup_name slaves
|
|
hostgroup_name slaves
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description TASKTRACKER::Process down
|
|
service_description TASKTRACKER::Process down
|
|
|
|
+ servicegroups MAPREDUCE
|
|
check_command check_tcp!50060!-w 1 -c 1
|
|
check_command check_tcp!50060!-w 1 -c 1
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -311,6 +338,7 @@ define service {
|
|
hostgroup_name region-servers
|
|
hostgroup_name region-servers
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description REGIONSERVER::Process down
|
|
service_description REGIONSERVER::Process down
|
|
|
|
+ servicegroups HBASE
|
|
check_command check_tcp!60020!-w 1 -c 1
|
|
check_command check_tcp!60020!-w 1 -c 1
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -325,6 +353,7 @@ define service {
|
|
hostgroup_name zookeeper-servers
|
|
hostgroup_name zookeeper-servers
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description ZKSERVERS::ZKSERVERS Process down
|
|
service_description ZKSERVERS::ZKSERVERS Process down
|
|
|
|
+ servicegroups ZOOKEEPER
|
|
check_command check_tcp!2181!-w 1 -c 1
|
|
check_command check_tcp!2181!-w 1 -c 1
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -338,6 +367,7 @@ define service {
|
|
hostgroup_name hbasemaster
|
|
hostgroup_name hbasemaster
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HBASEMASTER::HBase Web UI down
|
|
service_description HBASEMASTER::HBase Web UI down
|
|
|
|
+ servicegroups HBASE
|
|
check_command check_webui!hbase
|
|
check_command check_webui!hbase
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -348,6 +378,7 @@ define service {
|
|
hostgroup_name hbasemaster
|
|
hostgroup_name hbasemaster
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HBASEMASTER::HBaseMaster CPU utilization
|
|
service_description HBASEMASTER::HBaseMaster CPU utilization
|
|
|
|
+ servicegroups HBASE
|
|
check_command check_cpu!200%!250%
|
|
check_command check_cpu!200%!250%
|
|
normal_check_interval 5
|
|
normal_check_interval 5
|
|
retry_check_interval 2
|
|
retry_check_interval 2
|
|
@@ -358,6 +389,7 @@ define service {
|
|
hostgroup_name hbasemaster
|
|
hostgroup_name hbasemaster
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HBASEMASTER::HBaseMaster Process down
|
|
service_description HBASEMASTER::HBaseMaster Process down
|
|
|
|
+ servicegroups HBASE
|
|
check_command check_tcp!60000!-w 1 -c 1
|
|
check_command check_tcp!60000!-w 1 -c 1
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.25
|
|
retry_check_interval 0.25
|
|
@@ -371,6 +403,7 @@ define service {
|
|
hostgroup_name hiveserver
|
|
hostgroup_name hiveserver
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description HIVE-METASTORE::HIVE-METASTORE status check
|
|
service_description HIVE-METASTORE::HIVE-METASTORE status check
|
|
|
|
+ servicegroups HIVE-METASTORE
|
|
check_command check_hive_metastore_status!9083
|
|
check_command check_hive_metastore_status!9083
|
|
normal_check_interval 0.5
|
|
normal_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|
|
@@ -383,6 +416,7 @@ define service {
|
|
hostgroup_name oozie-server
|
|
hostgroup_name oozie-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description OOZIE::Oozie status check
|
|
service_description OOZIE::Oozie status check
|
|
|
|
+ servicegroups OOZIE
|
|
check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java32_home") %>
|
|
check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java32_home") %>
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 1
|
|
retry_check_interval 1
|
|
@@ -395,6 +429,7 @@ define service {
|
|
hostgroup_name templeton-server
|
|
hostgroup_name templeton-server
|
|
use hadoop-service
|
|
use hadoop-service
|
|
service_description TEMPLETON::Templeton status check
|
|
service_description TEMPLETON::Templeton status check
|
|
|
|
+ servicegroups TEMPLETON
|
|
check_command check_templeton_status!50111!v1
|
|
check_command check_templeton_status!50111!v1
|
|
normal_check_interval 1
|
|
normal_check_interval 1
|
|
retry_check_interval 0.5
|
|
retry_check_interval 0.5
|