|
@@ -3,7 +3,7 @@
|
|
|
define service {
|
|
|
hostgroup_name nagios-server
|
|
|
use generic-service
|
|
|
- service_description NAGIOS::Nagios status log staleness alert
|
|
|
+ service_description NAGIOS::Nagios status log staleness
|
|
|
check_command check_nagios!10!/var/nagios/status.dat!/usr/bin/nagios
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 0.5
|
|
@@ -14,8 +14,8 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name nagios-server
|
|
|
use generic-service
|
|
|
- service_description HDFS::Percent DataNodes storage full alert
|
|
|
- check_command check_aggregate!"DATANODE::Storage full alert"!10%!30%
|
|
|
+ service_description HDFS::Percent DataNodes storage full
|
|
|
+ check_command check_aggregate!"DATANODE::Storage full"!10%!30%
|
|
|
normal_check_interval 2
|
|
|
retry_check_interval 1
|
|
|
max_check_attempts 1
|
|
@@ -24,8 +24,8 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name nagios-server
|
|
|
use generic-service
|
|
|
- service_description HDFS::Percent DataNodes down alert
|
|
|
- check_command check_aggregate!"DATANODE::Process down alert"!10%!30%
|
|
|
+ service_description HDFS::Percent DataNodes down
|
|
|
+ check_command check_aggregate!"DATANODE::Process down"!10%!30%
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
|
max_check_attempts 3
|
|
@@ -35,8 +35,8 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name nagios-server
|
|
|
use generic-service
|
|
|
- service_description MAPREDUCE::Percent TaskTrackers down alert
|
|
|
- check_command check_aggregate!"TASKTRACKER::Process down alert"!10%!30%
|
|
|
+ service_description MAPREDUCE::Percent TaskTrackers down
|
|
|
+ check_command check_aggregate!"TASKTRACKER::Process down"!10%!30%
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
|
max_check_attempts 3
|
|
@@ -47,8 +47,8 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name nagios-server
|
|
|
use generic-service
|
|
|
- service_description ZOOKEEPER::Percent zookeeper servers down alert
|
|
|
- check_command check_aggregate!"ZKSERVERS::ZKSERVERS Process down alert"!35%!70%
|
|
|
+ service_description ZOOKEEPER::Percent zookeeper servers down
|
|
|
+ check_command check_aggregate!"ZKSERVERS::ZKSERVERS Process down"!35%!70%
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
|
max_check_attempts 3
|
|
@@ -60,8 +60,8 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name nagios-server
|
|
|
use generic-service
|
|
|
- service_description HBASE::Percent region servers down alert
|
|
|
- check_command check_aggregate!"REGIONSERVER::Process down alert"!10%!30%
|
|
|
+ service_description HBASE::Percent region servers down
|
|
|
+ check_command check_aggregate!"REGIONSERVER::Process down"!10%!30%
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
|
max_check_attempts 3
|
|
@@ -74,7 +74,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name ganglia-server
|
|
|
use generic-service
|
|
|
- service_description GANGLIA::Ganglia [gmetad] Process down alert
|
|
|
+ service_description GANGLIA::Ganglia [gmetad] Process down
|
|
|
check_command check_tcp!8651!-w 1 -c 1
|
|
|
normal_check_interval 0.25
|
|
|
retry_check_interval 0.25
|
|
@@ -127,7 +127,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description NAMENODE::Namenode Web UI down alert
|
|
|
+ service_description NAMENODE::Namenode Web UI down
|
|
|
check_command check_webui!namenode
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 1
|
|
@@ -137,7 +137,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description NAMENODE::Namenode Edit logs directory status alert
|
|
|
+ service_description NAMENODE::Namenode Edit logs directory status
|
|
|
check_command check_name_dir_status!50070
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.5
|
|
@@ -147,7 +147,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description NAMENODE::Namenode Host CPU utilization alert
|
|
|
+ service_description NAMENODE::Namenode Host CPU utilization
|
|
|
check_command check_cpu!200%!250%
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 2
|
|
@@ -157,7 +157,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description NAMENODE::Namenode Process down alert
|
|
|
+ service_description NAMENODE::Namenode Process down
|
|
|
check_command check_tcp!8020!-w 1 -c 1
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
@@ -167,7 +167,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description HDFS::Corrupt/Missing blocks alert
|
|
|
+ service_description HDFS::Corrupt/Missing blocks
|
|
|
check_command check_hdfs_blocks!50070!0%!0%
|
|
|
normal_check_interval 2
|
|
|
retry_check_interval 1
|
|
@@ -177,7 +177,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description HDFS::HDFS Capacity utilization alert
|
|
|
+ service_description HDFS::HDFS Capacity utilization
|
|
|
check_command check_hdfs_capacity!50070!80%!90%
|
|
|
normal_check_interval 10
|
|
|
retry_check_interval 1
|
|
@@ -187,7 +187,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name namenode
|
|
|
use generic-service
|
|
|
- service_description HDFS::Namenode RPC Latency alert
|
|
|
+ service_description HDFS::Namenode RPC Latency
|
|
|
check_command check_rpcq_latency!NameNode!50070!3000!5000
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 1
|
|
@@ -200,7 +200,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name jobtracker
|
|
|
use generic-service
|
|
|
- service_description JOBTRACKER::JobTracker Web UI down alert
|
|
|
+ service_description JOBTRACKER::JobTracker Web UI down
|
|
|
check_command check_webui!jobtracker
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 1
|
|
@@ -210,7 +210,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name jobtracker
|
|
|
use generic-service
|
|
|
- service_description JOBTRACKER::JobHistory Web UI down alert
|
|
|
+ service_description JOBTRACKER::JobHistory Web UI down
|
|
|
check_command check_webui!jobhistory
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 1
|
|
@@ -220,7 +220,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name jobtracker
|
|
|
use generic-service
|
|
|
- service_description JOBTRACKER::Jobtracker CPU utilization alert
|
|
|
+ service_description JOBTRACKER::Jobtracker CPU utilization
|
|
|
check_command check_cpu!200%!250%
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 2
|
|
@@ -230,7 +230,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name jobtracker
|
|
|
use generic-service
|
|
|
- service_description JOBTRACKER::Jobtracker Process down alert
|
|
|
+ service_description JOBTRACKER::Jobtracker Process down
|
|
|
check_command check_tcp!50030!-w 1 -c 1
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
@@ -240,7 +240,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name jobtracker
|
|
|
use generic-service
|
|
|
- service_description MAPREDUCE::JobTracker RPC Latency alert
|
|
|
+ service_description MAPREDUCE::JobTracker RPC Latency
|
|
|
check_command check_rpcq_latency!JobTracker!50030!3000!5000
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 1
|
|
@@ -253,7 +253,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name slaves
|
|
|
use generic-service
|
|
|
- service_description DATANODE::Process down alert
|
|
|
+ service_description DATANODE::Process down
|
|
|
check_command check_tcp!50010!-w 1 -c 1
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 0.5
|
|
@@ -263,7 +263,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name slaves
|
|
|
use generic-service
|
|
|
- service_description DATANODE::Storage full alert
|
|
|
+ service_description DATANODE::Storage full
|
|
|
check_command check_datanode_storage!50075!90%!90%
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 1
|
|
@@ -274,7 +274,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name slaves
|
|
|
use generic-service
|
|
|
- service_description TASKTRACKER::Process down alert
|
|
|
+ service_description TASKTRACKER::Process down
|
|
|
check_command check_tcp!50060!-w 1 -c 1
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 0.5
|
|
@@ -287,7 +287,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name region-servers
|
|
|
use generic-service
|
|
|
- service_description REGIONSERVER::Process down alert
|
|
|
+ service_description REGIONSERVER::Process down
|
|
|
check_command check_tcp!60020!-w 1 -c 1
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 0.5
|
|
@@ -300,7 +300,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name zookeeper-servers
|
|
|
use generic-service
|
|
|
- service_description ZKSERVERS::ZKSERVERS Process down alert
|
|
|
+ service_description ZKSERVERS::ZKSERVERS Process down
|
|
|
check_command check_tcp!2181!-w 1 -c 1
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 0.5
|
|
@@ -313,7 +313,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name hbasemaster
|
|
|
use generic-service
|
|
|
- service_description HBASEMASTER::HBase Web UI down alert
|
|
|
+ service_description HBASEMASTER::HBase Web UI down
|
|
|
check_command check_webui!hbase
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 1
|
|
@@ -323,7 +323,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name hbasemaster
|
|
|
use generic-service
|
|
|
- service_description HBASEMASTER::HBaseMaster CPU utilization alert
|
|
|
+ service_description HBASEMASTER::HBaseMaster CPU utilization
|
|
|
check_command check_cpu!200%!250%
|
|
|
normal_check_interval 5
|
|
|
retry_check_interval 2
|
|
@@ -333,7 +333,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name hbasemaster
|
|
|
use generic-service
|
|
|
- service_description HBASEMASTER::HBaseMaster Process down alert
|
|
|
+ service_description HBASEMASTER::HBaseMaster Process down
|
|
|
check_command check_tcp!60000!-w 1 -c 1
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.25
|
|
@@ -346,7 +346,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name hiveserver
|
|
|
use generic-service
|
|
|
- service_description HIVE-METASTORE::HIVE-METASTORE status check alert
|
|
|
+ service_description HIVE-METASTORE::HIVE-METASTORE status check
|
|
|
check_command check_hive_metastore_status!9083
|
|
|
normal_check_interval 0.5
|
|
|
retry_check_interval 0.5
|
|
@@ -358,7 +358,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name oozie-server
|
|
|
use generic-service
|
|
|
- service_description OOZIE::Oozie status check alert
|
|
|
+ service_description OOZIE::Oozie status check
|
|
|
check_command check_oozie_status!11000!<%=scope.function_hdp_template_var("java32_home") %>
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 1
|
|
@@ -370,7 +370,7 @@ define service {
|
|
|
define service {
|
|
|
hostgroup_name templeton-server
|
|
|
use generic-service
|
|
|
- service_description TEMPLETON::Templeton status check alert
|
|
|
+ service_description TEMPLETON::Templeton status check
|
|
|
check_command check_templeton_status!50111!v1
|
|
|
normal_check_interval 1
|
|
|
retry_check_interval 0.5
|