|
@@ -188,20 +188,6 @@ define service {
|
|
|
{% endfor %}
|
|
|
{% endif %}
|
|
|
|
|
|
-{% if hostgroup_defs['resourcemanager'] %}
|
|
|
-{% for hostname in hostgroup_defs['resourcemanager'] %}
|
|
|
-define service {
|
|
|
- host_name {{ hostname }}
|
|
|
- use hadoop-service
|
|
|
- service_description GANGLIA::Ganglia Monitor process for ResourceManager
|
|
|
- servicegroups GANGLIA
|
|
|
- check_command check_tcp_wrapper!{{ ganglia_collector_rm_port }}!-w 1 -c 1
|
|
|
- normal_check_interval 0.25
|
|
|
- retry_check_interval 0.25
|
|
|
- max_check_attempts 4
|
|
|
-}
|
|
|
-{% endfor %}
|
|
|
-{% endif %}
|
|
|
|
|
|
{% if hostgroup_defs['historyserver2'] %}
|
|
|
{% for hostname in hostgroup_defs['historyserver2'] %}
|
|
@@ -422,145 +408,6 @@ define service {
|
|
|
{% endif %}
|
|
|
{% endif %}
|
|
|
|
|
|
-{% if hostgroup_defs['resourcemanager'] %}
|
|
|
-# YARN::RESOURCEMANAGER Checks
|
|
|
-define service {
|
|
|
- hostgroup_name resourcemanager
|
|
|
- use hadoop-service
|
|
|
- service_description RESOURCEMANAGER::ResourceManager Web UI
|
|
|
- servicegroups YARN
|
|
|
- check_command check_webui!resourcemanager!{{ rm_port }}
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 1
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-
|
|
|
-{% if check_cpu_on %}
|
|
|
-define service {
|
|
|
- hostgroup_name resourcemanager
|
|
|
- use hadoop-service
|
|
|
- service_description RESOURCEMANAGER::ResourceManager CPU utilization
|
|
|
- servicegroups YARN
|
|
|
-# check_command check_cpu!200%!250%
|
|
|
- check_command check_cpu!{{ rm_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
|
|
|
- normal_check_interval 5
|
|
|
- retry_check_interval 2
|
|
|
- max_check_attempts 5
|
|
|
-}
|
|
|
-{% endif %}
|
|
|
-
|
|
|
-define service {
|
|
|
- hostgroup_name resourcemanager
|
|
|
- use hadoop-service
|
|
|
- service_description RESOURCEMANAGER::ResourceManager RPC latency
|
|
|
- servicegroups YARN
|
|
|
- check_command check_rpcq_latency!ResourceManager!{{ rm_port }}!3000!5000!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
|
|
|
- normal_check_interval 5
|
|
|
- retry_check_interval 1
|
|
|
- max_check_attempts 5
|
|
|
-}
|
|
|
-
|
|
|
-define service {
|
|
|
- hostgroup_name resourcemanager
|
|
|
- use hadoop-service
|
|
|
- service_description RESOURCEMANAGER::ResourceManager process
|
|
|
- servicegroups YARN
|
|
|
- check_command check_tcp_wrapper!{{ rm_port }}!-w 1 -c 1
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 0.5
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-{% endif %}
|
|
|
-
|
|
|
-{% if hostgroup_defs['nodemanagers'] %}
|
|
|
-# YARN::NODEMANAGER Checks
|
|
|
-define service {
|
|
|
- hostgroup_name nodemanagers
|
|
|
- use hadoop-service
|
|
|
- service_description NODEMANAGER::NodeManager process
|
|
|
- servicegroups YARN
|
|
|
- check_command check_tcp_wrapper!{{ nm_port }}!-w 1 -c 1
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 0.5
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-
|
|
|
-define service {
|
|
|
- hostgroup_name nodemanagers
|
|
|
- use hadoop-service
|
|
|
- service_description NODEMANAGER::NodeManager health
|
|
|
- servicegroups YARN
|
|
|
- check_command check_nodemanager_health!{{ nm_port }}!{{ str(security_enabled).lower() }}!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 1
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-define service {
|
|
|
- hostgroup_name nagios-server
|
|
|
- use hadoop-service
|
|
|
- service_description NODEMANAGER::Percent NodeManagers live
|
|
|
- servicegroups YARN
|
|
|
- check_command check_aggregate!"NODEMANAGER::NodeManager process"!10%!30%
|
|
|
- normal_check_interval 0.5
|
|
|
- retry_check_interval 0.25
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-{% endif %}
|
|
|
-
|
|
|
-{% if hostgroup_defs['historyserver2'] %}
|
|
|
-# MAPREDUCE::JOBHISTORY Checks
|
|
|
-define service {
|
|
|
- hostgroup_name historyserver2
|
|
|
- use hadoop-service
|
|
|
- service_description JOBHISTORY::HistoryServer Web UI
|
|
|
- servicegroups MAPREDUCE
|
|
|
- check_command check_webui!historyserver2!{{ hs_port }}
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 1
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-
|
|
|
-{% if check_cpu_on %}
|
|
|
-define service {
|
|
|
- hostgroup_name historyserver2
|
|
|
- use hadoop-service
|
|
|
- service_description JOBHISTORY::HistoryServer CPU utilization
|
|
|
- servicegroups MAPREDUCE
|
|
|
-# check_command check_cpu!200%!250%
|
|
|
- check_command check_cpu!{{ hs_port }}!200%!250%!{{ str(hadoop_ssl_enabled).lower() }}!{{ nagios_keytab_path }}!{{ nagios_principal_name }}!{{ kinit_path_local }}!{{ str(security_enabled).lower() }}
|
|
|
- normal_check_interval 5
|
|
|
- retry_check_interval 2
|
|
|
- max_check_attempts 5
|
|
|
-}
|
|
|
-{% endif %}
|
|
|
-
|
|
|
-define service {
|
|
|
- hostgroup_name historyserver2
|
|
|
- use hadoop-service
|
|
|
- service_description JOBHISTORY::HistoryServer process
|
|
|
- servicegroups MAPREDUCE
|
|
|
- check_command check_tcp_wrapper!{{ hs_port }}!-w 1 -c 1
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 0.5
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-
|
|
|
-{% endif %}
|
|
|
-
|
|
|
-{% if hostgroup_defs['journalnodes'] %}
|
|
|
-# Journalnode checks
|
|
|
-define service {
|
|
|
- hostgroup_name journalnodes
|
|
|
- use hadoop-service
|
|
|
- service_description JOURNALNODE::JournalNode process
|
|
|
- servicegroups HDFS
|
|
|
- check_command check_tcp_wrapper!{{ journalnode_port }}!-w 1 -c 1
|
|
|
- normal_check_interval 1
|
|
|
- retry_check_interval 0.5
|
|
|
- max_check_attempts 3
|
|
|
-}
|
|
|
-
|
|
|
-{% endif %}
|
|
|
|
|
|
{% if hostgroup_defs['slaves'] %}
|
|
|
# HDFS::DATANODE Checks
|