Browse Source

AMBARI-3200. Nagios alert for ambari shows critical even after agent is up (Vitaly Brodetskyi via dlysnichenko)

Lisnichenko Dmitro 11 năm trước cách đây
mục cha
commit
5f9b4e834d

+ 1 - 1
ambari-agent/conf/unix/ambari-agent.ini

@@ -23,7 +23,7 @@ prefix=/var/lib/ambari-agent/data
 loglevel=INFO
 data_cleanup_interval=86400
 data_cleanup_max_age=2592000
-ping_port=0
+ping_port=8670
 
 [stack]
 installprefix=/var/ambari-agent/

+ 0 - 39
ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_ambari_agent_status.sh

@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-#
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#
-AMBARI_AGENT_PID_PATH="/var/run/ambari-agent/ambari-agent.pid";
-RES="3";
-if [ -f $AMBARI_AGENT_PID_PATH ]
-then
-  RES=`cat $AMBARI_AGENT_PID_PATH | xargs ps -f -p | wc -l`;
-  AMBARI_AGENT_PID=`cat $AMBARI_AGENT_PID_PATH`; 
-else 
-  RES=-1; 
-fi
-
-if [ $RES -eq "2" ]
-then
-  echo "OK: Ambari Agent is running [PID:$AMBARI_AGENT_PID]";
-  exit 0;
-else
-  echo "CRITICAL: Ambari Agent is not running [$AMBARI_AGENT_PID_PATH not found]";
-  exit 2;
-fi

+ 2 - 1
ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/params.pp

@@ -92,6 +92,7 @@ class hdp-nagios::params() inherits hdp::params
     hue-server => {host_member_info => 'hue_server_host'},
     resourcemanager => {host_member_info => 'rm_host'},
     nodemanagers => {host_member_info => 'nm_hosts'},
-    historyserver2 => {host_member_info => 'hs_host'}
+    historyserver2 => {host_member_info => 'hs_host'},
+    journalnodes => {host_member_info => 'journalnode_hosts'}
   }
 }

+ 0 - 1
ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp

@@ -47,7 +47,6 @@ class hdp-nagios::server::config()
   hdp-nagios::server::check { 'check_oozie_status.sh': }
   hdp-nagios::server::check { 'check_templeton_status.sh': }
   hdp-nagios::server::check { 'check_hive_metastore_status.sh': }
-  hdp-nagios::server::check { 'check_ambari_agent_status.sh': }
   hdp-nagios::server::check { 'check_hue_status.sh': }
   hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': }
   hdp-nagios::server::check { 'check_nodemanager_health.sh': }

+ 0 - 9
ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb

@@ -83,10 +83,6 @@ define command{
         command_name    check_hive_metastore_status
         command_line    $USER1$/check_hive_metastore_status.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$ $ARG5$ $ARG6$ $ARG7$
        }
-define command{
-        command_name    check_ambari_agent_status
-        command_line    $USER1$/check_ambari_agent_status.sh
-       }
 define command{
         command_name    check_hue_status
         command_line    $USER1$/check_hue_status.sh
@@ -111,8 +107,3 @@ define command{
         command_name    check_resourcemanager_nodes_percentage
         command_line    $USER1$/check_resourcemanager_nodes_percentage.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$
        }
-
-define command{
-	command_name	check_tcp_on_host
-	command_line	$USER1$/check_tcp -H $ARG1$ -p $ARG2$ $ARG3$
-	}

+ 8 - 19
ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb

@@ -79,24 +79,13 @@ define service {
 <%end-%>
 
 # AMBARI AGENT Checks
-define service {
-        hostgroup_name          agent-servers
-        use                     hadoop-service
-        service_description     AMBARI::Ambari Agent process
-        servicegroups           AMBARI
-        check_command           check_ambari_agent_status
-        normal_check_interval   5
-        retry_check_interval    0.5
-        max_check_attempts      2
-}
-
 <%scope.function_hdp_template_var("all_hosts").each_with_index do |hostname, index|-%>
 define service {
-        hostgroup_name          agent-servers
+        host_name	        <%=hostname%>
         use                     hadoop-service
-        service_description     AMBARI::Ambari Agent process on <%=hostname%>
+        service_description     AMBARI::Ambari Agent process
         servicegroups           AMBARI
-        check_command           check_tcp_on_host!<%=hostname%>!<%=scope.function_hdp_template_var("all_ping_ports")[index]%>!-w 1 -c 1
+        check_command           check_tcp!<%=scope.function_hdp_template_var("all_ping_ports")[index]%>!-w 1 -c 1
         normal_check_interval   1
         retry_check_interval    0.25
         max_check_attempts      4
@@ -562,20 +551,20 @@ define service {
 
 <% end %>
 
+<%if scope.function_hdp_nagios_members_exist('journalnodes')-%>
 # Journalnode checks
-<%scope.function_hdp_template_var("::hdp::journalnode_hosts").each do |hostname|-%>
 define service {
-        hostgroup_name          slaves
+        hostgroup_name          journalnodes
         use                     hadoop-service
-        service_description     JOURNALNODE::JournalNode process on <%=hostname%>
+        service_description     JOURNALNODE::JournalNode process
         servicegroups           HDFS
-        check_command           check_tcp_on_host!<%=hostname%>!<%=scope.function_hdp_template_var("::hdp::journalnode_port")%>!-w 1 -c 1
+        check_command           check_tcp!<%=scope.function_hdp_template_var("::hdp::journalnode_port")%>!-w 1 -c 1
         normal_check_interval   1
         retry_check_interval    0.5
         max_check_attempts      3
 }
 
-<% end %>
+<%end-%>
 
 <%if scope.function_hdp_nagios_members_exist('slaves')-%>
 # HDFS::DATANODE Checks

+ 1 - 1
ambari-agent/src/main/python/ambari_agent/AmbariConfig.py

@@ -33,7 +33,7 @@ secured_url_port=8441
 prefix=/tmp/ambari-agent
 data_cleanup_interval=86400
 data_cleanup_max_age=2592000
-ping_port=0
+ping_port=8670
 
 [services]