Przeglądaj źródła

AMBARI-3256. 'Percent NodeManager Live' alert and 'Percent NodeManager healthy' alert for YARN service work with a considerable delay (Andrew Onischuk via dlysnichenko)

Lisnichenko Dmitro 11 lat temu
rodzic
commit
82d4a5438a

+ 0 - 59
ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_resourcemanager_nodes_percentage.sh

@@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-#
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#
-HOST=$1
-PORT=$2
-#Resource manager nodes, with selected status, which number we want to know
-NODE_STATUS=$3
-WARN_PERCENT=$4
-CRIT_PERCENT=$5
-NODES="Nodes"
-
-RESOURCEMANAGER_URL="http://$HOST:$PORT/ws/v1/cluster/metrics"
-export PATH="/usr/bin:$PATH"
-RESPONSE=`curl -s $RESOURCEMANAGER_URL`
-
-if [ -z "$RESPONSE" ]; then 
-  echo "CRITICAL: Can't get data from http://$HOST:$PORT/ws/v1/cluster/metrics" 
-  exit 2;
-fi 
-
-#code below is parsing RESPONSE that we get from resourcemanager api, for number between "activeNodes": and ','
-ACTIVE_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"activeNodes":([[:digit:]]+).*$/\1/gp'`
-LOST_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"lostNodes":([[:digit:]]+).*$/\1/gp'`
-UNHEALTHY_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"unhealthyNodes":([[:digit:]]+).*$/\1/gp'`
-DECOMMISSIONED_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"decommissionedNodes":([[:digit:]]+).*$/\1/gp'`
-REBOOTED_NODES=`echo "$RESPONSE" | sed -nre 's/^.*"rebootedNodes":([[:digit:]]+).*$/\1/gp'`
-
-TOTAL_NODES_NUM=$(($ACTIVE_NODES+$LOST_NODES+$UNHEALTHY_NODES+$DECOMMISSIONED_NODES+$REBOOTED_NODES))
-NODES_NUM=`echo "$RESPONSE" | sed -nre "s/^.*\"$NODE_STATUS$NODES\":([[:digit:]]+).*$/\1/gp"`
-PERCENT=$(($NODES_NUM*100/$TOTAL_NODES_NUM))
-
-if [[ "$PERCENT" -lt "$WARN_PERCENT" ]]; then
-  echo "OK: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
-  exit 0;
-elif [[ "$PERCENT" -lt "$CRIT_PERCENT" ]]; then
-  echo "WARNING: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
-  exit 1;
-else 
-  echo "CRITICAL: total:<$TOTAL_NODES_NUM>, affected:<$NODES_NUM>"
-  exit 2;
-fi

+ 0 - 1
ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp

@@ -50,7 +50,6 @@ class hdp-nagios::server::config()
   hdp-nagios::server::check { 'check_hue_status.sh': }
   hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': }
   hdp-nagios::server::check { 'check_nodemanager_health.sh': }
-  hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': }
   hdp-nagios::server::check { 'check_namenodes_ha.sh': }
   hdp-nagios::server::check { 'hdp_nagios_init.php': }
 

+ 0 - 5
ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb

@@ -102,8 +102,3 @@ define command{
         command_name    check_nodemanager_health
         command_line    $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$
        }
-
-define command{
-        command_name    check_resourcemanager_nodes_percentage
-        command_line    $USER1$/check_resourcemanager_nodes_percentage.sh $HOSTADDRESS$ $ARG1$ $ARG2$ $ARG3$ $ARG4$
-       }

+ 10 - 22
ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb

@@ -455,28 +455,6 @@ define service {
         max_check_attempts      5
 }
 
-define service {
-        hostgroup_name          resourcemanager
-        use                     hadoop-service
-        service_description     RESOURCEMANAGER::Percent NodeManager live
-        servicegroups           YARN
-        check_command           check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!lost!10!30
-        normal_check_interval   1
-        retry_check_interval    1
-        max_check_attempts      3
-}
-
-define service {
-        hostgroup_name          resourcemanager
-        use                     hadoop-service
-        service_description     RESOURCEMANAGER::Percent NodeManager healthy
-        servicegroups           YARN
-        check_command           check_resourcemanager_nodes_percentage!<%=scope.function_hdp_template_var("::hdp::rm_port")%>!unhealthy!10!30
-        normal_check_interval   1
-        retry_check_interval    1
-        max_check_attempts      3
-}
-
 define service {
         hostgroup_name          resourcemanager
         use                     hadoop-service
@@ -512,6 +490,16 @@ define service {
         retry_check_interval    1
         max_check_attempts      3
 }
+define service {
+        hostgroup_name          nagios-server
+        use                     hadoop-service
+        service_description     NODEMANAGER::Percent NodeManager process
+        servicegroups           YARN
+        check_command           check_aggregate!"NODEMANAGER::NodeManager process"!10%!30%
+        normal_check_interval   0.5
+        retry_check_interval    0.25
+        max_check_attempts      3
+}
 <% end %>
 
 <%if scope.function_hdp_nagios_members_exist('historyserver2')-%>