Browse Source

AMBARI-2928. Add a Nagios alert to check state of NN HA. (Dmitry Sen via odiachenko)

Oleksandr Diachenko 11 years ago
parent
commit
ddfbda054f

+ 82 - 0
ambari-agent/src/main/puppet/modules/hdp-nagios/files/check_namenodes_ha.sh

@@ -0,0 +1,82 @@
+#!/usr/bin/env bash
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+
+IFS=',' read -a namenodes <<< "$1"
+port=$2
+totalNN=${#namenodes[@]}
+activeNN=()
+standbyNN=()
+unavailableNN=()
+
+for nn in "${namenodes[@]}"
+do
+  status=$(curl -m 5 -s http://$nn:$port/jmx?qry=Hadoop:service=NameNode,name=FSNamesystem | grep -i "tag.HAState" | grep -o -E "standby|active")
+  if [ "$status" == "active" ]; then
+    activeNN[${#activeNN[*]}]="$nn"
+  elif [ "$status" == "standby" ]; then
+    standbyNN[${#standbyNN[*]}]="$nn"
+  elif [ "$status" == "" ]; then
+    unavailableNN[${#unavailableNN[*]}]="$nn"
+  fi
+done
+
+message=""
+critical=false
+
+if [ ${#activeNN[@]} -gt 1 ]; then
+  critical=true
+  message=$message" Only one NN can have HAState=active;"
+elif [ ${#activeNN[@]} == 0 ]; then
+  critical=true
+  message=$message" No Active NN available;"
+elif [ ${#standbyNN[@]} == 0 ]; then
+  critical=true
+  message=$message" No failover NN available;"
+fi
+
+NNstats=" Active<"
+for nn in "${activeNN[@]}"
+do
+  NNstats="$NNstats$nn;"
+done
+NNstats=${NNstats%\;}
+NNstats=$NNstats">, Standby<"
+for nn in "${standbyNN[@]}"
+do
+  NNstats="$NNstats$nn;"
+done
+NNstats=${NNstats%\;}
+NNstats=$NNstats">, Unavailable<"
+for nn in "${unavailableNN[@]}"
+do
+  NNstats="$NNstats$nn;"
+done
+NNstats=${NNstats%\;}
+NNstats=$NNstats">"
+
+if [ $critical == false ]; then
+  echo "OK: NameNode HA healthy;"$NNstats
+  exit 0
+fi
+
+echo "CRITICAL:"$message$NNstats
+exit 2

+ 1 - 0
ambari-agent/src/main/puppet/modules/hdp-nagios/manifests/server/config.pp

@@ -52,6 +52,7 @@ class hdp-nagios::server::config()
   hdp-nagios::server::check { 'check_mapred_local_dir_used.sh': }
   hdp-nagios::server::check { 'check_nodemanager_health.sh': }
   hdp-nagios::server::check { 'check_resourcemanager_nodes_percentage.sh': }
+  hdp-nagios::server::check { 'check_namenodes_ha.sh': }
 
   anchor{'hdp-nagios::server::config::begin':} -> Hdp-nagios::Server::Configfile<||> -> anchor{'hdp-nagios::server::config::end':}
   Anchor['hdp-nagios::server::config::begin'] -> Hdp-nagios::Server::Check<||> -> Anchor['hdp-nagios::server::config::end']

+ 5 - 0
ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-commands.cfg.erb

@@ -97,6 +97,11 @@ define command{
        command_line    $USER1$/check_mapred_local_dir_used.sh $ARG1$ $ARG2$
        }
 
+define command{
+       command_name    check_namenodes_ha
+       command_line    $USER1$/check_namenodes_ha.sh $ARG1$ $ARG2$
+       }
+
 define command{
         command_name    check_nodemanager_health
         command_line    $USER1$/check_nodemanager_health.sh $HOSTADDRESS$ $ARG1$

+ 12 - 1
ambari-agent/src/main/puppet/modules/hdp-nagios/templates/hadoop-services.cfg.erb

@@ -63,7 +63,18 @@ define service {
         retry_check_interval    0.25
         max_check_attempts      3
 }
-
+<% if scope.function_hdp_nagios_members_exist('namenode') && (scope.function_hdp_get_major_stack_version([scope.function_hdp_template_var("stack_version")]) >= 2)%>
+define service {
+        hostgroup_name          nagios-server
+        use                     hadoop-service
+        service_description     HDFS::NameNode HA Healthy
+        servicegroups           HDFS
+        check_command           check_namenodes_ha!$HOSTGROUPMEMBERS:namenode$!<%=scope.function_hdp_template_var("::hdp::namenode_port")%>
+        normal_check_interval   0.5
+        retry_check_interval    0.25
+        max_check_attempts      5
+}
+<%end-%>
 
 # AMBARI AGENT Checks
 define service {