Browse Source

AMBARI-5784. Nagios read timeout on 1000 node cluster (ncole)

Nate Cole 11 years ago
parent
commit
ec74a4b14b

+ 9 - 3
ambari-server/src/main/java/org/apache/ambari/server/controller/internal/AbstractProviderModule.java

@@ -60,6 +60,8 @@ public abstract class AbstractProviderModule implements ProviderModule, Resource
 
 
   private static final int PROPERTY_REQUEST_CONNECT_TIMEOUT = 5000;
   private static final int PROPERTY_REQUEST_CONNECT_TIMEOUT = 5000;
   private static final int PROPERTY_REQUEST_READ_TIMEOUT    = 10000;
   private static final int PROPERTY_REQUEST_READ_TIMEOUT    = 10000;
+  // nagios can take longer on big clusters
+  private static final int NAGIOS_READ_TIMEOUT              = 30000;
 
 
   private static final String CLUSTER_NAME_PROPERTY_ID                  = PropertyHelper.getPropertyId("Clusters", "cluster_name");
   private static final String CLUSTER_NAME_PROPERTY_ID                  = PropertyHelper.getPropertyId("Clusters", "cluster_name");
   private static final String HOST_COMPONENT_CLUSTER_NAME_PROPERTY_ID   = PropertyHelper.getPropertyId("HostRoles", "cluster_name");
   private static final String HOST_COMPONENT_CLUSTER_NAME_PROPERTY_ID   = PropertyHelper.getPropertyId("HostRoles", "cluster_name");
@@ -389,7 +391,7 @@ public abstract class AbstractProviderModule implements ProviderModule, Resource
     ComponentSSLConfiguration configuration = ComponentSSLConfiguration.instance();
     ComponentSSLConfiguration configuration = ComponentSSLConfiguration.instance();
     URLStreamProvider streamProvider = new URLStreamProvider(
     URLStreamProvider streamProvider = new URLStreamProvider(
         PROPERTY_REQUEST_CONNECT_TIMEOUT, PROPERTY_REQUEST_READ_TIMEOUT,
         PROPERTY_REQUEST_CONNECT_TIMEOUT, PROPERTY_REQUEST_READ_TIMEOUT,
-        configuration.getTruststorePath(), configuration.getTruststorePassword(), configuration.getTruststoreType());
+        configuration);
 
 
     if (type.isInternalType()) {
     if (type.isInternalType()) {
       switch (type.getInternalType()) {
       switch (type.getInternalType()) {
@@ -403,7 +405,9 @@ public abstract class AbstractProviderModule implements ProviderModule, Resource
           break;
           break;
         case Service:
         case Service:
           providers.add(new NagiosPropertyProvider(type,
           providers.add(new NagiosPropertyProvider(type,
-              streamProvider,
+              new URLStreamProvider(
+                PROPERTY_REQUEST_CONNECT_TIMEOUT, NAGIOS_READ_TIMEOUT,
+                configuration),
               "ServiceInfo/cluster_name",
               "ServiceInfo/cluster_name",
               "ServiceInfo/service_name"));
               "ServiceInfo/service_name"));
           break;
           break;
@@ -417,7 +421,9 @@ public abstract class AbstractProviderModule implements ProviderModule, Resource
               PropertyHelper.getPropertyId("Hosts", "host_name")
               PropertyHelper.getPropertyId("Hosts", "host_name")
           ));
           ));
           providers.add(new NagiosPropertyProvider(type,
           providers.add(new NagiosPropertyProvider(type,
-              streamProvider,
+              new URLStreamProvider(
+                PROPERTY_REQUEST_CONNECT_TIMEOUT, NAGIOS_READ_TIMEOUT,
+                configuration),
               "Hosts/cluster_name",
               "Hosts/cluster_name",
               "Hosts/host_name"));
               "Hosts/host_name"));
           break;
           break;

+ 17 - 5
ambari-server/src/main/java/org/apache/ambari/server/controller/internal/URLStreamProvider.java

@@ -18,8 +18,6 @@
 
 
 package org.apache.ambari.server.controller.internal;
 package org.apache.ambari.server.controller.internal;
 
 
-import com.google.gson.Gson;
-
 import java.io.File;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.IOException;
@@ -30,14 +28,13 @@ import java.net.URLConnection;
 import java.security.KeyStore;
 import java.security.KeyStore;
 import java.util.List;
 import java.util.List;
 import java.util.Map;
 import java.util.Map;
-import static javax.ws.rs.core.MediaType.APPLICATION_JSON;
-import static javax.ws.rs.core.MediaType.APPLICATION_FORM_URLENCODED;
 
 
 import javax.net.ssl.HttpsURLConnection;
 import javax.net.ssl.HttpsURLConnection;
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.SSLContext;
 import javax.net.ssl.SSLSocketFactory;
 import javax.net.ssl.SSLSocketFactory;
 import javax.net.ssl.TrustManagerFactory;
 import javax.net.ssl.TrustManagerFactory;
 
 
+import org.apache.ambari.server.configuration.ComponentSSLConfiguration;
 import org.apache.ambari.server.controller.utilities.StreamProvider;
 import org.apache.ambari.server.controller.utilities.StreamProvider;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.Log;
@@ -52,7 +49,6 @@ public class URLStreamProvider implements StreamProvider {
   private static final String COOKIE = "Cookie";
   private static final String COOKIE = "Cookie";
   private static final String WWW_AUTHENTICATE = "WWW-Authenticate";
   private static final String WWW_AUTHENTICATE = "WWW-Authenticate";
   private static final String NEGOTIATE = "Negotiate";
   private static final String NEGOTIATE = "Negotiate";
-  private static final String CONTENT_TYPE_HEADER_PARAMETER = "Content-Type";
   private static Log LOG = LogFactory.getLog(URLStreamProvider.class);
   private static Log LOG = LogFactory.getLog(URLStreamProvider.class);
 
 
   private final int connTimeout;
   private final int connTimeout;
@@ -63,6 +59,22 @@ public class URLStreamProvider implements StreamProvider {
   private volatile SSLSocketFactory sslSocketFactory = null;
   private volatile SSLSocketFactory sslSocketFactory = null;
   private AppCookieManager appCookieManager;
   private AppCookieManager appCookieManager;
 
 
+  /**
+   * Provide the connection timeout for the underlying connection.
+   * 
+   * @param connectionTimeout
+   *          time, in milliseconds, to attempt a connection
+   * @param readTimeout
+   *          the read timeout in milliseconds
+   * @param configuration configuration holding TrustStore information
+   */
+  public URLStreamProvider(int connectionTimeout, int readTimeout, 
+      ComponentSSLConfiguration configuration) {  
+    this(connectionTimeout, readTimeout,
+        configuration.getTruststorePath(),
+        configuration.getTruststorePassword(),
+        configuration.getTruststoreType());
+  }
   /**
   /**
    * Provide the connection timeout for the underlying connection.
    * Provide the connection timeout for the underlying connection.
    * 
    * 

+ 14 - 6
ambari-server/src/main/java/org/apache/ambari/server/controller/nagios/NagiosPropertyProvider.java

@@ -109,6 +109,7 @@ public class NagiosPropertyProvider extends BaseProvider implements PropertyProv
   private String clusterNameProperty;
   private String clusterNameProperty;
   private String resourceTypeProperty;
   private String resourceTypeProperty;
   private StreamProvider urlStreamProvider;
   private StreamProvider urlStreamProvider;
+  private boolean waitOnFirstCall = false;
   
   
   @Inject
   @Inject
   public static void init(Injector injector) {
   public static void init(Injector injector) {
@@ -154,7 +155,7 @@ public class NagiosPropertyProvider extends BaseProvider implements PropertyProv
           }
           }
         }
         }
       }
       }
-    }, 0L, 20L, TimeUnit.SECONDS);    
+    }, 0L, 30L, TimeUnit.SECONDS);    
   }
   }
   
   
   /**
   /**
@@ -163,6 +164,7 @@ public class NagiosPropertyProvider extends BaseProvider implements PropertyProv
   public void forceReset() {
   public void forceReset() {
     CLUSTER_NAMES.clear();
     CLUSTER_NAMES.clear();
     CLUSTER_ALERTS.clear();
     CLUSTER_ALERTS.clear();
+    waitOnFirstCall = true;
   }
   }
   
   
   @Override
   @Override
@@ -186,6 +188,9 @@ public class NagiosPropertyProvider extends BaseProvider implements PropertyProv
         continue;
         continue;
       
       
       if (!CLUSTER_ALERTS.containsKey(clusterName)) {
       if (!CLUSTER_ALERTS.containsKey(clusterName)) {
+        // prevent endless looping for the first-time collection
+        CLUSTER_ALERTS.put(clusterName, Collections.<NagiosAlert>emptyList());
+        
         Future<List<NagiosAlert>> f = scheduler.submit(new Callable<List<NagiosAlert>>() {
         Future<List<NagiosAlert>> f = scheduler.submit(new Callable<List<NagiosAlert>>() {
           @Override
           @Override
           public List<NagiosAlert> call() throws Exception {
           public List<NagiosAlert> call() throws Exception {
@@ -193,11 +198,13 @@ public class NagiosPropertyProvider extends BaseProvider implements PropertyProv
           }
           }
         });
         });
         
         
-        try {
-          CLUSTER_ALERTS.put(clusterName, f.get());
-        } catch (Exception e) {
-          LOG.error("Could not load metrics - Executor exception" +
-            " (" + e.getMessage() + ")");
+        if (waitOnFirstCall) {
+          try {
+            CLUSTER_ALERTS.put(clusterName, f.get());
+          } catch (Exception e) {
+            LOG.error("Could not load metrics - Executor exception" +
+             " (" + e.getMessage() + ")");
+          }
         }
         }
       }
       }
       
       
@@ -354,6 +361,7 @@ public class NagiosPropertyProvider extends BaseProvider implements PropertyProv
       String url = String.format(template, nagiosHost);  
       String url = String.format(template, nagiosHost);  
 
 
       InputStream in = null;
       InputStream in = null;
+
       try {
       try {
         in = urlStreamProvider.readFrom(url);
         in = urlStreamProvider.readFrom(url);
         
         

+ 48 - 15
contrib/addons/src/addOns/nagios/scripts/nagios_alerts.php

@@ -161,6 +161,7 @@ function hdp_mon_generate_response( $response_data )
     $services_object = array ();
     $services_object = array ();
     $services_object["PUPPET"] = 0;
     $services_object["PUPPET"] = 0;
     foreach ($matches[0] as $object) {
     foreach ($matches[0] as $object) {
+
       if (getParameter($object, "service_description") == HDFS_SERVICE_CHECK) {
       if (getParameter($object, "service_description") == HDFS_SERVICE_CHECK) {
         $services_object["HDFS"] = getParameter($object, "last_hard_state");
         $services_object["HDFS"] = getParameter($object, "last_hard_state");
         if ($services_object["HDFS"] >= 1) {
         if ($services_object["HDFS"] >= 1) {
@@ -234,6 +235,7 @@ function hdp_mon_generate_response( $response_data )
     $hostcounts_object = array ();
     $hostcounts_object = array ();
     $up_hosts = 0;
     $up_hosts = 0;
     $down_hosts = 0;
     $down_hosts = 0;
+
     foreach ($matches[0] as $object) {
     foreach ($matches[0] as $object) {
       if (getParameter($object, "last_hard_state") != ok) {
       if (getParameter($object, "last_hard_state") != ok) {
         $down_hosts++;
         $down_hosts++;
@@ -294,13 +296,14 @@ function hdp_mon_generate_response( $response_data )
     #echo $matches[1][0] . ", " . $matches[1][1] . "\n";
     #echo $matches[1][0] . ", " . $matches[1][1] . "\n";
     $services_objects = array ();
     $services_objects = array ();
     $i = 0;
     $i = 0;
-    foreach ($matches[0] as $object) {
+    foreach ($matches[1] as $object) {
+      $map = getParameterMap($object);
       $servicestatus = array ();
       $servicestatus = array ();
       switch ($alert_type) {
       switch ($alert_type) {
       case "all":
       case "all":
-        if (empty($host) || getParameter($object, "host_name") == $host) {
+        if (empty($host) || getParameterMapValue($map, "host_name") == $host) {
           foreach ($servicestatus_attributes as $attrib) {
           foreach ($servicestatus_attributes as $attrib) {
-            $servicestatus[$attrib] = htmlentities(getParameter($object, $attrib), ENT_COMPAT);
+            $servicestatus[$attrib] = htmlentities(getParameterMapValue($map, $attrib), ENT_COMPAT);
           }
           }
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
@@ -309,10 +312,10 @@ function hdp_mon_generate_response( $response_data )
         }
         }
         break;
         break;
       case "nok":
       case "nok":
-        if (getParameter($object, "last_hard_state") != ok &&
-           (empty($host) || getParameter($object, "host_name") == $host)) {
+        if (getParameterMapValue($map, "last_hard_state") != ok &&
+           (empty($host) || getParameterMapValue($map, "host_name") == $host)) {
           foreach ($servicestatus_attributes as $attrib) {
           foreach ($servicestatus_attributes as $attrib) {
-            $servicestatus[$attrib] = htmlentities(getParameter($object, $attrib), ENT_COMPAT);
+            $servicestatus[$attrib] = htmlentities(getParameterMapValue($map, $attrib), ENT_COMPAT);
           }
           }
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
@@ -320,10 +323,10 @@ function hdp_mon_generate_response( $response_data )
         }
         }
         break;
         break;
       case "ok":
       case "ok":
-        if (getParameter($object, "last_hard_state") == ok &&
-           (empty($host) || getParameter($object, "host_name") == $host)) {
+        if (getParameterMapValue($map, "last_hard_state") == ok &&
+           (empty($host) || getParameterMapValue($map, "host_name") == $host)) {
           foreach ($servicestatus_attributes as $attrib) {
           foreach ($servicestatus_attributes as $attrib) {
-            $servicestatus[$attrib] = htmlentities(getParameter($object, $attrib), ENT_COMPAT);
+            $servicestatus[$attrib] = htmlentities(getParameterMapValue($map, $attrib), ENT_COMPAT);
           }
           }
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
@@ -331,10 +334,10 @@ function hdp_mon_generate_response( $response_data )
         }
         }
         break;
         break;
       case "warn":
       case "warn":
-        if (getParameter($object, "last_hard_state") == warn &&
-           (empty($host) || getParameter($object, "host_name") == $host)) {
+        if (getParameterMapValue($map, "last_hard_state") == warn &&
+           (empty($host) || getParameterMapValue($map, "host_name") == $host)) {
           foreach ($servicestatus_attributes as $attrib) {
           foreach ($servicestatus_attributes as $attrib) {
-            $servicestatus[$attrib] = htmlentities(getParameter($object, $attrib), ENT_COMPAT);
+            $servicestatus[$attrib] = htmlentities(getParameterMapValue($map, $attrib), ENT_COMPAT);
           }
           }
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
@@ -342,10 +345,10 @@ function hdp_mon_generate_response( $response_data )
         }
         }
         break;
         break;
       case "critical":
       case "critical":
-        if (getParameter($object, "last_hard_state") == critical &&
-           (empty($host) || getParameter($object, "host_name") == $host)) {
+        if (getParameterMapValue($map, "last_hard_state") == critical &&
+           (empty($host) || getParameterMapValue($map, "host_name") == $host)) {
           foreach ($servicestatus_attributes as $attrib) {
           foreach ($servicestatus_attributes as $attrib) {
-            $servicestatus[$attrib] = htmlentities(getParameter($object, $attrib), ENT_COMPAT);
+            $servicestatus[$attrib] = htmlentities(getParameterMapValue($map, $attrib), ENT_COMPAT);
           }
           }
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $servicestatus['service_type'] = get_service_type($servicestatus['service_description']);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
           $srv_desc = explode ("::",$servicestatus['service_description'],2);
@@ -436,6 +439,36 @@ function hdp_mon_generate_response( $response_data )
     return $value;
     return $value;
   }
   }
 
 
+  function getParameterMapValue($map, $key) {
+    $value = $map[$key];
+
+    if (!is_null($value))
+      return "" . $value;
+
+    return "";
+  }
+
+  function getParameterMap($object) {
+    $map = array ();
+
+    $long_key = "long_plugin_output";
+    $found_long = false;
+    foreach (preg_split("/\n/", trim($object)) as $line) {
+      $arr = explode("=", $line, 2);
+      $key = trim($arr[0]);
+      if ($found_long) {
+        $map[$long_key] = trim($line);
+        $found_long = false;
+      } else {
+        $map[$key] = $arr[1];
+        if ($key == $long_key)
+          $found_long = true;
+      }
+    }
+
+    return $map;
+  }
+
 function indent($json) {
 function indent($json) {
 
 
     $result      = '';
     $result      = '';