Browse Source

AMBARI-20565. Ambari Agent Alert to detect when 'hdp-select versions' reports an error (alejandro)

Alejandro Fernandez 8 years ago
parent
commit
02d7aafe9e

+ 15 - 1
ambari-common/src/main/python/resource_management/libraries/functions/stack_select.py

@@ -286,11 +286,25 @@ def _get_upgrade_stack():
 
   return None
 
+def unsafe_get_stack_versions():
+  """
+  Gets list of stack versions installed on the host.
+  By default a call to <stack-selector-tool> versions is made to get the list of installed stack versions.
+  DO NOT use a fall-back since this function is called by alerts in order to find potential errors.
+  :return: Returns a tuple of (exit code, output, list of installed stack versions).
+  """
+  stack_selector_path = stack_tools.get_stack_tool_path(stack_tools.STACK_SELECTOR_NAME)
+  code, out = call((STACK_SELECT_PREFIX, stack_selector_path, 'versions'))
+  versions = []
+  if 0 == code:
+    for line in out.splitlines():
+      versions.append(line.rstrip('\n'))
+  return (code, out, versions)
 
 def get_stack_versions(stack_root):
   """
   Gets list of stack versions installed on the host.
-  Be default a call to <stack-selector-tool> versions is made to get the list of installed stack versions.
+  By default a call to <stack-selector-tool> versions is made to get the list of installed stack versions.
   As a fallback list of installed versions is collected from stack version directories in stack install root.
   :param stack_root: Stack install root
   :return: Returns list of installed stack versions.

+ 2 - 2
ambari-server/src/main/java/org/apache/ambari/server/checks/AtlasPresenceCheck.java

@@ -29,8 +29,8 @@ import org.slf4j.LoggerFactory;
 import com.google.inject.Singleton;
 
 /**
- * Checks if Atlas service is present. Upgrade to stack HDP 2.5 can't pursuit
- * with existed on the cluster Atlas service.
+ * Checks if Atlas service is present. Upgrade to stack HDP 2.5 from previous stack
+ * must first delete Atlas from the cluster.
  */
 @Singleton
 @UpgradeCheck(group = UpgradeCheckGroup.DEFAULT)

+ 12 - 0
ambari-server/src/main/resources/alerts.json

@@ -149,6 +149,18 @@
             }
           ]
         }
+      },
+      {
+        "name": "ambari_agent_version_select",
+        "label": "Ambari Agent Distro/Conf Select Versions",
+        "description": "This host-level alert is triggered if the distro selector such as hdp-select cannot calculate versions available on this host. This may indicate that /usr/$stack/ directory has links/dirs that do not belong inside of it.",
+        "interval": 5,
+        "scope": "HOST",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "alert_version_select.py"
+        }
       }
     ]
   }

+ 105 - 0
ambari-server/src/main/resources/host_scripts/alert_version_select.py

@@ -0,0 +1,105 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+import logging
+import socket
+import json
+
+from resource_management.libraries.script.script import Script
+from resource_management.libraries.functions.stack_select import unsafe_get_stack_versions
+
+RESULT_STATE_OK = 'OK'
+RESULT_STATE_WARNING = 'WARNING'
+RESULT_STATE_CRITICAL = 'CRITICAL'
+RESULT_STATE_UNKNOWN = 'UNKNOWN'
+
+STACK_TOOLS = '{{cluster-env/stack_tools}}'
+
+
+logger = logging.getLogger()
+
+
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (STACK_TOOLS,)
+
+
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Checks if the stack selector such as hdp-select can find versions installed on this host. E.g.,
+  hdp-select versions
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+  msg = []
+  try:
+    if configurations is None:
+      return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
+
+    # Check required properties
+    if STACK_TOOLS not in configurations:
+      return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(STACK_TOOLS)])
+
+    # Of the form,
+    # { "stack_selector": ["hdp-select", "/usr/bin/hdp-select", "hdp-select"], "conf_selector": ["conf-select", "/usr/bin/conf-select", "conf-select"] }
+    stack_tools_str = configurations[STACK_TOOLS]
+
+    if stack_tools_str is None:
+      return (RESULT_STATE_UNKNOWN, ['{} is a required parameter for the script and the value is null'.format(STACK_TOOLS)])
+
+    distro_select = "unknown-distro-select"
+    try:
+      stack_tools = json.loads(stack_tools_str)
+      distro_select = stack_tools["stack_selector"][0]
+    except:
+      pass
+
+    # This may not exist if the host does not contain any stack components,
+    # or only contains components like Ambari Metrics and SmartSense
+    stack_root_dir = Script.get_stack_root()
+
+    if os.path.isdir(stack_root_dir):
+      (code, out, versions) = unsafe_get_stack_versions()
+
+      if code == 0:
+        msg.append("Ok. {}".format(distro_select))
+        if versions is not None and type(versions) is list and len(versions) > 0:
+          msg.append("Versions: {}".format(", ".join(versions)))
+        return (RESULT_STATE_OK, ["\n".join(msg)])
+      else:
+        msg.append("Failed, check dir {} for unexpected contents.".format(stack_root_dir))
+        if out is not None:
+          msg.append(out)
+
+        return (RESULT_STATE_CRITICAL, ["\n".join(msg)])
+    else:
+      msg.append("Ok. No stack root {} to check.".format(stack_root_dir))
+      return (RESULT_STATE_OK, ["\n".join(msg)])
+  except Exception, e:
+    return (RESULT_STATE_CRITICAL, [e.message])
+

+ 7 - 7
ambari-server/src/test/java/org/apache/ambari/server/api/services/AmbariMetaInfoTest.java

@@ -1875,7 +1875,7 @@ public class AmbariMetaInfoTest {
 
     AlertDefinitionDAO dao = injector.getInstance(AlertDefinitionDAO.class);
     List<AlertDefinitionEntity> definitions = dao.findAll(clusterId);
-    assertEquals(10, definitions.size());
+    assertEquals(11, definitions.size());
 
     // figure out how many of these alerts were merged into from the
     // non-stack alerts.json
@@ -1887,7 +1887,7 @@ public class AmbariMetaInfoTest {
       }
     }
 
-    assertEquals(1, hostAlertCount);
+    assertEquals(2, hostAlertCount);
     assertEquals(9, definitions.size() - hostAlertCount);
 
     for (AlertDefinitionEntity definition : definitions) {
@@ -1898,7 +1898,7 @@ public class AmbariMetaInfoTest {
     metaInfo.reconcileAlertDefinitions(clusters);
 
     definitions = dao.findAll();
-    assertEquals(10, definitions.size());
+    assertEquals(11, definitions.size());
 
     for (AlertDefinitionEntity definition : definitions) {
       assertEquals(28, definition.getScheduleInterval().intValue());
@@ -1907,7 +1907,7 @@ public class AmbariMetaInfoTest {
     // find all enabled for the cluster should find 6 (the ones from HDFS;
     // it will not find the agent alert since it's not bound to the cluster)
     definitions = dao.findAllEnabled(cluster.getClusterId());
-    assertEquals(9, definitions.size());
+    assertEquals(10, definitions.size());
 
     // create new definition
     AlertDefinitionEntity entity = new AlertDefinitionEntity();
@@ -1926,19 +1926,19 @@ public class AmbariMetaInfoTest {
 
     // verify the new definition is found (6 HDFS + 1 new one)
     definitions = dao.findAllEnabled(cluster.getClusterId());
-    assertEquals(10, definitions.size());
+    assertEquals(11, definitions.size());
 
     // reconcile, which should disable our bad definition
     metaInfo.reconcileAlertDefinitions(clusters);
 
     // find all enabled for the cluster should find 6
     definitions = dao.findAllEnabled(cluster.getClusterId());
-    assertEquals(9, definitions.size());
+    assertEquals(10, definitions.size());
 
     // find all should find 6 HDFS + 1 disabled + 1 agent alert + 2 server
     // alerts
     definitions = dao.findAll();
-    assertEquals(11, definitions.size());
+    assertEquals(12, definitions.size());
 
     entity = dao.findById(entity.getDefinitionId());
     assertFalse(entity.getEnabled());

+ 1 - 1
ambari-server/src/test/java/org/apache/ambari/server/metadata/AgentAlertDefinitionsTest.java

@@ -60,7 +60,7 @@ public class AgentAlertDefinitionsTest {
   public void testLoadingAgentHostAlerts() {
     AmbariServiceAlertDefinitions ambariServiceAlertDefinitions = m_injector.getInstance(AmbariServiceAlertDefinitions.class);
     List<AlertDefinition> definitions = ambariServiceAlertDefinitions.getAgentDefinitions();
-    Assert.assertEquals(1, definitions.size());
+    Assert.assertEquals(2, definitions.size());
 
     for( AlertDefinition definition : definitions){
       Assert.assertEquals(Components.AMBARI_AGENT.name(),