Przeglądaj źródła

AMBARI-10464 - Ambari Agent holding socket open on 50070 prevents NN from starting (jonathanhurley)

Jonathan Hurley 10 lat temu
rodzic
commit
d2bc7bd375

+ 3 - 1
ambari-agent/src/main/python/ambari_agent/alerts/metric_alert.py

@@ -31,6 +31,8 @@ from resource_management.libraries.functions.get_port_from_url import get_port_f
 
 logger = logging.getLogger()
 
+CONNECTION_TIMEOUT = 5.0
+
 class MetricAlert(BaseAlert):
   
   def __init__(self, alert_meta, alert_source_meta):
@@ -157,7 +159,7 @@ class MetricAlert(BaseAlert):
       response = None
       try:
         url_opener = urllib2.build_opener(RefreshHeaderProcessor())
-        response = url_opener.open(url)
+        response = url_opener.open(url, timeout=CONNECTION_TIMEOUT)
         content = response.read()
       finally:
         # explicitely close the connection as we've seen python hold onto these

+ 3 - 1
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py

@@ -36,6 +36,8 @@ PERCENT_CRITICAL = 200
 CHECKPOINT_TX_DEFAULT = 1000000
 CHECKPOINT_PERIOD_DEFAULT = 21600
 
+CONNECTION_TIMEOUT = 5.0
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -133,7 +135,7 @@ def get_value_from_jmx(query, jmx_property):
   response = None
   
   try:
-    response = urllib2.urlopen(query)
+    response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT)
     data = response.read()
 
     data_dict = json.loads(data)

+ 3 - 1
ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py

@@ -35,6 +35,8 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
 NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
 DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
 
+CONNECTION_TIMEOUT = 5.0
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -163,7 +165,7 @@ def get_value_from_jmx(query, jmx_property):
   response = None
   
   try:
-    response = urllib2.urlopen(query)
+    response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT)
     data = response.read()
 
     data_dict = json.loads(data)

+ 3 - 2
ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py

@@ -53,7 +53,8 @@ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}
 WEBHCAT_OK_RESPONSE = 'ok'
 WEBHCAT_PORT_DEFAULT = 50111
 
-CURL_CONNECTION_TIMEOUT = '10'
+CURL_CONNECTION_TIMEOUT = '5'
+CONNECTION_TIMEOUT = 5.0
 
 def get_tokens():
   """
@@ -177,7 +178,7 @@ def execute(parameters=None, host_name=None):
     try:
       # execute the query for the JSON that includes WebHCat status
       start_time = time.time()
-      url_response = urllib2.urlopen(query_url)
+      url_response = urllib2.urlopen(query_url, timeout=CONNECTION_TIMEOUT)
       total_time = time.time() - start_time
 
       json_response = json.loads(url_response.read())

+ 3 - 1
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py

@@ -40,6 +40,8 @@ CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager hea
 
 NODEMANAGER_DEFAULT_PORT = 8042
 
+CONNECTION_TIMEOUT = 5.0
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -106,7 +108,7 @@ def execute(parameters=None, host_name=None):
 
   try:
     # execute the query for the JSON that includes templeton status
-    url_response = urllib2.urlopen(query)
+    url_response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT)
   except urllib2.HTTPError, httpError:
     label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
       str(httpError))

+ 4 - 2
ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py

@@ -29,6 +29,8 @@ OK_LABEL = 'All NodeManagers are healthy'
 NODEMANAGER_HTTP_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.address}}'
 NODEMANAGER_HTTPS_ADDRESS_KEY = '{{yarn-site/yarn.resourcemanager.webapp.https.address}}'
 YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
+
+CONNECTION_TIMEOUT = 5.0
   
 def get_tokens():
   """
@@ -99,7 +101,7 @@ def execute(parameters=None, host_name=None):
     label = str(e)
     result_code = 'UNKNOWN'
 
-  return ((result_code, [label]))
+  return (result_code, [label])
 
 
 def get_value_from_jmx(query, jmx_property):
@@ -109,7 +111,7 @@ def get_value_from_jmx(query, jmx_property):
     # use a customer header process that will look for the non-standard
     # "Refresh" header and attempt to follow the redirect
     url_opener = urllib2.build_opener(RefreshHeaderProcessor())
-    response = url_opener.open(query)
+    response = url_opener.open(query, timeout=CONNECTION_TIMEOUT)
 
     data = response.read()
     data_dict = json.loads(data)

+ 3 - 1
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_checkpoint_time.py

@@ -36,6 +36,8 @@ PERCENT_CRITICAL = 200
 CHECKPOINT_TX_DEFAULT = 1000000
 CHECKPOINT_PERIOD_DEFAULT = 21600
 
+CONNECTION_TIMEOUT = 5.0
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -133,7 +135,7 @@ def get_value_from_jmx(query, jmx_property):
   response = None
   
   try:
-    response = urllib2.urlopen(query)
+    response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT)
     data = response.read()
 
     data_dict = json.loads(data)

+ 3 - 1
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/HDFS/package/files/alert_ha_namenode_health.py

@@ -35,6 +35,8 @@ NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
 NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
 DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
 
+CONNECTION_TIMEOUT = 5.0
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -163,7 +165,7 @@ def get_value_from_jmx(query, jmx_property):
   response = None
   
   try:
-    response = urllib2.urlopen(query)
+    response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT)
     data = response.read()
 
     data_dict = json.loads(data)

+ 3 - 2
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/WEBHCAT/package/files/alert_webhcat_server.py

@@ -53,7 +53,8 @@ KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}
 WEBHCAT_OK_RESPONSE = 'ok'
 WEBHCAT_PORT_DEFAULT = 50111
 
-CURL_CONNECTION_TIMEOUT = '10'
+CURL_CONNECTION_TIMEOUT = '5'
+CONNECTION_TIMEOUT = 5.0
 
 def get_tokens():
   """
@@ -177,7 +178,7 @@ def execute(parameters=None, host_name=None):
     try:
       # execute the query for the JSON that includes WebHCat status
       start_time = time.time()
-      url_response = urllib2.urlopen(query_url)
+      url_response = urllib2.urlopen(query_url, timeout=CONNECTION_TIMEOUT)
       total_time = time.time() - start_time
 
       json_response = json.loads(url_response.read())

+ 3 - 1
ambari-server/src/main/resources/stacks/BIGTOP/0.8/services/YARN/package/files/alert_nodemanager_health.py

@@ -40,6 +40,8 @@ CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager hea
 
 NODEMANAGER_DEFAULT_PORT = 8042
 
+CONNECTION_TIMEOUT = 5.0
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -106,7 +108,7 @@ def execute(parameters=None, host_name=None):
 
   try:
     # execute the query for the JSON that includes templeton status
-    url_response = urllib2.urlopen(query)
+    url_response = urllib2.urlopen(query, timeout=CONNECTION_TIMEOUT)
   except urllib2.HTTPError, httpError:
     label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
       str(httpError))