Selaa lähdekoodia

AMBARI-7498. HiveServer2 still throwing Transport Errors (aonishuk)

Andrew Onishuk 10 vuotta sitten
vanhempi
commit
0b7fcf6f39
18 muutettua tiedostoa jossa 259 lisäystä ja 59 poistoa
  1. 1 0
      ambari-common/src/main/python/resource_management/libraries/functions/__init__.py
  2. 83 0
      ambari-common/src/main/python/resource_management/libraries/functions/hive_check.py
  3. 9 18
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/scripts/hive_service.py
  4. 3 8
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/scripts/service_check.py
  5. 66 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/check_hive_thrift_port.py
  6. 1 0
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/scripts/nagios_server_config.py
  7. 1 1
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
  8. 1 1
      ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2
  9. 8 17
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/package/scripts/hive_service.py
  10. 3 8
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/package/scripts/service_check.py
  11. 66 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_hive_thrift_port.py
  12. 1 0
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py
  13. 1 1
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2
  14. 1 1
      ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2
  15. 2 2
      ambari-server/src/test/python/stacks/1.3.2/HIVE/test_hive_server.py
  16. 5 0
      ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_nagios_server.py
  17. 2 2
      ambari-server/src/test/python/stacks/2.0.6/HIVE/test_hive_server.py
  18. 5 0
      ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py

+ 1 - 0
ambari-common/src/main/python/resource_management/libraries/functions/__init__.py

@@ -28,3 +28,4 @@ from resource_management.libraries.functions.check_process_status import *
 from resource_management.libraries.functions.is_empty import *
 from resource_management.libraries.functions.substitute_vars import *
 from resource_management.libraries.functions.get_port_from_url import *
+from resource_management.libraries.functions.hive_check import *

+ 83 - 0
ambari-common/src/main/python/resource_management/libraries/functions/hive_check.py

@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+'''
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+import socket
+from resource_management.core.exceptions import Fail
+
+def check_thrift_port_sasl(address, port, timeout = 5):
+  """
+  Hive thrift SASL port check
+  """
+
+  #Authentification mechanism
+  mechanism = "PLAIN"
+  #Anonymous username
+  usr = "ANONYMOUS"
+  start_byte = 0x01 #START communication
+  ok_byte = 0x02 #OK
+  bad_byte = 0x03 #BAD
+  error_byte = 0x04 #ERROR
+  complete_byte = 0x05 #COMPLETE communication
+  
+  msg = bytearray()
+
+  msg.append(start_byte)
+  msg.append(0)
+  msg.append(0)
+  msg.append(0)
+  msg.append(len(mechanism))
+  for elem in mechanism:
+    msg.append(ord(elem))
+
+  msg.append(ok_byte)
+  msg.append(0)
+  msg.append(0)
+  msg.append(0)
+  msg.append(len(usr)*2+2)
+  
+  #Adding anonymous user name
+  msg.append(0)
+  for elem in usr:
+    msg.append(ord(elem))
+
+  #Adding anonymous user password
+  msg.append(0)
+  for elem in usr:
+    msg.append(ord(elem))
+
+  msg.append(complete_byte)
+  msg.append(0)
+  msg.append(0)
+  msg.append(0)
+  msg.append(0)
+
+  is_service_socket_valid = False
+  s = socket.socket()
+  s.settimeout(timeout)
+
+  try:
+    s.connect((address, port))
+    s.send(msg)
+    is_service_socket_valid = True
+  except socket.error, e:
+    #Expected if service unreachable
+    pass
+  finally:
+    s.close()
+    return is_service_socket_valid

+ 9 - 18
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/scripts/hive_service.py

@@ -19,7 +19,6 @@ limitations under the License.
 """
 
 from resource_management import *
-import socket
 import sys
 import time
 
@@ -66,25 +65,17 @@ def hive_service(
       
       start_time = time.time()
       end_time = start_time + SOCKET_WAIT_SECONDS
-      
-      s = socket.socket()
-      s.settimeout(5)
-            
+
       is_service_socket_valid = False
       print "Waiting for the Hive server to start..."
-      try:
-        while time.time() < end_time:
-          try:
-            s.connect((address, port))
-            s.send("A001 AUTHENTICATE ANONYMOUS")
-            is_service_socket_valid = True
-            break
-          except socket.error, e:          
-            time.sleep(5)
-      finally:
-        s.close()
-      
-      elapsed_time = time.time() - start_time    
+      while time.time() < end_time:
+        if check_thrift_port_sasl(address, port, 2):
+          is_service_socket_valid = True
+          break
+        else:
+          time.sleep(2)
+
+      elapsed_time = time.time() - start_time
       
       if is_service_socket_valid == False: 
         raise Fail("Connection to Hive server %s on port %s failed after %d seconds" % (address, port, elapsed_time))

+ 3 - 8
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/HIVE/package/scripts/service_check.py

@@ -32,16 +32,11 @@ class HiveServiceCheck(Script):
 
     address=format("{hive_server_host}")
     port=int(format("{hive_server_port}"))
-    s = socket.socket()
     print "Test connectivity to hive server"
-    try:
-      s.connect((address, port))
-      s.send("A001 AUTHENTICATE ANONYMOUS")
+    if check_thrift_port_sasl(address, port):
       print "Successfully connected to %s on port %s" % (address, port)
-      s.close()
-    except socket.error, e:
-      print "Connection to %s on port %s failed: %s" % (address, port, e)
-      sys.exit(1)
+    else:
+      print "Connection to %s on port %s failed: %s" % (address, port)
 
     hcat_service_check()
     webhcat_service_check()

+ 66 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/files/check_hive_thrift_port.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+
+import os
+import optparse
+import json
+import traceback
+from resource_management import *
+from time import time
+
+
+OK_MESSAGE = "TCP OK - %.3f second response time on port %s"
+CRITICAL_MESSAGE = "Connection to %s on port %s failed"
+
+def main():
+
+  parser = optparse.OptionParser()
+
+  parser.add_option("-H", "--host", dest="address", help="Hive thrift host")
+  parser.add_option("-p", "--port", type="int", dest="port", help="Hive thrift port")
+
+  (options, args) = parser.parse_args()
+
+  if options.address is None:
+    print "Hive thrift host (--name or -n)"
+    exit(-1)
+
+  if options.port is None:
+    print "Hive thrift port (--file or -f)"
+    exit(-1)
+
+  address = options.address
+  port = options.port
+
+  starttime = time()
+  if check_thrift_port_sasl(address, port):
+    timetaken = time() - starttime
+    print OK_MESSAGE % (timetaken, port)
+    exit(0)
+  else:
+    print CRITICAL_MESSAGE % (address, port)
+    exit(2)
+
+
+if __name__ == "__main__":
+  main()
+

+ 1 - 0
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/scripts/nagios_server_config.py

@@ -67,6 +67,7 @@ def nagios_server_config():
   nagios_server_check( 'check_namenodes_ha.sh')
   nagios_server_check( 'check_wrapper.sh')
   nagios_server_check( 'hdp_nagios_init.php')
+  nagios_server_check( 'check_hive_thrift_port.py' )
 
 
 def nagios_server_configfile(

+ 1 - 1
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-commands.cfg.j2

@@ -143,5 +143,5 @@ define command{
 
 define command{
         command_name check_tcp_wrapper_sasl
-        command_line  $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$ -s \"$ARG3$\"
+        command_line $USER1$/check_wrapper.sh /var/lib/ambari-agent/ambari-python-wrap $USER1$/check_hive_thrift_port.py -H $HOSTADDRESS$ -p $ARG1$
        }

+ 1 - 1
ambari-server/src/main/resources/stacks/HDP/1.3.2/services/NAGIOS/package/templates/hadoop-services.cfg.j2

@@ -539,7 +539,7 @@ define service {
         use                     hadoop-service
         service_description     HIVE-SERVER::HiveServer2 process
         servicegroups           HIVE
-        check_command           check_tcp_wrapper_sasl!{{ hive_server_port }}!-w 1 -c 1!A001 AUTHENTICATE ANONYMOUS
+        check_command           check_tcp_wrapper_sasl!{{ hive_server_port }}!-w 1 -c 1
         normal_check_interval   0.5
         retry_check_interval    0.5
         max_check_attempts      3

+ 8 - 17
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/package/scripts/hive_service.py

@@ -19,7 +19,6 @@ limitations under the License.
 """
 
 from resource_management import *
-import socket
 import sys
 import time
 from resource_management.core.shell import call
@@ -72,24 +71,16 @@ def hive_service(
       
       start_time = time.time()
       end_time = start_time + SOCKET_WAIT_SECONDS
-      
-      s = socket.socket()
-      s.settimeout(5)
-            
+
       is_service_socket_valid = False
       print "Waiting for the Hive server to start..."
-      try:
-        while time.time() < end_time:
-          try:
-            s.connect((address, port))
-            s.send("A001 AUTHENTICATE ANONYMOUS")
-            is_service_socket_valid = True
-            break
-          except socket.error, e:          
-            time.sleep(5)
-      finally:
-        s.close()
-      
+      while time.time() < end_time:
+        if check_thrift_port_sasl(address, port, 2):
+          is_service_socket_valid = True
+          break
+        else:
+          time.sleep(2)
+
       elapsed_time = time.time() - start_time    
       
       if is_service_socket_valid == False: 

+ 3 - 8
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/HIVE/package/scripts/service_check.py

@@ -32,16 +32,11 @@ class HiveServiceCheck(Script):
 
     address=format("{hive_server_host}")
     port=int(format("{hive_server_port}"))
-    s = socket.socket()
     print "Test connectivity to hive server"
-    try:
-      s.connect((address, port))
-      s.send("A001 AUTHENTICATE ANONYMOUS")
+    if check_thrift_port_sasl(address, port):
       print "Successfully connected to %s on port %s" % (address, port)
-      s.close()
-    except socket.error, e:
-      print "Connection to %s on port %s failed: %s" % (address, port, e)
-      sys.exit(1)
+    else:
+      print "Connection to %s on port %s failed: %s" % (address, port)
 
     hcat_service_check()
     webhcat_service_check()

+ 66 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/files/check_hive_thrift_port.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+#
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#
+
+import os
+import optparse
+import json
+import traceback
+from resource_management import *
+from time import time
+
+
+OK_MESSAGE = "TCP OK - %.3f second response time on port %s"
+CRITICAL_MESSAGE = "Connection to %s on port %s failed"
+
+def main():
+
+  parser = optparse.OptionParser()
+
+  parser.add_option("-H", "--host", dest="address", help="Hive thrift host")
+  parser.add_option("-p", "--port", type="int", dest="port", help="Hive thrift port")
+
+  (options, args) = parser.parse_args()
+
+  if options.address is None:
+    print "Hive thrift host (--name or -n)"
+    exit(-1)
+
+  if options.port is None:
+    print "Hive thrift port (--file or -f)"
+    exit(-1)
+
+  address = options.address
+  port = options.port
+
+  starttime = time()
+  if check_thrift_port_sasl(address, port):
+    timetaken = time() - starttime
+    print OK_MESSAGE % (timetaken, port)
+    exit(0)
+  else:
+    print CRITICAL_MESSAGE % (address, port)
+    exit(2)
+
+
+if __name__ == "__main__":
+  main()
+

+ 1 - 0
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/scripts/nagios_server_config.py

@@ -72,6 +72,7 @@ def nagios_server_config():
   nagios_server_check( 'check_checkpoint_time.py' )
   nagios_server_check( 'sys_logger.py' )
   nagios_server_check( 'check_ambari_alerts.py' )
+  nagios_server_check( 'check_hive_thrift_port.py' )
 
 def nagios_server_configfile(
   name,

+ 1 - 1
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-commands.cfg.j2

@@ -157,7 +157,7 @@ define command{
 
 define command{
         command_name check_tcp_wrapper_sasl
-        command_line  $USER1$/check_wrapper.sh $USER1$/check_tcp -H $HOSTADDRESS$ -p $ARG1$ $ARG2$ -s \"$ARG3$\"
+        command_line $USER1$/check_wrapper.sh /var/lib/ambari-agent/ambari-python-wrap $USER1$/check_hive_thrift_port.py -H $HOSTADDRESS$ -p $ARG1$
        }
 
 define command{

+ 1 - 1
ambari-server/src/main/resources/stacks/HDP/2.0.6/services/NAGIOS/package/templates/hadoop-services.cfg.j2

@@ -700,7 +700,7 @@ define service {
         use                     hadoop-service
         service_description     HIVE-SERVER::HiveServer2 process
         servicegroups           HIVE
-        check_command           check_tcp_wrapper_sasl!{{ hive_server_port }}!-w 1 -c 1!A001 AUTHENTICATE ANONYMOUS
+        check_command           check_tcp_wrapper_sasl!{{ hive_server_port }}!-w 1 -c 1
         normal_check_interval   0.5
         retry_check_interval    0.5
         max_check_attempts      3

+ 2 - 2
ambari-server/src/test/python/stacks/1.3.2/HIVE/test_hive_server.py

@@ -379,5 +379,5 @@ class TestHiveServer(RMFTestCase):
       self.fail("Script failure due to socket error was expected")
     except:
       self.assert_configure_default()
-      self.assertTrue(socket_mock.called)
-      self.assertTrue(s.close.called)    
+      self.assertFalse(socket_mock.called)
+      self.assertFalse(s.close.called)

+ 5 - 0
ambari-server/src/test/python/stacks/1.3.2/NAGIOS/test_nagios_server.py

@@ -254,6 +254,11 @@ class TestNagiosServer(RMFTestCase):
                               content=StaticFile('hdp_nagios_init.php'),
                               mode=0755
     )
+    self.assertResourceCalled('File',
+                              '/usr/lib64/nagios/plugins/check_hive_thrift_port.py',
+                              content=StaticFile('check_hive_thrift_port.py'),
+                              mode=0755
+    )
     self.assertResourceCalled('Execute',
                               'htpasswd2 -c -b  /etc/nagios/htpasswd.users nagiosadmin \'!`"\'"\'"\' 1\'',
                               not_if="grep nagiosadmin /etc/nagios/htpasswd.users"

+ 2 - 2
ambari-server/src/test/python/stacks/2.0.6/HIVE/test_hive_server.py

@@ -492,5 +492,5 @@ class TestHiveServer(RMFTestCase):
       self.fail("Script failure due to socket error was expected")
     except:
       self.assert_configure_default()
-      self.assertTrue(socket_mock.called)
-      self.assertTrue(s.close.called)    
+      self.assertFalse(socket_mock.called)
+      self.assertFalse(s.close.called)

+ 5 - 0
ambari-server/src/test/python/stacks/2.0.6/NAGIOS/test_nagios_server.py

@@ -283,6 +283,11 @@ class TestNagiosServer(RMFTestCase):
                               content=StaticFile('check_ambari_alerts.py'),
                               mode=0755
     )
+    self.assertResourceCalled('File',
+                              '/usr/lib64/nagios/plugins/check_hive_thrift_port.py',
+                              content=StaticFile('check_hive_thrift_port.py'),
+                              mode=0755
+    )
     self.assertResourceCalled('Execute',
                               'htpasswd2 -c -b  /etc/nagios/htpasswd.users nagiosadmin \'!`"\'"\'"\' 1\'',
                               not_if="grep nagiosadmin /etc/nagios/htpasswd.users"