Explorar o código

AMBARI-12522. Provide traceback patch to debug hanging agents (dlysnichenko)

Lisnichenko Dmitro %!s(int64=10) %!d(string=hai) anos
pai
achega
7c12637cbb

+ 10 - 3
ambari-agent/src/main/python/ambari_agent/HeartbeatHandlers.py

@@ -26,6 +26,8 @@ import signal
 import threading
 import traceback
 from ambari_commons.os_family_impl import OsFamilyImpl
+import sys
+
 logger = logging.getLogger()
 
 _handler = None
@@ -138,10 +140,15 @@ def bind_signal_handlers(agentPid):
     if os.getpid() == agentPid:
       signal.signal(signal.SIGINT, signal_handler)
       signal.signal(signal.SIGTERM, signal_handler)
-      signal.signal(signal.SIGUSR1, debug)
+      try:
+        import faulthandler  # This is not default module, has to be installed separately
+        faulthandler.enable(file=sys.stderr, all_threads=True)
+        faulthandler.register(signal.SIGUSR1, file=sys.stderr, all_threads=True, chain=False)
+        sys.stderr.write("Registered faulthandler\n")
+      except ImportError:
+        pass  # Module is not included into python distribution
+
     _handler = HeartbeatStopHandlersLinux()
   else:
     _handler = HeartbeatStopHandlersWindows()
   return _handler
-
-

+ 0 - 2
ambari-agent/src/main/python/ambari_agent/main.py

@@ -240,8 +240,6 @@ def main(heartbeat_stop_callback=None):
   default_cfg = {'agent': {'prefix': '/home/ambari'}}
   config.load(default_cfg)
 
-  bind_signal_handlers(agentPid)
-
   if (len(sys.argv) > 1) and sys.argv[1] == 'stop':
     stop_agent()
 

+ 0 - 3
ambari-agent/src/test/python/ambari_agent/TestMain.py

@@ -131,8 +131,6 @@ class TestMain(unittest.TestCase):
     # Check if on SIGINT/SIGTERM agent is configured to terminate
     signal_mock.assert_any_call(signal.SIGINT, HeartbeatHandlers.signal_handler)
     signal_mock.assert_any_call(signal.SIGTERM, HeartbeatHandlers.signal_handler)
-    # Check if on SIGUSR1 agent is configured to fall into debug
-    signal_mock.assert_any_call(signal.SIGUSR1, HeartbeatHandlers.debug)
 
 
   @patch("platform.linux_distribution")
@@ -332,7 +330,6 @@ class TestMain(unittest.TestCase):
     main.main()
 
     self.assertTrue(setup_logging_mock.called)
-    self.assertTrue(bind_signal_handlers_mock.called)
     if OSCheck.get_os_family() != OSConst.WINSRV_FAMILY:
       self.assertTrue(stop_mock.called)
     #self.assertTrue(resolve_ambari_config_mock.called)