test_sys_logger.py 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644
  1. #!/usr/bin/python
  2. # Licensed to the Apache Software Foundation (ASF) under one or more
  3. # contributor license agreements. See the NOTICE file distributed with
  4. # this work for additional information regarding copyright ownership.
  5. # The ASF licenses this file to You under the Apache License, Version 2.0
  6. # (the "License"); you may not use this file except in compliance with
  7. # the License. You may obtain a copy of the License at
  8. #
  9. # http://www.apache.org/licenses/LICENSE-2.0
  10. #
  11. # Unless required by applicable law or agreed to in writing, software
  12. # distributed under the License is distributed on an "AS IS" BASIS,
  13. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. # See the License for the specific language governing permissions and
  15. # limitations under the License.
  16. import sys
  17. sys.path.append('../src')
  18. import sys_logger
  19. tests_passed = 0
  20. tests_failed = 0
  21. def test_log_tvi_msg(msg):
  22. global tests_passed, tests_failed
  23. if msg == expected_log_msg:
  24. print 'Test Passed'
  25. tests_passed += 1
  26. else:
  27. print '*** TEST FAILED ***'
  28. print 'Expected MSG: {0}'.format(expected_log_msg)
  29. print 'Actual MSG : {0}'.format(msg)
  30. tests_failed += 1
  31. sys_logger.log_tvi_msg = test_log_tvi_msg
  32. def test(tvi_rule, expected_msg, arg1, arg2, arg3, arg4, arg5):
  33. sys.stdout.write(tvi_rule + ': ')
  34. global expected_log_msg
  35. expected_log_msg = expected_msg
  36. sys_logger.generate_tvi_log_msg(arg1, arg2, arg3, arg4, arg5)
  37. def summary():
  38. total_tests = tests_passed + tests_failed
  39. print '\nTests Run: {0}'.format(total_tests)
  40. print 'Passed: {0}, Failed: {1}'.format(tests_passed, tests_failed)
  41. if not tests_failed:
  42. print 'SUCCESS! All tests pass.'
  43. # Hadoop_Host_Down
  44. test('Hadoop_Host_Down',
  45. 'Critical: Hadoop: host_down# Event Host=MY_HOST(CRITICAL), PING FAILED - Packet loss = 100%, RTA = 0.00 ms',
  46. 'HARD', '1', 'CRITICAL', 'Host::Ping', 'Event Host=MY_HOST(CRITICAL), PING FAILED - Packet loss = 100%, RTA = 0.00 ms')
  47. test('Hadoop_Host_Down:OK',
  48. 'OK: Hadoop: host_down_ok# Event Host=MY_HOST(OK), PING SUCCESS - Packet loss = 0%, RTA = 1.00 ms',
  49. 'HARD', '1', 'OK', 'Host::Ping', 'Event Host=MY_HOST(OK), PING SUCCESS - Packet loss = 0%, RTA = 1.00 ms')
  50. # Hadoop_Master_Daemon_CPU_Utilization
  51. test('Hadoop_Master_Daemon_CPU_Utilization',
  52. 'Critical: Hadoop: master_cpu_utilization# Event Host=MY_HOST Service Description=HBASEMASTER::HBaseMaster CPU utilization(CRITICAL), 4 CPU, average load 2.5% 200%',
  53. 'HARD', '1', 'CRITICAL', 'HBASEMASTER::HBaseMaster CPU utilization',
  54. 'Event Host=MY_HOST Service Description=HBASEMASTER::HBaseMaster CPU utilization(CRITICAL), 4 CPU, average load 2.5% 200%')
  55. test('Hadoop_Master_Daemon_CPU_Utilization:Degraded',
  56. 'Degraded: Hadoop: master_cpu_utilization# Event Host=MY_HOST Service Description=HBASEMASTER::HBaseMaster CPU utilization(CRITICAL), 4 CPU, average load 2.5% 200%',
  57. 'HARD', '1', 'WARNING', 'HBASEMASTER::HBaseMaster CPU utilization',
  58. 'Event Host=MY_HOST Service Description=HBASEMASTER::HBaseMaster CPU utilization(CRITICAL), 4 CPU, average load 2.5% 200%')
  59. test('Hadoop_Master_Daemon_CPU_Utilization:OK',
  60. 'OK: Hadoop: master_cpu_utilization_ok# Event Host=MY_HOST Service Description=HBASEMASTER::HBaseMaster CPU utilization(OK), 4 CPU, average load 2.5% 200%',
  61. 'HARD', '1', 'OK', 'HBASEMASTER::HBaseMaster CPU utilization',
  62. 'Event Host=MY_HOST Service Description=HBASEMASTER::HBaseMaster CPU utilization(OK), 4 CPU, average load 2.5% 200%')
  63. # Hadoop_HDFS_Percent_Capacity
  64. test('Hadoop_HDFS_Percent_Capacity',
  65. 'Critical: Hadoop: hdfs_percent_capacity# Event Host=MY_HOST Service Description=HDFS::HDFS Capacity utilization(CRITICAL),DFSUsedGB:0.1, DFSTotalGB:1568.7',
  66. 'HARD', '1', 'CRITICAL', 'HDFS::HDFS Capacity utilization',
  67. 'Event Host=MY_HOST Service Description=HDFS::HDFS Capacity utilization(CRITICAL),DFSUsedGB:0.1, DFSTotalGB:1568.7')
  68. test('Hadoop_HDFS_Percent_Capacity:OK',
  69. 'OK: Hadoop: hdfs_percent_capacity_ok# Event Host=MY_HOST Service Description=HDFS::HDFS Capacity utilization(OK),DFSUsedGB:0.1, DFSTotalGB:1568.7',
  70. 'HARD', '1', 'OK', 'HDFS::HDFS Capacity utilization',
  71. 'Event Host=MY_HOST Service Description=HDFS::HDFS Capacity utilization(OK),DFSUsedGB:0.1, DFSTotalGB:1568.7')
  72. # Hadoop_HDFS_Corrupt_Missing_Blocks
  73. test('Hadoop_HDFS_Corrupt_Missing_Blocks',
  74. 'Critical: Hadoop: hdfs_block# Event Host=MY_HOST Service Description=HDFS::Corrupt/Missing blocks(CRITICAL), corrupt_blocks:0, missing_blocks:0, total_blocks:147',
  75. 'HARD', '1', 'CRITICAL', 'HDFS::Corrupt/Missing blocks',
  76. 'Event Host=MY_HOST Service Description=HDFS::Corrupt/Missing blocks(CRITICAL), corrupt_blocks:0, missing_blocks:0, total_blocks:147')
  77. test('Hadoop_HDFS_Corrupt_Missing_Blocks:OK',
  78. 'OK: Hadoop: hdfs_block_ok# Event Host=MY_HOST Service Description=HDFS::Corrupt/Missing blocks(OK), corrupt_blocks:0, missing_blocks:0, total_blocks:147',
  79. 'HARD', '1', 'OK', 'HDFS::Corrupt/Missing blocks',
  80. 'Event Host=MY_HOST Service Description=HDFS::Corrupt/Missing blocks(OK), corrupt_blocks:0, missing_blocks:0, total_blocks:147')
  81. # Hadoop_NameNode_Edit_Log_Dir_Write
  82. test('Hadoop_NameNode_Edit_Log_Dir_Write',
  83. 'Critical: Hadoop: namenode_edit_log_write# SERVICE MSG',
  84. 'HARD', '1', 'CRITICAL', 'NAMENODE::Namenode Edit logs directory status', 'SERVICE MSG')
  85. test('Hadoop_NameNode_Edit_Log_Dir_Write:OK',
  86. 'OK: Hadoop: namenode_edit_log_write_ok# SERVICE MSG',
  87. 'HARD', '1', 'OK', 'NAMENODE::Namenode Edit logs directory status', 'SERVICE MSG')
  88. # Hadoop_DataNode_Down
  89. test('Hadoop_DataNode_Down',
  90. 'Critical: Hadoop: datanode_down# SERVICE MSG',
  91. 'HARD', '1', 'CRITICAL', 'HDFS::Percent DataNodes down','SERVICE MSG')
  92. test('Hadoop_DataNode_Down:OK',
  93. 'OK: Hadoop: datanode_down_ok# SERVICE MSG',
  94. 'HARD', '1', 'OK', 'HDFS::Percent DataNodes down','SERVICE MSG')
  95. # Hadoop_DataNode_Process_Down
  96. test('Hadoop_DataNode_Process_Down',
  97. 'Critical: Hadoop: datanode_process_down# SERVICE MSG',
  98. 'HARD', '1', 'CRITICAL', 'DATANODE::Process down', 'SERVICE MSG')
  99. test('Hadoop_DataNode_Process_Down:OK',
  100. 'OK: Hadoop: datanode_process_down_ok# SERVICE MSG',
  101. 'HARD', '1', 'OK', 'DATANODE::Process down', 'SERVICE MSG')
  102. # Hadoop_Percent_DataNodes_Storage_Full
  103. test('Hadoop_Percent_DataNodes_Storage_Full',
  104. 'Critical: Hadoop: datanodes_percent_storage_full# SERVICE MSG',
  105. 'HARD', '1', 'CRITICAL', 'HDFS::Percent DataNodes storage full', 'SERVICE MSG')
  106. test('Hadoop_Percent_DataNodes_Storage_Full:OK',
  107. 'OK: Hadoop: datanodes_percent_storage_full_ok# SERVICE MSG',
  108. 'HARD', '1', 'OK', 'HDFS::Percent DataNodes storage full', 'SERVICE MSG')
  109. # Hadoop_NameNode_Process_Down
  110. test('Hadoop_NameNode_Process_Down:CRITICAL',
  111. 'Fatal: Hadoop: namenode_process_down# SERVICE MSG',
  112. 'HARD', '1', 'CRITICAL', 'NAMENODE::Namenode Process down', 'SERVICE MSG')
  113. test('Hadoop_NameNode_Process_Down:WARNING',
  114. 'Fatal: Hadoop: namenode_process_down# SERVICE MSG',
  115. 'HARD', '1', 'WARNING', 'NAMENODE::Namenode Process down', 'SERVICE MSG')
  116. test('Hadoop_NameNode_Process_Down:UNKNOWN',
  117. 'Fatal: Hadoop: namenode_process_down# SERVICE MSG',
  118. 'HARD', '1', 'UNKNOWN', 'NAMENODE::Namenode Process down', 'SERVICE MSG')
  119. test('Hadoop_NameNode_Process_Down:OK',
  120. 'OK: Hadoop: namenode_process_down_ok# SERVICE MSG',
  121. 'HARD', '1', 'OK', 'NAMENODE::Namenode Process down', 'SERVICE MSG')
  122. # Hadoop_Secondary_NameNode_Process_Down
  123. test('Hadoop_Secondary_NameNode_Process_Down',
  124. 'Critical: Hadoop: secondary_namenode_process_down# SERVICE MSG',
  125. 'HARD', '1', 'CRITICAL', 'NAMENODE::Secondary Namenode Process down', 'SERVICE MSG')
  126. test('Hadoop_Secondary_NameNode_Process_Down:OK',
  127. 'OK: Hadoop: secondary_namenode_process_down_ok# SERVICE MSG',
  128. 'HARD', '1', 'OK', 'NAMENODE::Secondary Namenode Process down', 'SERVICE MSG')
  129. # Hadoop_NameNode_RPC_Latency
  130. test('Hadoop_NameNode_RPC_Latency',
  131. 'Critical: Hadoop: namenode_rpc_latency# SERVICE MSG',
  132. 'HARD', '1', 'CRITICAL', 'HDFS::Namenode RPC Latency', 'SERVICE MSG')
  133. test('Hadoop_NameNode_RPC_Latency:Degraded',
  134. 'Degraded: Hadoop: namenode_rpc_latency# SERVICE MSG',
  135. 'HARD', '1', 'WARNING', 'HDFS::Namenode RPC Latency', 'SERVICE MSG')
  136. test('Hadoop_NameNode_RPC_Latency:OK',
  137. 'OK: Hadoop: namenode_rpc_latency_ok# SERVICE MSG',
  138. 'HARD', '1', 'OK', 'HDFS::Namenode RPC Latency', 'SERVICE MSG')
  139. # Hadoop_DataNodes_Storage_Full
  140. test('Hadoop_DataNodes_Storage_Full',
  141. 'Critical: Hadoop: datanodes_storage_full# SERVICE MSG',
  142. 'HARD', '1', 'CRITICAL', 'DATANODE::Storage full', 'SERVICE MSG')
  143. test('Hadoop_DataNodes_Storage_Full:OK',
  144. 'OK: Hadoop: datanodes_storage_full_ok# SERVICE MSG',
  145. 'HARD', '1', 'OK', 'DATANODE::Storage full', 'SERVICE MSG')
  146. # Hadoop_JobTracker_Process_Down
  147. test('Hadoop_JobTracker_Process_Down',
  148. 'Critical: Hadoop: jobtracker_process_down# SERVICE MSG',
  149. 'HARD', '1', 'CRITICAL', 'JOBTRACKER::Jobtracker Process down', 'SERVICE MSG')
  150. test('Hadoop_JobTracker_Process_Down:OK',
  151. 'OK: Hadoop: jobtracker_process_down_ok# SERVICE MSG',
  152. 'HARD', '1', 'OK', 'JOBTRACKER::Jobtracker Process down', 'SERVICE MSG')
  153. # Hadoop_JobTracker_RPC_Latency
  154. test('Hadoop_JobTracker_RPC_Latency',
  155. 'Critical: Hadoop: jobtracker_rpc_latency# SERVICE MSG',
  156. 'HARD', '1', 'CRITICAL', 'MAPREDUCE::JobTracker RPC Latency', 'SERVICE MSG')
  157. test('Hadoop_JobTracker_RPC_Latency:Degraded',
  158. 'Degraded: Hadoop: jobtracker_rpc_latency# SERVICE MSG',
  159. 'HARD', '1', 'WARNING', 'MAPREDUCE::JobTracker RPC Latency', 'SERVICE MSG')
  160. test('Hadoop_JobTracker_RPC_Latency:OK',
  161. 'OK: Hadoop: jobtracker_rpc_latency_ok# SERVICE MSG',
  162. 'HARD', '1', 'OK', 'MAPREDUCE::JobTracker RPC Latency', 'SERVICE MSG')
  163. # Hadoop_JobTracker_CPU_Utilization
  164. test('Hadoop_JobTracker_CPU_Utilization',
  165. 'Critical: Hadoop: jobtracker_cpu_utilization# SERVICE MSG',
  166. 'HARD', '1', 'CRITICAL', 'JOBTRACKER::Jobtracker CPU utilization', 'SERVICE MSG')
  167. test('Hadoop_JobTracker_CPU_Utilization:Degraded',
  168. 'Degraded: Hadoop: jobtracker_cpu_utilization# SERVICE MSG',
  169. 'HARD', '1', 'WARNING', 'JOBTRACKER::Jobtracker CPU utilization', 'SERVICE MSG')
  170. test('Hadoop_JobTracker_CPU_Utilization:OK',
  171. 'OK: Hadoop: jobtracker_cpu_utilization_ok# SERVICE MSG',
  172. 'HARD', '1', 'OK', 'JOBTRACKER::Jobtracker CPU utilization', 'SERVICE MSG')
  173. # Hadoop_TaskTracker_Down
  174. test('Hadoop_TaskTracker_Down',
  175. 'Critical: Hadoop: tasktrackers_down# SERVICE MSG',
  176. 'HARD', '1', 'CRITICAL', 'MAPREDUCE::Percent TaskTrackers down', 'SERVICE MSG')
  177. test('Hadoop_TaskTracker_Down:OK',
  178. 'OK: Hadoop: tasktrackers_down_ok# SERVICE MSG',
  179. 'HARD', '1', 'OK', 'MAPREDUCE::Percent TaskTrackers down', 'SERVICE MSG')
  180. # Hadoop_TaskTracker_Process_Down
  181. test('Hadoop_TaskTracker_Process_Down',
  182. 'Critical: Hadoop: tasktracker_process_down# SERVICE MSG',
  183. 'HARD', '1', 'CRITICAL', 'TASKTRACKER::Process down', 'SERVICE MSG')
  184. test('Hadoop_TaskTracker_Process_Down:OK',
  185. 'OK: Hadoop: tasktracker_process_down_ok# SERVICE MSG',
  186. 'HARD', '1', 'OK', 'TASKTRACKER::Process down', 'SERVICE MSG')
  187. # Hadoop_HBaseMaster_Process_Down
  188. test('Hadoop_HBaseMaster_Process_Down',
  189. 'Critical: Hadoop: hbasemaster_process_down# SERVICE MSG',
  190. 'HARD', '1', 'CRITICAL', 'HBASEMASTER::HBaseMaster Process down', 'SERVICE MSG')
  191. test('Hadoop_HBaseMaster_Process_Down:OK',
  192. 'OK: Hadoop: hbasemaster_process_down_ok# SERVICE MSG',
  193. 'HARD', '1', 'OK', 'HBASEMASTER::HBaseMaster Process down', 'SERVICE MSG')
  194. # Hadoop_RegionServer_Process_Down
  195. test('Hadoop_RegionServer_Process_Down',
  196. 'Critical: Hadoop: regionserver_process_down# SERVICE MSG',
  197. 'HARD', '1', 'CRITICAL', 'REGIONSERVER::Process down', 'SERVICE MSG')
  198. test('Hadoop_RegionServer_Process_Down:OK',
  199. 'OK: Hadoop: regionserver_process_down_ok# SERVICE MSG',
  200. 'HARD', '1', 'OK', 'REGIONSERVER::Process down', 'SERVICE MSG')
  201. # Hadoop_RegionServer_Down
  202. test('Hadoop_RegionServer_Down',
  203. 'Critical: Hadoop: regionservers_down# SERVICE MSG',
  204. 'HARD', '1', 'CRITICAL', 'HBASE::Percent region servers down', 'SERVICE MSG')
  205. test('Hadoop_RegionServer_Down:OK',
  206. 'OK: Hadoop: regionservers_down_ok# SERVICE MSG',
  207. 'HARD', '1', 'OK', 'HBASE::Percent region servers down', 'SERVICE MSG')
  208. test('HBASE_RegionServer_live',
  209. 'Critical: Hadoop: regionservers_down# SERVICE MSG',
  210. 'HARD', '1', 'CRITICAL', 'HBASE::Percent RegionServers live', 'SERVICE MSG')
  211. test('HBASE_RegionServer_live:OK',
  212. 'OK: Hadoop: regionservers_down_ok# SERVICE MSG',
  213. 'HARD', '1', 'OK', 'HBASE::Percent RegionServers live', 'SERVICE MSG')
  214. # Hadoop_Hive_Metastore_Process_Down
  215. test('Hadoop_Hive_Metastore_Process_Down',
  216. 'Critical: Hadoop: hive_metastore_process_down# SERVICE MSG',
  217. 'HARD', '1', 'CRITICAL', 'HIVE-METASTORE::HIVE-METASTORE status check', 'SERVICE MSG')
  218. test('Hadoop_Hive_Metastore_Process_Down:OK',
  219. 'OK: Hadoop: hive_metastore_process_down_ok# SERVICE MSG',
  220. 'HARD', '1', 'OK', 'HIVE-METASTORE::HIVE-METASTORE status check', 'SERVICE MSG')
  221. # Hadoop_Zookeeper_Down
  222. test('Hadoop_Zookeeper_Down',
  223. 'Critical: Hadoop: zookeepers_down# SERVICE MSG',
  224. 'HARD', '1', 'CRITICAL', 'ZOOKEEPER::Percent zookeeper servers down', 'SERVICE MSG')
  225. test('Hadoop_Zookeeper_Down:OK',
  226. 'OK: Hadoop: zookeepers_down_ok# SERVICE MSG',
  227. 'HARD', '1', 'OK', 'ZOOKEEPER::Percent zookeeper servers down', 'SERVICE MSG')
  228. # Hadoop_Zookeeper_Process_Down
  229. test('Hadoop_Zookeeper_Process_Down',
  230. 'Critical: Hadoop: zookeeper_process_down# SERVICE MSG',
  231. 'HARD', '1', 'CRITICAL', 'ZKSERVERS::ZKSERVERS Process down', 'SERVICE MSG')
  232. test('Hadoop_Zookeeper_Process_Down:OK',
  233. 'OK: Hadoop: zookeeper_process_down_ok# SERVICE MSG',
  234. 'HARD', '1', 'OK', 'ZKSERVERS::ZKSERVERS Process down', 'SERVICE MSG')
  235. # Hadoop_Oozie_Down
  236. test('Hadoop_Oozie_Down',
  237. 'Critical: Hadoop: oozie_down# SERVICE MSG',
  238. 'HARD', '1', 'CRITICAL', 'OOZIE::Oozie status check', 'SERVICE MSG')
  239. test('Hadoop_Oozie_Down:OK',
  240. 'OK: Hadoop: oozie_down_ok# SERVICE MSG',
  241. 'HARD', '1', 'OK', 'OOZIE::Oozie status check', 'SERVICE MSG')
  242. # Hadoop_Templeton_Down
  243. test('Hadoop_Templeton_Down',
  244. 'Critical: Hadoop: templeton_down# SERVICE MSG',
  245. 'HARD', '1', 'CRITICAL', 'TEMPLETON::Templeton status check', 'SERVICE MSG')
  246. test('Hadoop_Templeton_Down:OK',
  247. 'OK: Hadoop: templeton_down_ok# SERVICE MSG',
  248. 'HARD', '1', 'OK', 'TEMPLETON::Templeton status check', 'SERVICE MSG')
  249. # Hadoop_Puppet_Down
  250. test('Hadoop_Puppet_Down',
  251. 'Critical: Hadoop: puppet_down# SERVICE MSG',
  252. 'HARD', '1', 'CRITICAL', 'PUPPET::Puppet agent down', 'SERVICE MSG')
  253. test('Hadoop_Puppet_Down:OK',
  254. 'OK: Hadoop: puppet_down_ok# SERVICE MSG',
  255. 'HARD', '1', 'OK', 'PUPPET::Puppet agent down', 'SERVICE MSG')
  256. # Hadoop_Nagios_Status_Log_Stale
  257. test('Hadoop_Nagios_Status_Log_Stale',
  258. 'Critical: Hadoop: nagios_status_log_stale# SERVICE MSG',
  259. 'HARD', '1', 'CRITICAL', 'NAGIOS::Nagios status log staleness', 'SERVICE MSG')
  260. test('Hadoop_Nagios_Status_Log_Stale:OK',
  261. 'OK: Hadoop: nagios_status_log_stale_ok# SERVICE MSG',
  262. 'HARD', '1', 'OK', 'NAGIOS::Nagios status log staleness', 'SERVICE MSG')
  263. # Hadoop_Ganglia_Process_Down
  264. test('Hadoop_Ganglia_Process_Down',
  265. 'Critical: Hadoop: ganglia_process_down# SERVICE MSG',
  266. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia [gmetad] Process down', 'SERVICE MSG')
  267. test('Hadoop_Ganglia_Process_Down:OK',
  268. 'OK: Hadoop: ganglia_process_down_ok# SERVICE MSG',
  269. 'HARD', '1', 'OK', 'GANGLIA::Ganglia [gmetad] Process down', 'SERVICE MSG')
  270. # Hadoop_Ganglia_Collector_Process_Down
  271. test('Hadoop_Ganglia_Collector_Process_Down',
  272. 'Critical: Hadoop: ganglia_collector_process_down# SERVICE MSG',
  273. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster', 'SERVICE MSG')
  274. test('Hadoop_Ganglia_Collector_Process_Down:OK',
  275. 'OK: Hadoop: ganglia_collector_process_down_ok# SERVICE MSG',
  276. 'HARD', '1', 'OK', 'GANGLIA::Ganglia collector [gmond] Process down alert for hbasemaster', 'SERVICE MSG')
  277. # Hadoop_Ganglia_Collector_Process_Down
  278. test('Hadoop_Ganglia_Collector_Process_Down',
  279. 'Critical: Hadoop: ganglia_collector_process_down# SERVICE MSG',
  280. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker', 'SERVICE MSG')
  281. test('Hadoop_Ganglia_Collector_Process_Down:OK',
  282. 'OK: Hadoop: ganglia_collector_process_down_ok# SERVICE MSG',
  283. 'HARD', '1', 'OK', 'GANGLIA::Ganglia collector [gmond] Process down alert for jobtracker', 'SERVICE MSG')
  284. # Hadoop_Ganglia_Collector_Process_Down
  285. test('Hadoop_Ganglia_Collector_Process_Down',
  286. 'Critical: Hadoop: ganglia_collector_process_down# SERVICE MSG',
  287. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia collector [gmond] Process down alert for namenode', 'SERVICE MSG')
  288. test('Hadoop_Ganglia_Collector_Process_Down:OK',
  289. 'OK: Hadoop: ganglia_collector_process_down_ok# SERVICE MSG',
  290. 'HARD', '1', 'OK', 'GANGLIA::Ganglia collector [gmond] Process down alert for namenode', 'SERVICE MSG')
  291. # Hadoop_Ganglia_Collector_Process_Down
  292. test('Hadoop_Ganglia_Collector_Process_Down',
  293. 'Critical: Hadoop: ganglia_collector_process_down# SERVICE MSG',
  294. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia collector [gmond] Process down alert for slaves', 'SERVICE MSG')
  295. test('Hadoop_Ganglia_Collector_Process_Down:OK',
  296. 'OK: Hadoop: ganglia_collector_process_down_ok# SERVICE MSG',
  297. 'HARD', '1', 'OK', 'GANGLIA::Ganglia collector [gmond] Process down alert for slaves', 'SERVICE MSG')
  298. # Hadoop_UNKNOWN_MSG
  299. test('Hadoop_UNKNOWN_MSG',
  300. 'Critical: Hadoop: HADOOP_UNKNOWN_MSG# SERVICE MSG',
  301. 'HARD', '1', 'CRITICAL', 'ANY UNKNOWN SERVICE', 'SERVICE MSG')
  302. # HBase UI Down
  303. test('Hadoop_HBase_UI_Down',
  304. 'Critical: Hadoop: hbase_ui_down# SERVICE MSG',
  305. 'HARD', '1', 'CRITICAL', 'HBASEMASTER::HBase Web UI down', 'SERVICE MSG')
  306. test('Hadoop_HBase_UI_Down:OK',
  307. 'OK: Hadoop: hbase_ui_down_ok# SERVICE MSG',
  308. 'HARD', '1', 'OK', 'HBASEMASTER::HBase Web UI down', 'SERVICE MSG')
  309. # Namenode UI Down
  310. test('Hadoop_NameNode_UI_Down',
  311. 'Critical: Hadoop: namenode_ui_down# SERVICE MSG',
  312. 'HARD', '1', 'CRITICAL', 'NAMENODE::Namenode Web UI down', 'SERVICE MSG')
  313. test('Hadoop_NameNode_UI_Down:OK',
  314. 'OK: Hadoop: namenode_ui_down_ok# SERVICE MSG',
  315. 'HARD', '1', 'OK', 'NAMENODE::Namenode Web UI down', 'SERVICE MSG')
  316. # JobHistory UI Down
  317. test('Hadoop_JobHistory_UI_Down',
  318. 'Critical: Hadoop: jobhistory_ui_down# SERVICE MSG',
  319. 'HARD', '1', 'CRITICAL', 'JOBTRACKER::JobHistory Web UI down', 'SERVICE MSG')
  320. test('Hadoop_JobHistory_UI_Down:OK',
  321. 'OK: Hadoop: jobhistory_ui_down_ok# SERVICE MSG',
  322. 'HARD', '1', 'OK', 'JOBTRACKER::JobHistory Web UI down', 'SERVICE MSG')
  323. # JobTracker UI Down
  324. test('Hadoop_JobTracker_UI_Down',
  325. 'Critical: Hadoop: jobtracker_ui_down# SERVICE MSG',
  326. 'HARD', '1', 'CRITICAL', 'JOBTRACKER::JobTracker Web UI down', 'SERVICE MSG')
  327. test('Hadoop_JobTracker_UI_Down:OK',
  328. 'OK: Hadoop: jobtracker_ui_down_ok# SERVICE MSG',
  329. 'HARD', '1', 'OK', 'JOBTRACKER::JobTracker Web UI down', 'SERVICE MSG')
  330. # Tests for ambari nagios service check
  331. test('DataNode_process',
  332. 'Critical: Hadoop: datanode_process# SERVICE MSG',
  333. 'HARD', '1', 'CRITICAL', 'DATANODE::DataNode process', 'SERVICE MSG')
  334. test('DataNode_process:OK',
  335. 'OK: Hadoop: datanode_process_ok# SERVICE MSG',
  336. 'HARD', '1', 'OK', 'DATANODE::DataNode process', 'SERVICE MSG')
  337. test('NameNode_process',
  338. 'Fatal: Hadoop: namenode_process# SERVICE MSG',
  339. 'HARD', '1', 'CRITICAL', 'NAMENODE::NameNode process', 'SERVICE MSG')
  340. test('NameNode_process:WARNING',
  341. 'Fatal: Hadoop: namenode_process# SERVICE MSG',
  342. 'HARD', '1', 'WARNING', 'NAMENODE::NameNode process', 'SERVICE MSG')
  343. test('NameNode_process:OK',
  344. 'OK: Hadoop: namenode_process_ok# SERVICE MSG',
  345. 'HARD', '1', 'OK', 'NAMENODE::NameNode process', 'SERVICE MSG')
  346. test('Secondary_NameNode_process',
  347. 'Critical: Hadoop: secondary_namenode_process# SERVICE MSG',
  348. 'HARD', '1', 'CRITICAL', 'NAMENODE::Secondary NameNode process', 'SERVICE MSG')
  349. test('Secondary_NameNode_process:OK',
  350. 'OK: Hadoop: secondary_namenode_process_ok# SERVICE MSG',
  351. 'HARD', '1', 'OK', 'NAMENODE::Secondary NameNode process', 'SERVICE MSG')
  352. test('JournalNode_process',
  353. 'Critical: Hadoop: journalnode_process# SERVICE MSG',
  354. 'HARD', '1', 'CRITICAL', 'JOURNALNODE::JournalNode process', 'SERVICE MSG')
  355. test('JournalNode_process:OK',
  356. 'OK: Hadoop: journalnode_process_ok# SERVICE MSG',
  357. 'HARD', '1', 'OK', 'JOURNALNODE::JournalNode process', 'SERVICE MSG')
  358. test('ZooKeeper_Server_process',
  359. 'Critical: Hadoop: zookeeper_server_process# SERVICE MSG',
  360. 'HARD', '1', 'CRITICAL', 'ZOOKEEPER::ZooKeeper Server process', 'SERVICE MSG')
  361. test('ZooKeeper_Server_process:OK',
  362. 'OK: Hadoop: zookeeper_server_process_ok# SERVICE MSG',
  363. 'HARD', '1', 'OK', 'ZOOKEEPER::ZooKeeper Server process', 'SERVICE MSG')
  364. test('JobTracker_process',
  365. 'Critical: Hadoop: jobtracker_process# SERVICE MSG',
  366. 'HARD', '1', 'CRITICAL', 'JOBTRACKER::JobTracker process', 'SERVICE MSG')
  367. test('JobTracker_process:OK',
  368. 'OK: Hadoop: jobtracker_process_ok# SERVICE MSG',
  369. 'HARD', '1', 'OK', 'JOBTRACKER::JobTracker process', 'SERVICE MSG')
  370. test('TaskTracker_process',
  371. 'Critical: Hadoop: tasktracker_process# SERVICE MSG',
  372. 'HARD', '1', 'CRITICAL', 'TASKTRACKER::TaskTracker process', 'SERVICE MSG')
  373. test('TaskTracker_process:OK',
  374. 'OK: Hadoop: tasktracker_process_ok# SERVICE MSG',
  375. 'HARD', '1', 'OK', 'TASKTRACKER::TaskTracker process', 'SERVICE MSG')
  376. test('Ganglia_Server_process',
  377. 'Critical: Hadoop: ganglia_server_process# SERVICE MSG',
  378. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Server process', 'SERVICE MSG')
  379. test('Ganglia_Server_process:OK',
  380. 'OK: Hadoop: ganglia_server_process_ok# SERVICE MSG',
  381. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Server process', 'SERVICE MSG')
  382. test('Ganglia_Monitor_process_for_Slaves',
  383. 'Critical: Hadoop: ganglia_monitor_process# SERVICE MSG',
  384. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Monitor process for Slaves', 'SERVICE MSG')
  385. test('Ganglia_Monitor_process_for_Slaves:OK',
  386. 'OK: Hadoop: ganglia_monitor_process_ok# SERVICE MSG',
  387. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Monitor process for Slaves', 'SERVICE MSG')
  388. test('Ganglia_Monitor_process_for_NameNode',
  389. 'Critical: Hadoop: ganglia_monitor_process# SERVICE MSG',
  390. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Monitor process for NameNode', 'SERVICE MSG')
  391. test('Ganglia_Monitor_process_for_NameNode:OK',
  392. 'OK: Hadoop: ganglia_monitor_process_ok# SERVICE MSG',
  393. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Monitor process for NameNode', 'SERVICE MSG')
  394. test('Ganglia_Monitor_process_for_JobTracker',
  395. 'Critical: Hadoop: ganglia_monitor_process# SERVICE MSG',
  396. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Monitor process for JobTracker', 'SERVICE MSG')
  397. test('Ganglia_Monitor_process_for_JobTracker:OK',
  398. 'OK: Hadoop: ganglia_monitor_process_ok# SERVICE MSG',
  399. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Monitor process for JobTracker', 'SERVICE MSG')
  400. test('Ganglia_Monitor_process_for_HBase_Master',
  401. 'Critical: Hadoop: ganglia_monitor_process# SERVICE MSG',
  402. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Monitor process for HBase Master', 'SERVICE MSG')
  403. test('Ganglia_Monitor_process_for_HBase_Master:OK',
  404. 'OK: Hadoop: ganglia_monitor_process_ok# SERVICE MSG',
  405. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Monitor process for HBase Master', 'SERVICE MSG')
  406. test('Ganglia_Monitor_process_for_ResourceManager',
  407. 'Critical: Hadoop: ganglia_monitor_process# SERVICE MSG',
  408. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Monitor process for ResourceManager', 'SERVICE MSG')
  409. test('Ganglia_Monitor_process_for_ResourceManager:OK',
  410. 'OK: Hadoop: ganglia_monitor_process_ok# SERVICE MSG',
  411. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Monitor process for ResourceManager', 'SERVICE MSG')
  412. test('Ganglia_Monitor_process_for_HistoryServer',
  413. 'Critical: Hadoop: ganglia_monitor_process# SERVICE MSG',
  414. 'HARD', '1', 'CRITICAL', 'GANGLIA::Ganglia Monitor process for HistoryServer', 'SERVICE MSG')
  415. test('Ganglia_Monitor_process_for_HistoryServer:OK',
  416. 'OK: Hadoop: ganglia_monitor_process_ok# SERVICE MSG',
  417. 'HARD', '1', 'OK', 'GANGLIA::Ganglia Monitor process for HistoryServer', 'SERVICE MSG')
  418. test('HBase_Master_process',
  419. 'Critical: Hadoop: hbase_master_process# SERVICE MSG',
  420. 'HARD', '1', 'CRITICAL', 'HBASEMASTER::HBase Master process', 'SERVICE MSG')
  421. test('HBase_Master_process:OK',
  422. 'OK: Hadoop: hbase_master_process_ok# SERVICE MSG',
  423. 'HARD', '1', 'OK', 'HBASEMASTER::HBase Master process', 'SERVICE MSG')
  424. test('RegionServer_process',
  425. 'Critical: Hadoop: regionserver_process# SERVICE MSG',
  426. 'HARD', '1', 'CRITICAL', 'REGIONSERVER::RegionServer process', 'SERVICE MSG')
  427. test('RegionServer_process:OK',
  428. 'OK: Hadoop: regionserver_process_ok# SERVICE MSG',
  429. 'HARD', '1', 'OK', 'REGIONSERVER::RegionServer process', 'SERVICE MSG')
  430. test('Nagios_status_log_freshness',
  431. 'Critical: Hadoop: nagios_process# SERVICE MSG',
  432. 'HARD', '1', 'CRITICAL', 'NAGIOS::Nagios status log freshness', 'SERVICE MSG')
  433. test('Nagios_status_log_freshness:OK',
  434. 'OK: Hadoop: nagios_process_ok# SERVICE MSG',
  435. 'HARD', '1', 'OK', 'NAGIOS::Nagios status log freshness', 'SERVICE MSG')
  436. test('Flume_Agent_process',
  437. 'Critical: Hadoop: flume_agent_process# SERVICE MSG',
  438. 'HARD', '1', 'CRITICAL', 'FLUME::Flume Agent process', 'SERVICE MSG')
  439. test('Flume_Agent_process:OK',
  440. 'OK: Hadoop: flume_agent_process_ok# SERVICE MSG',
  441. 'HARD', '1', 'OK', 'FLUME::Flume Agent process', 'SERVICE MSG')
  442. test('Oozie_Server_status',
  443. 'Critical: Hadoop: oozie_server_process# SERVICE MSG',
  444. 'HARD', '1', 'CRITICAL', 'OOZIE::Oozie Server status', 'SERVICE MSG')
  445. test('Oozie_Server_status:OK',
  446. 'OK: Hadoop: oozie_server_process_ok# SERVICE MSG',
  447. 'HARD', '1', 'OK', 'OOZIE::Oozie Server status', 'SERVICE MSG')
  448. test('Hive_Metastore_status',
  449. 'Critical: Hadoop: hive_metastore_process# SERVICE MSG',
  450. 'HARD', '1', 'CRITICAL', 'HIVE-METASTORE::Hive Metastore status', 'SERVICE MSG')
  451. test('Hive_Metastore_status:OK',
  452. 'OK: Hadoop: hive_metastore_process_ok# SERVICE MSG',
  453. 'HARD', '1', 'OK', 'HIVE-METASTORE::Hive Metastore status', 'SERVICE MSG')
  454. test('WebHCat_Server_status',
  455. 'Critical: Hadoop: webhcat_down# SERVICE MSG',
  456. 'HARD', '1', 'CRITICAL', 'WEBHCAT::WebHCat Server status', 'SERVICE MSG')
  457. test('WebHCat_Server_status:OK',
  458. 'OK: Hadoop: webhcat_down_ok# SERVICE MSG',
  459. 'HARD', '1', 'OK', 'WEBHCAT::WebHCat Server status', 'SERVICE MSG')
  460. test('ResourceManager_process',
  461. 'Critical: Hadoop: resourcemanager_process_down# SERVICE MSG',
  462. 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager process', 'SERVICE MSG')
  463. test('ResourceManager_process:OK',
  464. 'OK: Hadoop: resourcemanager_process_down_ok# SERVICE MSG',
  465. 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager process', 'SERVICE MSG')
  466. test('AppTimeline_process',
  467. 'Critical: Hadoop: timelineserver_process# SERVICE MSG',
  468. 'HARD', '1', 'CRITICAL', 'APP_TIMELINE_SERVER::App Timeline Server process', 'SERVICE MSG')
  469. test('AppTimeline_process:OK',
  470. 'OK: Hadoop: timelineserver_process_ok# SERVICE MSG',
  471. 'HARD', '1', 'OK', 'APP_TIMELINE_SERVER::App Timeline Server process', 'SERVICE MSG')
  472. test('NodeManager_process',
  473. 'Critical: Hadoop: nodemanager_process_down# SERVICE MSG',
  474. 'HARD', '1', 'CRITICAL', 'NODEMANAGER::NodeManager process', 'SERVICE MSG')
  475. test('NodeManager_process:OK',
  476. 'OK: Hadoop: nodemanager_process_down_ok# SERVICE MSG',
  477. 'HARD', '1', 'OK', 'NODEMANAGER::NodeManager process', 'SERVICE MSG')
  478. test('NodeManager_health',
  479. 'Critical: Hadoop: nodemanager_health# SERVICE MSG',
  480. 'HARD', '1', 'CRITICAL', 'NODEMANAGER::NodeManager health', 'SERVICE MSG')
  481. test('NodeManager_health:OK',
  482. 'OK: Hadoop: nodemanager_health_ok# SERVICE MSG',
  483. 'HARD', '1', 'OK', 'NODEMANAGER::NodeManager health', 'SERVICE MSG')
  484. test('NodeManager_live',
  485. 'Critical: Hadoop: nodemanagers_down# SERVICE MSG',
  486. 'HARD', '1', 'CRITICAL', 'NODEMANAGER::Percent NodeManagers live', 'SERVICE MSG')
  487. test('NodeManager_live:OK',
  488. 'OK: Hadoop: nodemanagers_down_ok# SERVICE MSG',
  489. 'HARD', '1', 'OK', 'NODEMANAGER::Percent NodeManagers live', 'SERVICE MSG')
  490. test('HistoryServer_process',
  491. 'Critical: Hadoop: historyserver_process# SERVICE MSG',
  492. 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer process', 'SERVICE MSG')
  493. test('HistoryServer_process:OK',
  494. 'OK: Hadoop: historyserver_process_ok# SERVICE MSG',
  495. 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer process', 'SERVICE MSG')
  496. test('HistoryServer_RPC_latency',
  497. 'Critical: Hadoop: historyserver_rpc_latency# SERVICE MSG',
  498. 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer RPC latency', 'SERVICE MSG')
  499. test('HistoryServer_RPC_latency:OK',
  500. 'OK: Hadoop: historyserver_rpc_latency_ok# SERVICE MSG',
  501. 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer RPC latency', 'SERVICE MSG')
  502. test('HistoryServer_CPU_utilization',
  503. 'Critical: Hadoop: historyserver_cpu_utilization# SERVICE MSG',
  504. 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer CPU utilization', 'SERVICE MSG')
  505. test('HistoryServer_CPU_utilization:OK',
  506. 'OK: Hadoop: historyserver_cpu_utilization_ok# SERVICE MSG',
  507. 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer CPU utilization', 'SERVICE MSG')
  508. test('HistoryServer_Web_UI',
  509. 'Critical: Hadoop: historyserver_ui# SERVICE MSG',
  510. 'HARD', '1', 'CRITICAL', 'JOBHISTORY::HistoryServer Web UI', 'SERVICE MSG')
  511. test('HistoryServer_Web_UI:OK',
  512. 'OK: Hadoop: historyserver_ui_ok# SERVICE MSG',
  513. 'HARD', '1', 'OK', 'JOBHISTORY::HistoryServer Web UI', 'SERVICE MSG')
  514. test('ResourceManager_rpc_latency',
  515. 'Critical: Hadoop: resourcemanager_rpc_latency# SERVICE MSG',
  516. 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager RPC latency', 'SERVICE MSG')
  517. test('ResourceManager_rpc_latency:OK',
  518. 'OK: Hadoop: resourcemanager_rpc_latency_ok# SERVICE MSG',
  519. 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager RPC latency', 'SERVICE MSG')
  520. test('ResourceManager_cpu_utilization',
  521. 'Critical: Hadoop: resourcemanager_cpu_utilization# SERVICE MSG',
  522. 'HARD', '1', 'CRITICAL', 'RESOURCEMANAGER::ResourceManager CPU utilization', 'SERVICE MSG')
  523. test('ResourceManager_cpu_utilization:OK',
  524. 'OK: Hadoop: resourcemanager_cpu_utilization_ok# SERVICE MSG',
  525. 'HARD', '1', 'OK', 'RESOURCEMANAGER::ResourceManager CPU utilization', 'SERVICE MSG')
  526. summary()