alertDefinitions.json 39 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244
  1. {
  2. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions?fields=*",
  3. "items" : [
  4. {
  5. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/1",
  6. "AlertDefinition" : {
  7. "cluster_name" : "tdk",
  8. "component_name" : "RESOURCEMANAGER",
  9. "enabled" : true,
  10. "id" : 1,
  11. "ignore_host" : false,
  12. "interval" : 5,
  13. "label" : "ResourceManager RPC Latency",
  14. "name" : "yarn_resourcemanager_rpc_latency",
  15. "scope" : "ANY",
  16. "service_name" : "YARN",
  17. "source" : {
  18. "jmx" : {
  19. "property_list" : [
  20. "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
  21. "Hadoop:service=ResourceManager,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
  22. ],
  23. "value" : "{0}"
  24. },
  25. "reporting" : {
  26. "ok" : {
  27. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]"
  28. },
  29. "warning" : {
  30. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]",
  31. "value" : 3000.0
  32. },
  33. "critical" : {
  34. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]",
  35. "value" : 5000.0
  36. }
  37. },
  38. "type" : "METRIC",
  39. "uri" : {
  40. "http" : "{{yarn-site/yarn.resourcemanager.webapp.address}}",
  41. "https" : "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
  42. "https_property" : "{{yarn-site/yarn.http.policy}}",
  43. "https_property_value" : "HTTPS_ONLY",
  44. "default_port" : 0.0
  45. }
  46. }
  47. }
  48. },
  49. {
  50. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/2",
  51. "AlertDefinition" : {
  52. "cluster_name" : "tdk",
  53. "component_name" : "RESOURCEMANAGER",
  54. "enabled" : true,
  55. "id" : 2,
  56. "ignore_host" : false,
  57. "interval" : 1,
  58. "label" : "ResourceManager Web UI",
  59. "name" : "yarn_resourcemanager_webui",
  60. "scope" : "ANY",
  61. "service_name" : "YARN",
  62. "source" : {
  63. "reporting" : {
  64. "ok" : {
  65. "text" : "HTTP {0} response in {2:.4f} seconds"
  66. },
  67. "warning" : {
  68. "text" : "HTTP {0} response in {2:.4f} seconds"
  69. },
  70. "critical" : {
  71. "text" : "Connection failed to {1}"
  72. }
  73. },
  74. "type" : "WEB",
  75. "uri" : {
  76. "http" : "{{yarn-site/yarn.resourcemanager.webapp.address}}",
  77. "https" : "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
  78. "https_property" : "{{yarn-site/yarn.http.policy}}",
  79. "https_property_value" : "HTTPS_ONLY",
  80. "default_port" : 0.0
  81. }
  82. }
  83. }
  84. },
  85. {
  86. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/3",
  87. "AlertDefinition" : {
  88. "cluster_name" : "tdk",
  89. "component_name" : "APP_TIMELINE_SERVER",
  90. "enabled" : true,
  91. "id" : 3,
  92. "ignore_host" : false,
  93. "interval" : 1,
  94. "label" : "App Timeline Web UI",
  95. "name" : "yarn_app_timeline_server_webui",
  96. "scope" : "ANY",
  97. "service_name" : "YARN",
  98. "source" : {
  99. "reporting" : {
  100. "ok" : {
  101. "text" : "HTTP {0} response in {2:.4f} seconds"
  102. },
  103. "warning" : {
  104. "text" : "HTTP {0} response in {2:.4f} seconds"
  105. },
  106. "critical" : {
  107. "text" : "Connection failed to {1}"
  108. }
  109. },
  110. "type" : "WEB",
  111. "uri" : {
  112. "http" : "{{yarn-site/yarn.timeline-service.webapp.address}}",
  113. "https" : "{{yarn-site/yarn.timeline-service.webapp.https.address}}",
  114. "https_property" : "{{yarn-site/yarn.http.policy}}",
  115. "https_property_value" : "HTTPS_ONLY",
  116. "default_port" : 0.0
  117. }
  118. }
  119. }
  120. },
  121. {
  122. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/4",
  123. "AlertDefinition" : {
  124. "cluster_name" : "tdk",
  125. "component_name" : null,
  126. "description" : null,
  127. "enabled" : true,
  128. "id" : 4,
  129. "ignore_host" : false,
  130. "interval" : 1,
  131. "label" : "Percent NodeManagers Available",
  132. "name" : "yarn_nodemanager_webui_percent",
  133. "scope" : "SERVICE",
  134. "service_name" : "YARN",
  135. "source" : {
  136. "alert_name" : "yarn_nodemanager_webui",
  137. "reporting" : {
  138. "ok" : {
  139. "text" : "affected: [{1}], total: [{0}]"
  140. },
  141. "warning" : {
  142. "text" : "affected: [{1}], total: [{0}]",
  143. "value" : 0.1
  144. },
  145. "critical" : {
  146. "text" : "affected: [{1}], total: [{0}]",
  147. "value" : 0.3
  148. }
  149. },
  150. "type" : "AGGREGATE"
  151. }
  152. }
  153. },
  154. {
  155. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/5",
  156. "AlertDefinition" : {
  157. "cluster_name" : "tdk",
  158. "component_name" : "RESOURCEMANAGER",
  159. "enabled" : true,
  160. "id" : 5,
  161. "ignore_host" : false,
  162. "interval" : 5,
  163. "label" : "ResourceManager CPU Utilization",
  164. "name" : "yarn_resourcemanager_cpu",
  165. "scope" : "ANY",
  166. "service_name" : "YARN",
  167. "source" : {
  168. "jmx" : {
  169. "property_list" : [
  170. "java.lang:type=OperatingSystem/SystemCpuLoad",
  171. "java.lang:type=OperatingSystem/AvailableProcessors"
  172. ],
  173. "value" : "{0} * 100"
  174. },
  175. "reporting" : {
  176. "ok" : {
  177. "text" : "{1} CPU, load {0:.1%}"
  178. },
  179. "warning" : {
  180. "text" : "{1} CPU, load {0:.1%}",
  181. "value" : 200.0
  182. },
  183. "critical" : {
  184. "text" : "{1} CPU, load {0:.1%}",
  185. "value" : 250.0
  186. }
  187. },
  188. "type" : "METRIC",
  189. "uri" : {
  190. "http" : "{{yarn-site/yarn.resourcemanager.webapp.address}}",
  191. "https" : "{{yarn-site/yarn.resourcemanager.webapp.https.address}}",
  192. "https_property" : "{{yarn-site/yarn.http.policy}}",
  193. "https_property_value" : "HTTPS_ONLY",
  194. "default_port" : 0.0
  195. }
  196. }
  197. }
  198. },
  199. {
  200. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/6",
  201. "AlertDefinition" : {
  202. "cluster_name" : "tdk",
  203. "component_name" : "NODEMANAGER",
  204. "enabled" : true,
  205. "id" : 6,
  206. "ignore_host" : false,
  207. "interval" : 1,
  208. "label" : "NodeManager Health",
  209. "name" : "yarn_nodemanager_health",
  210. "scope" : "HOST",
  211. "service_name" : "YARN",
  212. "source" : {
  213. "path" : "HDP/2.0.6/services/YARN/package/files/alert_nodemanager_health.py",
  214. "type" : "SCRIPT"
  215. }
  216. }
  217. },
  218. {
  219. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/7",
  220. "AlertDefinition" : {
  221. "cluster_name" : "tdk",
  222. "component_name" : "NODEMANAGER",
  223. "enabled" : true,
  224. "id" : 7,
  225. "ignore_host" : false,
  226. "interval" : 1,
  227. "label" : "NodeManager Web UI",
  228. "name" : "yarn_nodemanager_webui",
  229. "scope" : "HOST",
  230. "service_name" : "YARN",
  231. "source" : {
  232. "reporting" : {
  233. "ok" : {
  234. "text" : "HTTP {0} response in {2:.4f} seconds"
  235. },
  236. "warning" : {
  237. "text" : "HTTP {0} response in {2:.4f} seconds"
  238. },
  239. "critical" : {
  240. "text" : "Connection failed to {1}"
  241. }
  242. },
  243. "type" : "WEB",
  244. "uri" : {
  245. "http" : "{{yarn-site/yarn.nodemanager.webapp.address}}",
  246. "https" : "{{yarn-site/yarn.nodemanager.webapp.https.address}}",
  247. "https_property" : "{{yarn-site/yarn.http.policy}}",
  248. "https_property_value" : "HTTPS_ONLY",
  249. "default_port" : 8042.0
  250. }
  251. }
  252. }
  253. },
  254. {
  255. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/8",
  256. "AlertDefinition" : {
  257. "cluster_name" : "tdk",
  258. "component_name" : "ZOOKEEPER_SERVER",
  259. "enabled" : true,
  260. "id" : 8,
  261. "ignore_host" : false,
  262. "interval" : 1,
  263. "label" : "ZooKeeper Server Process",
  264. "name" : "zookeeper_server_process",
  265. "scope" : "ANY",
  266. "service_name" : "ZOOKEEPER",
  267. "source" : {
  268. "default_port" : 2181.0,
  269. "reporting" : {
  270. "ok" : {
  271. "text" : "TCP OK - {0:.4f} response on port {1}"
  272. },
  273. "critical" : {
  274. "text" : "Connection failed: {0} to {1}:{2}"
  275. }
  276. },
  277. "type" : "PORT",
  278. "uri" : "{{zookeeper-env/clientPort}}"
  279. }
  280. }
  281. },
  282. {
  283. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/9",
  284. "AlertDefinition" : {
  285. "cluster_name" : "tdk",
  286. "component_name" : null,
  287. "enabled" : true,
  288. "id" : 9,
  289. "ignore_host" : false,
  290. "interval" : 1,
  291. "label" : "Percent ZooKeeper Servers Available",
  292. "name" : "zookeeper_server_process_percent",
  293. "scope" : "SERVICE",
  294. "service_name" : "ZOOKEEPER",
  295. "source" : {
  296. "alert_name" : "zookeeper_server_process",
  297. "reporting" : {
  298. "ok" : {
  299. "text" : "affected: [{1}], total: [{0}]"
  300. },
  301. "warning" : {
  302. "text" : "affected: [{1}], total: [{0}]",
  303. "value" : 0.35
  304. },
  305. "critical" : {
  306. "text" : "affected: [{1}], total: [{0}]",
  307. "value" : 0.7
  308. }
  309. },
  310. "type" : "AGGREGATE"
  311. }
  312. }
  313. },
  314. {
  315. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/10",
  316. "AlertDefinition" : {
  317. "cluster_name" : "tdk",
  318. "component_name" : "HISTORYSERVER",
  319. "enabled" : true,
  320. "id" : 10,
  321. "ignore_host" : false,
  322. "interval" : 1,
  323. "label" : "History Server Web UI",
  324. "name" : "mapreduce_history_server_webui",
  325. "scope" : "ANY",
  326. "service_name" : "MAPREDUCE2",
  327. "source" : {
  328. "reporting" : {
  329. "ok" : {
  330. "text" : "HTTP {0} response in {2:.4f} seconds"
  331. },
  332. "warning" : {
  333. "text" : "HTTP {0} response in {2:.4f} seconds"
  334. },
  335. "critical" : {
  336. "text" : "Connection failed to {1}"
  337. }
  338. },
  339. "type" : "WEB",
  340. "uri" : {
  341. "http" : "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
  342. "https" : "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
  343. "https_property" : "{{mapred-site/mapreduce.jobhistory.http.policy}}",
  344. "https_property_value" : "HTTPS_ONLY",
  345. "default_port" : 0.0
  346. }
  347. }
  348. }
  349. },
  350. {
  351. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/11",
  352. "AlertDefinition" : {
  353. "cluster_name" : "tdk",
  354. "component_name" : "HISTORYSERVER",
  355. "enabled" : true,
  356. "id" : 11,
  357. "ignore_host" : false,
  358. "interval" : 1,
  359. "label" : "History Server Process",
  360. "name" : "mapreduce_history_server_process",
  361. "scope" : "ANY",
  362. "service_name" : "MAPREDUCE2",
  363. "source" : {
  364. "default_port" : 19888.0,
  365. "reporting" : {
  366. "ok" : {
  367. "text" : "TCP OK - {0:.4f} response on port {1}"
  368. },
  369. "critical" : {
  370. "text" : "Connection failed: {0} to {1}:{2}"
  371. }
  372. },
  373. "type" : "PORT",
  374. "uri" : "{{mapred-site/mapreduce.jobhistory.webapp.address}}"
  375. }
  376. }
  377. },
  378. {
  379. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/12",
  380. "AlertDefinition" : {
  381. "cluster_name" : "tdk",
  382. "component_name" : "HISTORYSERVER",
  383. "enabled" : true,
  384. "id" : 12,
  385. "ignore_host" : false,
  386. "interval" : 5,
  387. "label" : "History Server RPC Latency",
  388. "name" : "mapreduce_history_server_rpc_latency",
  389. "scope" : "ANY",
  390. "service_name" : "MAPREDUCE2",
  391. "source" : {
  392. "jmx" : {
  393. "property_list" : [
  394. "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
  395. "Hadoop:service=JobHistoryServer,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
  396. ],
  397. "value" : "{0}"
  398. },
  399. "reporting" : {
  400. "ok" : {
  401. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]"
  402. },
  403. "warning" : {
  404. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]",
  405. "value" : 3000.0
  406. },
  407. "critical" : {
  408. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]",
  409. "value" : 5000.0
  410. }
  411. },
  412. "type" : "METRIC",
  413. "uri" : {
  414. "http" : "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
  415. "https" : "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
  416. "https_property" : "{{mapred-site/mapreduce.jobhistory.http.policy}}",
  417. "https_property_value" : "HTTPS_ONLY",
  418. "default_port" : 0.0
  419. }
  420. }
  421. }
  422. },
  423. {
  424. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/13",
  425. "AlertDefinition" : {
  426. "cluster_name" : "tdk",
  427. "component_name" : "HISTORYSERVER",
  428. "enabled" : true,
  429. "id" : 13,
  430. "ignore_host" : false,
  431. "interval" : 5,
  432. "label" : "History Server CPU Utilization",
  433. "name" : "mapreduce_history_server_cpu",
  434. "scope" : "ANY",
  435. "service_name" : "MAPREDUCE2",
  436. "source" : {
  437. "jmx" : {
  438. "property_list" : [
  439. "java.lang:type=OperatingSystem/SystemCpuLoad",
  440. "java.lang:type=OperatingSystem/AvailableProcessors"
  441. ],
  442. "value" : "{0} * 100"
  443. },
  444. "reporting" : {
  445. "ok" : {
  446. "text" : "{1} CPU, load {0:.1%}"
  447. },
  448. "warning" : {
  449. "text" : "{1} CPU, load {0:.1%}",
  450. "value" : 200.0
  451. },
  452. "critical" : {
  453. "text" : "{1} CPU, load {0:.1%}",
  454. "value" : 250.0
  455. }
  456. },
  457. "type" : "METRIC",
  458. "uri" : {
  459. "http" : "{{mapred-site/mapreduce.jobhistory.webapp.address}}",
  460. "https" : "{{mapred-site/mapreduce.jobhistory.webapp.https.address}}",
  461. "https_property" : "{{mapred-site/mapreduce.jobhistory.http.policy}}",
  462. "https_property_value" : "HTTPS_ONLY",
  463. "default_port" : 0.0
  464. }
  465. }
  466. }
  467. },
  468. {
  469. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/14",
  470. "AlertDefinition" : {
  471. "cluster_name" : "tdk",
  472. "component_name" : "GANGLIA_SERVER",
  473. "enabled" : true,
  474. "id" : 14,
  475. "ignore_host" : false,
  476. "interval" : 1,
  477. "label" : "Ganglia History Server Process Monitor",
  478. "name" : "ganglia_monitor_mapreduce_history_server",
  479. "scope" : "ANY",
  480. "service_name" : "GANGLIA",
  481. "source" : {
  482. "default_port" : 8666.0,
  483. "reporting" : {
  484. "ok" : {
  485. "text" : "TCP OK - {0:.4f} response on port {1}"
  486. },
  487. "critical" : {
  488. "text" : "Connection failed: {0} to {1}:{2}"
  489. }
  490. },
  491. "type" : "PORT",
  492. "uri" : "8666"
  493. }
  494. }
  495. },
  496. {
  497. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/15",
  498. "AlertDefinition" : {
  499. "cluster_name" : "tdk",
  500. "component_name" : "GANGLIA_SERVER",
  501. "enabled" : true,
  502. "id" : 15,
  503. "ignore_host" : false,
  504. "interval" : 1,
  505. "label" : "Ganglia ResourceManager Process Monitor",
  506. "name" : "ganglia_monitor_yarn_resourcemanager",
  507. "scope" : "ANY",
  508. "service_name" : "GANGLIA",
  509. "source" : {
  510. "default_port" : 8664.0,
  511. "reporting" : {
  512. "ok" : {
  513. "text" : "TCP OK - {0:.4f} response on port {1}"
  514. },
  515. "critical" : {
  516. "text" : "Connection failed: {0} to {1}:{2}"
  517. }
  518. },
  519. "type" : "PORT",
  520. "uri" : "8664"
  521. }
  522. }
  523. },
  524. {
  525. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/16",
  526. "AlertDefinition" : {
  527. "cluster_name" : "tdk",
  528. "component_name" : "GANGLIA_SERVER",
  529. "enabled" : true,
  530. "id" : 16,
  531. "ignore_host" : false,
  532. "interval" : 1,
  533. "label" : "Ganglia NameNode Process Monitor",
  534. "name" : "ganglia_monitor_hdfs_namenode",
  535. "scope" : "ANY",
  536. "service_name" : "GANGLIA",
  537. "source" : {
  538. "default_port" : 8661.0,
  539. "reporting" : {
  540. "ok" : {
  541. "text" : "TCP OK - {0:.4f} response on port {1}"
  542. },
  543. "critical" : {
  544. "text" : "Connection failed: {0} to {1}:{2}"
  545. }
  546. },
  547. "type" : "PORT",
  548. "uri" : "8661"
  549. }
  550. }
  551. },
  552. {
  553. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/17",
  554. "AlertDefinition" : {
  555. "cluster_name" : "tdk",
  556. "component_name" : "GANGLIA_SERVER",
  557. "enabled" : true,
  558. "id" : 17,
  559. "ignore_host" : false,
  560. "interval" : 1,
  561. "label" : "Ganglia HBase Master Process Monitor",
  562. "name" : "ganglia_monitor_hbase_master",
  563. "scope" : "ANY",
  564. "service_name" : "GANGLIA",
  565. "source" : {
  566. "default_port" : 8663.0,
  567. "reporting" : {
  568. "ok" : {
  569. "text" : "TCP OK - {0:.4f} response on port {1}"
  570. },
  571. "critical" : {
  572. "text" : "Connection failed: {0} to {1}:{2}"
  573. }
  574. },
  575. "type" : "PORT",
  576. "uri" : "8663"
  577. }
  578. }
  579. },
  580. {
  581. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/18",
  582. "AlertDefinition" : {
  583. "cluster_name" : "tdk",
  584. "component_name" : "GANGLIA_SERVER",
  585. "enabled" : true,
  586. "id" : 18,
  587. "ignore_host" : false,
  588. "interval" : 1,
  589. "label" : "Ganglia Server Process",
  590. "name" : "ganglia_server_process",
  591. "scope" : "ANY",
  592. "service_name" : "GANGLIA",
  593. "source" : {
  594. "default_port" : 8651.0,
  595. "reporting" : {
  596. "ok" : {
  597. "text" : "TCP OK - {0:.4f} response on port {1}"
  598. },
  599. "critical" : {
  600. "text" : "Connection failed: {0} to {1}:{2}"
  601. }
  602. },
  603. "type" : "PORT",
  604. "uri" : "8651"
  605. }
  606. }
  607. },
  608. {
  609. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/19",
  610. "AlertDefinition" : {
  611. "cluster_name" : "tdk",
  612. "component_name" : "SECONDARY_NAMENODE",
  613. "enabled" : true,
  614. "id" : 19,
  615. "ignore_host" : false,
  616. "interval" : 1,
  617. "label" : "Secondary NameNode Process",
  618. "name" : "secondary_namenode_process",
  619. "scope" : "ANY",
  620. "service_name" : "HDFS",
  621. "source" : {
  622. "default_port" : 50071.0,
  623. "reporting" : {
  624. "ok" : {
  625. "text" : "TCP OK - {0:.4f} response on port {1}"
  626. },
  627. "critical" : {
  628. "text" : "Connection failed: {0} to {1}:{2}"
  629. }
  630. },
  631. "type" : "PORT",
  632. "uri" : "{{hdfs-site/dfs.namenode.secondary.http-address}}"
  633. }
  634. }
  635. },
  636. {
  637. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/20",
  638. "AlertDefinition" : {
  639. "cluster_name" : "tdk",
  640. "component_name" : "NAMENODE",
  641. "enabled" : true,
  642. "id" : 20,
  643. "ignore_host" : true,
  644. "interval" : 1,
  645. "label" : "NameNode High Availability Health",
  646. "name" : "namenode_ha_health",
  647. "scope" : "ANY",
  648. "service_name" : "HDFS",
  649. "source" : {
  650. "path" : "HDP/2.0.6/services/HDFS/package/files/alert_ha_namenode_health.py",
  651. "type" : "SCRIPT"
  652. }
  653. }
  654. },
  655. {
  656. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/21",
  657. "AlertDefinition" : {
  658. "cluster_name" : "tdk",
  659. "component_name" : "DATANODE",
  660. "enabled" : true,
  661. "id" : 21,
  662. "ignore_host" : false,
  663. "interval" : 1,
  664. "label" : "DataNode Web UI",
  665. "name" : "datanode_webui",
  666. "scope" : "HOST",
  667. "service_name" : "HDFS",
  668. "source" : {
  669. "reporting" : {
  670. "ok" : {
  671. "text" : "HTTP {0} response in {2:.4f} seconds"
  672. },
  673. "warning" : {
  674. "text" : "HTTP {0} response in {2:.4f} seconds"
  675. },
  676. "critical" : {
  677. "text" : "Connection failed to {1}"
  678. }
  679. },
  680. "type" : "WEB",
  681. "uri" : {
  682. "http" : "{{hdfs-site/dfs.datanode.http.address}}",
  683. "https" : "{{hdfs-site/dfs.datanode.https.address}}",
  684. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  685. "https_property_value" : "HTTPS_ONLY",
  686. "default_port" : 0.0
  687. }
  688. }
  689. }
  690. },
  691. {
  692. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/22",
  693. "AlertDefinition" : {
  694. "cluster_name" : "tdk",
  695. "component_name" : "NAMENODE",
  696. "enabled" : true,
  697. "id" : 22,
  698. "ignore_host" : false,
  699. "interval" : 5,
  700. "label" : "NameNode Host CPU Utilization",
  701. "name" : "namenode_cpu",
  702. "scope" : "ANY",
  703. "service_name" : "HDFS",
  704. "source" : {
  705. "jmx" : {
  706. "property_list" : [
  707. "java.lang:type=OperatingSystem/SystemCpuLoad",
  708. "java.lang:type=OperatingSystem/AvailableProcessors"
  709. ],
  710. "value" : "{0} * 100"
  711. },
  712. "reporting" : {
  713. "ok" : {
  714. "text" : "{1} CPU, load {0:.1%}"
  715. },
  716. "warning" : {
  717. "text" : "{1} CPU, load {0:.1%}",
  718. "value" : 200.0
  719. },
  720. "critical" : {
  721. "text" : "{1} CPU, load {0:.1%}",
  722. "value" : 250.0
  723. }
  724. },
  725. "type" : "METRIC",
  726. "uri" : {
  727. "http" : "{{hdfs-site/dfs.namenode.http-address}}",
  728. "https" : "{{hdfs-site/dfs.namenode.https-address}}",
  729. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  730. "https_property_value" : "HTTPS_ONLY",
  731. "default_port" : 0.0
  732. }
  733. }
  734. }
  735. },
  736. {
  737. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/23",
  738. "AlertDefinition" : {
  739. "cluster_name" : "tdk",
  740. "component_name" : "NAMENODE",
  741. "enabled" : true,
  742. "id" : 23,
  743. "ignore_host" : false,
  744. "interval" : 2,
  745. "label" : "NameNode RPC Latency",
  746. "name" : "namenode_rpc_latency",
  747. "scope" : "ANY",
  748. "service_name" : "HDFS",
  749. "source" : {
  750. "jmx" : {
  751. "property_list" : [
  752. "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcQueueTimeAvgTime",
  753. "Hadoop:service=NameNode,name=RpcActivityForPort*/RpcProcessingTimeAvgTime"
  754. ],
  755. "value" : "{0}"
  756. },
  757. "reporting" : {
  758. "ok" : {
  759. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]"
  760. },
  761. "warning" : {
  762. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]",
  763. "value" : 3000.0
  764. },
  765. "critical" : {
  766. "text" : "Average Queue Time:[{0}], Average Processing Time:[{1}]",
  767. "value" : 5000.0
  768. }
  769. },
  770. "type" : "METRIC",
  771. "uri" : {
  772. "http" : "{{hdfs-site/dfs.namenode.http-address}}",
  773. "https" : "{{hdfs-site/dfs.namenode.https-address}}",
  774. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  775. "https_property_value" : "HTTPS_ONLY",
  776. "default_port" : 0.0
  777. }
  778. }
  779. }
  780. },
  781. {
  782. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/24",
  783. "AlertDefinition" : {
  784. "cluster_name" : "tdk",
  785. "component_name" : "NAMENODE",
  786. "enabled" : true,
  787. "id" : 24,
  788. "ignore_host" : false,
  789. "interval" : 2,
  790. "label" : "NameNode Blocks Health",
  791. "name" : "namenode_hdfs_blocks_health",
  792. "scope" : "ANY",
  793. "service_name" : "HDFS",
  794. "source" : {
  795. "jmx" : {
  796. "property_list" : [
  797. "Hadoop:service=NameNode,name=FSNamesystem/MissingBlocks",
  798. "Hadoop:service=NameNode,name=FSNamesystem/BlocksTotal"
  799. ],
  800. "value" : "{0}"
  801. },
  802. "reporting" : {
  803. "ok" : {
  804. "text" : "Total Blocks:[{1}], Missing Blocks:[{0}]"
  805. },
  806. "warning" : {
  807. "text" : "Total Blocks:[{1}], Missing Blocks:[{0}]",
  808. "value" : 1.0
  809. },
  810. "critical" : {
  811. "text" : "Total Blocks:[{1}], Missing Blocks:[{0}]",
  812. "value" : 1.0
  813. }
  814. },
  815. "type" : "METRIC",
  816. "uri" : {
  817. "http" : "{{hdfs-site/dfs.namenode.http-address}}",
  818. "https" : "{{hdfs-site/dfs.namenode.https-address}}",
  819. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  820. "https_property_value" : "HTTPS_ONLY",
  821. "default_port" : 0.0
  822. }
  823. }
  824. }
  825. },
  826. {
  827. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/25",
  828. "AlertDefinition" : {
  829. "cluster_name" : "tdk",
  830. "component_name" : "NAMENODE",
  831. "enabled" : true,
  832. "id" : 25,
  833. "ignore_host" : false,
  834. "interval" : 1,
  835. "label" : "NameNode Web UI",
  836. "name" : "namenode_webui",
  837. "scope" : "ANY",
  838. "service_name" : "HDFS",
  839. "source" : {
  840. "reporting" : {
  841. "ok" : {
  842. "text" : "HTTP {0} response in {2:.4f} seconds"
  843. },
  844. "warning" : {
  845. "text" : "HTTP {0} response in {2:.4f} seconds"
  846. },
  847. "critical" : {
  848. "text" : "Connection failed to {1}"
  849. }
  850. },
  851. "type" : "WEB",
  852. "uri" : {
  853. "http" : "{{hdfs-site/dfs.namenode.http-address}}",
  854. "https" : "{{hdfs-site/dfs.namenode.https-address}}",
  855. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  856. "https_property_value" : "HTTPS_ONLY",
  857. "default_port" : 0.0
  858. }
  859. }
  860. }
  861. },
  862. {
  863. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/26",
  864. "AlertDefinition" : {
  865. "cluster_name" : "tdk",
  866. "component_name" : "DATANODE",
  867. "enabled" : true,
  868. "id" : 26,
  869. "ignore_host" : false,
  870. "interval" : 1,
  871. "label" : "DateNode Process",
  872. "name" : "datanode_process",
  873. "scope" : "HOST",
  874. "service_name" : "HDFS",
  875. "source" : {
  876. "default_port" : 50010.0,
  877. "reporting" : {
  878. "ok" : {
  879. "text" : "TCP OK - {0:.4f} response on port {1}"
  880. },
  881. "critical" : {
  882. "text" : "Connection failed: {0} to {1}:{2}"
  883. }
  884. },
  885. "type" : "PORT",
  886. "uri" : "{{hdfs-site/dfs.datanode.address}}"
  887. }
  888. }
  889. },
  890. {
  891. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/27",
  892. "AlertDefinition" : {
  893. "cluster_name" : "tdk",
  894. "component_name" : null,
  895. "enabled" : true,
  896. "id" : 27,
  897. "ignore_host" : false,
  898. "interval" : 1,
  899. "label" : "Percent DataNodes Available",
  900. "name" : "datanode_process_percent",
  901. "scope" : "SERVICE",
  902. "service_name" : "HDFS",
  903. "source" : {
  904. "alert_name" : "datanode_process",
  905. "reporting" : {
  906. "ok" : {
  907. "text" : "affected: [{1}], total: [{0}]"
  908. },
  909. "warning" : {
  910. "text" : "affected: [{1}], total: [{0}]",
  911. "value" : 0.1
  912. },
  913. "critical" : {
  914. "text" : "affected: [{1}], total: [{0}]",
  915. "value" : 0.3
  916. }
  917. },
  918. "type" : "AGGREGATE"
  919. }
  920. }
  921. },
  922. {
  923. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/28",
  924. "AlertDefinition" : {
  925. "cluster_name" : "tdk",
  926. "component_name" : "NAMENODE",
  927. "enabled" : true,
  928. "id" : 28,
  929. "ignore_host" : false,
  930. "interval" : 1,
  931. "label" : "NameNode Process",
  932. "name" : "namenode_process",
  933. "scope" : "ANY",
  934. "service_name" : "HDFS",
  935. "source" : {
  936. "default_port" : 50070.0,
  937. "reporting" : {
  938. "ok" : {
  939. "text" : "TCP OK - {0:.4f} response on port {1}"
  940. },
  941. "critical" : {
  942. "text" : "Connection failed: {0} to {1}:{2}"
  943. }
  944. },
  945. "type" : "PORT",
  946. "uri" : "{{hdfs-site/dfs.namenode.http-address}}"
  947. }
  948. }
  949. },
  950. {
  951. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/29",
  952. "AlertDefinition" : {
  953. "cluster_name" : "tdk",
  954. "component_name" : "NAMENODE",
  955. "enabled" : true,
  956. "id" : 29,
  957. "ignore_host" : false,
  958. "interval" : 2,
  959. "label" : "HDFS Capacity Utilization",
  960. "name" : "namenode_hdfs_capacity_utilization",
  961. "scope" : "ANY",
  962. "service_name" : "HDFS",
  963. "source" : {
  964. "jmx" : {
  965. "property_list" : [
  966. "Hadoop:service=NameNode,name=FSNamesystemState/CapacityUsed",
  967. "Hadoop:service=NameNode,name=FSNamesystemState/CapacityRemaining"
  968. ],
  969. "value" : "{0}/({0} + {1}) * 100"
  970. },
  971. "reporting" : {
  972. "ok" : {
  973. "text" : "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]"
  974. },
  975. "warning" : {
  976. "text" : "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]",
  977. "value" : 80.0
  978. },
  979. "critical" : {
  980. "text" : "Capacity Used:[{2:d}%, {0}], Capacity Remaining:[{1}]",
  981. "value" : 90.0
  982. }
  983. },
  984. "type" : "METRIC",
  985. "uri" : {
  986. "http" : "{{hdfs-site/dfs.namenode.http-address}}",
  987. "https" : "{{hdfs-site/dfs.namenode.https-address}}",
  988. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  989. "https_property_value" : "HTTPS_ONLY",
  990. "default_port" : 0.0
  991. }
  992. }
  993. }
  994. },
  995. {
  996. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/30",
  997. "AlertDefinition" : {
  998. "cluster_name" : "tdk",
  999. "component_name" : "NAMENODE",
  1000. "enabled" : true,
  1001. "id" : 30,
  1002. "ignore_host" : false,
  1003. "interval" : 1,
  1004. "label" : "NameNode Last Checkpoint",
  1005. "name" : "namenode_last_checkpoint",
  1006. "scope" : "ANY",
  1007. "service_name" : "HDFS",
  1008. "source" : {
  1009. "path" : "HDP/2.0.6/services/HDFS/package/files/alert_checkpoint_time.py",
  1010. "type" : "SCRIPT"
  1011. }
  1012. }
  1013. },
  1014. {
  1015. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/31",
  1016. "AlertDefinition" : {
  1017. "cluster_name" : "tdk",
  1018. "component_name" : "JOURNALNODE",
  1019. "enabled" : true,
  1020. "id" : 31,
  1021. "ignore_host" : false,
  1022. "interval" : 1,
  1023. "label" : "JournalNode Process",
  1024. "name" : "journalnode_process",
  1025. "scope" : "HOST",
  1026. "service_name" : "HDFS",
  1027. "source" : {
  1028. "default_port" : 8480.0,
  1029. "reporting" : {
  1030. "ok" : {
  1031. "text" : "TCP OK - {0:.4f} response on port {1}"
  1032. },
  1033. "critical" : {
  1034. "text" : "Connection failed: {0} to {1}:{2}"
  1035. }
  1036. },
  1037. "type" : "PORT",
  1038. "uri" : "{{hdfs-site/dfs.journalnode.http-address}}"
  1039. }
  1040. }
  1041. },
  1042. {
  1043. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/32",
  1044. "AlertDefinition" : {
  1045. "cluster_name" : "tdk",
  1046. "component_name" : "ZKFC",
  1047. "enabled" : true,
  1048. "id" : 32,
  1049. "ignore_host" : false,
  1050. "interval" : 1,
  1051. "label" : "ZooKeeper Failover Controller Process",
  1052. "name" : "hdfs_zookeeper_failover_controller_process",
  1053. "scope" : "ANY",
  1054. "service_name" : "HDFS",
  1055. "source" : {
  1056. "default_port" : 2181.0,
  1057. "reporting" : {
  1058. "ok" : {
  1059. "text" : "TCP OK - {0:.4f} response on port {1}"
  1060. },
  1061. "critical" : {
  1062. "text" : "Connection failed: {0} on host {1}:{2}"
  1063. }
  1064. },
  1065. "type" : "PORT",
  1066. "uri" : "{{core-site/ha.zookeeper.quorum}}"
  1067. }
  1068. }
  1069. },
  1070. {
  1071. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/33",
  1072. "AlertDefinition" : {
  1073. "cluster_name" : "tdk",
  1074. "component_name" : "NAMENODE",
  1075. "enabled" : true,
  1076. "id" : 33,
  1077. "ignore_host" : false,
  1078. "interval" : 1,
  1079. "label" : "NameNode Directory Status",
  1080. "name" : "namenode_directory_status",
  1081. "scope" : "ANY",
  1082. "service_name" : "HDFS",
  1083. "source" : {
  1084. "jmx" : {
  1085. "property_list" : [
  1086. "Hadoop:service=NameNode,name=NameNodeInfo/NameDirStatuses"
  1087. ],
  1088. "value" : "calculate(args)\ndef calculate(args):\n import json\n json_statuses = json.loads({0})\n return len(json_statuses['failed']) if 'failed' in json_statuses else 0"
  1089. },
  1090. "reporting" : {
  1091. "ok" : {
  1092. "text" : "Directories are healthy"
  1093. },
  1094. "warning" : {
  1095. "text" : "Failed directory count: {1}",
  1096. "value" : 1.0
  1097. },
  1098. "critical" : {
  1099. "text" : "Failed directory count: {1}",
  1100. "value" : 1.0
  1101. }
  1102. },
  1103. "type" : "METRIC",
  1104. "uri" : {
  1105. "http" : "{{hdfs-site/dfs.namenode.http-address}}",
  1106. "https" : "{{hdfs-site/dfs.namenode.https-address}}",
  1107. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  1108. "https_property_value" : "HTTPS_ONLY",
  1109. "default_port" : 0.0
  1110. }
  1111. }
  1112. }
  1113. },
  1114. {
  1115. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/34",
  1116. "AlertDefinition" : {
  1117. "cluster_name" : "tdk",
  1118. "component_name" : "DATANODE",
  1119. "enabled" : true,
  1120. "id" : 34,
  1121. "ignore_host" : false,
  1122. "interval" : 2,
  1123. "label" : "DataNode Storage",
  1124. "name" : "datanode_storage",
  1125. "scope" : "HOST",
  1126. "service_name" : "HDFS",
  1127. "source" : {
  1128. "jmx" : {
  1129. "property_list" : [
  1130. "Hadoop:service=DataNode,name=FSDatasetState-*/Remaining",
  1131. "Hadoop:service=DataNode,name=FSDatasetState-*/Capacity"
  1132. ],
  1133. "value" : "({1} - {0})/{1} * 100"
  1134. },
  1135. "reporting" : {
  1136. "ok" : {
  1137. "text" : "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]"
  1138. },
  1139. "warning" : {
  1140. "text" : "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
  1141. "value" : 80.0
  1142. },
  1143. "critical" : {
  1144. "text" : "Remaining Capacity:[{0}], Total Capacity:[{2:d}% Used, {1}]",
  1145. "value" : 90.0
  1146. }
  1147. },
  1148. "type" : "METRIC",
  1149. "uri" : {
  1150. "http" : "{{hdfs-site/dfs.datanode.http.address}}",
  1151. "https" : "{{hdfs-site/dfs.datanode.https.address}}",
  1152. "https_property" : "{{hdfs-site/dfs.http.policy}}",
  1153. "https_property_value" : "HTTPS_ONLY",
  1154. "default_port" : 0.0
  1155. }
  1156. }
  1157. }
  1158. },
  1159. {
  1160. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/35",
  1161. "AlertDefinition" : {
  1162. "cluster_name" : "tdk",
  1163. "component_name" : null,
  1164. "enabled" : true,
  1165. "id" : 35,
  1166. "ignore_host" : false,
  1167. "interval" : 1,
  1168. "label" : "Percent DataNodes With Available Space",
  1169. "name" : "datanode_storage_percent",
  1170. "scope" : "SERVICE",
  1171. "service_name" : "HDFS",
  1172. "source" : {
  1173. "alert_name" : "datanode_storage",
  1174. "reporting" : {
  1175. "ok" : {
  1176. "text" : "affected: [{1}], total: [{0}]"
  1177. },
  1178. "warning" : {
  1179. "text" : "affected: [{1}], total: [{0}]",
  1180. "value" : 0.1
  1181. },
  1182. "critical" : {
  1183. "text" : "affected: [{1}], total: [{0}]",
  1184. "value" : 0.3
  1185. }
  1186. },
  1187. "type" : "AGGREGATE"
  1188. }
  1189. }
  1190. },
  1191. {
  1192. "href" : "http://host:8080/api/v1/clusters/tdk/alert_definitions/36",
  1193. "AlertDefinition" : {
  1194. "cluster_name" : "tdk",
  1195. "component_name" : null,
  1196. "enabled" : true,
  1197. "id" : 36,
  1198. "ignore_host" : false,
  1199. "interval" : 1,
  1200. "label" : "Percent JournalNodes Available",
  1201. "name" : "journalnode_process_percent",
  1202. "scope" : "SERVICE",
  1203. "service_name" : "HDFS",
  1204. "source" : {
  1205. "alert_name" : "journalnode_process",
  1206. "reporting" : {
  1207. "ok" : {
  1208. "text" : "affected: [{1}], total: [{0}]"
  1209. },
  1210. "warning" : {
  1211. "text" : "affected: [{1}], total: [{0}]",
  1212. "value" : 0.33
  1213. },
  1214. "critical" : {
  1215. "text" : "affected: [{1}], total: [{0}]",
  1216. "value" : 0.5
  1217. }
  1218. },
  1219. "type" : "AGGREGATE"
  1220. }
  1221. }
  1222. },
  1223. {
  1224. "href" : "http://host:8080/api/v1/clusters/c1/alert_definitions/37",
  1225. "AlertDefinition" : {
  1226. "cluster_name" : "c1",
  1227. "component_name" : "AMBARI_AGENT",
  1228. "description" : null,
  1229. "enabled" : true,
  1230. "id" : 37,
  1231. "ignore_host" : false,
  1232. "interval" : 1,
  1233. "label" : "Ambari Agent Disk Usage",
  1234. "name" : "ambari_agent_disk_usage",
  1235. "scope" : "HOST",
  1236. "service_name" : "AMBARI",
  1237. "source" : {
  1238. "path" : "alert_disk_space.py",
  1239. "type" : "SCRIPT"
  1240. }
  1241. }
  1242. }
  1243. ]
  1244. }