|
@@ -298,11 +298,11 @@
|
|
|
},
|
|
|
"warning": {
|
|
|
"text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
|
|
|
- "value": 75
|
|
|
+ "value": 80
|
|
|
},
|
|
|
"critical": {
|
|
|
"text": "Capacity Used:[{2:.0f}%, {0}], Capacity Remaining:[{1}]",
|
|
|
- "value": 80
|
|
|
+ "value": 90
|
|
|
},
|
|
|
"units" : "%"
|
|
|
},
|
|
@@ -522,12 +522,12 @@
|
|
|
}
|
|
|
},
|
|
|
{
|
|
|
- "name": "namenode_service_rpc_queue_latency_hourly",
|
|
|
- "label": "NameNode Service RPC Queue Latency (Hourly)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC queue latency on datanode port has grown beyond the specified threshold within a given time interval.",
|
|
|
+ "name": "increase_nn_heap_usage_hourly",
|
|
|
+ "label": "NameNode Heap Usage (Hourly)",
|
|
|
+ "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 5,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -556,7 +556,7 @@
|
|
|
{
|
|
|
"name": "metricName",
|
|
|
"display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
|
|
|
+ "value": "jvm.JvmMetrics.MemHeapUsedM",
|
|
|
"type": "STRING",
|
|
|
"description": "The metric to monitor."
|
|
|
},
|
|
@@ -575,24 +575,17 @@
|
|
|
"units": "%",
|
|
|
"value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
}
|
|
|
]
|
|
|
}
|
|
|
},
|
|
|
{
|
|
|
- "name": "namenode_client_rpc_queue_latency_hourly",
|
|
|
- "label": "NameNode Client RPC Queue Latency (Hourly)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC queue latency on client port has grown beyond the specified threshold within a given time interval.",
|
|
|
+ "name": "namenode_service_rpc_latency_hourly",
|
|
|
+ "label": "NameNode RPC Latency (Hourly)",
|
|
|
+ "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 5,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -621,7 +614,7 @@
|
|
|
{
|
|
|
"name": "metricName",
|
|
|
"display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
|
|
|
+ "value": "rpc.rpc.RpcProcessingTimeAvgTime",
|
|
|
"type": "STRING",
|
|
|
"description": "The metric to monitor."
|
|
|
},
|
|
@@ -640,24 +633,17 @@
|
|
|
"units": "%",
|
|
|
"value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
}
|
|
|
]
|
|
|
}
|
|
|
},
|
|
|
{
|
|
|
- "name": "namenode_service_rpc_processing_latency_hourly",
|
|
|
- "label": "NameNode Service RPC Processing Latency (Hourly)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
|
|
|
+ "name": "namenode_increase_in_storage_capacity_usage_hourly",
|
|
|
+ "label": "HDFS Storage Capacity Usage (Hourly)",
|
|
|
+ "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 5,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -665,7 +651,7 @@
|
|
|
{
|
|
|
"name": "mergeHaMetrics",
|
|
|
"display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
+ "value": "true",
|
|
|
"type": "STRING",
|
|
|
"description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
},
|
|
@@ -686,7 +672,7 @@
|
|
|
{
|
|
|
"name": "metricName",
|
|
|
"display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
|
|
|
+ "value": "dfs.FSNamesystem.CapacityUsed",
|
|
|
"type": "STRING",
|
|
|
"description": "The metric to monitor."
|
|
|
},
|
|
@@ -705,78 +691,6 @@
|
|
|
"units": "%",
|
|
|
"value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
- }
|
|
|
- ]
|
|
|
- }
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "namenode_client_rpc_processing_latency_hourly",
|
|
|
- "label": "NameNode Client RPC Processing Latency (Hourly)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
|
|
|
- "interval": 5,
|
|
|
- "scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
- "source": {
|
|
|
- "type": "SCRIPT",
|
|
|
- "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
|
- "parameters": [
|
|
|
- {
|
|
|
- "name": "mergeHaMetrics",
|
|
|
- "display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
- "type": "STRING",
|
|
|
- "description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "interval",
|
|
|
- "display_name": "Time interval in minutes",
|
|
|
- "value": 60,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Time interval in minutes."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "appId",
|
|
|
- "display_name": "AMS application id",
|
|
|
- "value": "NAMENODE",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The application id used to retrieve the metric."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metricName",
|
|
|
- "display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The metric to monitor."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.warning.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 100,
|
|
|
- "threshold": "WARNING"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.critical.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 200,
|
|
|
- "threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
}
|
|
|
]
|
|
|
}
|
|
@@ -787,7 +701,7 @@
|
|
|
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 480,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -820,64 +734,6 @@
|
|
|
"type": "STRING",
|
|
|
"description": "The metric to monitor."
|
|
|
},
|
|
|
- {
|
|
|
- "name": "metric.deviation.warning.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 20,
|
|
|
- "threshold": "WARNING"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.critical.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 50,
|
|
|
- "threshold": "CRITICAL"
|
|
|
- }
|
|
|
- ]
|
|
|
- }
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "namenode_service_rpc_processing_latency_daily",
|
|
|
- "label": "NameNode Service RPC Processing Latency (Daily)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
|
|
|
- "interval": 480,
|
|
|
- "scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
- "source": {
|
|
|
- "type": "SCRIPT",
|
|
|
- "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
|
- "parameters": [
|
|
|
- {
|
|
|
- "name": "mergeHaMetrics",
|
|
|
- "display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
- "type": "STRING",
|
|
|
- "description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "interval",
|
|
|
- "display_name": "Time interval in minutes",
|
|
|
- "value": 1440,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Time interval in minutes."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "appId",
|
|
|
- "display_name": "AMS application id",
|
|
|
- "value": "NAMENODE",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The application id used to retrieve the metric."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metricName",
|
|
|
- "display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.datanode.RpcProcessingTimeAvgTime",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The metric to monitor."
|
|
|
- },
|
|
|
{
|
|
|
"name": "metric.deviation.warning.threshold",
|
|
|
"display_name": "The standard deviation threshold above which a warning is produced.",
|
|
@@ -893,24 +749,17 @@
|
|
|
"units": "%",
|
|
|
"value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
}
|
|
|
]
|
|
|
}
|
|
|
},
|
|
|
{
|
|
|
- "name": "namenode_client_rpc_processing_latency_daily",
|
|
|
- "label": "NameNode Client RPC Processing Latency (Daily)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
|
|
|
+ "name": "namenode_service_rpc_latency_daily",
|
|
|
+ "label": "NameNode RPC Latency (Daily)",
|
|
|
+ "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 480,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -939,7 +788,7 @@
|
|
|
{
|
|
|
"name": "metricName",
|
|
|
"display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.client.RpcProcessingTimeAvgTime",
|
|
|
+ "value": "rpc.rpc.RpcProcessingTimeAvgTime",
|
|
|
"type": "STRING",
|
|
|
"description": "The metric to monitor."
|
|
|
},
|
|
@@ -958,143 +807,6 @@
|
|
|
"units": "%",
|
|
|
"value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
- }
|
|
|
- ]
|
|
|
- }
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "namenode_service_rpc_queue_latency_daily",
|
|
|
- "label": "NameNode Service RPC Queue Latency (Daily)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC latency on datanode port has grown beyond the specified threshold within a given time interval.",
|
|
|
- "interval": 480,
|
|
|
- "scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
- "source": {
|
|
|
- "type": "SCRIPT",
|
|
|
- "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
|
- "parameters": [
|
|
|
- {
|
|
|
- "name": "mergeHaMetrics",
|
|
|
- "display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
- "type": "STRING",
|
|
|
- "description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "interval",
|
|
|
- "display_name": "Time interval in minutes",
|
|
|
- "value": 1440,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Time interval in minutes."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "appId",
|
|
|
- "display_name": "AMS application id",
|
|
|
- "value": "NAMENODE",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The application id used to retrieve the metric."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metricName",
|
|
|
- "display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.datanode.RpcQueueTimeAvgTime",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The metric to monitor."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.warning.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 100,
|
|
|
- "threshold": "WARNING"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.critical.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 200,
|
|
|
- "threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
- }
|
|
|
- ]
|
|
|
- }
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "namenode_client_rpc_queue_latency_daily",
|
|
|
- "label": "NameNode Client RPC Queue Latency (Daily)",
|
|
|
- "description": "This service-level alert is triggered if the deviation of RPC latency on client port has grown beyond the specified threshold within a given time interval.",
|
|
|
- "interval": 480,
|
|
|
- "scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
- "source": {
|
|
|
- "type": "SCRIPT",
|
|
|
- "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
|
- "parameters": [
|
|
|
- {
|
|
|
- "name": "mergeHaMetrics",
|
|
|
- "display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
- "type": "STRING",
|
|
|
- "description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "interval",
|
|
|
- "display_name": "Time interval in minutes",
|
|
|
- "value": 1440,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Time interval in minutes."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "appId",
|
|
|
- "display_name": "AMS application id",
|
|
|
- "value": "NAMENODE",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The application id used to retrieve the metric."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metricName",
|
|
|
- "display_name": "Metric Name",
|
|
|
- "value": "rpc.rpc.client.RpcQueueTimeAvgTime",
|
|
|
- "type": "STRING",
|
|
|
- "description": "The metric to monitor."
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.warning.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 100,
|
|
|
- "threshold": "WARNING"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "metric.deviation.critical.threshold",
|
|
|
- "display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
- "type": "PERCENT",
|
|
|
- "units": "%",
|
|
|
- "value": 200,
|
|
|
- "threshold": "CRITICAL"
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "minimumValue",
|
|
|
- "display_name": "Minimum Latency (in seconds)",
|
|
|
- "value": 30,
|
|
|
- "type": "NUMERIC",
|
|
|
- "description": "Minimum latency time to measure (in seconds)."
|
|
|
}
|
|
|
]
|
|
|
}
|
|
@@ -1105,7 +817,7 @@
|
|
|
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 480,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -1113,7 +825,7 @@
|
|
|
{
|
|
|
"name": "mergeHaMetrics",
|
|
|
"display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
+ "value": "true",
|
|
|
"type": "STRING",
|
|
|
"description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
},
|
|
@@ -1143,7 +855,7 @@
|
|
|
"display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
"type": "PERCENT",
|
|
|
"units": "%",
|
|
|
- "value": 30,
|
|
|
+ "value": 100,
|
|
|
"threshold": "WARNING"
|
|
|
},
|
|
|
{
|
|
@@ -1151,7 +863,7 @@
|
|
|
"display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
"type": "PERCENT",
|
|
|
"units": "%",
|
|
|
- "value": 50,
|
|
|
+ "value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
}
|
|
|
]
|
|
@@ -1163,7 +875,7 @@
|
|
|
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 1440,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -1201,7 +913,7 @@
|
|
|
"display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
"type": "PERCENT",
|
|
|
"units": "%",
|
|
|
- "value": 20,
|
|
|
+ "value": 100,
|
|
|
"threshold": "WARNING"
|
|
|
},
|
|
|
{
|
|
@@ -1209,7 +921,7 @@
|
|
|
"display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
"type": "PERCENT",
|
|
|
"units": "%",
|
|
|
- "value": 50,
|
|
|
+ "value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
}
|
|
|
]
|
|
@@ -1221,7 +933,7 @@
|
|
|
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
|
|
|
"interval": 1440,
|
|
|
"scope": "ANY",
|
|
|
- "enabled": true,
|
|
|
+ "enabled": false,
|
|
|
"source": {
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
|
|
@@ -1229,7 +941,7 @@
|
|
|
{
|
|
|
"name": "mergeHaMetrics",
|
|
|
"display_name": "Whether active and stanby NameNodes metrics should be merged",
|
|
|
- "value": "false",
|
|
|
+ "value": "true",
|
|
|
"type": "STRING",
|
|
|
"description": "Whether active and stanby NameNodes metrics should be merged."
|
|
|
},
|
|
@@ -1259,7 +971,7 @@
|
|
|
"display_name": "The standard deviation threshold above which a warning is produced.",
|
|
|
"type": "PERCENT",
|
|
|
"units": "%",
|
|
|
- "value": 10,
|
|
|
+ "value": 100,
|
|
|
"threshold": "WARNING"
|
|
|
},
|
|
|
{
|
|
@@ -1267,7 +979,7 @@
|
|
|
"display_name": "The standard deviation threshold above which a critical alert is produced.",
|
|
|
"type": "PERCENT",
|
|
|
"units": "%",
|
|
|
- "value": 20,
|
|
|
+ "value": 200,
|
|
|
"threshold": "CRITICAL"
|
|
|
}
|
|
|
]
|
|
@@ -1449,11 +1161,11 @@
|
|
|
},
|
|
|
"warning": {
|
|
|
"text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
|
|
|
- "value": 75
|
|
|
+ "value": 80
|
|
|
},
|
|
|
"critical": {
|
|
|
"text": "Remaining Capacity:[{0}], Total Capacity:[{2:.0f}% Used, {1}]",
|
|
|
- "value": 80
|
|
|
+ "value": 90
|
|
|
},
|
|
|
"units" : "%"
|
|
|
},
|
|
@@ -1477,47 +1189,6 @@
|
|
|
"type": "SCRIPT",
|
|
|
"path": "HDFS/2.1.0.2.0/package/alerts/alert_datanode_unmounted_data_dir.py"
|
|
|
}
|
|
|
- },
|
|
|
- {
|
|
|
- "name": "datanode_heap_usage",
|
|
|
- "label": "DataNode Heap Usage",
|
|
|
- "description": "This host-level alert is triggered if heap usage goes past thresholds on the DataNode. It checks the DataNode JMXServlet for the MemHeapUsedM and MemHeapMaxM properties. The threshold values are in percent.",
|
|
|
- "interval": 2,
|
|
|
- "scope": "HOST",
|
|
|
- "enabled": true,
|
|
|
- "source": {
|
|
|
- "type": "METRIC",
|
|
|
- "uri": {
|
|
|
- "http": "{{hdfs-site/dfs.datanode.http.address}}",
|
|
|
- "https": "{{hdfs-site/dfs.datanode.https.address}}",
|
|
|
- "kerberos_keytab": "{{hdfs-site/dfs.web.authentication.kerberos.keytab}}",
|
|
|
- "kerberos_principal": "{{hdfs-site/dfs.web.authentication.kerberos.principal}}",
|
|
|
- "https_property": "{{hdfs-site/dfs.http.policy}}",
|
|
|
- "https_property_value": "HTTPS_ONLY",
|
|
|
- "connection_timeout": 5.0
|
|
|
- },
|
|
|
- "reporting": {
|
|
|
- "ok": {
|
|
|
- "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB"
|
|
|
- },
|
|
|
- "warning": {
|
|
|
- "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
|
|
|
- "value": 80
|
|
|
- },
|
|
|
- "critical": {
|
|
|
- "text": "Used Heap:[{2:.0f}%, {0} MB], Max Heap: {1} MB",
|
|
|
- "value": 90
|
|
|
- },
|
|
|
- "units" : "%"
|
|
|
- },
|
|
|
- "jmx": {
|
|
|
- "property_list": [
|
|
|
- "Hadoop:service=DataNode,name=JvmMetrics/MemHeapUsedM",
|
|
|
- "Hadoop:service=DataNode,name=JvmMetrics/MemHeapMaxM"
|
|
|
- ],
|
|
|
- "value": "100.0 - (({1} - {0})/{1} * 100.0)"
|
|
|
- }
|
|
|
- }
|
|
|
}
|
|
|
],
|
|
|
"ZKFC": [
|