|
@@ -0,0 +1,196 @@
|
|
|
+/*
|
|
|
+ * Licensed to the Apache Software Foundation (ASF) under one
|
|
|
+ * or more contributor license agreements. See the NOTICE file
|
|
|
+ * distributed with this work for additional information
|
|
|
+ * regarding copyright ownership. The ASF licenses this file
|
|
|
+ * to you under the Apache License, Version 2.0 (the
|
|
|
+ * "License"); you may not use this file except in compliance
|
|
|
+ * with the License. You may obtain a copy of the License at
|
|
|
+ *
|
|
|
+ * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+ *
|
|
|
+ * Unless required by applicable law or agreed to in writing, software
|
|
|
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+ * See the License for the specific language governing permissions and
|
|
|
+ * limitations under the License.
|
|
|
+ */
|
|
|
+package org.apache.hadoop.util;
|
|
|
+
|
|
|
+import java.lang.management.GarbageCollectorMXBean;
|
|
|
+import java.lang.management.ManagementFactory;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Map;
|
|
|
+import java.util.Set;
|
|
|
+
|
|
|
+import org.apache.commons.logging.Log;
|
|
|
+import org.apache.commons.logging.LogFactory;
|
|
|
+import org.apache.hadoop.classification.InterfaceAudience;
|
|
|
+import org.apache.hadoop.conf.Configuration;
|
|
|
+
|
|
|
+import com.google.common.base.Joiner;
|
|
|
+import com.google.common.base.Preconditions;
|
|
|
+import com.google.common.base.Stopwatch;
|
|
|
+import com.google.common.collect.Lists;
|
|
|
+import com.google.common.collect.Maps;
|
|
|
+import com.google.common.collect.Sets;
|
|
|
+
|
|
|
+/**
|
|
|
+ * Class which sets up a simple thread which runs in a loop sleeping
|
|
|
+ * for a short interval of time. If the sleep takes significantly longer
|
|
|
+ * than its target time, it implies that the JVM or host machine has
|
|
|
+ * paused processing, which may cause other problems. If such a pause is
|
|
|
+ * detected, the thread logs a message.
|
|
|
+ */
|
|
|
+@InterfaceAudience.Private
|
|
|
+public class JvmPauseMonitor {
|
|
|
+ private static final Log LOG = LogFactory.getLog(
|
|
|
+ JvmPauseMonitor.class);
|
|
|
+
|
|
|
+ /** The target sleep time */
|
|
|
+ private static final long SLEEP_INTERVAL_MS = 500;
|
|
|
+
|
|
|
+ /** log WARN if we detect a pause longer than this threshold */
|
|
|
+ private final long warnThresholdMs;
|
|
|
+ private static final String WARN_THRESHOLD_KEY =
|
|
|
+ "jvm.pause.warn-threshold.ms";
|
|
|
+ private static final long WARN_THRESHOLD_DEFAULT = 10000;
|
|
|
+
|
|
|
+ /** log INFO if we detect a pause longer than this threshold */
|
|
|
+ private final long infoThresholdMs;
|
|
|
+ private static final String INFO_THRESHOLD_KEY =
|
|
|
+ "jvm.pause.info-threshold.ms";
|
|
|
+ private static final long INFO_THRESHOLD_DEFAULT = 1000;
|
|
|
+
|
|
|
+
|
|
|
+ private Thread monitorThread;
|
|
|
+ private volatile boolean shouldRun = true;
|
|
|
+
|
|
|
+ public JvmPauseMonitor(Configuration conf) {
|
|
|
+ this.warnThresholdMs = conf.getLong(WARN_THRESHOLD_KEY, WARN_THRESHOLD_DEFAULT);
|
|
|
+ this.infoThresholdMs = conf.getLong(INFO_THRESHOLD_KEY, INFO_THRESHOLD_DEFAULT);
|
|
|
+ }
|
|
|
+
|
|
|
+ public void start() {
|
|
|
+ Preconditions.checkState(monitorThread == null,
|
|
|
+ "Already started");
|
|
|
+ monitorThread = new Daemon(new Monitor());
|
|
|
+ monitorThread.start();
|
|
|
+ }
|
|
|
+
|
|
|
+ public void stop() {
|
|
|
+ shouldRun = false;
|
|
|
+ monitorThread.interrupt();
|
|
|
+ try {
|
|
|
+ monitorThread.join();
|
|
|
+ } catch (InterruptedException e) {
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private String formatMessage(long extraSleepTime,
|
|
|
+ Map<String, GcTimes> gcTimesAfterSleep,
|
|
|
+ Map<String, GcTimes> gcTimesBeforeSleep) {
|
|
|
+
|
|
|
+ Set<String> gcBeanNames = Sets.intersection(
|
|
|
+ gcTimesAfterSleep.keySet(),
|
|
|
+ gcTimesBeforeSleep.keySet());
|
|
|
+ List<String> gcDiffs = Lists.newArrayList();
|
|
|
+ for (String name : gcBeanNames) {
|
|
|
+ GcTimes diff = gcTimesAfterSleep.get(name).subtract(
|
|
|
+ gcTimesBeforeSleep.get(name));
|
|
|
+ if (diff.gcCount != 0) {
|
|
|
+ gcDiffs.add("GC pool '" + name + "' had collection(s): " +
|
|
|
+ diff.toString());
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ String ret = "Detected pause in JVM or host machine (eg GC): " +
|
|
|
+ "pause of approximately " + extraSleepTime + "ms\n";
|
|
|
+ if (gcDiffs.isEmpty()) {
|
|
|
+ ret += "No GCs detected";
|
|
|
+ } else {
|
|
|
+ ret += Joiner.on("\n").join(gcDiffs);
|
|
|
+ }
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ private Map<String, GcTimes> getGcTimes() {
|
|
|
+ Map<String, GcTimes> map = Maps.newHashMap();
|
|
|
+ List<GarbageCollectorMXBean> gcBeans =
|
|
|
+ ManagementFactory.getGarbageCollectorMXBeans();
|
|
|
+ for (GarbageCollectorMXBean gcBean : gcBeans) {
|
|
|
+ map.put(gcBean.getName(), new GcTimes(gcBean));
|
|
|
+ }
|
|
|
+ return map;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static class GcTimes {
|
|
|
+ private GcTimes(GarbageCollectorMXBean gcBean) {
|
|
|
+ gcCount = gcBean.getCollectionCount();
|
|
|
+ gcTimeMillis = gcBean.getCollectionTime();
|
|
|
+ }
|
|
|
+
|
|
|
+ private GcTimes(long count, long time) {
|
|
|
+ this.gcCount = count;
|
|
|
+ this.gcTimeMillis = time;
|
|
|
+ }
|
|
|
+
|
|
|
+ private GcTimes subtract(GcTimes other) {
|
|
|
+ return new GcTimes(this.gcCount - other.gcCount,
|
|
|
+ this.gcTimeMillis - other.gcTimeMillis);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public String toString() {
|
|
|
+ return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
|
|
|
+ }
|
|
|
+
|
|
|
+ private long gcCount;
|
|
|
+ private long gcTimeMillis;
|
|
|
+ }
|
|
|
+
|
|
|
+ private class Monitor implements Runnable {
|
|
|
+ @Override
|
|
|
+ public void run() {
|
|
|
+ Stopwatch sw = new Stopwatch();
|
|
|
+ Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
|
|
|
+ while (shouldRun) {
|
|
|
+ sw.reset().start();
|
|
|
+ try {
|
|
|
+ Thread.sleep(SLEEP_INTERVAL_MS);
|
|
|
+ } catch (InterruptedException ie) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ long extraSleepTime = sw.elapsedMillis() - SLEEP_INTERVAL_MS;
|
|
|
+ Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
|
|
|
+
|
|
|
+ if (extraSleepTime > warnThresholdMs) {
|
|
|
+ LOG.warn(formatMessage(
|
|
|
+ extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
|
|
|
+ } else if (extraSleepTime > infoThresholdMs) {
|
|
|
+ LOG.info(formatMessage(
|
|
|
+ extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
|
|
|
+ }
|
|
|
+
|
|
|
+ gcTimesBeforeSleep = gcTimesAfterSleep;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Simple 'main' to facilitate manual testing of the pause monitor.
|
|
|
+ *
|
|
|
+ * This main function just leaks memory into a list. Running this class
|
|
|
+ * with a 1GB heap will very quickly go into "GC hell" and result in
|
|
|
+ * log messages about the GC pauses.
|
|
|
+ */
|
|
|
+ public static void main(String []args) throws Exception {
|
|
|
+ new JvmPauseMonitor(new Configuration()).start();
|
|
|
+ List<String> list = Lists.newArrayList();
|
|
|
+ int i = 0;
|
|
|
+ while (true) {
|
|
|
+ list.add(String.valueOf(i++));
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|