Browse Source

ZOOKEEPER-3037: Add JVMPauseMonitor

https://issues.apache.org/jira/browse/ZOOKEEPER-3037

Author: Norbert Kalmar <nkalmar@yahoo.com>

Reviewers: andor@apache.org

Closes #904 from nkalmar/ZOOKEEPER-3037 and squashes the following commits:

a6105324b [Norbert Kalmar] ZOOKEEPER-3037 - add serviceStop() and improve unit tests
7d0baaa46 [Norbert Kalmar] ZOOKEEPER-3037 - refactor unit tests
97d2c6103 [Norbert Kalmar] ZOOKEEPER-3037 - cite hadoop-common as source
3661389e4 [Norbert Kalmar] ZOOKEEPER-3037 - Add unit test and various improvements
f30975765 [Norbert Kalmar] ZOOKEEPER-3037 - Add JvmPauseMonitor
Norbert Kalmar 6 years ago
parent
commit
e9adf6ee09

+ 26 - 0
zookeeper-server/src/main/java/org/apache/zookeeper/server/ServerConfig.java

@@ -55,6 +55,15 @@ public class ServerConfig {
     /** defaults to -1 if not set explicitly */
     /** defaults to -1 if not set explicitly */
     protected int listenBacklog = -1;
     protected int listenBacklog = -1;
 
 
+    /** JVM Pause Monitor feature switch */
+    protected boolean jvmPauseMonitorToRun = false;
+    /** JVM Pause Monitor warn threshold in ms */
+    protected long jvmPauseWarnThresholdMs;
+    /** JVM Pause Monitor info threshold in ms */
+    protected long jvmPauseInfoThresholdMs;
+    /** JVM Pause Monitor sleep time in ms */
+    protected long jvmPauseSleepTimeMs;
+
     /**
     /**
      * Parse arguments for server configuration
      * Parse arguments for server configuration
      * @param args clientPort dataDir and optional tickTime and maxClientCnxns
      * @param args clientPort dataDir and optional tickTime and maxClientCnxns
@@ -105,6 +114,10 @@ public class ServerConfig {
         maxClientCnxns = config.getMaxClientCnxns();
         maxClientCnxns = config.getMaxClientCnxns();
         minSessionTimeout = config.getMinSessionTimeout();
         minSessionTimeout = config.getMinSessionTimeout();
         maxSessionTimeout = config.getMaxSessionTimeout();
         maxSessionTimeout = config.getMaxSessionTimeout();
+        jvmPauseMonitorToRun = config.isJvmPauseMonitorToRun();
+        jvmPauseInfoThresholdMs = config.getJvmPauseInfoThresholdMs();
+        jvmPauseWarnThresholdMs = config.getJvmPauseWarnThresholdMs();
+        jvmPauseSleepTimeMs = config.getJvmPauseSleepTimeMs();
         metricsProviderClassName = config.getMetricsProviderClassName();
         metricsProviderClassName = config.getMetricsProviderClassName();
         metricsProviderConfiguration = config.getMetricsProviderConfiguration();
         metricsProviderConfiguration = config.getMetricsProviderConfiguration();
         listenBacklog = config.getClientPortListenBacklog();
         listenBacklog = config.getClientPortListenBacklog();
@@ -124,6 +137,19 @@ public class ServerConfig {
     public int getMinSessionTimeout() { return minSessionTimeout; }
     public int getMinSessionTimeout() { return minSessionTimeout; }
     /** maximum session timeout in milliseconds, -1 if unset */
     /** maximum session timeout in milliseconds, -1 if unset */
     public int getMaxSessionTimeout() { return maxSessionTimeout; }
     public int getMaxSessionTimeout() { return maxSessionTimeout; }
+
+    public long getJvmPauseInfoThresholdMs() {
+        return jvmPauseInfoThresholdMs;
+    }
+    public long getJvmPauseWarnThresholdMs() {
+        return jvmPauseWarnThresholdMs;
+    }
+    public long getJvmPauseSleepTimeMs() {
+        return jvmPauseSleepTimeMs;
+    }
+    public boolean isJvmPauseMonitorToRun() {
+        return jvmPauseMonitorToRun;
+    }
     public String getMetricsProviderClassName() { return metricsProviderClassName; }
     public String getMetricsProviderClassName() { return metricsProviderClassName; }
     public Properties getMetricsProviderConfiguration() { return metricsProviderConfiguration; }
     public Properties getMetricsProviderConfiguration() { return metricsProviderConfiguration; }
     /** Maximum number of pending socket connections to read, -1 if unset */
     /** Maximum number of pending socket connections to read, -1 if unset */

+ 27 - 0
zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServer.java

@@ -70,6 +70,7 @@ import org.apache.zookeeper.server.auth.ProviderRegistry;
 import org.apache.zookeeper.server.auth.ServerAuthenticationProvider;
 import org.apache.zookeeper.server.auth.ServerAuthenticationProvider;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.server.quorum.ReadOnlyZooKeeperServer;
 import org.apache.zookeeper.server.quorum.ReadOnlyZooKeeperServer;
+import org.apache.zookeeper.server.util.JvmPauseMonitor;
 import org.apache.zookeeper.txn.CreateSessionTxn;
 import org.apache.zookeeper.txn.CreateSessionTxn;
 import org.apache.zookeeper.txn.TxnHeader;
 import org.apache.zookeeper.txn.TxnHeader;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
@@ -112,6 +113,7 @@ public class ZooKeeperServer implements SessionExpirer, ServerStats.Provider {
     private final AtomicLong hzxid = new AtomicLong(0);
     private final AtomicLong hzxid = new AtomicLong(0);
     public final static Exception ok = new Exception("No prob");
     public final static Exception ok = new Exception("No prob");
     protected RequestProcessor firstProcessor;
     protected RequestProcessor firstProcessor;
+    protected JvmPauseMonitor jvmPauseMonitor;
     protected volatile State state = State.INITIAL;
     protected volatile State state = State.INITIAL;
     private boolean isResponseCachingEnabled = true;
     private boolean isResponseCachingEnabled = true;
 
 
@@ -216,6 +218,20 @@ public class ZooKeeperServer implements SessionExpirer, ServerStats.Provider {
                 + " snapdir " + txnLogFactory.getSnapDir());
                 + " snapdir " + txnLogFactory.getSnapDir());
     }
     }
 
 
+    /**
+     * Adds JvmPauseMonitor and calls
+     * {@link #ZooKeeperServer(FileTxnSnapLog, int, int, int, int, ZKDatabase)}
+     *
+     */
+    public ZooKeeperServer(JvmPauseMonitor jvmPauseMonitor, FileTxnSnapLog txnLogFactory, int tickTime,
+                           int minSessionTimeout, int maxSessionTimeout, int clientPortListenBacklog, ZKDatabase zkDb) {
+        this(txnLogFactory, tickTime, minSessionTimeout, maxSessionTimeout, clientPortListenBacklog, zkDb);
+        this.jvmPauseMonitor = jvmPauseMonitor;
+        if(jvmPauseMonitor != null) {
+            LOG.info("Added JvmPauseMonitor to server");
+        }
+    }
+
     /**
     /**
      * creates a zookeeperserver instance.
      * creates a zookeeperserver instance.
      * @param txnLogFactory the file transaction snapshot logging class
      * @param txnLogFactory the file transaction snapshot logging class
@@ -521,10 +537,18 @@ public class ZooKeeperServer implements SessionExpirer, ServerStats.Provider {
 
 
         registerJMX();
         registerJMX();
 
 
+        startJvmPauseMonitor();
+
         setState(State.RUNNING);
         setState(State.RUNNING);
         notifyAll();
         notifyAll();
     }
     }
 
 
+    protected void startJvmPauseMonitor() {
+        if (this.jvmPauseMonitor != null) {
+            this.jvmPauseMonitor.serviceStart();
+        }
+    }
+
     protected void setupRequestProcessors() {
     protected void setupRequestProcessors() {
         RequestProcessor finalProcessor = new FinalRequestProcessor(this);
         RequestProcessor finalProcessor = new FinalRequestProcessor(this);
         RequestProcessor syncProcessor = new SyncRequestProcessor(this,
         RequestProcessor syncProcessor = new SyncRequestProcessor(this,
@@ -629,6 +653,9 @@ public class ZooKeeperServer implements SessionExpirer, ServerStats.Provider {
         if (firstProcessor != null) {
         if (firstProcessor != null) {
             firstProcessor.shutdown();
             firstProcessor.shutdown();
         }
         }
+        if(jvmPauseMonitor != null) {
+            jvmPauseMonitor.serviceStop();
+        }
 
 
         if (zkDb != null) {
         if (zkDb != null) {
             if (fullyShutDown) {
             if (fullyShutDown) {

+ 6 - 1
zookeeper-server/src/main/java/org/apache/zookeeper/server/ZooKeeperServerMain.java

@@ -36,6 +36,7 @@ import org.apache.zookeeper.server.admin.AdminServerFactory;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog.DatadirException;
 import org.apache.zookeeper.server.persistence.FileTxnSnapLog.DatadirException;
 import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
 import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
+import org.apache.zookeeper.server.util.JvmPauseMonitor;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
 
 
@@ -135,7 +136,11 @@ public class ZooKeeperServerMain {
             // run() in this thread.
             // run() in this thread.
             // create a file logger url from the command line args
             // create a file logger url from the command line args
             txnLog = new FileTxnSnapLog(config.dataLogDir, config.dataDir);
             txnLog = new FileTxnSnapLog(config.dataLogDir, config.dataDir);
-            final ZooKeeperServer zkServer = new ZooKeeperServer(txnLog,
+            JvmPauseMonitor jvmPauseMonitor = null;
+            if(config.jvmPauseMonitorToRun) {
+                jvmPauseMonitor = new JvmPauseMonitor(config);
+            }
+            final ZooKeeperServer zkServer = new ZooKeeperServer(jvmPauseMonitor, txnLog,
                     config.tickTime, config.minSessionTimeout, config.maxSessionTimeout,
                     config.tickTime, config.minSessionTimeout, config.maxSessionTimeout,
                     config.listenBacklog, null);
                     config.listenBacklog, null);
             txnLog.setServerStats(zkServer.serverStats());
             txnLog.setServerStats(zkServer.serverStats());

+ 16 - 0
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java

@@ -73,6 +73,7 @@ import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
 import org.apache.zookeeper.server.quorum.flexible.QuorumMaj;
 import org.apache.zookeeper.server.quorum.flexible.QuorumMaj;
 import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
 import org.apache.zookeeper.server.quorum.flexible.QuorumVerifier;
 import org.apache.zookeeper.server.util.ConfigUtils;
 import org.apache.zookeeper.server.util.ConfigUtils;
+import org.apache.zookeeper.server.util.JvmPauseMonitor;
 import org.apache.zookeeper.server.util.ZxidUtils;
 import org.apache.zookeeper.server.util.ZxidUtils;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
@@ -133,6 +134,7 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
      */
      */
     private ZKDatabase zkDb;
     private ZKDatabase zkDb;
 
 
+    private JvmPauseMonitor jvmPauseMonitor;
     public static final class AddressTuple {
     public static final class AddressTuple {
         public final InetSocketAddress quorumAddr;
         public final InetSocketAddress quorumAddr;
         public final InetSocketAddress electionAddr;
         public final InetSocketAddress electionAddr;
@@ -465,6 +467,10 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
         return getVotingView().size();
         return getVotingView().size();
     }
     }
 
 
+    public void setJvmPauseMonitor(JvmPauseMonitor jvmPauseMonitor) {
+        this.jvmPauseMonitor = jvmPauseMonitor;
+    }
+
     /**
     /**
      * QuorumVerifier implementation; default (majority).
      * QuorumVerifier implementation; default (majority).
      */
      */
@@ -915,6 +921,7 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
             System.out.println(e);
             System.out.println(e);
         }
         }
         startLeaderElection();
         startLeaderElection();
+        startJvmPauseMonitor();
         super.start();
         super.start();
     }
     }
 
 
@@ -981,6 +988,12 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
         this.electionAlg = createElectionAlgorithm(electionType);
         this.electionAlg = createElectionAlgorithm(electionType);
     }
     }
 
 
+    private void startJvmPauseMonitor() {
+        if (this.jvmPauseMonitor != null) {
+            this.jvmPauseMonitor.serviceStart();
+        }
+    }
+
     /**
     /**
      * Count the number of nodes in the map that could be followers.
      * Count the number of nodes in the map that could be followers.
      * @param peers
      * @param peers
@@ -1341,6 +1354,9 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
         if(udpSocket != null) {
         if(udpSocket != null) {
             udpSocket.close();
             udpSocket.close();
         }
         }
+        if(jvmPauseMonitor != null) {
+            jvmPauseMonitor.serviceStop();
+        }
 
 
         try {
         try {
             adminServer.shutdown();
             adminServer.shutdown();

+ 39 - 0
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeerConfig.java

@@ -39,6 +39,7 @@ import java.util.Map.Entry;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.apache.yetus.audience.InterfaceAudience;
 import org.apache.zookeeper.common.ClientX509Util;
 import org.apache.zookeeper.common.ClientX509Util;
 import org.apache.zookeeper.common.StringUtils;
 import org.apache.zookeeper.common.StringUtils;
+import org.apache.zookeeper.server.util.JvmPauseMonitor;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
 import org.slf4j.MDC;
 import org.slf4j.MDC;
@@ -122,6 +123,23 @@ public class QuorumPeerConfig {
      */
      */
     private final int MIN_SNAP_RETAIN_COUNT = 3;
     private final int MIN_SNAP_RETAIN_COUNT = 3;
 
 
+    /**
+     * JVM Pause Monitor feature switch
+     */
+    protected boolean jvmPauseMonitorToRun = false;
+    /**
+     * JVM Pause Monitor warn threshold in ms
+     */
+    protected long jvmPauseWarnThresholdMs = JvmPauseMonitor.WARN_THRESHOLD_DEFAULT;
+    /**
+     * JVM Pause Monitor info threshold in ms
+     */
+    protected long jvmPauseInfoThresholdMs = JvmPauseMonitor.INFO_THRESHOLD_DEFAULT;
+    /**
+     * JVM Pause Monitor sleep time in ms
+     */
+    protected long jvmPauseSleepTimeMs = JvmPauseMonitor.SLEEP_TIME_MS_DEFAULT;
+
     @SuppressWarnings("serial")
     @SuppressWarnings("serial")
     public static class ConfigException extends Exception {
     public static class ConfigException extends Exception {
         public ConfigException(String msg) {
         public ConfigException(String msg) {
@@ -344,6 +362,14 @@ public class QuorumPeerConfig {
                 quorumServicePrincipal = value;
                 quorumServicePrincipal = value;
             } else if (key.equals("quorum.cnxn.threads.size")) {
             } else if (key.equals("quorum.cnxn.threads.size")) {
                 quorumCnxnThreadsSize = Integer.parseInt(value);
                 quorumCnxnThreadsSize = Integer.parseInt(value);
+            } else if (key.equals(JvmPauseMonitor.INFO_THRESHOLD_KEY)) {
+                jvmPauseInfoThresholdMs = Long.parseLong(value);
+            } else if (key.equals(JvmPauseMonitor.WARN_THRESHOLD_KEY)) {
+                jvmPauseWarnThresholdMs = Long.parseLong(value);
+            } else if (key.equals(JvmPauseMonitor.SLEEP_TIME_MS_KEY)) {
+                jvmPauseSleepTimeMs = Long.parseLong(value);
+            } else if (key.equals(JvmPauseMonitor.JVM_PAUSE_MONITOR_FEATURE_SWITCH_KEY)) {
+                jvmPauseMonitorToRun = Boolean.parseBoolean(value);
             } else if (key.equals("metricsProvider.className")) {
             } else if (key.equals("metricsProvider.className")) {
                 metricsProviderClassName = value;
                 metricsProviderClassName = value;
             } else if (key.startsWith("metricsProvider.")) {
             } else if (key.startsWith("metricsProvider.")) {
@@ -825,6 +851,19 @@ public class QuorumPeerConfig {
         return Collections.unmodifiableMap(quorumVerifier.getAllMembers());
         return Collections.unmodifiableMap(quorumVerifier.getAllMembers());
     }
     }
 
 
+    public long getJvmPauseInfoThresholdMs() {
+        return jvmPauseInfoThresholdMs;
+    }
+    public long getJvmPauseWarnThresholdMs() {
+        return jvmPauseWarnThresholdMs;
+    }
+    public long getJvmPauseSleepTimeMs() {
+        return jvmPauseSleepTimeMs;
+    }
+    public boolean isJvmPauseMonitorToRun() {
+        return jvmPauseMonitorToRun;
+    }
+
     public long getServerId() { return serverId; }
     public long getServerId() { return serverId; }
 
 
     public boolean isDistributed() {
     public boolean isDistributed() {

+ 6 - 1
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeerMain.java

@@ -24,6 +24,7 @@ import javax.management.JMException;
 import javax.security.sasl.SaslException;
 import javax.security.sasl.SaslException;
 
 
 import org.apache.yetus.audience.InterfaceAudience;
 import org.apache.yetus.audience.InterfaceAudience;
+import org.apache.zookeeper.server.util.JvmPauseMonitor;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
 import org.apache.zookeeper.jmx.ManagedUtil;
 import org.apache.zookeeper.jmx.ManagedUtil;
@@ -219,7 +220,11 @@ public class QuorumPeerMain {
           }
           }
           quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize);
           quorumPeer.setQuorumCnxnThreadsSize(config.quorumCnxnThreadsSize);
           quorumPeer.initialize();
           quorumPeer.initialize();
-          
+
+          if(config.jvmPauseMonitorToRun) {
+              quorumPeer.setJvmPauseMonitor(new JvmPauseMonitor(config));
+          }
+
           quorumPeer.start();
           quorumPeer.start();
           quorumPeer.join();
           quorumPeer.join();
       } catch (InterruptedException e) {
       } catch (InterruptedException e) {

+ 209 - 0
zookeeper-server/src/main/java/org/apache/zookeeper/server/util/JvmPauseMonitor.java

@@ -0,0 +1,209 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.server.util;
+
+import org.apache.zookeeper.server.ServerConfig;
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.lang.management.GarbageCollectorMXBean;
+import java.lang.management.ManagementFactory;
+import java.time.Instant;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.List;
+
+/**
+ * This code is originally from hadoop-common, see:
+ * https://github.com/apache/hadoop/blob/trunk/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/util/JvmPauseMonitor.java
+ *
+ * Class which sets up a simple thread which runs in a loop sleeping
+ * for a short interval of time. If the sleep takes significantly longer
+ * than its target time, it implies that the JVM or host machine has
+ * paused processing, which may cause other problems. If such a pause is
+ * detected, the thread logs a message.
+ */
+public class JvmPauseMonitor {
+    private static final Logger LOG = LoggerFactory.getLogger(JvmPauseMonitor.class);
+
+    public static final String JVM_PAUSE_MONITOR_FEATURE_SWITCH_KEY = "jvm.pause.monitor";
+
+    /** The target sleep time */
+    protected long sleepTimeMs;
+    public static final String SLEEP_TIME_MS_KEY = "jvm.pause.sleep.time.ms";
+    public static final long SLEEP_TIME_MS_DEFAULT = 500;
+
+    /** log WARN if we detect a pause longer than this threshold */
+    protected long warnThresholdMs;
+    public static final String WARN_THRESHOLD_KEY = "jvm.pause.warn-threshold.ms";
+    public static final long WARN_THRESHOLD_DEFAULT = 10000;
+
+    /** log INFO if we detect a pause longer than this threshold */
+    protected long infoThresholdMs;
+    public static final String INFO_THRESHOLD_KEY = "jvm.pause.info-threshold.ms";
+    public static final long INFO_THRESHOLD_DEFAULT = 1000;
+
+    private long numGcWarnThresholdExceeded = 0;
+    private long numGcInfoThresholdExceeded = 0;
+    private long totalGcExtraSleepTime = 0;
+
+    private Thread monitorThread;
+    private volatile boolean shouldRun = true;
+
+    public JvmPauseMonitor(QuorumPeerConfig config) {
+        this.warnThresholdMs = config.getJvmPauseWarnThresholdMs();
+        this.infoThresholdMs = config.getJvmPauseInfoThresholdMs();
+        this.sleepTimeMs = config.getJvmPauseSleepTimeMs();
+    }
+
+    public JvmPauseMonitor(ServerConfig config) {
+        this.warnThresholdMs = config.getJvmPauseWarnThresholdMs();
+        this.infoThresholdMs = config.getJvmPauseInfoThresholdMs();
+        this.sleepTimeMs = config.getJvmPauseSleepTimeMs();
+    }
+
+    public void serviceStart() {
+        monitorThread = new Thread(new JVMMonitor());
+        monitorThread.setDaemon(true);
+        monitorThread.start();
+    }
+
+    public void serviceStop() {
+        shouldRun = false;
+        if (monitorThread != null) {
+            monitorThread.interrupt();
+            try {
+                monitorThread.join();
+            } catch (InterruptedException e) {
+                Thread.currentThread().interrupt();
+            }
+        }
+    }
+
+    public boolean isStarted() {
+        return monitorThread != null;
+    }
+
+    public long getNumGcWarnThresholdExceeded() {
+        return numGcWarnThresholdExceeded;
+    }
+
+    public long getNumGcInfoThresholdExceeded() {
+        return numGcInfoThresholdExceeded;
+    }
+
+    public long getTotalGcExtraSleepTime() {
+        return totalGcExtraSleepTime;
+    }
+
+    private String formatMessage(long extraSleepTime,
+                                 Map<String, GcTimes> gcTimesAfterSleep,
+                                 Map<String, GcTimes> gcTimesBeforeSleep) {
+
+        Set<String> gcBeanNames = new HashSet<>(gcTimesAfterSleep.keySet());
+        gcBeanNames.retainAll(gcTimesBeforeSleep.keySet());
+        List<String> gcDiffs = new ArrayList<>();
+
+        for (String name : gcBeanNames) {
+            GcTimes diff = gcTimesAfterSleep.get(name).subtract(gcTimesBeforeSleep.get(name));
+            if (diff.gcCount != 0) {
+                gcDiffs.add("GC pool '" + name + "' had collection(s): " + diff.toString());
+            }
+        }
+
+        String ret = String.format("Detected pause in JVM or host machine (eg GC): pause of approximately %d ms, " +
+                "total pause: info level: %d, warn level: %d %n",
+                extraSleepTime, numGcInfoThresholdExceeded, numGcWarnThresholdExceeded);
+        if (gcDiffs.isEmpty()) {
+            ret += ("No GCs detected");
+        } else {
+            ret += String.join("\n", gcDiffs);
+        }
+        return ret;
+    }
+
+    private Map<String, GcTimes> getGcTimes() {
+        Map<String, GcTimes> map = new HashMap<>();
+        List<GarbageCollectorMXBean> gcBeans = ManagementFactory.getGarbageCollectorMXBeans();
+        for (GarbageCollectorMXBean gcBean : gcBeans) {
+            map.put(gcBean.getName(), new GcTimes(gcBean));
+        }
+        return map;
+    }
+
+    private static class GcTimes {
+
+        private long gcCount;
+        private long gcTimeMillis;
+
+        private GcTimes(GarbageCollectorMXBean gcBean) {
+            gcCount = gcBean.getCollectionCount();
+            gcTimeMillis = gcBean.getCollectionTime();
+        }
+
+        private GcTimes(long count, long time) {
+            this.gcCount = count;
+            this.gcTimeMillis = time;
+        }
+
+        private GcTimes subtract(GcTimes other) {
+            return new GcTimes(this.gcCount - other.gcCount,
+                    this.gcTimeMillis - other.gcTimeMillis);
+        }
+
+        public String toString() {
+            return "count=" + gcCount + " time=" + gcTimeMillis + "ms";
+        }
+
+    }
+
+    private class JVMMonitor implements Runnable {
+        @Override
+        public void run() {
+            Map<String, GcTimes> gcTimesBeforeSleep = getGcTimes();
+            LOG.info("Starting JVM Pause Monitor with infoThresholdMs:{} warnThresholdMs:{} and sleepTimeMs:{}",
+                    infoThresholdMs, warnThresholdMs, sleepTimeMs);
+            while (shouldRun) {
+                long startTime = Instant.now().toEpochMilli();
+                try {
+                    Thread.sleep(sleepTimeMs);
+                } catch (InterruptedException ie) {
+                    return;
+                }
+                long endTime = Instant.now().toEpochMilli();
+                long extraSleepTime = (endTime - startTime) - sleepTimeMs;
+                Map<String, GcTimes> gcTimesAfterSleep = getGcTimes();
+
+                if (extraSleepTime > warnThresholdMs) {
+                    ++numGcWarnThresholdExceeded;
+                    LOG.warn(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
+                } else if (extraSleepTime > infoThresholdMs) {
+                    ++numGcInfoThresholdExceeded;
+                    LOG.info(formatMessage(extraSleepTime, gcTimesAfterSleep, gcTimesBeforeSleep));
+                }
+                totalGcExtraSleepTime += extraSleepTime;
+                gcTimesBeforeSleep = gcTimesAfterSleep;
+            }
+        }
+    }
+}

+ 23 - 0
zookeeper-server/src/test/java/org/apache/zookeeper/ServerConfigTest.java

@@ -19,12 +19,15 @@
 package org.apache.zookeeper;
 package org.apache.zookeeper;
 
 
 import org.apache.zookeeper.server.ServerConfig;
 import org.apache.zookeeper.server.ServerConfig;
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig;
 import org.junit.Before;
 import org.junit.Before;
 import org.junit.Test;
 import org.junit.Test;
 
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 
 
 import java.io.File;
 import java.io.File;
 
 
@@ -60,6 +63,26 @@ public class ServerConfigTest {
         serverConfig.parse(args);
         serverConfig.parse(args);
     }
     }
 
 
+    @Test
+    public void testJvmPauseMonitorConfigured() {
+        final Long sleepTime = 444L;
+        final Long warnTH = 5555L;
+        final Long infoTH = 555L;
+
+        QuorumPeerConfig qpConfig = mock(QuorumPeerConfig.class);
+        when(qpConfig.isJvmPauseMonitorToRun()).thenReturn(true);
+        when(qpConfig.getJvmPauseSleepTimeMs()).thenReturn(sleepTime);
+        when(qpConfig.getJvmPauseWarnThresholdMs()).thenReturn(warnTH);
+        when(qpConfig.getJvmPauseInfoThresholdMs()).thenReturn(infoTH);
+
+        serverConfig.readFrom(qpConfig);
+
+        assertEquals(sleepTime, Long.valueOf(serverConfig.getJvmPauseSleepTimeMs()));
+        assertEquals(warnTH, Long.valueOf(serverConfig.getJvmPauseWarnThresholdMs()));
+        assertEquals(infoTH, Long.valueOf(serverConfig.getJvmPauseInfoThresholdMs()));
+        assertTrue(serverConfig.isJvmPauseMonitorToRun());
+    }
+
     boolean checkEquality(String a, String b) {
     boolean checkEquality(String a, String b) {
         assertNotNull(a);
         assertNotNull(a);
         assertNotNull(b);
         assertNotNull(b);

+ 24 - 4
zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/QuorumPeerConfigTest.java

@@ -18,10 +18,6 @@
 
 
 package org.apache.zookeeper.server.quorum;
 package org.apache.zookeeper.server.quorum;
 
 
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.fail;
-
 import java.io.File;
 import java.io.File;
 import java.io.IOException;
 import java.io.IOException;
 import java.net.InetSocketAddress;
 import java.net.InetSocketAddress;
@@ -32,6 +28,8 @@ import org.apache.zookeeper.server.quorum.QuorumPeerConfig.ConfigException;
 import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
 import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
 import org.junit.Test;
 import org.junit.Test;
 
 
+import static org.junit.Assert.*;
+
 public class QuorumPeerConfigTest {
 public class QuorumPeerConfigTest {
 
 
     /**
     /**
@@ -148,6 +146,28 @@ public class QuorumPeerConfigTest {
         assertEquals(quorumPeerConfig.getClientPortAddress(), qs.clientAddr);
         assertEquals(quorumPeerConfig.getClientPortAddress(), qs.clientAddr);
     }
     }
 
 
+    @Test
+    public void testJvmPauseMonitorConfigured()
+            throws IOException, ConfigException {
+        final Long sleepTime = 444L;
+        final Long warnTH = 5555L;
+        final Long infoTH = 555L;
+
+        QuorumPeerConfig quorumPeerConfig = new QuorumPeerConfig();
+        Properties zkProp = getDefaultZKProperties();
+        zkProp.setProperty("dataDir", new File("myDataDir").getAbsolutePath());
+        zkProp.setProperty("jvm.pause.monitor", "true");
+        zkProp.setProperty("jvm.pause.sleep.time.ms", sleepTime.toString());
+        zkProp.setProperty("jvm.pause.warn-threshold.ms", warnTH.toString());
+        zkProp.setProperty("jvm.pause.info-threshold.ms", infoTH.toString());
+        quorumPeerConfig.parseProperties(zkProp);
+
+        assertEquals(sleepTime, Long.valueOf(quorumPeerConfig.getJvmPauseSleepTimeMs()));
+        assertEquals(warnTH, Long.valueOf(quorumPeerConfig.getJvmPauseWarnThresholdMs()));
+        assertEquals(infoTH, Long.valueOf(quorumPeerConfig.getJvmPauseInfoThresholdMs()));
+        assertTrue(quorumPeerConfig.isJvmPauseMonitorToRun());
+    }
+
     private Properties getDefaultZKProperties() {
     private Properties getDefaultZKProperties() {
         Properties zkProp = new Properties();
         Properties zkProp = new Properties();
         zkProp.setProperty("dataDir", new File("myDataDir").getAbsolutePath());
         zkProp.setProperty("dataDir", new File("myDataDir").getAbsolutePath());

+ 75 - 0
zookeeper-server/src/test/java/org/apache/zookeeper/server/util/JvmPauseMonitorTest.java

@@ -0,0 +1,75 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.server.util;
+
+import org.apache.zookeeper.server.quorum.QuorumPeerConfig;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Test;
+
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+public class JvmPauseMonitorTest {
+
+    private final Long sleepTime = 100L;
+    private final Long infoTH = -1L;
+    private final Long warnTH = -1L;
+    private JvmPauseMonitor pauseMonitor;
+
+    @Test(timeout=5000)
+    public void testJvmPauseMonitorExceedInfoThreshold() throws InterruptedException {
+        QuorumPeerConfig qpConfig = mock(QuorumPeerConfig.class);
+        when(qpConfig.getJvmPauseSleepTimeMs()).thenReturn(sleepTime);
+        when(qpConfig.getJvmPauseInfoThresholdMs()).thenReturn(infoTH);
+
+        pauseMonitor = new JvmPauseMonitor(qpConfig);
+        pauseMonitor.serviceStart();
+
+        Assert.assertEquals(sleepTime, Long.valueOf(pauseMonitor.sleepTimeMs));
+        Assert.assertEquals(infoTH, Long.valueOf(pauseMonitor.infoThresholdMs));
+
+        while(pauseMonitor.getNumGcInfoThresholdExceeded() == 0) {
+            Thread.sleep(200);
+        }
+    }
+
+    @Test(timeout=5000)
+    public void testJvmPauseMonitorExceedWarnThreshold() throws InterruptedException {
+        QuorumPeerConfig qpConfig = mock(QuorumPeerConfig.class);
+        when(qpConfig.getJvmPauseSleepTimeMs()).thenReturn(sleepTime);
+        when(qpConfig.getJvmPauseWarnThresholdMs()).thenReturn(warnTH);
+
+        pauseMonitor = new JvmPauseMonitor(qpConfig);
+        pauseMonitor.serviceStart();
+
+        Assert.assertEquals(sleepTime, Long.valueOf(pauseMonitor.sleepTimeMs));
+        Assert.assertEquals(warnTH, Long.valueOf(pauseMonitor.warnThresholdMs));
+
+        while(pauseMonitor.getNumGcWarnThresholdExceeded() == 0) {
+            Thread.sleep(200);
+        }
+
+    }
+
+    @After
+    public void teardown() {
+        pauseMonitor.serviceStop();
+    }
+}