Переглянути джерело

ZOOKEEPER-3019: add metric for slow fsyncs count

Added metric to count fsyncWarningThresholdMS exceed in FileTxnLog.commit(). Both jmx bean and Command can be used to get the stat.

Also updated the python standalone client and documentation.

Tested the py script, i.e.:
src/contrib/monitoring/check_zookeeper.py -o cacti -s "localhost:2181" -k zk_num_alive_connections
localhost_2181:1

rc/contrib/monitoring/check_zookeeper.py -o cacti -s "localhost:2181" -k zk_fsync_threshold_exceed_count
localhost_2181:0

mntr command now returns:
zk_version	3.6.0-SNAPSHOT-13c9f899ff62be6a2eacb28e522d06703f1dab08, built on 04/04/2018 12:09 GMT
zk_avg_latency	0
zk_max_latency	0
zk_min_latency	0
zk_packets_received	7
zk_packets_sent	6
zk_num_alive_connections	1
zk_outstanding_requests	0
zk_server_state	standalone
zk_znode_count	5
zk_watch_count	0
zk_ephemerals_count	0
zk_approximate_data_size	44
zk_open_file_descriptor_count	61
zk_max_file_descriptor_count	10240
zk_fsync_threshold_exceed_count	0

Author: Norbert Kalmar <nkalmar@yahoo.com>

Reviewers: Andor Molnar <andor@apache.org>

Closes #501 from nkalmar/ZOOKEEPER-3019 and squashes the following commits:

067edf6a [Norbert Kalmar] ZOOKEEPER-3019 remove redundant log
7430dbce [Norbert Kalmar] ZOOKEEPER-3019 remove metric from 4ltr command as no longer supported
1916a418 [Norbert Kalmar] ZOOKEEPER-3019 add extra warning log
761450f9 [Norbert Kalmar] ZOOKEEPER-3019 remove fsync from py script stat check
3b22bb7b [Norbert Kalmar] ZOOKEEPER-3019 remove fsyncEsceed from stats
0be003a8 [Norbert Kalmar] ZOOKEEPER-3019 exclude serverStats IS chec from findbugs
b48d2958 [Norbert Kalmar] ZOOKEEPER-3019 add unit test and fix PR comments
fb4c7a31 [Norbert Kalmar] ZOOKEEPER-3019 add metric to track slow fsyncs count + update py script and docs
Norbert Kalmar 7 роки тому
батько
коміт
a2623a625a

+ 2 - 0
build.xml

@@ -230,6 +230,8 @@ xmlns:cs="antlib:com.puppycrawl.tools.checkstyle.ant">
     <property name="javadoc-jar" value="${dist.maven.dir}/${final.name}-javadoc.jar"/>
 
 
+    <property name="hamcrest.version" value="1.3"/>
+
     <!-- ====================================================== -->
     <!-- Macro definitions                                      -->
     <!-- ====================================================== -->

+ 3 - 0
ivy.xml

@@ -136,6 +136,9 @@
 
     <dependency org="org.openclover" name="clover" rev="${clover.version}" conf="clover->default"/>
 
+    <dependency org="org.hamcrest" name="hamcrest-all" rev="${hamcrest.version}"
+                  conf="test->default" />
+
     <conflict manager="strict"/>
 
   </dependencies>

+ 1 - 0
src/contrib/monitoring/README

@@ -26,6 +26,7 @@ zk_max_latency  1788
 zk_min_latency  0
 zk_packets_received 385466
 zk_packets_sent 435364
+zk_num_alive_connections	1
 zk_outstanding_requests 0
 zk_server_state follower
 zk_znode_count  5

+ 30 - 0
src/contrib/monitoring/check_zookeeper.py

@@ -241,6 +241,11 @@ class ZooKeeperServer(object):
                 result['zk_packets_sent'] = int(m.group(1))
                 continue
 
+            m = re.match('Alive connections: (\d+)', line)
+            if m is not None:
+                result['zk_num_alive_connections'] = int(m.group(1))
+                continue
+
             m = re.match('Outstanding: (\d+)', line)
             if m is not None:
                 result['zk_outstanding_requests'] = int(m.group(1))
@@ -256,6 +261,31 @@ class ZooKeeperServer(object):
                 result['zk_znode_count'] = int(m.group(1))
                 continue
 
+            m = re.match('Watch count: (\d+)', line)
+            if m is not None:
+                result['zk_watch_count'] = int(m.group(1))
+                continue
+
+            m = re.match('Ephemerals count: (\d+)', line)
+            if m is not None:
+                result['zk_ephemerals_count'] = int(m.group(1))
+                continue
+
+            m = re.match('Approximate data size: (\d+)', line)
+            if m is not None:
+                result['zk_approximate_data_size'] = int(m.group(1))
+                continue
+
+            m = re.match('Open file descriptor count: (\d+)', line)
+            if m is not None:
+                result['zk_open_file_descriptor_count'] = int(m.group(1))
+                continue
+
+            m = re.match('Max file descriptor count: (\d+)', line)
+            if m is not None:
+                result['zk_max_file_descriptor_count'] = int(m.group(1))
+                continue
+
             m = re.match('Zxid: (0x[0-9a-fA-F]+)', line)
             if m is not None:
                 result['zk_zxid']         = m.group(1)

+ 1 - 0
src/contrib/monitoring/ganglia/zookeeper.pyconf

@@ -35,6 +35,7 @@ collection_group {
   metric { name = "zk_min_latency" }
   metric { name = "zk_packets_received" }
   metric { name = "zk_packets_sent" }
+  metric { name = "zk_num_alive_connections" }
   metric { name = "zk_outstanding_requests" }
   metric { name = "zk_znode_count" }
   metric { name = "zk_watch_count" }

+ 1 - 0
src/contrib/monitoring/ganglia/zookeeper_ganglia.py

@@ -204,6 +204,7 @@ def metric_init(params=None):
             'value_type': 'double',
             'format': '%f'
         },
+        'zk_num_alive_connections': {'units': 'connections'},
         'zk_outstanding_requests': {'units': 'connections'},
         'zk_znode_count': {'units': 'znodes'},
         'zk_watch_count': {'units': 'watches'},

+ 1 - 0
src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml

@@ -1921,6 +1921,7 @@ server.3=zoo3:2888:3888</programlisting>
               zk_min_latency  0
               zk_packets_received 70
               zk_packets_sent 69
+              zk_num_alive_connections	1
               zk_outstanding_requests 0
               zk_server_state leader
               zk_znode_count   4

+ 16 - 0
src/java/main/org/apache/zookeeper/server/ServerStats.java

@@ -22,6 +22,8 @@ package org.apache.zookeeper.server;
 
 import org.apache.zookeeper.common.Time;
 
+import java.util.concurrent.atomic.AtomicLong;
+
 /**
  * Basic Server Statistics
  */
@@ -32,6 +34,7 @@ public class ServerStats {
     private long minLatency = Long.MAX_VALUE;
     private long totalLatency = 0;
     private long count = 0;
+    private AtomicLong fsyncThresholdExceedCount = new AtomicLong(0);
 
     private final Provider provider;
 
@@ -148,6 +151,19 @@ public class ServerStats {
         packetsReceived = 0;
         packetsSent = 0;
     }
+
+    public long getFsyncThresholdExceedCount() {
+        return fsyncThresholdExceedCount.get();
+    }
+
+    public void incrementFsyncThresholdExceedCount() {
+        fsyncThresholdExceedCount.incrementAndGet();
+    }
+
+    public void resetFsyncThresholdExceedCount() {
+        fsyncThresholdExceedCount.set(0);
+    }
+
     synchronized public void reset() {
         resetLatency();
         resetRequestCounters();

+ 1 - 0
src/java/main/org/apache/zookeeper/server/ZooKeeperServer.java

@@ -153,6 +153,7 @@ public class ZooKeeperServer implements SessionExpirer, ServerStats.Provider {
             int minSessionTimeout, int maxSessionTimeout, ZKDatabase zkDb) {
         serverStats = new ServerStats(this);
         this.txnLogFactory = txnLogFactory;
+        this.txnLogFactory.setServerStats(this.serverStats);
         this.zkDb = zkDb;
         this.tickTime = tickTime;
         setMinSessionTimeout(minSessionTimeout);

+ 9 - 0
src/java/main/org/apache/zookeeper/server/ZooKeeperServerBean.java

@@ -127,6 +127,10 @@ public class ZooKeeperServerBean implements ZooKeeperServerMXBean, ZKMBeanInfo {
     public long getPacketsSent() {
         return zks.serverStats().getPacketsSent();
     }
+
+    public long getFsyncThresholdExceedCount() {
+        return zks.serverStats().getFsyncThresholdExceedCount();
+    }
     
     public void resetLatency() {
         zks.serverStats().resetLatency();
@@ -136,10 +140,15 @@ public class ZooKeeperServerBean implements ZooKeeperServerMXBean, ZKMBeanInfo {
         zks.serverStats().resetMaxLatency();
     }
 
+    public void resetFsyncThresholdExceedCount() {
+        zks.serverStats().resetFsyncThresholdExceedCount();
+    }
+
     public void resetStatistics() {
         ServerStats serverStats = zks.serverStats();
         serverStats.resetRequestCounters();
         serverStats.resetLatency();
+        serverStats.resetFsyncThresholdExceedCount();
     }
 
     public long getNumAliveConnections() {

+ 8 - 0
src/java/main/org/apache/zookeeper/server/ZooKeeperServerMXBean.java

@@ -54,6 +54,10 @@ public interface ZooKeeperServerMXBean {
      * @return number of packets sent so far
      */
     public long getPacketsSent();
+    /**
+     * @return number of fsync threshold exceeds so far
+     */
+    public long getFsyncThresholdExceedCount();
     /**
      * @return number of outstanding requests.
      */
@@ -103,6 +107,10 @@ public interface ZooKeeperServerMXBean {
      * Reset max latency statistics only.
      */
     public void resetMaxLatency();
+    /**
+     * Reset Fsync Threshold Exceed Count statistics only.
+     */
+    public void resetFsyncThresholdExceedCount();
     /**
      * @return number of alive client connections
      */

+ 1 - 0
src/java/main/org/apache/zookeeper/server/ZooKeeperServerMain.java

@@ -124,6 +124,7 @@ public class ZooKeeperServerMain {
             txnLog = new FileTxnSnapLog(config.dataLogDir, config.dataDir);
             final ZooKeeperServer zkServer = new ZooKeeperServer(txnLog,
                     config.tickTime, config.minSessionTimeout, config.maxSessionTimeout, null);
+            txnLog.setServerStats(zkServer.serverStats());
 
             // Registers shutdown handler which will be used to know the
             // server error or shutdown state changes.

+ 1 - 0
src/java/main/org/apache/zookeeper/server/admin/Commands.java

@@ -290,6 +290,7 @@ public class Commands {
      *   - "approximate_data_size": Long
      *   - "open_file_descriptor_count": Long (unix only)
      *   - "max_file_descritpor_count": Long (unix only)
+     *   - "fsync_threshold_exceed_count": Long
      *   - "followers": Integer (leader only)
      *   - "synced_followers": Integer (leader only)
      *   - "pending_syncs": Integer (leader only)

+ 15 - 0
src/java/main/org/apache/zookeeper/server/persistence/FileTxnLog.java

@@ -41,6 +41,7 @@ import org.apache.jute.BinaryOutputArchive;
 import org.apache.jute.InputArchive;
 import org.apache.jute.OutputArchive;
 import org.apache.jute.Record;
+import org.apache.zookeeper.server.ServerStats;
 import org.apache.zookeeper.server.util.SerializeUtils;
 import org.apache.zookeeper.txn.TxnHeader;
 import org.slf4j.Logger;
@@ -124,6 +125,8 @@ public class FileTxnLog implements TxnLog {
     File logFileWrite = null;
     private FilePadding filePadding = new FilePadding();
 
+    private ServerStats serverStats;
+
     private volatile long syncElapsedMS = -1L;
 
     /**
@@ -144,6 +147,15 @@ public class FileTxnLog implements TxnLog {
         FilePadding.setPreallocSize(size);
     }
 
+    /**
+     * Setter for ServerStats to monitor fsync threshold exceed
+     * @param serverStats used to update fsyncThresholdExceedCount
+     */
+    @Override
+    public void setServerStats(ServerStats serverStats) {
+        this.serverStats = serverStats;
+    }
+
     /**
      * creates a checksum algorithm to be used
      * @return the checksum used for this txnlog
@@ -320,6 +332,9 @@ public class FileTxnLog implements TxnLog {
 
                 syncElapsedMS = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startSyncNS);
                 if (syncElapsedMS > fsyncWarningThresholdMS) {
+                    if(serverStats != null) {
+                        serverStats.incrementFsyncThresholdExceedCount();
+                    }
                     LOG.warn("fsync-ing the write ahead log in "
                             + Thread.currentThread().getName()
                             + " took " + syncElapsedMS

+ 6 - 1
src/java/main/org/apache/zookeeper/server/persistence/FileTxnSnapLog.java

@@ -33,6 +33,7 @@ import org.apache.zookeeper.ZooDefs.OpCode;
 import org.apache.zookeeper.server.DataTree;
 import org.apache.zookeeper.server.DataTree.ProcessTxnResult;
 import org.apache.zookeeper.server.Request;
+import org.apache.zookeeper.server.ServerStats;
 import org.apache.zookeeper.server.ZooTrace;
 import org.apache.zookeeper.server.persistence.TxnLog.TxnIterator;
 import org.apache.zookeeper.txn.CreateSessionTxn;
@@ -151,6 +152,10 @@ public class FileTxnSnapLog {
                 ZOOKEEPER_DB_AUTOCREATE_DEFAULT));
     }
 
+    public void setServerStats(ServerStats serverStats) {
+        txnLog.setServerStats(serverStats);
+    }
+
     private void checkLogDir() throws LogDirContentCheckException {
         File[] files = this.dataDir.listFiles(new FilenameFilter() {
             @Override
@@ -205,7 +210,7 @@ public class FileTxnSnapLog {
      * @throws IOException
      */
     public long restore(DataTree dt, Map<Long, Integer> sessions,
-            PlayBackListener listener) throws IOException {
+                        PlayBackListener listener) throws IOException {
         long deserializeResult = snapLog.deserialize(dt, sessions);
         FileTxnLog txnLog = new FileTxnLog(dataDir);
         boolean trustEmptyDB;

+ 7 - 0
src/java/main/org/apache/zookeeper/server/persistence/TxnLog.java

@@ -21,6 +21,7 @@ package org.apache.zookeeper.server.persistence;
 import java.io.IOException;
 
 import org.apache.jute.Record;
+import org.apache.zookeeper.server.ServerStats;
 import org.apache.zookeeper.txn.TxnHeader;
 
 /**
@@ -28,6 +29,12 @@ import org.apache.zookeeper.txn.TxnHeader;
  *
  */
 public interface TxnLog {
+
+    /**
+     * Setter for ServerStats to monitor fsync threshold exceed
+     * @param serverStats used to update fsyncThresholdExceedCount
+     */
+    void setServerStats(ServerStats serverStats);
     
     /**
      * roll the current

+ 5 - 0
src/java/test/config/findbugsExcludeFile.xml

@@ -86,6 +86,11 @@
      <Field name="lastProposed"/>
      <Bug code="IS"/>
   </Match>
+  <Match>
+    <Class name="org.apache.zookeeper.server.persistence.FileTxnLog"/>
+    <Field name="serverStats"/>
+    <Bug code="IS"/>
+  </Match>
   <Match>
      <Class name="org.apache.zookeeper.server.quorum.LearnerSessionTracker"/>
        <Bug code="UrF"/>

+ 145 - 0
src/java/test/org/apache/zookeeper/server/ServerStatsTest.java

@@ -0,0 +1,145 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.zookeeper.server;
+
+import org.apache.zookeeper.ZKTestCase;
+import org.apache.zookeeper.common.Time;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.mockito.Mockito.mock;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
+
+public class ServerStatsTest extends ZKTestCase {
+
+    private ServerStats.Provider providerMock;
+
+    @Before
+    public void setUp() {
+        providerMock = mock(ServerStats.Provider.class);
+    }
+
+    @Test
+    public void testPacketsMetrics() {
+        // Given ...
+        ServerStats serverStats = new ServerStats(providerMock);
+        int incrementCount = 20;
+
+        // When increment ...
+        for (int i = 0; i < incrementCount; i++) {
+            serverStats.incrementPacketsSent();
+            serverStats.incrementPacketsReceived();
+            serverStats.incrementPacketsReceived();
+        }
+
+        // Then ...
+        Assert.assertEquals(incrementCount, serverStats.getPacketsSent());
+        Assert.assertEquals(incrementCount*2, serverStats.getPacketsReceived());
+
+        // When reset ...
+        serverStats.resetRequestCounters();
+
+        // Then ...
+        assertAllPacketsZero(serverStats);
+
+    }
+
+    @Test
+    public void testLatencyMetrics() {
+        // Given ...
+        ServerStats serverStats = new ServerStats(providerMock);
+
+        // When incremented...
+        serverStats.updateLatency(Time.currentElapsedTime()-1000);
+        serverStats.updateLatency(Time.currentElapsedTime()-2000);
+
+        // Then ...
+        assertThat("Max latency check", 2000L,
+                greaterThanOrEqualTo(serverStats.getMaxLatency()));
+        assertThat("Min latency check", 1000L,
+                greaterThanOrEqualTo(serverStats.getMinLatency()));
+        assertThat("Avg latency check", 1500L,
+                greaterThanOrEqualTo(serverStats.getAvgLatency()));
+
+        // When reset...
+        serverStats.resetLatency();
+
+        // Then ...
+        assertAllLatencyZero(serverStats);
+    }
+
+    @Test
+    public void testFsyncThresholdExceedMetrics() {
+        // Given ...
+        ServerStats serverStats = new ServerStats(providerMock);
+        int incrementCount = 30;
+
+        // When increment ...
+        for (int i = 0; i < incrementCount; i++) {
+            serverStats.incrementFsyncThresholdExceedCount();
+        }
+
+        // Then ...
+        Assert.assertEquals(incrementCount, serverStats.getFsyncThresholdExceedCount());
+
+        // When reset ...
+        serverStats.resetFsyncThresholdExceedCount();
+
+        // Then ...
+        assertFsyncThresholdExceedCountZero(serverStats);
+
+    }
+
+    @Test
+    public void testReset() {
+        // Given ...
+        ServerStats serverStats = new ServerStats(providerMock);
+
+        assertAllPacketsZero(serverStats);
+        assertAllLatencyZero(serverStats);
+
+        // When ...
+        serverStats.incrementPacketsSent();
+        serverStats.incrementPacketsReceived();
+        serverStats.updateLatency(Time.currentElapsedTime()-1000);
+
+        serverStats.reset();
+
+        // Then ...
+        assertAllPacketsZero(serverStats);
+        assertAllLatencyZero(serverStats);
+    }
+
+    private void assertAllPacketsZero(ServerStats serverStats) {
+        Assert.assertEquals(0L, serverStats.getPacketsSent());
+        Assert.assertEquals(0L, serverStats.getPacketsReceived());
+    }
+
+    private void assertAllLatencyZero(ServerStats serverStats) {
+        Assert.assertEquals(0L, serverStats.getMaxLatency());
+        Assert.assertEquals(0L, serverStats.getMinLatency());
+        Assert.assertEquals(0L, serverStats.getAvgLatency());
+    }
+
+    private void assertFsyncThresholdExceedCountZero(ServerStats serverStats) {
+        Assert.assertEquals(0L, serverStats.getFsyncThresholdExceedCount());
+    }
+}

+ 27 - 0
src/java/test/org/apache/zookeeper/server/persistence/FileTxnLogTest.java

@@ -19,6 +19,7 @@ package org.apache.zookeeper.server.persistence;
 
 import org.apache.zookeeper.ZKTestCase;
 import org.apache.zookeeper.ZooDefs;
+import org.apache.zookeeper.server.ServerStats;
 import org.apache.zookeeper.test.ClientBase;
 import org.apache.zookeeper.txn.CreateTxn;
 import org.apache.zookeeper.txn.TxnHeader;
@@ -33,6 +34,7 @@ import java.util.Arrays;
 
 import static org.hamcrest.core.Is.is;
 import static org.hamcrest.core.IsEqual.equalTo;
+import static org.mockito.Mockito.mock;
 
 public class FileTxnLogTest  extends ZKTestCase {
   protected static final Logger LOG = LoggerFactory.getLogger(FileTxnLogTest.class);
@@ -108,4 +110,29 @@ public class FileTxnLogTest  extends ZKTestCase {
     FileTxnLog.setPreallocSize(customPreallocSize);
     Assert.assertThat(FilePadding.getPreAllocSize(), is(equalTo(customPreallocSize)));
   }
+
+  public void testSyncThresholdExceedCount() throws IOException {
+    // Given ...
+
+    // Set threshold to -1, as after the first commit it takes 0ms to commit to disk.
+    java.lang.System.setProperty("zookeeper.fsync.warningthresholdms", "-1");
+    ServerStats.Provider providerMock = mock(ServerStats.Provider.class);
+    ServerStats serverStats = new ServerStats(providerMock);
+
+    File logDir = ClientBase.createTmpDir();
+    FileTxnLog fileTxnLog = new FileTxnLog(logDir);
+    fileTxnLog.setServerStats(serverStats);
+
+    // Verify serverStats is 0 before any commit
+    Assert.assertEquals(0L, serverStats.getFsyncThresholdExceedCount());
+
+    // When ...
+    for (int i = 0; i < 50; i++) {
+      fileTxnLog.append(new TxnHeader(1, 1, 1, 1, ZooDefs.OpCode.create),
+              new CreateTxn("/testFsyncThresholdCountIncreased", new byte[]{}, ZooDefs.Ids.OPEN_ACL_UNSAFE, false, 0));
+      fileTxnLog.commit(); // only 1 commit, otherwise it will be flaky
+      // Then ... verify serverStats is updated to the number of commits (as threshold is set to 0)
+      Assert.assertEquals((long) i + 1 , serverStats.getFsyncThresholdExceedCount());
+    }
+  }
 }