Browse Source

ZOOKEEPER-1239. add logging/stats to identify fsync stalls. (phunt via camille)

git-svn-id: https://svn.apache.org/repos/asf/zookeeper/trunk@1202360 13f79535-47bb-0310-9956-ffa450edef68
Camille Fournier 13 years ago
parent
commit
dad9038097

+ 3 - 1
CHANGES.txt

@@ -55,7 +55,9 @@ BUGFIXES:
   ZOOKEEPER-1291. AcceptedEpoch not updated at leader before it proposes the epoch to followers. (Alex Shraer via camille)
 
   ZOOKEEPER-1208. Ephemeral node not removed after the client session is long gone. (phunt via camille)
-  
+ 
+  ZOOKEEPER-1239. add logging/stats to identify fsync stalls. (phunt via camille)
+   
 IMPROVEMENTS:
 
   ZOOKEEPER-1170. Fix compiler (eclipse) warnings: unused imports,

+ 15 - 0
src/docs/src/documentation/content/xdocs/zookeeperAdmin.xml

@@ -783,6 +783,21 @@ server.3=zoo3:2888:3888</programlisting>
             </listitem>
            </varlistentry>
            
+           <varlistentry>
+             <term>fsync.warningthresholdms</term>
+             <listitem>
+               <para>(Java system property: <emphasis
+               role="bold">fsync.warningthresholdms</emphasis>)</para>
+
+               <para><emphasis role="bold">New in 3.3.4:</emphasis> A
+               warning message will be output to the log whenever an
+               fsync in the Transactional Log (WAL) takes longer than
+               this value. The values is specified in milliseconds and
+               defaults to 1000. This value can only be set as a
+               system property.</para>
+             </listitem>
+           </varlistentry>
+
           <varlistentry>
             <term>autopurge.snapRetainCount</term>
 

+ 17 - 0
src/java/main/org/apache/zookeeper/server/persistence/FileTxnLog.java

@@ -31,6 +31,7 @@ import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.concurrent.TimeUnit;
 import java.util.zip.Adler32;
 import java.util.zip.Checksum;
 
@@ -96,6 +97,9 @@ public class FileTxnLog implements TxnLog {
 
     public final static int VERSION = 2;
 
+    /** Maximum time we allow for elapsed fsync before WARNing */
+    private final static long fsyncWarningThresholdMS;
+
     static {
         LOG = LoggerFactory.getLogger(FileTxnLog.class);
 
@@ -107,6 +111,7 @@ public class FileTxnLog implements TxnLog {
                 LOG.warn(size + " is not a valid value for preAllocSize");
             }
         }
+        fsyncWarningThresholdMS = Long.getLong("fsync.warningthresholdms", 1000);
     }
 
     long lastZxidSeen;
@@ -306,7 +311,19 @@ public class FileTxnLog implements TxnLog {
         for (FileOutputStream log : streamsToFlush) {
             log.flush();
             if (forceSync) {
+                long startSyncNS = System.nanoTime();
+
                 log.getChannel().force(false);
+
+                long syncElapsedMS =
+                    TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startSyncNS);
+                if (syncElapsedMS > fsyncWarningThresholdMS) {
+                    LOG.warn("fsync-ing the write ahead log in "
+                            + Thread.currentThread().getName()
+                            + " took " + syncElapsedMS
+                            + "ms which will adversely effect operation latency. "
+                            + "See the ZooKeeper troubleshooting guide");
+                }
             }
         }
         while (streamsToFlush.size() > 1) {