Browse Source

HDFS-13946. Log longest FSN write/read lock held stack trace.

Yiqun Lin 6 years ago
parent
commit
feb2664ac4

+ 16 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/log/LogThrottlingHelper.java

@@ -272,6 +272,22 @@ public class LogThrottlingHelper {
     }
   }
 
+  /**
+   * Return the summary information for given index.
+   *
+   * @param recorderName The name of the recorder.
+   * @param idx The index value.
+   * @return The summary information.
+   */
+  public SummaryStatistics getCurrentStats(String recorderName, int idx) {
+    LoggingAction currentLog = currentLogs.get(recorderName);
+    if (currentLog != null) {
+      return currentLog.getStats(idx);
+    }
+
+    return null;
+  }
+
   /**
    * A standard log action which keeps track of all of the values which have
    * been logged. This is also used for internal bookkeeping via its private

+ 3 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/log/TestLogThrottlingHelper.java

@@ -167,6 +167,9 @@ public class TestLogThrottlingHelper {
     assertEquals(2.0, bar.getStats(0).getMean(), 0.01);
     assertEquals(3.0, baz.getStats(0).getMean(), 0.01);
     assertEquals(3.0, baz.getStats(1).getMean(), 0.01);
+
+    assertEquals(2.0, helper.getCurrentStats("bar", 0).getMax(), 0);
+    assertEquals(3.0, helper.getCurrentStats("baz", 0).getMax(), 0);
   }
 
 }

+ 74 - 19
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java

@@ -21,10 +21,12 @@ package org.apache.hadoop.hdfs.server.namenode;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.AtomicReference;
 import java.util.concurrent.locks.Condition;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import com.google.common.annotations.VisibleForTesting;
+import org.apache.commons.math3.stat.descriptive.SummaryStatistics;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.log.LogThrottlingHelper;
 import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
@@ -97,8 +99,18 @@ class FSNamesystemLock {
       new AtomicInteger(0);
   /** Time stamp (ms) of the last time a read lock report was written. */
   private final AtomicLong timeStampOfLastReadLockReportMs = new AtomicLong(0);
-  /** Longest time (ms) a read lock was held since the last report. */
-  private final AtomicLong longestReadLockHeldIntervalMs = new AtomicLong(0);
+  /**
+   * The info (lock held time and stack trace) when longest time (ms) a read
+   * lock was held since the last report.
+   */
+  private final AtomicReference<ReadLockHeldInfo> longestReadLockHeldInfo =
+      new AtomicReference<>(new ReadLockHeldInfo(0, null));
+
+  /**
+   * The stack trace when longest time of the write lock
+   * was held since the last report.
+   */
+  private volatile String longestWriteLockHeldStackTrace;
 
   @VisibleForTesting
   static final String OP_NAME_OTHER = "OTHER";
@@ -172,12 +184,13 @@ class FSNamesystemLock {
     final long readLockIntervalMs =
         TimeUnit.NANOSECONDS.toMillis(readLockIntervalNanos);
     if (needReport && readLockIntervalMs >= this.readLockReportingThresholdMs) {
-      long localLongestReadLock;
+      ReadLockHeldInfo localLockHeldInfo;
       do {
-        localLongestReadLock = longestReadLockHeldIntervalMs.get();
-      } while (localLongestReadLock - readLockIntervalMs < 0 &&
-          !longestReadLockHeldIntervalMs.compareAndSet(localLongestReadLock,
-              readLockIntervalMs));
+        localLockHeldInfo = longestReadLockHeldInfo.get();
+      } while (localLockHeldInfo.getIntervalMs() - readLockIntervalMs < 0 &&
+          !longestReadLockHeldInfo.compareAndSet(localLockHeldInfo,
+              new ReadLockHeldInfo(readLockIntervalMs,
+                  StringUtils.getStackTrace(Thread.currentThread()))));
 
       long localTimeStampOfLastReadLockReport;
       long nowMs;
@@ -193,13 +206,13 @@ class FSNamesystemLock {
       } while (!timeStampOfLastReadLockReportMs.compareAndSet(
           localTimeStampOfLastReadLockReport, nowMs));
       int numSuppressedWarnings = numReadLockWarningsSuppressed.getAndSet(0);
-      long longestLockHeldIntervalMs =
-          longestReadLockHeldIntervalMs.getAndSet(0);
-      FSNamesystem.LOG.info("FSNamesystem read lock held for " +
-          readLockIntervalMs + " ms via\n" +
-          StringUtils.getStackTrace(Thread.currentThread()) +
-          "\tNumber of suppressed read-lock reports: " + numSuppressedWarnings +
-          "\n\tLongest read-lock held interval: " + longestLockHeldIntervalMs);
+      ReadLockHeldInfo lockHeldInfo = longestReadLockHeldInfo
+          .getAndSet(new ReadLockHeldInfo(0, null));
+      FSNamesystem.LOG.info(
+          "\tNumber of suppressed read-lock reports: {}" +
+          "\n\tLongest read-lock held interval: {}ms via {}",
+          numSuppressedWarnings, lockHeldInfo.getIntervalMs(),
+          lockHeldInfo.getStackTrace());
     }
   }
   
@@ -255,6 +268,14 @@ class FSNamesystemLock {
     LogAction logAction = LogThrottlingHelper.DO_NOT_LOG;
     if (needReport &&
         writeLockIntervalMs >= this.writeLockReportingThresholdMs) {
+      SummaryStatistics currentStats =
+          writeLockReportLogger.getCurrentStats("write", 0);
+      double currentMaxTime = currentStats != null ? currentStats.getMax() : 0;
+      if (currentMaxTime < writeLockIntervalMs) {
+        longestWriteLockHeldStackTrace =
+            StringUtils.getStackTrace(Thread.currentThread());
+      }
+
       logAction = writeLockReportLogger
           .record("write", currentTimeMs, writeLockIntervalMs);
     }
@@ -266,12 +287,12 @@ class FSNamesystemLock {
     }
 
     if (logAction.shouldLog()) {
-      FSNamesystem.LOG.info("FSNamesystem write lock held for {} ms via {}\t" +
-          "Number of suppressed write-lock reports: {}\n\tLongest write-lock " +
-          "held interval: {} \n\tTotal suppressed write-lock held time: {}",
-          writeLockIntervalMs,
-          StringUtils.getStackTrace(Thread.currentThread()),
+      FSNamesystem.LOG.info(
+          "\tNumber of suppressed write-lock reports: {}" +
+          "\n\tLongest write-lock held interval: {}ms via {} " +
+          "\n\tTotal suppressed write-lock held time: {}",
           logAction.getCount() - 1, logAction.getStats(0).getMax(),
+          longestWriteLockHeldStackTrace,
           logAction.getStats(0).getSum() - writeLockIntervalMs);
     }
   }
@@ -323,4 +344,38 @@ class FSNamesystemLock {
         org.apache.commons.lang3.StringUtils.capitalize(operationName) +
         LOCK_METRIC_SUFFIX;
   }
+
+  /**
+   * Read lock Held Info.
+   */
+  private static class ReadLockHeldInfo {
+    /** Read lock held time. */
+    private Long intervalMs;
+    /** The stack trace read lock was held. */
+    private String stackTrace;
+
+    ReadLockHeldInfo(long intervalMs, String stackTrace) {
+      this.intervalMs = intervalMs;
+      this.stackTrace = stackTrace;
+    }
+
+    public Long getIntervalMs() {
+      return this.intervalMs;
+    }
+
+    public String getStackTrace() {
+      return this.stackTrace;
+    }
+
+    @Override
+    public int hashCode() {
+      return this.intervalMs.hashCode();
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+      return obj instanceof ReadLockHeldInfo && ((ReadLockHeldInfo) obj)
+          .getIntervalMs().compareTo(intervalMs) == 0;
+    }
+  }
 }

+ 33 - 8
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java

@@ -202,7 +202,11 @@ public class TestFSNamesystemLock {
     timer.advance(writeLockReportingThreshold + 100);
     logs.clearOutput();
     fsnLock.writeUnlock();
+    // look for the method name in the stack trace
     assertTrue(logs.getOutput().contains(GenericTestUtils.getMethodName()));
+    // find the held interval time in the log
+    Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
+    assertTrue(pattern.matcher(logs.getOutput()).find());
     assertTrue(logs.getOutput().contains(
         "Number of suppressed write-lock reports: 2"));
   }
@@ -215,7 +219,7 @@ public class TestFSNamesystemLock {
   public void testFSReadLockLongHoldingReport() throws Exception {
     final long readLockReportingThreshold = 100L;
     final long readLockSuppressWarningInterval = 10000L;
-    final String readLockLogStmt = "FSNamesystem read lock held for ";
+    final String readLockLogStmt = "Number of suppressed read-lock reports";
     Configuration conf = new Configuration();
     conf.setLong(
         DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY,
@@ -256,6 +260,18 @@ public class TestFSNamesystemLock {
     // Track but do not Report if it's held for a long time when re-entering
     // read lock but time since last report does not exceed the suppress
     // warning interval
+    Thread tLong = new Thread() {
+      @Override
+      public void run() {
+        fsnLock.readLock();
+        // Add one lock hold which is the longest, but occurs under a different
+        // stack trace, to ensure this is the one that gets logged
+        timer.advance(readLockReportingThreshold + 20);
+        fsnLock.readUnlock();
+      }
+    };
+    tLong.start();
+    tLong.join();
     fsnLock.readLock();
     timer.advance(readLockReportingThreshold / 2 + 1);
     fsnLock.readLock();
@@ -268,6 +284,18 @@ public class TestFSNamesystemLock {
     fsnLock.readUnlock();
     assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()) &&
         logs.getOutput().contains(readLockLogStmt));
+    timer.advance(readLockSuppressWarningInterval);
+    fsnLock.readLock();
+    timer.advance(readLockReportingThreshold + 1);
+    fsnLock.readUnlock();
+    // Assert that stack trace eventually logged is the one for the longest hold
+    String stackTracePatternString =
+        String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
+    Pattern tLongPattern = Pattern.compile(
+        String.format(stackTracePatternString, tLong.getClass().getName()));
+    assertTrue(tLongPattern.matcher(logs.getOutput()).find());
+    assertTrue(logs.getOutput().contains(
+        "Number of suppressed read-lock reports: 3"));
 
     // Report if it's held for a long time (and time since last report
     // exceeds the suppress warning interval) while another thread also has the
@@ -310,16 +338,15 @@ public class TestFSNamesystemLock {
     t1.join();
     t2.join();
     // Look for the differentiating class names in the stack trace
-    String stackTracePatternString =
-        String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
     Pattern t1Pattern = Pattern.compile(
         String.format(stackTracePatternString, t1.getClass().getName()));
     assertTrue(t1Pattern.matcher(logs.getOutput()).find());
     Pattern t2Pattern = Pattern.compile(
         String.format(stackTracePatternString, t2.getClass().getName()));
     assertFalse(t2Pattern.matcher(logs.getOutput()).find());
-    assertTrue(logs.getOutput().contains(
-        "Number of suppressed read-lock reports: 2"));
+    // match the held interval time in the log
+    Pattern pattern = Pattern.compile(".*[\n].*\\d+ms(.*[\n].*){1,}");
+    assertTrue(pattern.matcher(logs.getOutput()).find());
   }
 
   @Test
@@ -396,7 +423,7 @@ public class TestFSNamesystemLock {
     timer.advance(writeLockReportingThreshold + 100);
     fsnLock.writeUnlock();
     assertTrue(logs.getOutput().contains(
-        "FSNamesystem write lock held for"));
+        "Number of suppressed write-lock reports"));
 
     logs.clearOutput();
 
@@ -407,8 +434,6 @@ public class TestFSNamesystemLock {
     assertFalse(logs.getOutput().contains(GenericTestUtils.getMethodName()));
     assertFalse(logs.getOutput().contains(
         "Number of suppressed write-lock reports:"));
-    assertFalse(logs.getOutput().contains(
-        "FSNamesystem write lock held for"));
   }
 
 }