فهرست منبع

HDFS-10872. Add MutableRate metrics for FSNamesystemLock operations. Contributed by Erik Krogen.

Zhe Zhang 8 سال پیش
والد
کامیت
818915aabb
13فایلهای تغییر یافته به همراه278 افزوده شده و 128 حذف شده
  1. 8 0
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MetricsRegistry.java
  2. 1 3
      hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableMetricsFactory.java
  3. 4 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java
  4. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java
  5. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java
  6. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ContentSummaryComputationContext.java
  7. 2 2
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java
  8. 142 108
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java
  9. 56 3
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java
  10. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java
  11. 1 1
      hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java
  12. 8 0
      hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml
  13. 52 7
      hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java

+ 8 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MetricsRegistry.java

@@ -281,6 +281,14 @@ public class MetricsRegistry {
     return ret;
   }
 
+  public synchronized MutableRatesWithAggregation newRatesWithAggregation(
+      String name) {
+    checkMetricName(name);
+    MutableRatesWithAggregation rates = new MutableRatesWithAggregation();
+    metricsMap.put(name, rates);
+    return rates;
+  }
+
   synchronized void add(String name, MutableMetric metric) {
     checkMetricName(name);
     metricsMap.put(name, metric);

+ 1 - 3
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/lib/MutableMetricsFactory.java

@@ -68,9 +68,7 @@ public class MutableMetricsFactory {
       return new MutableRates(registry);
     }
     if (cls == MutableRatesWithAggregation.class) {
-      MutableRatesWithAggregation rates = new MutableRatesWithAggregation();
-      registry.add(info.name(), rates);
-      return rates;
+      return registry.newRatesWithAggregation(info.name());
     }
     if (cls == MutableStat.class) {
       return registry.newStat(info.name(), info.description(),

+ 4 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/DFSConfigKeys.java

@@ -391,6 +391,10 @@ public class DFSConfigKeys extends CommonConfigurationKeys {
   public static final long
       DFS_NAMENODE_MAX_LOCK_HOLD_TO_RELEASE_LEASE_MS_DEFAULT = 25;
 
+  public static final String  DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY =
+      "dfs.namenode.lock.detailed-metrics.enabled";
+  public static final boolean DFS_NAMENODE_LOCK_DETAILED_METRICS_DEFAULT =
+      false;
   // Threshold for how long namenode locks must be held for the
   // event to be logged
   public static final String  DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_KEY =

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CacheManager.java

@@ -959,7 +959,7 @@ public final class CacheManager {
       processCacheReportImpl(datanode, blockIds);
     } finally {
       endTime = Time.monotonicNow();
-      namesystem.writeUnlock();
+      namesystem.writeUnlock("processCacheReport");
     }
 
     // Log the block report processing stats from Namenode perspective

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/Checkpointer.java

@@ -264,7 +264,7 @@ class Checkpointer extends Daemon {
         bnStorage.writeAll();
       }
     } finally {
-      backupNode.namesystem.writeUnlock();
+      backupNode.namesystem.writeUnlock("doCheckpoint");
     }
 
     if(cpCmd.needToReturnImage()) {

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ContentSummaryComputationContext.java

@@ -115,7 +115,7 @@ public class ContentSummaryComputationContext {
 
     // unlock
     dir.readUnlock();
-    fsn.readUnlock();
+    fsn.readUnlock("contentSummary");
 
     try {
       Thread.sleep(sleepMilliSec, sleepNanoSec);

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLogLoader.java

@@ -151,7 +151,7 @@ public class FSEditLogLoader {
       return numEdits;
     } finally {
       edits.close();
-      fsNamesys.writeUnlock();
+      fsNamesys.writeUnlock("loadFSEdits");
       prog.endStep(Phase.LOADING_EDITS, step);
     }
   }
@@ -288,7 +288,7 @@ public class FSEditLogLoader {
         in.close();
       }
       fsDir.writeUnlock();
-      fsNamesys.writeUnlock();
+      fsNamesys.writeUnlock("loadEditRecords");
 
       if (LOG.isTraceEnabled()) {
         LOG.trace("replaying edit log finished");

تفاوت فایلی نمایش داده نمی شود زیرا این فایل بسیار بزرگ است
+ 142 - 108
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java


+ 56 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java

@@ -27,11 +27,14 @@ import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.util.Timer;
 
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_LOCK_SUPPRESS_WARNING_INTERVAL_KEY;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LOCK_DETAILED_METRICS_DEFAULT;
+import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_DEFAULT;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_READ_LOCK_REPORTING_THRESHOLD_MS_KEY;
 import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPORTING_THRESHOLD_MS_DEFAULT;
@@ -40,11 +43,20 @@ import static org.apache.hadoop.hdfs.DFSConfigKeys.DFS_NAMENODE_WRITE_LOCK_REPOR
 /**
  * Mimics a ReentrantReadWriteLock but does not directly implement the interface
  * so more sophisticated locking capabilities and logging/metrics are possible.
+ * If {@link org.apache.hadoop.hdfs.DFSConfigKeys#DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY}
+ * is set to be true, metrics will be emitted into the FSNamesystem metrics
+ * registry for each operation which acquires this lock indicating how long
+ * the operation held the lock for. Note that if a thread dies, metrics produced
+ * after the most recent snapshot will be lost due to the use of
+ * {@link MutableRatesWithAggregation}. However since threads are re-used
+ * between operations this should not generally be an issue.
  */
 class FSNamesystemLock {
   @VisibleForTesting
   protected ReentrantReadWriteLock coarseLock;
 
+  private final boolean metricsEnabled;
+  private final MutableRatesWithAggregation detailedHoldTimeMetrics;
   private final Timer timer;
 
   /**
@@ -80,12 +92,19 @@ class FSNamesystemLock {
   private final AtomicLong timeStampOfLastReadLockReport = new AtomicLong(0);
   private final AtomicLong longestReadLockHeldInterval = new AtomicLong(0);
 
-  FSNamesystemLock(Configuration conf) {
-    this(conf, new Timer());
+  @VisibleForTesting
+  static final String OP_NAME_OTHER = "OTHER";
+  private static final String READ_LOCK_METRIC_PREFIX = "FSNReadLock";
+  private static final String WRITE_LOCK_METRIC_PREFIX = "FSNWriteLock";
+
+  FSNamesystemLock(Configuration conf,
+      MutableRatesWithAggregation detailedHoldTimeMetrics) {
+    this(conf, detailedHoldTimeMetrics, new Timer());
   }
 
   @VisibleForTesting
-  FSNamesystemLock(Configuration conf, Timer timer) {
+  FSNamesystemLock(Configuration conf,
+      MutableRatesWithAggregation detailedHoldTimeMetrics, Timer timer) {
     boolean fair = conf.getBoolean("dfs.namenode.fslock.fair", true);
     FSNamesystem.LOG.info("fsLock is fair: " + fair);
     this.coarseLock = new ReentrantReadWriteLock(fair);
@@ -100,6 +119,12 @@ class FSNamesystemLock {
     this.lockSuppressWarningInterval = conf.getTimeDuration(
         DFS_LOCK_SUPPRESS_WARNING_INTERVAL_KEY,
         DFS_LOCK_SUPPRESS_WARNING_INTERVAL_DEFAULT, TimeUnit.MILLISECONDS);
+    this.metricsEnabled = conf.getBoolean(
+        DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY,
+        DFS_NAMENODE_LOCK_DETAILED_METRICS_DEFAULT);
+    FSNamesystem.LOG.info("Detailed lock hold time metrics enabled: " +
+        this.metricsEnabled);
+    this.detailedHoldTimeMetrics = detailedHoldTimeMetrics;
   }
 
   public void readLock() {
@@ -110,12 +135,17 @@ class FSNamesystemLock {
   }
 
   public void readUnlock() {
+    readUnlock(OP_NAME_OTHER);
+  }
+
+  public void readUnlock(String opName) {
     final boolean needReport = coarseLock.getReadHoldCount() == 1;
     final long readLockInterval =
         timer.monotonicNow() - readLockHeldTimeStamp.get();
     coarseLock.readLock().unlock();
 
     if (needReport) {
+      addMetric(opName, readLockInterval, false);
       readLockHeldTimeStamp.remove();
     }
     if (needReport && readLockInterval >= this.readLockReportingThreshold) {
@@ -164,6 +194,10 @@ class FSNamesystemLock {
   }
 
   public void writeUnlock() {
+    writeUnlock(OP_NAME_OTHER);
+  }
+
+  public void writeUnlock(String opName) {
     final boolean needReport = coarseLock.getWriteHoldCount() == 1 &&
         coarseLock.isWriteLockedByCurrentThread();
     final long currentTime = timer.monotonicNow();
@@ -191,6 +225,10 @@ class FSNamesystemLock {
 
     coarseLock.writeLock().unlock();
 
+    if (needReport) {
+      addMetric(opName, writeLockInterval, true);
+    }
+
     if (logReport) {
       FSNamesystem.LOG.info("FSNamesystem write lock held for " +
           writeLockInterval + " ms via\n" +
@@ -227,4 +265,19 @@ class FSNamesystemLock {
   public int getQueueLength() {
     return coarseLock.getQueueLength();
   }
+
+  /**
+   * Add the lock hold time for a recent operation to the metrics.
+   * @param operationName Name of the operation for which to record the time
+   * @param value Length of time the lock was held
+   */
+  private void addMetric(String operationName, long value, boolean isWrite) {
+    if (metricsEnabled) {
+      String metricName =
+          (isWrite ? WRITE_LOCK_METRIC_PREFIX : READ_LOCK_METRIC_PREFIX) +
+          org.apache.commons.lang.StringUtils.capitalize(operationName);
+      detailedHoldTimeMetrics.add(metricName, value);
+    }
+  }
+
 }

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java

@@ -329,7 +329,7 @@ public class LeaseManager {
               needSync = checkLeases();
             }
           } finally {
-            fsnamesystem.writeUnlock();
+            fsnamesystem.writeUnlock("leaseManager");
             // lease reassignments should to be sync'ed.
             if (needSync) {
               fsnamesystem.getEditLog().logSync();

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NamenodeFsck.java

@@ -498,7 +498,7 @@ public class NamenodeFsck implements DataEncryptionKeyFactory {
     } catch (FileNotFoundException fnfe) {
       blocks = null;
     } finally {
-      fsn.readUnlock();
+      fsn.readUnlock("fsckGetBlockLocations");
     }
     return blocks;
   }

+ 8 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/resources/hdfs-default.xml

@@ -2652,6 +2652,14 @@
   </description>
 </property>
 
+<property>
+  <name>dfs.namenode.lock.detailed-metrics.enabled</name>
+  <value>false</value>
+  <description>If true, the namenode will keep track of how long various
+    operations hold the Namesystem lock for and emit this as metrics.
+  </description>
+</property>
+
 <property>
   <name>dfs.namenode.startup.delay.block.deletion.sec</name>
   <value>0</value>

+ 52 - 7
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestFSNamesystemLock.java

@@ -21,8 +21,12 @@ package org.apache.hadoop.hdfs.server.namenode;
 import com.google.common.base.Supplier;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.metrics2.MetricsRecordBuilder;
+import org.apache.hadoop.metrics2.lib.MetricsRegistry;
+import org.apache.hadoop.metrics2.lib.MutableRatesWithAggregation;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.test.GenericTestUtils.LogCapturer;
+import org.apache.hadoop.test.MetricsAsserts;
 import org.apache.hadoop.util.FakeTimer;
 import org.apache.log4j.Level;
 import org.junit.Test;
@@ -36,6 +40,8 @@ import java.util.concurrent.TimeoutException;
 import java.util.regex.Pattern;
 
 import static org.junit.Assert.*;
+import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
+import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
 
 /**
  * Tests the FSNamesystemLock, looking at lock compatibilities and
@@ -48,17 +54,17 @@ public class TestFSNamesystemLock {
     Configuration conf = new Configuration();
 
     conf.setBoolean("dfs.namenode.fslock.fair", true);
-    FSNamesystemLock fsnLock = new FSNamesystemLock(conf);
+    FSNamesystemLock fsnLock = new FSNamesystemLock(conf, null);
     assertTrue(fsnLock.coarseLock.isFair());
 
     conf.setBoolean("dfs.namenode.fslock.fair", false);
-    fsnLock = new FSNamesystemLock(conf);
+    fsnLock = new FSNamesystemLock(conf, null);
     assertFalse(fsnLock.coarseLock.isFair());
   }
 
   @Test
   public void testFSNamesystemLockCompatibility() {
-    FSNamesystemLock rwLock = new FSNamesystemLock(new Configuration());
+    FSNamesystemLock rwLock = new FSNamesystemLock(new Configuration(), null);
 
     assertEquals(0, rwLock.getReadHoldCount());
     rwLock.readLock();
@@ -98,7 +104,7 @@ public class TestFSNamesystemLock {
     final CountDownLatch latch = new CountDownLatch(threadCount);
     final Configuration conf = new Configuration();
     conf.setBoolean("dfs.namenode.fslock.fair", true);
-    final FSNamesystemLock rwLock = new FSNamesystemLock(conf);
+    final FSNamesystemLock rwLock = new FSNamesystemLock(conf, null);
     rwLock.writeLock();
     ExecutorService helper = Executors.newFixedThreadPool(threadCount);
 
@@ -141,7 +147,7 @@ public class TestFSNamesystemLock {
         writeLockSuppressWarningInterval, TimeUnit.MILLISECONDS);
 
     final FakeTimer timer = new FakeTimer();
-    final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, timer);
+    final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, null, timer);
     timer.advance(writeLockSuppressWarningInterval);
 
     LogCapturer logs = LogCapturer.captureLogs(FSNamesystem.LOG);
@@ -216,7 +222,7 @@ public class TestFSNamesystemLock {
         readLockSuppressWarningInterval, TimeUnit.MILLISECONDS);
 
     final FakeTimer timer = new FakeTimer();
-    final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, timer);
+    final FSNamesystemLock fsnLock = new FSNamesystemLock(conf, null, timer);
     timer.advance(readLockSuppressWarningInterval);
 
     LogCapturer logs = LogCapturer.captureLogs(FSNamesystem.LOG);
@@ -303,7 +309,7 @@ public class TestFSNamesystemLock {
     t2.join();
     // Look for the differentiating class names in the stack trace
     String stackTracePatternString =
-        String.format("INFO.+%s(.+\n){4}\\Q%%s\\E\\.run", readLockLogStmt);
+        String.format("INFO.+%s(.+\n){5}\\Q%%s\\E\\.run", readLockLogStmt);
     Pattern t1Pattern = Pattern.compile(
         String.format(stackTracePatternString, t1.getClass().getName()));
     assertTrue(t1Pattern.matcher(logs.getOutput()).find());
@@ -314,4 +320,43 @@ public class TestFSNamesystemLock {
         "Number of suppressed read-lock reports: 2"));
   }
 
+  @Test
+  public void testDetailedHoldMetrics() throws Exception {
+    Configuration conf = new Configuration();
+    conf.setBoolean(DFSConfigKeys.DFS_NAMENODE_LOCK_DETAILED_METRICS_KEY, true);
+    FakeTimer timer = new FakeTimer();
+    MetricsRegistry registry = new MetricsRegistry("Test");
+    MutableRatesWithAggregation rates =
+        registry.newRatesWithAggregation("Test");
+    FSNamesystemLock fsLock = new FSNamesystemLock(conf, rates, timer);
+
+    fsLock.readLock();
+    timer.advance(1);
+    fsLock.readUnlock("foo");
+    fsLock.readLock();
+    timer.advance(2);
+    fsLock.readUnlock("foo");
+
+    fsLock.readLock();
+    timer.advance(1);
+    fsLock.readLock();
+    timer.advance(1);
+    fsLock.readUnlock("bar");
+    fsLock.readUnlock("bar");
+
+    fsLock.writeLock();
+    timer.advance(1);
+    fsLock.writeUnlock("baz");
+
+    MetricsRecordBuilder rb = MetricsAsserts.mockMetricsRecordBuilder();
+    rates.snapshot(rb, true);
+
+    assertGauge("FSNReadLockFooAvgTime", 1.5, rb);
+    assertCounter("FSNReadLockFooNumOps", 2L, rb);
+    assertGauge("FSNReadLockBarAvgTime", 2.0, rb);
+    assertCounter("FSNReadLockBarNumOps", 1L, rb);
+    assertGauge("FSNWriteLockBazAvgTime", 1.0, rb);
+    assertCounter("FSNWriteLockBazNumOps", 1L, rb);
+  }
+
 }

برخی فایل ها در این مقایسه diff نمایش داده نمی شوند زیرا تعداد فایل ها بسیار زیاد است