Forráskód Böngészése

HDFS-11907. Add metric for time taken by NameNode resource check. Contributed by Chen Liang.

Arpit Agarwal 8 éve
szülő
commit
097b1f74e5

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -3711,9 +3711,12 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
    * Perform resource checks and cache the results.
    */
   void checkAvailableResources() {
+    long resourceCheckTime = monotonicNow();
     Preconditions.checkState(nnResourceChecker != null,
         "nnResourceChecker not initialized");
     hasResourcesAvailable = nnResourceChecker.hasAvailableDiskSpace();
+    resourceCheckTime = monotonicNow() - resourceCheckTime;
+    NameNode.getNameNodeMetrics().addResourceCheckTime(resourceCheckTime);
   }
 
   /**

+ 10 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java

@@ -86,6 +86,7 @@ import org.apache.hadoop.util.GenericOptionsParser;
 import org.apache.hadoop.util.JvmPauseMonitor;
 import org.apache.hadoop.util.ServicePlugin;
 import org.apache.hadoop.util.StringUtils;
+import org.apache.hadoop.util.Time;
 import org.apache.htrace.core.Tracer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -366,8 +367,9 @@ public class NameNode extends ReconfigurableBase implements
   private final boolean haEnabled;
   private final HAContext haContext;
   protected final boolean allowStaleStandbyReads;
-  private AtomicBoolean started = new AtomicBoolean(false); 
+  private AtomicBoolean started = new AtomicBoolean(false);
 
+  private final static int HEALTH_MONITOR_WARN_THRESHOLD_MS = 5000;
   
   /** httpServer */
   protected NameNodeHttpServer httpServer;
@@ -1736,7 +1738,14 @@ public class NameNode extends ReconfigurableBase implements
     if (!haEnabled) {
       return; // no-op, if HA is not enabled
     }
+    long start = Time.monotonicNow();
     getNamesystem().checkAvailableResources();
+    long end = Time.monotonicNow();
+    if (end - start >= HEALTH_MONITOR_WARN_THRESHOLD_MS) {
+      // log a warning if it take >= 5 seconds.
+      LOG.warn("Remote IP {} checking available resources took {}ms",
+          Server.getRemoteIp(), end - start);
+    }
     if (!getNamesystem().nameNodeHasResourcesAvailable()) {
       throw new HealthCheckFailedException(
           "The NameNode has no resources available");

+ 13 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/metrics/NameNodeMetrics.java

@@ -119,6 +119,8 @@ public class NameNodeMetrics {
   private final MutableQuantiles[] generateEDEKTimeQuantiles;
   @Metric("Warm-up EDEK time") private MutableRate warmUpEDEKTime;
   private final MutableQuantiles[] warmUpEDEKTimeQuantiles;
+  @Metric("Resource check time") private MutableRate resourceCheckTime;
+  private final MutableQuantiles[] resourceCheckTimeQuantiles;
 
   @Metric("Duration in SafeMode at startup in msec")
   MutableGaugeInt safeModeTime;
@@ -145,6 +147,7 @@ public class NameNodeMetrics {
     cacheReportQuantiles = new MutableQuantiles[len];
     generateEDEKTimeQuantiles = new MutableQuantiles[len];
     warmUpEDEKTimeQuantiles = new MutableQuantiles[len];
+    resourceCheckTimeQuantiles = new MutableQuantiles[len];
     
     for (int i = 0; i < len; i++) {
       int interval = intervals[i];
@@ -163,6 +166,9 @@ public class NameNodeMetrics {
       warmUpEDEKTimeQuantiles[i] = registry.newQuantiles(
           "warmupEDEKTime" + interval + "s",
           "Warm up EDEK time", "ops", "latency", interval);
+      resourceCheckTimeQuantiles[i] = registry.newQuantiles(
+          "resourceCheckTime" + interval + "s",
+          "resource check time", "ops", "latency", interval);
     }
   }
 
@@ -353,4 +359,11 @@ public class NameNodeMetrics {
       q.add(latency);
     }
   }
+
+  public void addResourceCheckTime(long latency) {
+    resourceCheckTime.add(latency);
+    for (MutableQuantiles q : resourceCheckTimeQuantiles) {
+      q.add(latency);
+    }
+  }
 }

+ 37 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/metrics/TestNameNodeMetrics.java

@@ -22,8 +22,13 @@ import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.fs.FileSystemTestHelper;
 import org.apache.hadoop.fs.FileSystemTestWrapper;
 import org.apache.hadoop.fs.permission.FsPermission;
+import org.apache.hadoop.ha.HAServiceProtocol;
+import org.apache.hadoop.hdfs.DFSUtil;
 import org.apache.hadoop.hdfs.client.CreateEncryptionZoneFlag;
 import org.apache.hadoop.hdfs.client.HdfsAdmin;
+
+import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_DEFAULT;
+import static org.apache.hadoop.fs.CommonConfigurationKeys.HA_HM_RPC_TIMEOUT_KEY;
 import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
 import static org.apache.hadoop.test.MetricsAsserts.assertGauge;
 import static org.apache.hadoop.test.MetricsAsserts.assertQuantileGauges;
@@ -60,9 +65,11 @@ import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
 import org.apache.hadoop.hdfs.server.datanode.DataNode;
 import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
 import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
+import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
 import org.apache.hadoop.metrics2.MetricsRecordBuilder;
 import org.apache.hadoop.metrics2.MetricsSource;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
@@ -684,4 +691,34 @@ public class TestNameNodeMetrics {
       }
     }
   }
+
+  @Test
+  public void testResourceCheck() throws Exception {
+    HdfsConfiguration conf = new HdfsConfiguration();
+    MiniDFSCluster tmpCluster = new MiniDFSCluster.Builder(conf)
+        .numDataNodes(0)
+        .nnTopology(MiniDFSNNTopology.simpleHATopology())
+        .build();
+    try {
+      MockNameNodeResourceChecker mockResourceChecker =
+          new MockNameNodeResourceChecker(conf);
+      tmpCluster.getNameNode(0).getNamesystem()
+          .setNNResourceChecker(mockResourceChecker);
+      NNHAServiceTarget haTarget = new NNHAServiceTarget(conf,
+          DFSUtil.getNamenodeNameServiceId(
+              new HdfsConfiguration()), "nn1");
+      HAServiceProtocol rpc = haTarget.getHealthMonitorProxy(conf, conf.getInt(
+          HA_HM_RPC_TIMEOUT_KEY, HA_HM_RPC_TIMEOUT_DEFAULT));
+
+      MetricsRecordBuilder rb = getMetrics(NN_METRICS);
+      for (long i = 0; i < 10; i++) {
+        rpc.monitorHealth();
+        assertQuantileGauges("ResourceCheckTime1s", rb);
+      }
+    } finally {
+      if (tmpCluster != null) {
+        tmpCluster.shutdown();
+      }
+    }
+  }
 }