Bladeren bron

YARN-10208. Add capacityScheduler metric for NODE_UPDATE interval. Contributed by Pranjal Protim Borah.

bibinchundatt 5 jaren geleden
bovenliggende
commit
5dadf963d3

+ 14 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacityScheduler.java

@@ -1828,6 +1828,7 @@ public class CapacityScheduler extends
     case NODE_UPDATE:
     {
       NodeUpdateSchedulerEvent nodeUpdatedEvent = (NodeUpdateSchedulerEvent)event;
+      updateSchedulerNodeHBIntervalMetrics(nodeUpdatedEvent);
       nodeUpdate(nodeUpdatedEvent.getRMNode());
     }
     break;
@@ -2114,6 +2115,19 @@ public class CapacityScheduler extends
     }
   }
 
+  private void updateSchedulerNodeHBIntervalMetrics(
+      NodeUpdateSchedulerEvent nodeUpdatedEvent) {
+    // Add metrics for evaluating the time difference between heartbeats.
+    SchedulerNode node =
+        nodeTracker.getNode(nodeUpdatedEvent.getRMNode().getNodeID());
+    if (node != null) {
+      long lastInterval =
+          Time.monotonicNow() - node.getLastHeartbeatMonotonicTime();
+      CapacitySchedulerMetrics.getMetrics()
+          .addSchedulerNodeHBInterval(lastInterval);
+    }
+  }
+
   @Override
   protected void completedContainerInternal(
       RMContainer rmContainer, ContainerStatus containerStatus,

+ 12 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CapacitySchedulerMetrics.java

@@ -26,6 +26,7 @@ import org.apache.hadoop.metrics2.annotation.Metric;
 import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.MetricsRegistry;
+import org.apache.hadoop.metrics2.lib.MutableQuantiles;
 import org.apache.hadoop.metrics2.lib.MutableRate;
 
 import java.util.concurrent.atomic.AtomicBoolean;
@@ -49,6 +50,8 @@ public class CapacitySchedulerMetrics {
   @Metric("Scheduler commit success") MutableRate commitSuccess;
   @Metric("Scheduler commit failure") MutableRate commitFailure;
   @Metric("Scheduler node update") MutableRate nodeUpdate;
+  @Metric("Scheduler node heartbeat interval") MutableQuantiles
+      schedulerNodeHBInterval;
 
   private static volatile CapacitySchedulerMetrics INSTANCE = null;
   private static MetricsRegistry registry;
@@ -116,4 +119,13 @@ public class CapacitySchedulerMetrics {
   public long getNumOfCommitSuccess() {
     return this.commitSuccess.lastStat().numSamples();
   }
+
+  public void addSchedulerNodeHBInterval(long heartbeatInterval) {
+    schedulerNodeHBInterval.add(heartbeatInterval);
+  }
+
+  @VisibleForTesting
+  public long getNumOfSchedulerNodeHBInterval() {
+    return this.schedulerNodeHBInterval.getEstimator().getCount();
+  }
 }

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestCapacitySchedulerMetrics.java

@@ -71,6 +71,9 @@ public class TestCapacitySchedulerMetrics {
     try {
       GenericTestUtils.waitFor(()
           -> csMetrics.getNumOfNodeUpdate() == 2, 100, 3000);
+      GenericTestUtils
+          .waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 2,
+              100, 3000);
     } catch(TimeoutException e) {
       Assert.fail("CS metrics not updated on node-update events.");
     }
@@ -101,6 +104,9 @@ public class TestCapacitySchedulerMetrics {
       // Verify HB metrics updated
       GenericTestUtils.waitFor(()
           -> csMetrics.getNumOfNodeUpdate() == 4, 100, 3000);
+      GenericTestUtils
+          .waitFor(() -> csMetrics.getNumOfSchedulerNodeHBInterval() == 4,
+              100, 3000);
       // For async mode, the number of alloc might be bigger than 1
       Assert.assertTrue(csMetrics.getNumOfAllocates() > 0);
       // But there will be only 2 successful commit (1 AM + 1 task)