Przeglądaj źródła

YARN-10893. Adding metrics for getClusterMetrics and getApplications APIs in FederationClientInterceptor (#3325)

Akshat Bordia 3 lat temu
rodzic
commit
dee6dc2f89

+ 33 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/RouterMetrics.java

@@ -53,6 +53,8 @@ public final class RouterMetrics {
   private MutableGaugeInt numMultipleAppsFailedRetrieved;
   @Metric("# of applicationAttempt reports failed to be retrieved")
   private MutableGaugeInt numAppAttemptsFailedRetrieved;
+  @Metric("# of getClusterMetrics failed to be retrieved")
+  private MutableGaugeInt numGetClusterMetricsFailedRetrieved;
 
   // Aggregate metrics are shared, and don't have to be looked up per call
   @Metric("Total number of successful Submitted apps and latency(ms)")
@@ -69,6 +71,9 @@ public final class RouterMetrics {
   @Metric("Total number of successful Retrieved " +
           "appAttempt reports and latency(ms)")
   private MutableRate totalSucceededAppAttemptsRetrieved;
+  @Metric("Total number of successful Retrieved getClusterMetrics and "
+      + "latency(ms)")
+  private MutableRate totalSucceededGetClusterMetricsRetrieved;
 
 
   /**
@@ -80,6 +85,7 @@ public final class RouterMetrics {
   private MutableQuantiles getApplicationReportLatency;
   private MutableQuantiles getApplicationsReportLatency;
   private MutableQuantiles getApplicationAttemptReportLatency;
+  private MutableQuantiles getClusterMetricsLatency;
 
   private static volatile RouterMetrics INSTANCE = null;
   private static MetricsRegistry registry;
@@ -103,6 +109,9 @@ public final class RouterMetrics {
         registry.newQuantiles("getApplicationAttemptReportLatency",
                     "latency of get applicationattempt " +
                             "report", "ops", "latency", 10);
+    getClusterMetricsLatency =
+        registry.newQuantiles("getClusterMetricsLatency",
+            "latency of get cluster metrics", "ops", "latency", 10);
   }
 
   public static RouterMetrics getMetrics() {
@@ -154,6 +163,11 @@ public final class RouterMetrics {
     return totalSucceededMultipleAppsRetrieved.lastStat().numSamples();
   }
 
+  @VisibleForTesting
+  public long getNumSucceededGetClusterMetricsRetrieved(){
+    return totalSucceededGetClusterMetricsRetrieved.lastStat().numSamples();
+  }
+
   @VisibleForTesting
   public double getLatencySucceededAppsCreated() {
     return totalSucceededAppsCreated.lastStat().mean();
@@ -184,6 +198,11 @@ public final class RouterMetrics {
     return totalSucceededMultipleAppsRetrieved.lastStat().mean();
   }
 
+  @VisibleForTesting
+  public double getLatencySucceededGetClusterMetricsRetrieved() {
+    return totalSucceededGetClusterMetricsRetrieved.lastStat().mean();
+  }
+
   @VisibleForTesting
   public int getAppsFailedCreated() {
     return numAppsFailedCreated.value();
@@ -214,6 +233,11 @@ public final class RouterMetrics {
     return numMultipleAppsFailedRetrieved.value();
   }
 
+  @VisibleForTesting
+  public int getClusterMetricsFailedRetrieved() {
+    return numGetClusterMetricsFailedRetrieved.value();
+  }
+
   public void succeededAppsCreated(long duration) {
     totalSucceededAppsCreated.add(duration);
     getNewApplicationLatency.add(duration);
@@ -244,6 +268,11 @@ public final class RouterMetrics {
     getApplicationAttemptReportLatency.add(duration);
   }
 
+  public void succeededGetClusterMetricsRetrieved(long duration) {
+    totalSucceededGetClusterMetricsRetrieved.add(duration);
+    getClusterMetricsLatency.add(duration);
+  }
+
   public void incrAppsFailedCreated() {
     numAppsFailedCreated.incr();
   }
@@ -268,4 +297,8 @@ public final class RouterMetrics {
     numAppAttemptsFailedRetrieved.incr();
   }
 
+  public void incrGetClusterMetricsFailedRetrieved() {
+    numGetClusterMetricsFailedRetrieved.incr();
+  }
+
 }

+ 29 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/main/java/org/apache/hadoop/yarn/server/router/clientrm/FederationClientInterceptor.java

@@ -628,18 +628,29 @@ public class FederationClientInterceptor
   public GetApplicationsResponse getApplications(GetApplicationsRequest request)
       throws YarnException, IOException {
     if (request == null) {
+      routerMetrics.incrMultipleAppsFailedRetrieved();
       RouterServerUtil.logAndThrowException(
           "Missing getApplications request.",
           null);
     }
+    long startTime = clock.getTime();
     Map<SubClusterId, SubClusterInfo> subclusters =
         federationFacade.getSubClusters(true);
     ClientMethod remoteMethod = new ClientMethod("getApplications",
         new Class[] {GetApplicationsRequest.class}, new Object[] {request});
-    Map<SubClusterId, GetApplicationsResponse> applications =
-        invokeConcurrent(subclusters.keySet(), remoteMethod,
-            GetApplicationsResponse.class);
+    Map<SubClusterId, GetApplicationsResponse> applications;
+
+    try {
+      applications = invokeConcurrent(subclusters.keySet(), remoteMethod,
+          GetApplicationsResponse.class);
 
+    } catch (Exception ex) {
+      routerMetrics.incrMultipleAppsFailedRetrieved();
+      LOG.error("Unable to get applications due to exception.", ex);
+      throw ex;
+    }
+    long stopTime = clock.getTime();
+    routerMetrics.succeededMultipleAppsRetrieved(stopTime - startTime);
     // Merge the Application Reports
     return RouterYarnClientUtils.mergeApplications(applications.values(),
         returnPartialReport);
@@ -648,14 +659,26 @@ public class FederationClientInterceptor
   @Override
   public GetClusterMetricsResponse getClusterMetrics(
       GetClusterMetricsRequest request) throws YarnException, IOException {
+    long startTime = clock.getTime();
     Map<SubClusterId, SubClusterInfo> subclusters =
         federationFacade.getSubClusters(true);
     ClientMethod remoteMethod = new ClientMethod("getClusterMetrics",
         new Class[] {GetClusterMetricsRequest.class}, new Object[] {request});
     ArrayList<SubClusterId> clusterList = new ArrayList<>(subclusters.keySet());
-    Map<SubClusterId, GetClusterMetricsResponse> clusterMetrics =
-        invokeConcurrent(clusterList, remoteMethod,
-            GetClusterMetricsResponse.class);
+    Map<SubClusterId, GetClusterMetricsResponse> clusterMetrics;
+
+    try {
+      clusterMetrics = invokeConcurrent(clusterList, remoteMethod,
+          GetClusterMetricsResponse.class);
+
+    } catch (Exception ex) {
+      routerMetrics.incrGetClusterMetricsFailedRetrieved();
+      LOG.error("Unable to get cluster metrics due to exception.", ex);
+      throw ex;
+    }
+
+    long stopTime = clock.getTime();
+    routerMetrics.succeededGetClusterMetricsRetrieved(stopTime - startTime);
     return RouterYarnClientUtils.merge(clusterMetrics.values());
   }
 

+ 42 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-router/src/test/java/org/apache/hadoop/yarn/server/router/TestRouterMetrics.java

@@ -279,6 +279,37 @@ public class TestRouterMetrics {
         metrics.getMultipleAppsFailedRetrieved());
   }
 
+  /**
+   * This test validates the correctness of the metric: Retrieved getClusterMetrics
+   * multiple times successfully.
+   */
+  @Test
+  public void testSucceededGetClusterMetrics() {
+    long totalGoodBefore = metrics.getNumSucceededGetClusterMetricsRetrieved();
+    goodSubCluster.getClusterMetrics(100);
+    Assert.assertEquals(totalGoodBefore + 1,
+        metrics.getNumSucceededGetClusterMetricsRetrieved());
+    Assert.assertEquals(100, metrics.getLatencySucceededGetClusterMetricsRetrieved(),
+        0);
+    goodSubCluster.getClusterMetrics(200);
+    Assert.assertEquals(totalGoodBefore + 2,
+        metrics.getNumSucceededGetClusterMetricsRetrieved());
+    Assert.assertEquals(150, metrics.getLatencySucceededGetClusterMetricsRetrieved(),
+        0);
+  }
+
+  /**
+   * This test validates the correctness of the metric: Failed to
+   * retrieve getClusterMetrics.
+   */
+  @Test
+  public void testGetClusterMetricsFailed() {
+    long totalBadbefore = metrics.getClusterMetricsFailedRetrieved();
+    badSubCluster.getClusterMetrics();
+    Assert.assertEquals(totalBadbefore + 1,
+        metrics.getClusterMetricsFailedRetrieved());
+  }
+
   // Records failures for all calls
   private class MockBadSubCluster {
     public void getNewApplication() {
@@ -310,6 +341,11 @@ public class TestRouterMetrics {
       LOG.info("Mocked: failed getApplicationsReport call");
       metrics.incrMultipleAppsFailedRetrieved();
     }
+
+    public void getClusterMetrics() {
+      LOG.info("Mocked: failed getClusterMetrics call");
+      metrics.incrGetClusterMetricsFailedRetrieved();
+    }
   }
 
   // Records successes for all calls
@@ -350,5 +386,11 @@ public class TestRouterMetrics {
           duration);
       metrics.succeededMultipleAppsRetrieved(duration);
     }
+
+    public void getClusterMetrics(long duration){
+      LOG.info("Mocked: successful getClusterMetrics call with duration {}",
+              duration);
+      metrics.succeededGetClusterMetricsRetrieved(duration);
+    }
   }
 }