Selaa lähdekoodia

YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. Contributed by Rohith Sharmaks
(cherry picked from commit bdd90110e6904b59746812d9a093924a65e72280)

Jian He 10 vuotta sitten
vanhempi
commit
dc4698bb33

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -103,6 +103,9 @@ Release 2.8.0 - UNRELEASED
     YARN-3451. Display attempt start time and elapsed time on the web UI.
     (Rohith Sharmaks via jianhe)
 
+    YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
+    Sharmaks via jianhe)
+
   OPTIMIZATIONS
 
     YARN-3339. TestDockerContainerExecutor should pull a single image and not

+ 8 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java

@@ -83,16 +83,17 @@ public class QueueMetrics implements MetricsSource {
   static final MetricsInfo RECORD_INFO = info("QueueMetrics",
       "Metrics for the resource scheduler");
   protected static final MetricsInfo QUEUE_INFO = info("Queue", "Metrics by queue");
-  static final MetricsInfo USER_INFO = info("User", "Metrics by user");
+  protected static final MetricsInfo USER_INFO =
+      info("User", "Metrics by user");
   static final Splitter Q_SPLITTER =
       Splitter.on('.').omitEmptyStrings().trimResults();
 
-  final MetricsRegistry registry;
-  final String queueName;
-  final QueueMetrics parent;
-  final MetricsSystem metricsSystem;
-  private final Map<String, QueueMetrics> users;
-  private final Configuration conf;
+  protected final MetricsRegistry registry;
+  protected final String queueName;
+  protected final QueueMetrics parent;
+  protected final MetricsSystem metricsSystem;
+  protected final Map<String, QueueMetrics> users;
+  protected final Configuration conf;
 
   protected QueueMetrics(MetricsSystem ms, String queueName, Queue parent, 
 	       boolean enableUserMetrics, Configuration conf) {

+ 6 - 7
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/AbstractCSQueue.java

@@ -43,7 +43,6 @@ import org.apache.hadoop.yarn.security.PrivilegedEntity.EntityType;
 import org.apache.hadoop.yarn.security.YarnAuthorizationProvider;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
-import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceUsage;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
@@ -67,7 +66,7 @@ public abstract class AbstractCSQueue implements CSQueue {
   final Resource minimumAllocation;
   Resource maximumAllocation;
   QueueState state;
-  final QueueMetrics metrics;
+  final CSQueueMetrics metrics;
   protected final PrivilegedEntity queueEntity;
 
   final ResourceCalculator resourceCalculator;
@@ -100,10 +99,10 @@ public abstract class AbstractCSQueue implements CSQueue {
     this.resourceCalculator = cs.getResourceCalculator();
     
     // must be called after parent and queueName is set
-    this.metrics = old != null ? old.getMetrics() :
-        QueueMetrics.forQueue(getQueuePath(), parent,
-            cs.getConfiguration().getEnableUserMetrics(),
-            cs.getConf());
+    this.metrics =
+        old != null ? (CSQueueMetrics) old.getMetrics() : CSQueueMetrics
+            .forQueue(getQueuePath(), parent, cs.getConfiguration()
+                .getEnableUserMetrics(), cs.getConf());
 
     this.csContext = cs;
     this.minimumAllocation = csContext.getMinimumResourceCapability();
@@ -171,7 +170,7 @@ public abstract class AbstractCSQueue implements CSQueue {
   }
   
   @Override
-  public QueueMetrics getMetrics() {
+  public CSQueueMetrics getMetrics() {
     return metrics;
   }
   

+ 133 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/CSQueueMetrics.java

@@ -0,0 +1,133 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.Queue;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
+
+@Metrics(context = "yarn")
+public class CSQueueMetrics extends QueueMetrics {
+
+  @Metric("AM memory limit in MB")
+  MutableGaugeInt AMResourceLimitMB;
+  @Metric("AM CPU limit in virtual cores")
+  MutableGaugeInt AMResourceLimitVCores;
+  @Metric("Used AM memory limit in MB")
+  MutableGaugeInt usedAMResourceMB;
+  @Metric("Used AM CPU limit in virtual cores")
+  MutableGaugeInt usedAMResourceVCores;
+
+  CSQueueMetrics(MetricsSystem ms, String queueName, Queue parent,
+      boolean enableUserMetrics, Configuration conf) {
+    super(ms, queueName, parent, enableUserMetrics, conf);
+  }
+
+  public int getAMResourceLimitMB() {
+    return AMResourceLimitMB.value();
+  }
+
+  public int getAMResourceLimitVCores() {
+    return AMResourceLimitVCores.value();
+  }
+
+  public int getUsedAMResourceMB() {
+    return usedAMResourceMB.value();
+  }
+
+  public int getUsedAMResourceVCores() {
+    return usedAMResourceVCores.value();
+  }
+
+  public void setAMResouceLimit(Resource res) {
+    AMResourceLimitMB.set(res.getMemory());
+    AMResourceLimitVCores.set(res.getVirtualCores());
+  }
+
+  public void setAMResouceLimitForUser(String user, Resource res) {
+    CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
+    if (userMetrics != null) {
+      userMetrics.setAMResouceLimit(res);
+    }
+  }
+
+  public void incAMUsed(String user, Resource res) {
+    usedAMResourceMB.incr(res.getMemory());
+    usedAMResourceVCores.incr(res.getVirtualCores());
+    CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
+    if (userMetrics != null) {
+      userMetrics.incAMUsed(user, res);
+    }
+  }
+
+  public void decAMUsed(String user, Resource res) {
+    usedAMResourceMB.decr(res.getMemory());
+    usedAMResourceVCores.decr(res.getVirtualCores());
+    CSQueueMetrics userMetrics = (CSQueueMetrics) getUserMetrics(user);
+    if (userMetrics != null) {
+      userMetrics.decAMUsed(user, res);
+    }
+  }
+
+  public synchronized static CSQueueMetrics forQueue(String queueName,
+      Queue parent, boolean enableUserMetrics, Configuration conf) {
+    MetricsSystem ms = DefaultMetricsSystem.instance();
+    QueueMetrics metrics = queueMetrics.get(queueName);
+    if (metrics == null) {
+      metrics =
+          new CSQueueMetrics(ms, queueName, parent, enableUserMetrics, conf)
+              .tag(QUEUE_INFO, queueName);
+
+      // Register with the MetricsSystems
+      if (ms != null) {
+        metrics =
+            ms.register(sourceName(queueName).toString(), "Metrics for queue: "
+                + queueName, metrics);
+      }
+      queueMetrics.put(queueName, metrics);
+    }
+
+    return (CSQueueMetrics) metrics;
+  }
+
+  @Override
+  public synchronized QueueMetrics getUserMetrics(String userName) {
+    if (users == null) {
+      return null;
+    }
+    CSQueueMetrics metrics = (CSQueueMetrics) users.get(userName);
+    if (metrics == null) {
+      metrics = new CSQueueMetrics(metricsSystem, queueName, null, false, conf);
+      users.put(userName, metrics);
+      metricsSystem.register(
+          sourceName(queueName).append(",user=").append(userName).toString(),
+          "Metrics for user '" + userName + "' in queue '" + queueName + "'",
+          ((CSQueueMetrics) metrics.tag(QUEUE_INFO, queueName)).tag(USER_INFO,
+              userName));
+    }
+    return metrics;
+  }
+
+}

+ 9 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/LeafQueue.java

@@ -563,10 +563,12 @@ public class LeafQueue extends AbstractCSQueue {
      }
      Resource queueCap = Resources.max(resourceCalculator, lastClusterResource,
        absoluteCapacityResource, queueCurrentLimit);
-     return Resources.multiplyAndNormalizeUp( 
-          resourceCalculator,
-          queueCap, 
-          maxAMResourcePerQueuePercent, minimumAllocation);
+    Resource amResouceLimit =
+        Resources.multiplyAndNormalizeUp(resourceCalculator, queueCap,
+            maxAMResourcePerQueuePercent, minimumAllocation);
+
+    metrics.setAMResouceLimit(amResouceLimit);
+    return amResouceLimit;
   }
   
   public synchronized Resource getUserAMResourceLimit() {
@@ -645,6 +647,8 @@ public class LeafQueue extends AbstractCSQueue {
       orderingPolicy.addSchedulableEntity(application);
       queueUsage.incAMUsed(application.getAMResource());
       user.getResourceUsage().incAMUsed(application.getAMResource());
+      metrics.incAMUsed(application.getUser(), application.getAMResource());
+      metrics.setAMResouceLimitForUser(application.getUser(), userAMLimit);
       i.remove();
       LOG.info("Application " + application.getApplicationId() +
           " from user: " + application.getUser() + 
@@ -698,6 +702,7 @@ public class LeafQueue extends AbstractCSQueue {
     } else {
       queueUsage.decAMUsed(application.getAMResource());
       user.getResourceUsage().decAMUsed(application.getAMResource());
+      metrics.decAMUsed(application.getUser(), application.getAMResource());
     }
     applicationAttemptMap.remove(application.getApplicationAttemptId());
 

+ 9 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestApplicationLimits.java

@@ -278,10 +278,18 @@ public class TestApplicationLimits {
     		" UserAMResourceLimit=" + 
     		queue.getUserAMResourceLimit());
     
-    assertEquals(queue.getAMResourceLimit(), Resource.newInstance(160*GB, 1));
+    Resource amResourceLimit = Resource.newInstance(160 * GB, 1);
+    assertEquals(queue.getAMResourceLimit(), amResourceLimit);
+    assertEquals(queue.getAMResourceLimit(), amResourceLimit);
     assertEquals(queue.getUserAMResourceLimit(), 
       Resource.newInstance(80*GB, 1));
     
+    // Assert in metrics
+    assertEquals(queue.getMetrics().getAMResourceLimitMB(),
+        amResourceLimit.getMemory());
+    assertEquals(queue.getMetrics().getAMResourceLimitVCores(),
+        amResourceLimit.getVirtualCores());
+
     assertEquals(
         (int)(clusterResource.getMemory() * queue.getAbsoluteCapacity()),
         queue.getMetrics().getAvailableMB()

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestLeafQueue.java

@@ -432,10 +432,16 @@ public class TestLeafQueue {
         .getMockApplicationAttemptId(0, 2);
     FiCaSchedulerApp app_1 = new FiCaSchedulerApp(appAttemptId_1, user_0, a, null,
         spyRMContext);
+    app_1.setAMResource(Resource.newInstance(100, 1));
     a.submitApplicationAttempt(app_1, user_0); // same user
 
     assertEquals(1, a.getMetrics().getAppsSubmitted());
     assertEquals(1, a.getMetrics().getAppsPending());
+    assertEquals(1, a.getUser(user_0).getActiveApplications());
+    assertEquals(app_1.getAMResource().getMemory(), a.getMetrics()
+        .getUsedAMResourceMB());
+    assertEquals(app_1.getAMResource().getVirtualCores(), a.getMetrics()
+        .getUsedAMResourceVCores());
     
     event = new AppAttemptRemovedSchedulerEvent(appAttemptId_0,
         RMAppAttemptState.FINISHED, false);