Ver código fonte

YARN-10628. Add node usage metrics in SLS. Contributed by Vadaga Ananyo Rao

Szilard Nemeth 3 anos atrás
pai
commit
54f9fff218

+ 21 - 0
hadoop-tools/hadoop-sls/src/main/html/showSimulationTrace.html

@@ -100,6 +100,10 @@
     <div class="divborder span8 " style="margin-left:50px" id="area7"></div>
     <div class="span7 chart-area" id="area8"></div>
   </div>
+  <div class="row">
+    <div class="divborder span8 chart-area" style="margin-left:50px" id="area9"></div>
+    <div class="divborder span8 chart-area" id="area10"></div>
+  </div>
 </div>
 <p>&nbsp;</p>
 <script>
@@ -162,6 +166,23 @@ function drawCharts(filepath) {
         "scheduler.handle-APP_REMOVED.timecost", "scheduler.handle-CONTAINER_EXPIRED.timecost"
       ];
       drawEachChart("#area7", data, legends, "Scheduler allocate & handle operations timecost", "Timecost (ms)", 0, 210);
+
+      // Node usage stats
+      legends = [
+        "nodes.memory.unused", "nodes.memory.1to19pctUsed",
+        "nodes.memory.20to39pctUsed", "nodes.memory.40to59pctUsed",
+        "nodes.memory.60to79pctUsed","nodes.memory.80to99pctUsed",
+        "nodes.memory.full"
+      ];
+      drawEachChart("#area9", data, legends, "Cluster nodes memory usage", "Node count", 1, 0);
+
+      legends = [
+        "nodes.vcores.unused", "nodes.vcores.1to19pctUsed",
+        "nodes.vcores.20to39pctUsed", "nodes.vcores.40to59pctUsed",
+        "nodes.vcores.60to79pctUsed", "nodes.vcores.80to99pctUsed",
+        "nodes.vcores.full"
+      ];
+      drawEachChart("#area10", data, legends, "Cluster nodes vcores usage", "Node count", 1, 0);
     });
   }).done(function() {
     $("#data").css("display", "block");

+ 25 - 3
hadoop-tools/hadoop-sls/src/main/html/simulate.html.template

@@ -49,6 +49,10 @@
     <div class="row">
       <div class="divborder span8" style="margin-left:50px" id="area7"></div>
       <div class="divborder span8" style="margin-left:50px" id="area8"></div>
+    </div>
+    <div class="row">
+      <div class="divborder span8" style="margin-left:50px" id="area9"></div>
+      <div class="divborder span8" style="margin-left:50px" id="area10"></div>
     </div><br/><br/>
 
     <script>
@@ -87,6 +91,20 @@
                     ''scheduler.commit.failure.timecost''];
       legends[7] = [''scheduler.commit.success.throughput'',
                     ''scheduler.commit.failure.throughput''];
+      legends[8] = [''nodes.memory.unused'',
+                    ''nodes.memory.1to19pctUsed'',
+                    ''nodes.memory.20to39pctUsed'',
+                    ''nodes.memory.40to59pctUsed'',
+                    ''nodes.memory.60to79pctUsed'',
+                    ''nodes.memory.80to99pctUsed'',
+                    ''nodes.memory.full''];
+      legends[9] = [''nodes.vcores.unused'',
+                    ''nodes.vcores.1to19pctUsed'',
+                    ''nodes.vcores.20to39pctUsed'',
+                    ''nodes.vcores.40to59pctUsed'',
+                    ''nodes.vcores.60to79pctUsed'',
+                    ''nodes.vcores.80to99pctUsed'',
+                    ''nodes.vcores.full''];
 
       // title
       titles[0] = ''Cluster running applications & containers'';
@@ -97,6 +115,8 @@
       titles[5] = ''Queue allocated vcores'';
       titles[6] = ''Scheduler allocate & handle & commit operation timecost'';
       titles[7] = ''Scheduler commit success/failure operation throughput'';
+      titles[8] = ''Cluster nodes memory usage'';
+      titles[9] = ''Cluster nodes vcores usage'';
 
       // ylabels
       yLabels[0] = ''Number'';
@@ -107,12 +127,14 @@
       yLabels[5] = ''Number'';
       yLabels[6] = ''Timecost (ms)'';
       yLabels[7] = ''Number'';
+      yLabels[8] = ''Number'';
+      yLabels[9] = ''Number'';
 
       // is area?
-      isAreas = [0, 0, 0, 0, 1, 1, 0, 0];
+      isAreas = [0, 0, 0, 0, 1, 1, 0, 0, 1, 1];
 
       // draw all charts
-      for (var i = 0; i < 8; i ++) '{'
+      for (var i = 0; i < 10; i ++) '{'
         drawEachChart(i);
       '}'
 
@@ -180,7 +202,7 @@
           data.push(point);
 
           // clear old
-          for (var i = 0; i < 8; i ++) '{'
+          for (var i = 0; i < 10; i ++) '{'
             svgs[i].selectAll(''g.tick'').remove();
             svgs[i].selectAll(''g'').remove();
             var color = d3.scale.category10();

+ 47 - 0
hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/scheduler/SchedulerMetrics.java

@@ -54,11 +54,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnSched
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplication;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.fifo.FifoScheduler;
 import org.apache.hadoop.yarn.sls.conf.SLSConfiguration;
+import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
 import org.apache.hadoop.yarn.sls.web.SLSWebApp;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -157,6 +159,8 @@ public abstract class SchedulerMetrics {
     registerClusterResourceMetrics();
     registerContainerAppNumMetrics();
     registerSchedulerMetrics();
+    registerNodesUsageMetrics("memory");
+    registerNodesUsageMetrics("vcores");
 
     // .csv output
     initMetricsCSVOutput();
@@ -463,6 +467,49 @@ public abstract class SchedulerMetrics {
     }
   }
 
+  private void registerNodesUsageMetrics(String resourceType) {
+    samplerLock.lock();
+    try {
+      for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
+        String metricName = "nodes." + resourceType + "." + range.getKeyword();
+        metrics.register(metricName,
+            new Gauge<Integer>() {
+              @Override
+              public Integer getValue() {
+                if (!(scheduler instanceof AbstractYarnScheduler)) {
+                  return 0;
+                } else {
+                  int count = 0;
+                  AbstractYarnScheduler sch = (AbstractYarnScheduler) scheduler;
+                  for (Object node : sch.getNodeTracker().getAllNodes()) {
+                    SchedulerNode sNode = (SchedulerNode) node;
+                    long allocated = 0, total = 0;
+                    if (resourceType.equals("memory")) {
+                      allocated = sNode.getAllocatedResource().getMemorySize();
+                      total = sNode.getTotalResource().getMemorySize();
+                    } else if (resourceType.equals("vcores")) {
+                      allocated =
+                          sNode.getAllocatedResource().getVirtualCores();
+                      total =
+                          sNode.getTotalResource().getVirtualCores();
+                    }
+                    float usedPct = allocated * 100f / total;
+                    if (range.getLowerLimit() <= usedPct
+                        && usedPct <= range.getUpperLimit()) {
+                      count++;
+                    }
+                  }
+                  return count;
+                }
+              }
+            }
+        );
+      }
+    } finally {
+      samplerLock.unlock();
+    }
+  }
+
   private void initMetricsCSVOutput() {
     int timeIntervalMS = conf.getInt(
         SLSConfiguration.METRICS_RECORD_INTERVAL_MS,

+ 68 - 0
hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/NodeUsageRanges.java

@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.sls.utils;
+
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+public final class NodeUsageRanges {
+  private NodeUsageRanges() {}
+
+  /**
+   * Class to store the keyword, lower-limit and upper-limit of a resource
+   * within the specified lower-limit (inclusive) and upper-limit (inclusive).
+   */
+  public static class Range {
+    private String keyword;
+    private float lowerLimit, upperLimit;
+    public Range(String keyword, float lowerLimit, float upperLimit) {
+      this.keyword = keyword;
+      this.lowerLimit = lowerLimit;
+      this.upperLimit = upperLimit;
+    }
+
+    public String getKeyword() {
+      return keyword;
+    }
+
+    public float getLowerLimit() {
+      return lowerLimit;
+    }
+
+    public float getUpperLimit() {
+      return upperLimit;
+    }
+  }
+
+  private static final Set<Range> RANGES;
+  static {
+    RANGES = new LinkedHashSet<>();
+    RANGES.add(new Range("unused", 0, 0));
+    RANGES.add(new Range("1to19pctUsed", 1, 19));
+    RANGES.add(new Range("20to39pctUsed", 20, 39));
+    RANGES.add(new Range("40to59pctUsed", 40, 59));
+    RANGES.add(new Range("60to79pctUsed", 60, 79));
+    RANGES.add(new Range("80to99pctUsed", 80, 99));
+    RANGES.add(new Range("full", 100, 100));
+  }
+
+  public static Set<Range> getRanges() {
+    return RANGES;
+  }
+}

+ 21 - 0
hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/utils/package-info.java

@@ -0,0 +1,21 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/**
+ * Utility classes for SLS.
+ */
+package org.apache.hadoop.yarn.sls.utils;

+ 33 - 0
hadoop-tools/hadoop-sls/src/main/java/org/apache/hadoop/yarn/sls/web/SLSWebApp.java

@@ -40,6 +40,7 @@ import org.apache.hadoop.yarn.sls.scheduler.FairSchedulerMetrics;
 import org.apache.hadoop.yarn.sls.scheduler.SchedulerMetrics;
 import org.apache.hadoop.yarn.sls.scheduler.SchedulerWrapper;
 
+import org.apache.hadoop.yarn.sls.utils.NodeUsageRanges;
 import org.eclipse.jetty.http.MimeTypes;
 import org.eclipse.jetty.server.Handler;
 import org.eclipse.jetty.server.Request;
@@ -73,6 +74,7 @@ public class SLSWebApp extends HttpServlet {
   private transient Gauge allocatedVCoresGauge;
   private transient Gauge availableMemoryGauge;
   private transient Gauge availableVCoresGauge;
+  private transient Map<String, Gauge> perNodeUsageGaugeMap;
   private transient Histogram allocateTimecostHistogram;
   private transient Histogram commitSuccessTimecostHistogram;
   private transient Histogram commitFailureTimecostHistogram;
@@ -122,6 +124,7 @@ public class SLSWebApp extends HttpServlet {
     handleOperTimecostHistogramMap = new HashMap<>();
     queueAllocatedMemoryCounterMap = new HashMap<>();
     queueAllocatedVCoresCounterMap = new HashMap<>();
+    perNodeUsageGaugeMap = new HashMap<>();
     schedulerMetrics = wrapper.getSchedulerMetrics();
     metrics = schedulerMetrics.getMetrics();
     port = metricsAddressPort;
@@ -547,10 +550,40 @@ public class SLSWebApp extends HttpServlet {
       sb.append(",\"scheduler.handle-").append(e).append(".timecost\":")
               .append(handleOperTimecostMap.get(e));
     }
+    sb.append(generateNodeUsageMetrics("memory"));
+    sb.append(generateNodeUsageMetrics("vcores"));
     sb.append("}");
     return sb.toString();
   }
 
+  private String generateNodeUsageMetrics(String resourceType) {
+    StringBuilder sb = new StringBuilder();
+    Map<String, Integer> perNodeUsageMap = new HashMap<>();
+    for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
+      String metricName = "nodes." + resourceType + "." + range.getKeyword();
+      if (!perNodeUsageGaugeMap.containsKey(metricName) &&
+          metrics.getGauges().containsKey(metricName)) {
+        perNodeUsageGaugeMap.put(metricName,
+            metrics.getGauges().get(metricName));
+      }
+
+      int perNodeUsageCount =
+          perNodeUsageGaugeMap.containsKey(metricName) ?
+              Integer.parseInt(
+                  perNodeUsageGaugeMap.get(metricName).getValue().toString()) : 0;
+
+      perNodeUsageMap.put(metricName, perNodeUsageCount);
+    }
+
+    // per node memory and vcores used
+    for (NodeUsageRanges.Range range : NodeUsageRanges.getRanges()) {
+      String metricName = "nodes." + resourceType + "." + range.getKeyword();
+      sb.append(",\"").append(metricName).append("\":")
+          .append(perNodeUsageMap.get(metricName));
+    }
+    return sb.toString();
+  }
+
   /**
    * package metrics information for one tracked queue/app
    * only support FairScheduler currently