Browse Source

YARN-3363. add localization and container launch time to ContainerMetrics at NM to show these timing information for each active container. (zxu via rkanter)

(cherry picked from commit ac7d152901e29b1f444507fe4e421eb6e1402b5a)
Robert Kanter 10 years ago
parent
commit
32b3b8e2ef

+ 4 - 0
hadoop-yarn-project/CHANGES.txt

@@ -129,6 +129,10 @@ Release 2.8.0 - UNRELEASED
     YARN-3406. Display count of running containers in the RM's Web UI.
     YARN-3406. Display count of running containers in the RM's Web UI.
     (Ryu Kobayashi via ozawa)
     (Ryu Kobayashi via ozawa)
 
 
+    YARN-3363. add localization and container launch time to ContainerMetrics
+    at NM to show these timing information for each active container.
+    (zxu via rkanter)
+
   OPTIMIZATIONS
   OPTIMIZATIONS
 
 
     YARN-3339. TestDockerContainerExecutor should pull a single image and not
     YARN-3339. TestDockerContainerExecutor should pull a single image and not

+ 17 - 12
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

@@ -96,6 +96,7 @@ public class ContainerImpl implements Container {
   private int exitCode = ContainerExitStatus.INVALID;
   private int exitCode = ContainerExitStatus.INVALID;
   private final StringBuilder diagnostics;
   private final StringBuilder diagnostics;
   private boolean wasLaunched;
   private boolean wasLaunched;
+  private long containerLocalizationStartTime;
   private long containerLaunchStartTime;
   private long containerLaunchStartTime;
   private static Clock clock = new SystemClock();
   private static Clock clock = new SystemClock();
 
 
@@ -493,16 +494,21 @@ public class ContainerImpl implements Container {
   // resource usage.
   // resource usage.
   @SuppressWarnings("unchecked") // dispatcher not typed
   @SuppressWarnings("unchecked") // dispatcher not typed
   private void sendContainerMonitorStartEvent() {
   private void sendContainerMonitorStartEvent() {
-      long pmemBytes = getResource().getMemory() * 1024 * 1024L;
-      float pmemRatio = daemonConf.getFloat(
-          YarnConfiguration.NM_VMEM_PMEM_RATIO,
-          YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
-      long vmemBytes = (long) (pmemRatio * pmemBytes);
-      int cpuVcores = getResource().getVirtualCores();
-
-      dispatcher.getEventHandler().handle(
-          new ContainerStartMonitoringEvent(containerId,
-              vmemBytes, pmemBytes, cpuVcores));
+    long launchDuration = clock.getTime() - containerLaunchStartTime;
+    metrics.addContainerLaunchDuration(launchDuration);
+
+    long pmemBytes = getResource().getMemory() * 1024 * 1024L;
+    float pmemRatio = daemonConf.getFloat(
+        YarnConfiguration.NM_VMEM_PMEM_RATIO,
+        YarnConfiguration.DEFAULT_NM_VMEM_PMEM_RATIO);
+    long vmemBytes = (long) (pmemRatio * pmemBytes);
+    int cpuVcores = getResource().getVirtualCores();
+    long localizationDuration = containerLaunchStartTime -
+        containerLocalizationStartTime;
+    dispatcher.getEventHandler().handle(
+        new ContainerStartMonitoringEvent(containerId,
+        vmemBytes, pmemBytes, cpuVcores, launchDuration,
+        localizationDuration));
   }
   }
 
 
   private void addDiagnostics(String... diags) {
   private void addDiagnostics(String... diags) {
@@ -601,6 +607,7 @@ public class ContainerImpl implements Container {
         }
         }
       }
       }
 
 
+      container.containerLocalizationStartTime = clock.getTime();
       // Send requests for public, private resources
       // Send requests for public, private resources
       Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources();
       Map<String,LocalResource> cntrRsrc = ctxt.getLocalResources();
       if (!cntrRsrc.isEmpty()) {
       if (!cntrRsrc.isEmpty()) {
@@ -756,8 +763,6 @@ public class ContainerImpl implements Container {
       container.sendContainerMonitorStartEvent();
       container.sendContainerMonitorStartEvent();
       container.metrics.runningContainer();
       container.metrics.runningContainer();
       container.wasLaunched  = true;
       container.wasLaunched  = true;
-      long duration = clock.getTime() - container.containerLaunchStartTime;
-      container.metrics.addContainerLaunchDuration(duration);
 
 
       if (container.recoveredAsKilled) {
       if (container.recoveredAsKilled) {
         LOG.info("Killing " + container.containerId
         LOG.info("Killing " + container.containerId

+ 20 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerMetrics.java

@@ -28,6 +28,7 @@ import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.MetricsRegistry;
 import org.apache.hadoop.metrics2.lib.MetricsRegistry;
 import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
 import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
+import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
 import org.apache.hadoop.metrics2.lib.MutableStat;
 import org.apache.hadoop.metrics2.lib.MutableStat;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 
 
@@ -46,6 +47,9 @@ public class ContainerMetrics implements MetricsSource {
   public static final String VMEM_LIMIT_METRIC_NAME = "vMemLimitMBs";
   public static final String VMEM_LIMIT_METRIC_NAME = "vMemLimitMBs";
   public static final String VCORE_LIMIT_METRIC_NAME = "vCoreLimit";
   public static final String VCORE_LIMIT_METRIC_NAME = "vCoreLimit";
   public static final String PMEM_USAGE_METRIC_NAME = "pMemUsageMBs";
   public static final String PMEM_USAGE_METRIC_NAME = "pMemUsageMBs";
+  public static final String LAUNCH_DURATION_METRIC_NAME = "launchDurationMs";
+  public static final String LOCALIZATION_DURATION_METRIC_NAME =
+      "localizationDurationMs";
   private static final String PHY_CPU_USAGE_METRIC_NAME = "pCpuUsagePercent";
   private static final String PHY_CPU_USAGE_METRIC_NAME = "pCpuUsagePercent";
 
 
   // Use a multiplier of 1000 to avoid losing too much precision when
   // Use a multiplier of 1000 to avoid losing too much precision when
@@ -74,6 +78,12 @@ public class ContainerMetrics implements MetricsSource {
   @Metric
   @Metric
   public MutableGaugeInt cpuVcoreLimit;
   public MutableGaugeInt cpuVcoreLimit;
 
 
+  @Metric
+  public MutableGaugeLong launchDurationMs;
+
+  @Metric
+  public MutableGaugeLong localizationDurationMs;
+
   static final MetricsInfo RECORD_INFO =
   static final MetricsInfo RECORD_INFO =
       info("ContainerResource", "Resource limit and usage by container");
       info("ContainerResource", "Resource limit and usage by container");
 
 
@@ -122,6 +132,10 @@ public class ContainerMetrics implements MetricsSource {
         VMEM_LIMIT_METRIC_NAME, "Virtual memory limit in MBs", 0);
         VMEM_LIMIT_METRIC_NAME, "Virtual memory limit in MBs", 0);
     this.cpuVcoreLimit = registry.newGauge(
     this.cpuVcoreLimit = registry.newGauge(
         VCORE_LIMIT_METRIC_NAME, "CPU limit in number of vcores", 0);
         VCORE_LIMIT_METRIC_NAME, "CPU limit in number of vcores", 0);
+    this.launchDurationMs = registry.newGauge(
+        LAUNCH_DURATION_METRIC_NAME, "Launch duration in MS", 0L);
+    this.localizationDurationMs = registry.newGauge(
+        LOCALIZATION_DURATION_METRIC_NAME, "Localization duration in MS", 0L);
   }
   }
 
 
   ContainerMetrics tag(MetricsInfo info, ContainerId containerId) {
   ContainerMetrics tag(MetricsInfo info, ContainerId containerId) {
@@ -213,6 +227,12 @@ public class ContainerMetrics implements MetricsSource {
     this.cpuVcoreLimit.set(cpuVcores);
     this.cpuVcoreLimit.set(cpuVcores);
   }
   }
 
 
+  public void recordStateChangeDurations(long launchDuration,
+      long localizationDuration) {
+    this.launchDurationMs.set(launchDuration);
+    this.localizationDurationMs.set(localizationDuration);
+  }
+
   private synchronized void scheduleTimerTaskIfRequired() {
   private synchronized void scheduleTimerTaskIfRequired() {
     if (flushPeriodMs > 0) {
     if (flushPeriodMs > 0) {
       // Lazily initialize timer
       // Lazily initialize timer

+ 14 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainerStartMonitoringEvent.java

@@ -25,13 +25,18 @@ public class ContainerStartMonitoringEvent extends ContainersMonitorEvent {
   private final long vmemLimit;
   private final long vmemLimit;
   private final long pmemLimit;
   private final long pmemLimit;
   private final int cpuVcores;
   private final int cpuVcores;
+  private final long launchDuration;
+  private final long localizationDuration;
 
 
   public ContainerStartMonitoringEvent(ContainerId containerId,
   public ContainerStartMonitoringEvent(ContainerId containerId,
-      long vmemLimit, long pmemLimit, int cpuVcores) {
+      long vmemLimit, long pmemLimit, int cpuVcores, long launchDuration,
+      long localizationDuration) {
     super(containerId, ContainersMonitorEventType.START_MONITORING_CONTAINER);
     super(containerId, ContainersMonitorEventType.START_MONITORING_CONTAINER);
     this.vmemLimit = vmemLimit;
     this.vmemLimit = vmemLimit;
     this.pmemLimit = pmemLimit;
     this.pmemLimit = pmemLimit;
     this.cpuVcores = cpuVcores;
     this.cpuVcores = cpuVcores;
+    this.launchDuration = launchDuration;
+    this.localizationDuration = localizationDuration;
   }
   }
 
 
   public long getVmemLimit() {
   public long getVmemLimit() {
@@ -45,4 +50,12 @@ public class ContainerStartMonitoringEvent extends ContainersMonitorEvent {
   public int getCpuVcores() {
   public int getCpuVcores() {
     return this.cpuVcores;
     return this.cpuVcores;
   }
   }
+
+  public long getLaunchDuration() {
+    return this.launchDuration;
+  }
+
+  public long getLocalizationDuration() {
+    return this.localizationDuration;
+  }
 }
 }

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -617,6 +617,15 @@ public class ContainersMonitorImpl extends AbstractService implements
     case START_MONITORING_CONTAINER:
     case START_MONITORING_CONTAINER:
       ContainerStartMonitoringEvent startEvent =
       ContainerStartMonitoringEvent startEvent =
           (ContainerStartMonitoringEvent) monitoringEvent;
           (ContainerStartMonitoringEvent) monitoringEvent;
+
+      if (containerMetricsEnabled) {
+        ContainerMetrics usageMetrics = ContainerMetrics
+            .forContainer(containerId, containerMetricsPeriodMs);
+        usageMetrics.recordStateChangeDurations(
+            startEvent.getLaunchDuration(),
+            startEvent.getLocalizationDuration());
+      }
+
       synchronized (this.containersToBeAdded) {
       synchronized (this.containersToBeAdded) {
         ProcessTreeInfo processTreeInfo =
         ProcessTreeInfo processTreeInfo =
             new ProcessTreeInfo(containerId, null, null,
             new ProcessTreeInfo(containerId, null, null,

+ 10 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainerMetrics.java

@@ -87,10 +87,14 @@ public class TestContainerMetrics {
     int anyPmemLimit = 1024;
     int anyPmemLimit = 1024;
     int anyVmemLimit = 2048;
     int anyVmemLimit = 2048;
     int anyVcores = 10;
     int anyVcores = 10;
+    long anyLaunchDuration = 20L;
+    long anyLocalizationDuration = 1000L;
     String anyProcessId = "1234";
     String anyProcessId = "1234";
 
 
     metrics.recordResourceLimit(anyVmemLimit, anyPmemLimit, anyVcores);
     metrics.recordResourceLimit(anyVmemLimit, anyPmemLimit, anyVcores);
     metrics.recordProcessId(anyProcessId);
     metrics.recordProcessId(anyProcessId);
+    metrics.recordStateChangeDurations(anyLaunchDuration,
+        anyLocalizationDuration);
 
 
     Thread.sleep(110);
     Thread.sleep(110);
     metrics.getMetrics(collector, true);
     metrics.getMetrics(collector, true);
@@ -105,6 +109,12 @@ public class TestContainerMetrics {
     MetricsRecords.assertMetric(record, ContainerMetrics.VMEM_LIMIT_METRIC_NAME, anyVmemLimit);
     MetricsRecords.assertMetric(record, ContainerMetrics.VMEM_LIMIT_METRIC_NAME, anyVmemLimit);
     MetricsRecords.assertMetric(record, ContainerMetrics.VCORE_LIMIT_METRIC_NAME, anyVcores);
     MetricsRecords.assertMetric(record, ContainerMetrics.VCORE_LIMIT_METRIC_NAME, anyVcores);
 
 
+    MetricsRecords.assertMetric(record,
+        ContainerMetrics.LAUNCH_DURATION_METRIC_NAME, anyLaunchDuration);
+    MetricsRecords.assertMetric(record,
+        ContainerMetrics.LOCALIZATION_DURATION_METRIC_NAME,
+        anyLocalizationDuration);
+
     collector.clear();
     collector.clear();
   }
   }
 }
 }