浏览代码

YARN-7625. Expose NM node/containers resource utilization in JVM metrics. Contributed by Weiwei Yang

Jason Lowe 7 年之前
父节点
当前提交
06f0eb2dce
共有 9 个文件被更改,包括 170 次插入7 次删除
  1. 3 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java
  2. 18 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java
  3. 14 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java
  4. 13 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
  5. 61 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java
  6. 31 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeResourceMonitor.java
  7. 6 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java
  8. 5 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorPlugin.java
  9. 19 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java

+ 3 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/Context.java

@@ -35,6 +35,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Ap
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
 import org.apache.hadoop.yarn.server.scheduler.OpportunisticContainerAllocator;
 import org.apache.hadoop.yarn.server.nodemanager.security.NMContainerTokenSecretManager;
@@ -125,4 +126,6 @@ public interface Context {
   ContainerStateTransitionListener getContainerStateTransitionListener();
 
   ResourcePluginManager getResourcePluginManager();
+
+  NodeManagerMetrics getNodeManagerMetrics();
 }

+ 18 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java

@@ -205,7 +205,7 @@ public class NodeManager extends CompositeService
   }
 
   protected NodeResourceMonitor createNodeResourceMonitor() {
-    return new NodeResourceMonitorImpl();
+    return new NodeResourceMonitorImpl(context);
   }
 
   protected ContainerManagerImpl createContainerManager(Context context,
@@ -242,6 +242,7 @@ public class NodeManager extends CompositeService
     NMContext nmContext = new NMContext(containerTokenSecretManager,
         nmTokenSecretManager, dirsHandler, aclsManager, stateStore,
         isDistSchedulerEnabled, conf);
+    nmContext.setNodeManagerMetrics(metrics);
     DefaultContainerStateListener defaultListener =
         new DefaultContainerStateListener();
     nmContext.setContainerStateTransitionListener(defaultListener);
@@ -574,6 +575,8 @@ public class NodeManager extends CompositeService
 
     private Configuration conf = null;
 
+    private NodeManagerMetrics metrics = null;
+
     protected final ConcurrentMap<ApplicationId, Application> applications =
         new ConcurrentHashMap<ApplicationId, Application>();
 
@@ -823,6 +826,20 @@ public class NodeManager extends CompositeService
       return resourcePluginManager;
     }
 
+    /**
+     * Returns the {@link NodeManagerMetrics} instance of this node.
+     * This might return a null if the instance was not set to the context.
+     * @return node manager metrics.
+     */
+    @Override
+    public NodeManagerMetrics getNodeManagerMetrics() {
+      return metrics;
+    }
+
+    public void setNodeManagerMetrics(NodeManagerMetrics nmMetrics) {
+      this.metrics = nmMetrics;
+    }
+
     public void setResourcePluginManager(
         ResourcePluginManager resourcePluginManager) {
       this.resourcePluginManager = resourcePluginManager;

+ 14 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeResourceMonitorImpl.java

@@ -22,6 +22,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.service.AbstractService;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.api.records.ResourceUtilization;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -48,12 +49,14 @@ public class NodeResourceMonitorImpl extends AbstractService implements
   /** Current <em>resource utilization</em> of the node. */
   private ResourceUtilization nodeUtilization;
 
+  private Context nmContext;
+
   /**
    * Initialize the node resource monitor.
    */
-  public NodeResourceMonitorImpl() {
+  public NodeResourceMonitorImpl(Context context) {
     super(NodeResourceMonitorImpl.class.getName());
-
+    this.nmContext = context;
     this.monitoringThread = new MonitoringThread();
   }
 
@@ -149,6 +152,15 @@ public class NodeResourceMonitorImpl extends AbstractService implements
                 (int) (vmem >> 20), // B -> MB
                 vcores); // Used Virtual Cores
 
+        // Publish the node utilization metrics to node manager
+        // metrics system.
+        NodeManagerMetrics nmMetrics = nmContext.getNodeManagerMetrics();
+        if (nmMetrics != null) {
+          nmMetrics.setNodeUsedMemGB(nodeUtilization.getPhysicalMemory());
+          nmMetrics.setNodeUsedVMemGB(nodeUtilization.getVirtualMemory());
+          nmMetrics.setNodeCpuUtilization(nodeUtilization.getCPU());
+        }
+
         try {
           Thread.sleep(monitoringInterval);
         } catch (InterruptedException e) {

+ 13 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -20,6 +20,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
 
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.classification.InterfaceAudience.Private;
@@ -488,6 +489,18 @@ public class ContainersMonitorImpl extends AbstractService implements
         // Save the aggregated utilization of the containers
         setContainersUtilization(trackedContainersUtilization);
 
+        // Publish the container utilization metrics to node manager
+        // metrics system.
+        NodeManagerMetrics nmMetrics = context.getNodeManagerMetrics();
+        if (nmMetrics != null) {
+          nmMetrics.setContainerUsedMemGB(
+              trackedContainersUtilization.getPhysicalMemory());
+          nmMetrics.setContainerUsedVMemGB(
+              trackedContainersUtilization.getVirtualMemory());
+          nmMetrics.setContainerCpuUtilization(
+              trackedContainersUtilization.getCPU());
+        }
+
         try {
           Thread.sleep(monitoringInterval);
         } catch (InterruptedException e) {

+ 61 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java

@@ -24,6 +24,7 @@ import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.lib.MutableCounterInt;
 import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
 import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+import org.apache.hadoop.metrics2.lib.MutableGaugeFloat;
 import org.apache.hadoop.metrics2.lib.MutableRate;
 import org.apache.hadoop.metrics2.source.JvmMetrics;
 import org.apache.hadoop.yarn.api.records.Resource;
@@ -77,6 +78,18 @@ public class NodeManagerMetrics {
   MutableGaugeLong publicBytesDeleted;
   @Metric("# of bytes deleted from the private local cache")
   MutableGaugeLong privateBytesDeleted;
+  @Metric("Current used physical memory by all containers in GB")
+  MutableGaugeInt containerUsedMemGB;
+  @Metric("Current used virtual memory by all containers in GB")
+  MutableGaugeInt containerUsedVMemGB;
+  @Metric("Aggregated CPU utilization of all containers")
+  MutableGaugeFloat containerCpuUtilization;
+  @Metric("Current used memory by this node in GB")
+  MutableGaugeInt nodeUsedMemGB;
+  @Metric("Current used virtual memory by this node in GB")
+  MutableGaugeInt nodeUsedVMemGB;
+  @Metric("Current CPU utilization")
+  MutableGaugeFloat nodeCpuUtilization;
 
   // CHECKSTYLE:ON:VisibilityModifier
 
@@ -316,4 +329,52 @@ public class NodeManagerMetrics {
   public long getPrivateBytesDeleted() {
     return this.privateBytesDeleted.value();
   }
+
+  public void setContainerUsedMemGB(long usedMem) {
+    this.containerUsedMemGB.set((int)Math.floor(usedMem/1024d));
+  }
+
+  public int getContainerUsedMemGB() {
+    return this.containerUsedMemGB.value();
+  }
+
+  public void setContainerUsedVMemGB(long usedVMem) {
+    this.containerUsedVMemGB.set((int)Math.floor(usedVMem/1024d));
+  }
+
+  public int getContainerUsedVMemGB() {
+    return this.containerUsedVMemGB.value();
+  }
+
+  public void setContainerCpuUtilization(float cpuUtilization) {
+    this.containerCpuUtilization.set(cpuUtilization);
+  }
+
+  public float getContainerCpuUtilization() {
+    return this.containerCpuUtilization.value();
+  }
+
+  public void setNodeUsedMemGB(long totalUsedMemGB) {
+    this.nodeUsedMemGB.set((int)Math.floor(totalUsedMemGB/1024d));
+  }
+
+  public int getNodeUsedMemGB() {
+    return nodeUsedMemGB.value();
+  }
+
+  public void setNodeUsedVMemGB(long totalUsedVMemGB) {
+    this.nodeUsedVMemGB.set((int)Math.floor(totalUsedVMemGB/1024d));
+  }
+
+  public int getNodeUsedVMemGB() {
+    return nodeUsedVMemGB.value();
+  }
+
+  public float getNodeCpuUtilization() {
+    return nodeCpuUtilization.value();
+  }
+
+  public void setNodeCpuUtilization(float cpuUtilization) {
+    this.nodeCpuUtilization.set(cpuUtilization);
+  }
 }

+ 31 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeResourceMonitor.java

@@ -18,18 +18,46 @@
 
 package org.apache.hadoop.yarn.server.nodemanager;
 
+import java.io.IOException;
+
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
-import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager
+    .BaseContainerManagerTest;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager
+    .monitor.MockResourceCalculatorPlugin;
 
+import org.junit.Before;
 import org.junit.Test;
+import org.mockito.Mockito;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.timeout;
 
 public class TestNodeResourceMonitor extends BaseContainerManagerTest {
   public TestNodeResourceMonitor() throws UnsupportedFileSystemException {
     super();
   }
 
+  @Before
+  public void setup() throws IOException {
+    // Enable node resource monitor with a mocked resource calculator.
+    conf.set(
+        YarnConfiguration.NM_MON_RESOURCE_CALCULATOR,
+        MockResourceCalculatorPlugin.class.getCanonicalName());
+    super.setup();
+  }
+
   @Test
-  public void testNodeResourceMonitor() {
-    NodeResourceMonitor nrm = new NodeResourceMonitorImpl();
+  public void testMetricsUpdate() throws Exception {
+    // This test doesn't verify the correction of those metrics
+    // updated by the monitor, it only verifies that the monitor
+    // do publish these info to node manager metrics system in
+    // each monitor interval.
+    Context spyContext = spy(context);
+    NodeResourceMonitor nrm = new NodeResourceMonitorImpl(spyContext);
+    nrm.init(conf);
+    nrm.start();
+    Mockito.verify(spyContext, timeout(500).atLeastOnce())
+        .getNodeManagerMetrics();
   }
 }

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/amrmproxy/BaseAMRMProxyTest.java

@@ -55,6 +55,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManag
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.application.Application;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.ResourcePluginManager;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMMemoryStateStoreService;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService.RecoveredAMRMProxyState;
@@ -802,5 +803,10 @@ public abstract class BaseAMRMProxyTest {
     public ResourcePluginManager getResourcePluginManager() {
       return null;
     }
+
+    @Override
+    public NodeManagerMetrics getNodeManagerMetrics() {
+      return null;
+    }
   }
 }

+ 5 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorPlugin.java

@@ -70,4 +70,9 @@ public class MockResourceCalculatorPlugin extends ResourceCalculatorPlugin {
   public float getCpuUsagePercentage() {
     return 0;
   }
+
+  @Override
+  public float getNumVCoresUsed() {
+    return 0;
+  }
 }

+ 19 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitor.java

@@ -22,6 +22,8 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.timeout;
 
 import java.io.BufferedReader;
 import java.io.File;
@@ -67,7 +69,6 @@ import org.apache.hadoop.yarn.server.nodemanager.Context;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.BaseContainerManagerTest;
 import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerSignalContext;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
-import org.apache.hadoop.yarn.util.ConverterUtils;
 import org.apache.hadoop.yarn.util.LinuxResourceCalculatorPlugin;
 import org.apache.hadoop.yarn.util.ProcfsBasedProcessTree;
 import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
@@ -75,6 +76,7 @@ import org.apache.hadoop.yarn.util.TestProcfsBasedProcessTree;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
+import org.mockito.Mockito;
 import org.slf4j.LoggerFactory;
 
 public class TestContainersMonitor extends BaseContainerManagerTest {
@@ -95,6 +97,22 @@ public class TestContainersMonitor extends BaseContainerManagerTest {
     super.setup();
   }
 
+  @Test
+  public void testMetricsUpdate() throws Exception {
+    // This test doesn't verify the correction of those metrics
+    // updated by the monitor, it only verifies that the monitor
+    // do publish these info to node manager metrics system in
+    // each monitor interval.
+    Context spyContext = spy(context);
+    ContainersMonitorImpl cm =
+        new ContainersMonitorImpl(mock(ContainerExecutor.class),
+            mock(AsyncDispatcher.class), spyContext);
+    cm.init(getConfForCM(false, true, 1024, 2.1f));
+    cm.start();
+    Mockito.verify(spyContext, timeout(500).atLeastOnce())
+        .getNodeManagerMetrics();
+  }
+
   /**
    * Test to verify the check for whether a process tree is over limit or not.
    *