Pārlūkot izejas kodu

YARN-8813. Improve debug messages for NM preemption of OPPORTUNISTIC containers (haibochen via rkanter)

Robert Kanter 6 gadi atpakaļ
vecāks
revīzija
bb59914237

+ 8 - 5
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupElasticMemoryController.java

@@ -94,6 +94,7 @@ public class CGroupElasticMemoryController extends Thread {
     boolean controlVirtual = controlVirtualMemory && !controlPhysicalMemory;
     Runnable oomHandlerTemp =
         getDefaultOOMHandler(conf, context, oomHandlerOverride, controlVirtual);
+    LOG.info("Using OOMHandler: " + oomHandlerTemp.getClass().getName());
     if (controlPhysicalMemory && controlVirtualMemory) {
       LOG.warn(
           NM_ELASTIC_MEMORY_CONTROL_ENABLED + " is on. " +
@@ -138,11 +139,10 @@ public class CGroupElasticMemoryController extends Thread {
       Configuration conf, Context context, Runnable oomHandlerLocal,
       boolean controlVirtual)
       throws YarnException {
-    Class oomHandlerClass =
-        conf.getClass(
-            YarnConfiguration.NM_ELASTIC_MEMORY_CONTROL_OOM_HANDLER,
-            DefaultOOMHandler.class);
     if (oomHandlerLocal == null) {
+      Class oomHandlerClass = conf.getClass(
+          YarnConfiguration.NM_ELASTIC_MEMORY_CONTROL_OOM_HANDLER,
+          DefaultOOMHandler.class);
       try {
         Constructor constr = oomHandlerClass.getConstructor(
             Context.class, boolean.class);
@@ -284,12 +284,15 @@ public class CGroupElasticMemoryController extends Thread {
       // This loop can be exited by terminating the process
       // with stopListening()
       while ((read = events.read(event)) == event.length) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("OOM event notification received from oom-listener");
+        }
         // An OOM event has occurred
         resolveOOM(executor);
       }
 
       if (read != -1) {
-        LOG.warn(String.format("Characters returned from event hander: %d",
+        LOG.warn(String.format("Characters returned from event handler: %d",
             read));
       }
 

+ 3 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/DefaultOOMHandler.java

@@ -181,6 +181,9 @@ public class DefaultOOMHandler implements Runnable {
             CGroupsHandler.CGroupController.MEMORY,
             "",
             CGROUP_PARAM_MEMORY_OOM_CONTROL);
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("OOM status read from cgroups: " + status);
+        }
         if (!status.contains(CGroupsHandler.UNDER_OOM)) {
           break;
         }

+ 4 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -1096,6 +1096,10 @@ public class ContainersMonitorImpl extends AbstractService implements
   private void setLatestContainersUtilization(ResourceUtilization utilization) {
     this.latestContainersUtilization = new ContainersResourceUtilization(
         utilization, Time.now());
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Updated latest containers resource utilization to " +
+          latestContainersUtilization.getUtilization());
+    }
   }
 
   /**

+ 12 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/SnapshotBasedOverAllocationPreemptionPolicy.java

@@ -20,6 +20,8 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler;
 import org.apache.hadoop.yarn.api.records.ResourceUtilization;
 import org.apache.hadoop.yarn.server.api.records.ResourceThresholds;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 /**
  * An implementation of {@link NMAllocationPreemptionPolicy} based on the
@@ -29,6 +31,8 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.Contai
  */
 public class SnapshotBasedOverAllocationPreemptionPolicy
     extends NMAllocationPreemptionPolicy {
+  private static final Logger LOG = LoggerFactory.getLogger(
+      SnapshotBasedOverAllocationPreemptionPolicy.class);
   private final int absoluteMemoryPreemptionThresholdMb;
   private final float cpuPreemptionThreshold;
   private final int maxTimesCpuOverPreemption;
@@ -52,6 +56,10 @@ public class SnapshotBasedOverAllocationPreemptionPolicy
     ResourceUtilization utilization =
         getContainersMonitor().getContainersUtilization(true).getUtilization();
 
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("The latest container utilization is"  + utilization);
+    }
+
     int memoryOverLimit = utilization.getPhysicalMemory() -
         absoluteMemoryPreemptionThresholdMb;
     float vcoreOverLimit = utilization.getCPU() - cpuPreemptionThreshold;
@@ -59,6 +67,10 @@ public class SnapshotBasedOverAllocationPreemptionPolicy
     if (vcoreOverLimit > 0) {
       timesCpuOverPreemption++;
       if (timesCpuOverPreemption > maxTimesCpuOverPreemption) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("CPU utilization is over the preemption threshold " +
+              timesCpuOverPreemption + " times consecutively.");
+        }
         timesCpuOverPreemption = 0;
       } else {
         // report no over limit for cpu if # of times CPU is over the preemption