|
@@ -18,32 +18,43 @@
|
|
|
|
|
|
package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
|
|
|
|
|
|
+import java.io.IOException;
|
|
|
import java.util.Map;
|
|
|
import java.util.Map.Entry;
|
|
|
import java.util.concurrent.ConcurrentHashMap;
|
|
|
+import java.util.concurrent.ExecutorService;
|
|
|
+import java.util.concurrent.Executors;
|
|
|
+import java.util.concurrent.TimeUnit;
|
|
|
|
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
|
import org.apache.commons.logging.Log;
|
|
|
import org.apache.commons.logging.LogFactory;
|
|
|
+import org.apache.hadoop.classification.InterfaceAudience.Private;
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
import org.apache.hadoop.service.AbstractService;
|
|
|
import org.apache.hadoop.util.StringUtils.TraditionalBinaryPrefix;
|
|
|
import org.apache.hadoop.yarn.api.records.ContainerExitStatus;
|
|
|
import org.apache.hadoop.yarn.api.records.ContainerId;
|
|
|
import org.apache.hadoop.yarn.api.records.Resource;
|
|
|
+import org.apache.hadoop.yarn.api.records.timelineservice.ContainerEntity;
|
|
|
+import org.apache.hadoop.yarn.api.records.timelineservice.TimelineEntity;
|
|
|
+import org.apache.hadoop.yarn.api.records.timelineservice.TimelineMetric;
|
|
|
+import org.apache.hadoop.yarn.client.api.TimelineClient;
|
|
|
import org.apache.hadoop.yarn.conf.YarnConfiguration;
|
|
|
import org.apache.hadoop.yarn.event.AsyncDispatcher;
|
|
|
import org.apache.hadoop.yarn.event.Dispatcher;
|
|
|
import org.apache.hadoop.yarn.api.records.ResourceUtilization;
|
|
|
+import org.apache.hadoop.yarn.exceptions.YarnException;
|
|
|
import org.apache.hadoop.yarn.server.nodemanager.ContainerExecutor;
|
|
|
import org.apache.hadoop.yarn.server.nodemanager.Context;
|
|
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.Container;
|
|
|
import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerKillEvent;
|
|
|
import org.apache.hadoop.yarn.server.nodemanager.util.NodeManagerHardwareUtils;
|
|
|
-import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
|
|
import org.apache.hadoop.yarn.util.ResourceCalculatorPlugin;
|
|
|
+import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
|
|
|
|
|
|
import com.google.common.base.Preconditions;
|
|
|
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
|
|
|
|
|
public class ContainersMonitorImpl extends AbstractService implements
|
|
|
ContainersMonitor {
|
|
@@ -75,11 +86,25 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
private boolean pmemCheckEnabled;
|
|
|
private boolean vmemCheckEnabled;
|
|
|
private boolean containersMonitorEnabled;
|
|
|
+
|
|
|
+ private boolean publishContainerMetricsToTimelineService;
|
|
|
|
|
|
private long maxVCoresAllottedForContainers;
|
|
|
|
|
|
private static final long UNKNOWN_MEMORY_LIMIT = -1L;
|
|
|
private int nodeCpuPercentageForYARN;
|
|
|
+
|
|
|
+ // For posting entities in new timeline service in a non-blocking way
|
|
|
+ // TODO replace with event loop in TimelineClient.
|
|
|
+ private static ExecutorService threadPool =
|
|
|
+ Executors.newCachedThreadPool(
|
|
|
+ new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
|
|
|
+ .build());
|
|
|
+
|
|
|
+ @Private
|
|
|
+ public static enum ContainerMetric {
|
|
|
+ CPU, MEMORY
|
|
|
+ }
|
|
|
|
|
|
private ResourceUtilization containersUtilization;
|
|
|
|
|
@@ -188,6 +213,18 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
1) + "). Thrashing might happen.");
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ publishContainerMetricsToTimelineService =
|
|
|
+ YarnConfiguration.systemMetricsPublisherEnabled(conf);
|
|
|
+
|
|
|
+ if (publishContainerMetricsToTimelineService) {
|
|
|
+ LOG.info("NodeManager has been configured to publish container " +
|
|
|
+ "metrics to Timeline Service V2.");
|
|
|
+ } else {
|
|
|
+ LOG.warn("NodeManager has not been configured to publish container " +
|
|
|
+ "metrics to Timeline Service V2.");
|
|
|
+ }
|
|
|
+
|
|
|
super.serviceInit(conf);
|
|
|
}
|
|
|
|
|
@@ -230,8 +267,27 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
;
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ shutdownAndAwaitTermination();
|
|
|
+
|
|
|
super.serviceStop();
|
|
|
}
|
|
|
+
|
|
|
+ // TODO remove threadPool after adding non-blocking call in TimelineClient
|
|
|
+ private static void shutdownAndAwaitTermination() {
|
|
|
+ threadPool.shutdown();
|
|
|
+ try {
|
|
|
+ if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
|
|
|
+ threadPool.shutdownNow();
|
|
|
+ if (!threadPool.awaitTermination(60, TimeUnit.SECONDS))
|
|
|
+ LOG.error("ThreadPool did not terminate");
|
|
|
+ }
|
|
|
+ } catch (InterruptedException ie) {
|
|
|
+ threadPool.shutdownNow();
|
|
|
+ // Preserve interrupt status
|
|
|
+ Thread.currentThread().interrupt();
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
@VisibleForTesting
|
|
|
static class ProcessTreeInfo {
|
|
@@ -409,6 +465,10 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
.entrySet()) {
|
|
|
ContainerId containerId = entry.getKey();
|
|
|
ProcessTreeInfo ptInfo = entry.getValue();
|
|
|
+
|
|
|
+ ContainerEntity entity = new ContainerEntity();
|
|
|
+ entity.setId(containerId.toString());
|
|
|
+
|
|
|
try {
|
|
|
String pId = ptInfo.getPID();
|
|
|
|
|
@@ -423,7 +483,8 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
+ " for the first time");
|
|
|
|
|
|
ResourceCalculatorProcessTree pt =
|
|
|
- ResourceCalculatorProcessTree.getResourceCalculatorProcessTree(pId, processTreeClass, conf);
|
|
|
+ ResourceCalculatorProcessTree.getResourceCalculatorProcessTree(
|
|
|
+ pId, processTreeClass, conf);
|
|
|
ptInfo.setPid(pId);
|
|
|
ptInfo.setProcessTree(pt);
|
|
|
|
|
@@ -447,13 +508,15 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
pTree.updateProcessTree(); // update process-tree
|
|
|
long currentVmemUsage = pTree.getVirtualMemorySize();
|
|
|
long currentPmemUsage = pTree.getRssMemorySize();
|
|
|
+ long currentTime = System.currentTimeMillis();
|
|
|
+
|
|
|
// if machine has 6 cores and 3 are used,
|
|
|
// cpuUsagePercentPerCore should be 300% and
|
|
|
// cpuUsageTotalCoresPercentage should be 50%
|
|
|
float cpuUsagePercentPerCore = pTree.getCpuUsagePercent();
|
|
|
float cpuUsageTotalCoresPercentage = cpuUsagePercentPerCore /
|
|
|
resourceCalculatorPlugin.getNumProcessors();
|
|
|
-
|
|
|
+
|
|
|
// Multiply by 1000 to avoid losing data when converting to int
|
|
|
int milliVcoresUsed = (int) (cpuUsageTotalCoresPercentage * 1000
|
|
|
* maxVCoresAllottedForContainers /nodeCpuPercentageForYARN);
|
|
@@ -490,6 +553,26 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
((int)cpuUsagePercentPerCore, milliVcoresUsed);
|
|
|
}
|
|
|
|
|
|
+ if (publishContainerMetricsToTimelineService) {
|
|
|
+ // if currentPmemUsage data is available
|
|
|
+ if (currentPmemUsage !=
|
|
|
+ ResourceCalculatorProcessTree.UNAVAILABLE) {
|
|
|
+ TimelineMetric memoryMetric = new TimelineMetric();
|
|
|
+ memoryMetric.setId(ContainerMetric.MEMORY.toString() + pId);
|
|
|
+ memoryMetric.addTimeSeriesData(currentTime, currentPmemUsage);
|
|
|
+ entity.addMetric(memoryMetric);
|
|
|
+ }
|
|
|
+ // if cpuUsageTotalCoresPercentage data is available
|
|
|
+ if (cpuUsageTotalCoresPercentage !=
|
|
|
+ ResourceCalculatorProcessTree.UNAVAILABLE) {
|
|
|
+ TimelineMetric cpuMetric = new TimelineMetric();
|
|
|
+ cpuMetric.setId(ContainerMetric.CPU.toString() + pId);
|
|
|
+ cpuMetric.addTimeSeriesData(currentTime,
|
|
|
+ cpuUsageTotalCoresPercentage);
|
|
|
+ entity.addMetric(cpuMetric);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
boolean isMemoryOverLimit = false;
|
|
|
String msg = "";
|
|
|
int containerExitStatus = ContainerExitStatus.INVALID;
|
|
@@ -544,10 +627,23 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
trackingContainers.remove(containerId);
|
|
|
LOG.info("Removed ProcessTree with root " + pId);
|
|
|
}
|
|
|
+
|
|
|
} catch (Exception e) {
|
|
|
// Log the exception and proceed to the next container.
|
|
|
- LOG.warn("Uncaught exception in ContainerMemoryManager "
|
|
|
- + "while managing memory of " + containerId, e);
|
|
|
+ LOG.warn("Uncaught exception in ContainersMonitorImpl "
|
|
|
+ + "while monitoring resource of " + containerId, e);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (publishContainerMetricsToTimelineService) {
|
|
|
+ try {
|
|
|
+ TimelineClient timelineClient = context.getApplications().get(
|
|
|
+ containerId.getApplicationAttemptId().getApplicationId()).
|
|
|
+ getTimelineClient();
|
|
|
+ putEntityWithoutBlocking(timelineClient, entity);
|
|
|
+ } catch (Exception e) {
|
|
|
+ LOG.error("Exception in ContainersMonitorImpl in putting " +
|
|
|
+ "resource usage metrics to timeline service.", e);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
if (LOG.isDebugEnabled()) {
|
|
@@ -571,6 +667,21 @@ public class ContainersMonitorImpl extends AbstractService implements
|
|
|
}
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ private void putEntityWithoutBlocking(final TimelineClient timelineClient,
|
|
|
+ final TimelineEntity entity) {
|
|
|
+ Runnable publishWrapper = new Runnable() {
|
|
|
+ public void run() {
|
|
|
+ try {
|
|
|
+ timelineClient.putEntities(entity);
|
|
|
+ } catch (IOException|YarnException e) {
|
|
|
+ LOG.error("putEntityNonBlocking get failed: " + e);
|
|
|
+ throw new RuntimeException(e.toString());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ };
|
|
|
+ threadPool.execute(publishWrapper);
|
|
|
+ }
|
|
|
|
|
|
private String formatErrorMessage(String memTypeExceeded,
|
|
|
long currentVmemUsage, long vmemLimit,
|