Explorar o código

YARN-6862. Nodemanager resource usage metrics sometimes are negative. Contributed by Benjamin Teke

Szilard Nemeth %!s(int64=3) %!d(string=hai) anos
pai
achega
d3fded12dc

+ 8 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -537,6 +537,14 @@ public class ContainersMonitorImpl extends AbstractService implements
             pTree.updateProcessTree();    // update process-tree
             long currentVmemUsage = pTree.getVirtualMemorySize();
             long currentPmemUsage = pTree.getRssMemorySize();
+            if (currentVmemUsage < 0 || currentPmemUsage < 0) {
+              // YARN-6862/YARN-5021 If the container just exited or for
+              // another reason the physical/virtual memory is UNAVAILABLE (-1)
+              // the values shouldn't be aggregated.
+              LOG.info("Skipping monitoring container {} because "
+                  + "memory usage is not available.", containerId);
+              continue;
+            }
 
             // if machine has 6 cores and 3 are used,
             // cpuUsagePercentPerCore should be 300%

+ 10 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockCPUResourceCalculatorProcessTree.java

@@ -56,6 +56,16 @@ public class MockCPUResourceCalculatorProcessTree
     return true;
   }
 
+  @Override
+  public long getVirtualMemorySize(int olderThanAge) {
+    return 0;
+  }
+
+  @Override
+  public long getRssMemorySize(int olderThanAge) {
+    return 0;
+  }
+
   @Override
   public float getCpuUsagePercent() {
     long cpu = this.cpuPercentage;

+ 89 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockMemoryResourceCalculatorProcessTree.java

@@ -0,0 +1,89 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor;
+
+import org.apache.hadoop.yarn.util.ResourceCalculatorProcessTree;
+
+/**
+ * Mock class to obtain resource usage (Memory).
+ */
+public class MockMemoryResourceCalculatorProcessTree extends ResourceCalculatorProcessTree {
+  private final long memorySize = 500000000L;
+
+  private long rssMemorySize = memorySize;
+  private long virtualMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE;
+
+  /**
+   * Constructor for MockMemoryResourceCalculatorProcessTree with specified root
+   * process.
+   * @param root
+   */
+  public MockMemoryResourceCalculatorProcessTree(String root) {
+    super(root);
+  }
+
+  @Override
+  public void updateProcessTree() {
+  }
+
+  @Override
+  public String getProcessTreeDump() {
+    return "";
+  }
+
+  @Override
+  public long getCumulativeCpuTime() {
+    return 0;
+  }
+
+  @Override
+  public boolean checkPidPgrpidForMatch() {
+    return true;
+  }
+
+  @Override
+  public long getRssMemorySize(int olderThanAge) {
+    long rssMemory = this.rssMemorySize;
+    // First getter call will return with 500000000, and second call will
+    // return -1, rest of the calls will return a valid value.
+    if (rssMemory == memorySize) {
+      this.rssMemorySize = ResourceCalculatorProcessTree.UNAVAILABLE;
+    }
+    if (rssMemory == ResourceCalculatorProcessTree.UNAVAILABLE) {
+      this.rssMemorySize = 2 * memorySize;
+    }
+    return rssMemory;
+  }
+
+  @Override
+  public long getVirtualMemorySize(int olderThanAge) {
+    long virtualMemory = this.virtualMemorySize;
+    // First getter call will return with -1, and rest of the calls will
+    // return a valid value.
+    if (virtualMemory == ResourceCalculatorProcessTree.UNAVAILABLE) {
+      this.virtualMemorySize = 3 * memorySize;
+    }
+    return virtualMemory;
+  }
+
+  @Override
+  public float getCpuUsagePercent() {
+    return 0;
+  }
+}

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/MockResourceCalculatorProcessTree.java

@@ -51,10 +51,16 @@ public class MockResourceCalculatorProcessTree extends ResourceCalculatorProcess
     this.rssMemorySize = rssMemorySize;
   }
 
+  @Override
   public long getRssMemorySize() {
     return this.rssMemorySize;
   }
 
+  @Override
+  public long getVirtualMemorySize() {
+    return 0;
+  }
+
   @Override
   public float getCpuUsagePercent() {
     return 0;

+ 18 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/TestContainersMonitorResourceChange.java

@@ -282,13 +282,24 @@ public class TestContainersMonitorResourceChange {
 
   @Test
   public void testContainersCPUResourceForDefaultValue() throws Exception {
+    testContainerMonitoringInvalidResources(
+        MockCPUResourceCalculatorProcessTree.class.getCanonicalName());
+  }
+
+  @Test
+  public void testContainersMemoryResourceUnavailable() throws Exception {
+    testContainerMonitoringInvalidResources(
+        MockMemoryResourceCalculatorProcessTree.class.getCanonicalName());
+  }
+
+  private void testContainerMonitoringInvalidResources(
+      String processTreeClassName) throws Exception {
     Configuration newConf = new Configuration(conf);
-    // set container monitor interval to be 20s
+    // set container monitor interval to be 20ms
     newConf.setLong(YarnConfiguration.NM_CONTAINER_MON_INTERVAL_MS, 20L);
     containersMonitor = createContainersMonitor(executor, dispatcher, context);
     newConf.set(YarnConfiguration.NM_CONTAINER_MON_PROCESS_TREE,
-        MockCPUResourceCalculatorProcessTree.class.getCanonicalName());
-    // set container monitor interval to be 20ms
+        processTreeClassName);
     containersMonitor.init(newConf);
     containersMonitor.start();
 
@@ -305,7 +316,7 @@ public class TestContainersMonitorResourceChange {
         0, containersMonitor.getContainersUtilization()
             .compareTo(ResourceUtilization.newInstance(0, 0, 0.0f)));
 
-    // Verify the container utilization value. Since atleast one round is done,
+    // Verify the container utilization value. Since at least one round is done,
     // we can expect a non-zero value for container utilization as
     // MockCPUResourceCalculatorProcessTree#getCpuUsagePercent will return 50.
     waitForContainerResourceUtilizationChange(containersMonitor, 100);
@@ -324,12 +335,13 @@ public class TestContainersMonitorResourceChange {
       }
 
       LOG.info(
-          "Monitor thread is waiting for resource utlization change.");
+          "Monitor thread is waiting for resource utilization change.");
       Thread.sleep(WAIT_MS_PER_LOOP);
       timeWaiting += WAIT_MS_PER_LOOP;
     }
 
-    assertTrue("Resource utilization is not changed from second run onwards",
+    assertTrue("Resource utilization is not changed after " +
+            timeoutMsecs / WAIT_MS_PER_LOOP + " updates",
         0 != containersMonitor.getContainersUtilization()
             .compareTo(ResourceUtilization.newInstance(0, 0, 0.0f)));
   }