瀏覽代碼

YARN-8911. ContainerScheduler incorrectly uses percentage number as the cpu resource utlization.

Haibo Chen 6 年之前
父節點
當前提交
766b78ee07
共有 8 個文件被更改,包括 57 次插入91 次删除
  1. 4 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceUtilization.java
  2. 2 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitor.java
  3. 2 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
  4. 3 24
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/AllocationBasedResourceUtilizationTracker.java
  5. 1 4
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java
  6. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java
  7. 0 18
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestAllocationBasedResourceUtilizationTracker.java
  8. 44 35
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestContainerSchedulerRecovery.java

+ 4 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ResourceUtilization.java

@@ -89,18 +89,18 @@ public abstract class ResourceUtilization implements
   public abstract void setPhysicalMemory(int pmem);
 
   /**
-   * Get <em>CPU</em> utilization.
+   * Get <em>CPU</em> utilization (The amount of vcores used).
    *
-   * @return <em>CPU utilization</em> normalized to 1 CPU
+   * @return <em>CPU utilization</em>
    */
   @Public
   @Unstable
   public abstract float getCPU();
 
   /**
-   * Set <em>CPU</em> utilization.
+   * Set <em>CPU</em> utilization (The amount of vcores used).
    *
-   * @param cpu <em>CPU utilization</em> normalized to 1 CPU
+   * @param cpu <em>CPU utilization</em>
    */
   @Public
   @Unstable

+ 2 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitor.java

@@ -43,8 +43,7 @@ public interface ContainersMonitor extends Service,
   static void increaseResourceUtilization(
       ContainersMonitor containersMonitor, ResourceUtilization resourceUtil,
       Resource resource) {
-    float vCores = (float) resource.getVirtualCores() /
-        containersMonitor.getVCoresAllocatedForContainers();
+    float vCores = (float) resource.getVirtualCores();
     int vmem = (int) (resource.getMemorySize()
         * containersMonitor.getVmemRatio());
     resourceUtil.addTo((int)resource.getMemorySize(), vmem, vCores);
@@ -60,8 +59,7 @@ public interface ContainersMonitor extends Service,
   static void decreaseResourceUtilization(
       ContainersMonitor containersMonitor, ResourceUtilization resourceUtil,
       Resource resource) {
-    float vCores = (float) resource.getVirtualCores() /
-        containersMonitor.getVCoresAllocatedForContainers();
+    float vCores = (float) resource.getVirtualCores();
     int vmem = (int) (resource.getMemorySize()
         * containersMonitor.getVmemRatio());
     resourceUtil.subtractFrom((int)resource.getMemorySize(), vmem, vCores);

+ 2 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -949,7 +949,8 @@ public class ContainersMonitorImpl extends AbstractService implements
   public void subtractNodeResourcesFromResourceUtilization(
       ResourceUtilization resourceUtil) {
     resourceUtil.subtractFrom((int) (getPmemAllocatedForContainers() >> 20),
-        (int) (getVmemAllocatedForContainers() >> 20), 1.0f);
+        (int) (getVmemAllocatedForContainers() >> 20),
+        getVCoresAllocatedForContainers());
   }
 
   @Override

+ 3 - 24
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/AllocationBasedResourceUtilizationTracker.java

@@ -123,35 +123,14 @@ public class AllocationBasedResourceUtilizationTracker implements
           this.containersAllocation.getCPU(),
           getContainersMonitor().getVCoresAllocatedForContainers());
     }
-    // Check CPU. Compare using integral values of cores to avoid decimal
-    // inaccuracies.
-    if (!hasEnoughCpu(this.containersAllocation.getCPU(),
-        getContainersMonitor().getVCoresAllocatedForContainers(), cpuVcores)) {
+    // Check CPU.
+    if (this.containersAllocation.getCPU() + cpuVcores >
+        getContainersMonitor().getVCoresAllocatedForContainers()) {
       return false;
     }
     return true;
   }
 
-  /**
-   * Returns whether there is enough space for coresRequested in totalCores.
-   * Converts currentAllocation usage to nearest integer count before comparing,
-   * as floats are inherently imprecise. NOTE: this calculation assumes that
-   * requested core counts must be integers, and currentAllocation core count
-   * must also be an integer.
-   *
-   * @param currentAllocation The current allocation, a float value from 0 to 1.
-   * @param totalCores The total cores in the system.
-   * @param coresRequested The number of cores requested.
-   * @return True if currentAllocationtotalCores*coresRequested &lt;=
-   *         totalCores.
-   */
-  public boolean hasEnoughCpu(float currentAllocation, long totalCores,
-      int coresRequested) {
-    // Must not cast here, as it would truncate the decimal digits.
-    return Math.round(currentAllocation * totalCores)
-        + coresRequested <= totalCores;
-  }
-
   public ContainersMonitor getContainersMonitor() {
     return this.scheduler.getContainersMonitor();
   }

+ 1 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java

@@ -594,10 +594,7 @@ public class ContainerScheduler extends AbstractService implements
       ResourceUtilization resourcesToFreeUp) {
     return resourcesToFreeUp.getPhysicalMemory() <= 0 &&
         resourcesToFreeUp.getVirtualMemory() <= 0 &&
-        // Convert the number of cores to nearest integral number, due to
-        // imprecision of direct float comparison.
-        Math.round(resourcesToFreeUp.getCPU()
-            * getContainersMonitor().getVCoresAllocatedForContainers()) <= 0;
+        resourcesToFreeUp.getCPU() <= 0;
   }
 
   private ResourceUtilization resourcesToFreeUp(

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManagerRecovery.java

@@ -527,7 +527,7 @@ public class TestContainerManagerRecovery extends BaseContainerManagerTest {
     assertNotNull(app);
 
     ResourceUtilization utilization =
-        ResourceUtilization.newInstance(1024, 2048, 0.25F);
+        ResourceUtilization.newInstance(1024, 2048, 1.0F);
     assertEquals(cm.getContainerScheduler().getNumRunningContainers(), 1);
     assertEquals(utilization,
         cm.getContainerScheduler().getCurrentUtilization());

+ 0 - 18
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestAllocationBasedResourceUtilizationTracker.java

@@ -72,22 +72,4 @@ public class TestAllocationBasedResourceUtilizationTracker {
     }
     Assert.assertFalse(tracker.hasResourcesAvailable(testContainer));
   }
-
-  /**
-   * Test the case where the current allocation has been truncated to 0.8888891
-   * (8/9 cores used). Request 1 additional core - hasEnoughCpu should return
-   * true.
-   */
-  @Test
-  public void testHasEnoughCpu() {
-    AllocationBasedResourceUtilizationTracker tracker =
-        new AllocationBasedResourceUtilizationTracker(mockContainerScheduler);
-    float currentAllocation = 0.8888891f;
-    long totalCores = 9;
-    int alreadyUsedCores = 8;
-    Assert.assertTrue(tracker.hasEnoughCpu(currentAllocation, totalCores,
-        (int) totalCores - alreadyUsedCores));
-    Assert.assertFalse(tracker.hasEnoughCpu(currentAllocation, totalCores,
-        (int) totalCores - alreadyUsedCores + 1));
-  }
 }

+ 44 - 35
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/TestContainerSchedulerRecovery.java

@@ -20,17 +20,19 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler;
 import static org.junit.Assert.assertEquals;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.when;
-import static org.mockito.Mockito.spy;
-import static org.mockito.Mockito.doNothing;
 
 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.ExecutionType;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceUtilization;
 import org.apache.hadoop.yarn.event.AsyncDispatcher;
 import org.apache.hadoop.yarn.security.ContainerTokenIdentifier;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager.NMContext;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.ContainerManager;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.ContainerImpl;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.monitor.ContainersMonitor;
 import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService
         .RecoveredContainerState;
@@ -40,7 +42,6 @@ import org.junit.Before;
 import org.junit.Test;
 import org.mockito.InjectMocks;
 import org.mockito.Mock;
-import org.mockito.Mockito;
 import org.mockito.MockitoAnnotations;
 
 /**
@@ -49,6 +50,10 @@ import org.mockito.MockitoAnnotations;
  * ExecutionType.
  */
 public class TestContainerSchedulerRecovery {
+  private static final Resource CONTAINER_SIZE =
+      Resource.newInstance(1024, 4);
+  private static final ResourceUtilization ZERO =
+      ResourceUtilization.newInstance(0, 0, 0.0f);
 
   @Mock private NMContext context;
 
@@ -66,13 +71,9 @@ public class TestContainerSchedulerRecovery {
 
   @Mock private ContainerId containerId;
 
-  @Mock private AllocationBasedResourceUtilizationTracker
-      allocationBasedResourceUtilizationTracker;
-
-  @InjectMocks private ContainerScheduler tempContainerScheduler =
+  @InjectMocks private ContainerScheduler spy =
       new ContainerScheduler(context, dispatcher, metrics, 0);
 
-  private ContainerScheduler spy;
 
   private RecoveredContainerState createRecoveredContainerState(
       RecoveredContainerStatus status) {
@@ -81,16 +82,32 @@ public class TestContainerSchedulerRecovery {
     return mockState;
   }
 
+  /**
+   * Set up the {@link ContainersMonitor} dependency of
+   * {@link ResourceUtilizationTracker} so that we can
+   * verify the resource utilization.
+   */
+  private void setupContainerMonitor() {
+    ContainersMonitor containersMonitor = mock(ContainersMonitor.class);
+    when(containersMonitor.getVCoresAllocatedForContainers()).thenReturn(10L);
+    when(containersMonitor.getPmemAllocatedForContainers()).thenReturn(10240L);
+    when(containersMonitor.getVmemRatio()).thenReturn(1.0f);
+    when(containersMonitor.getVmemAllocatedForContainers()).thenReturn(10240L);
+
+    ContainerManager cm = mock(ContainerManager.class);
+    when(cm.getContainersMonitor()).thenReturn(containersMonitor);
+    when(context.getContainerManager()).thenReturn(cm);
+  }
+
   @Before public void setUp() throws Exception {
     MockitoAnnotations.initMocks(this);
-    spy = spy(tempContainerScheduler);
+    setupContainerMonitor();
     when(container.getContainerId()).thenReturn(containerId);
+    when(container.getResource()).thenReturn(CONTAINER_SIZE);
     when(containerId.getApplicationAttemptId()).thenReturn(appAttemptId);
     when(containerId.getApplicationAttemptId().getApplicationId())
         .thenReturn(appId);
     when(containerId.getContainerId()).thenReturn(123L);
-    doNothing().when(allocationBasedResourceUtilizationTracker)
-        .addContainerResources(container);
   }
 
   @After public void tearDown() {
@@ -112,8 +129,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(1, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as QUEUED, OPPORTUNISTIC,
@@ -132,8 +148,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(1, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as PAUSED, GUARANTEED,
@@ -152,8 +167,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(1, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as PAUSED, OPPORTUNISTIC,
@@ -172,8 +186,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(1, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as LAUNCHED, GUARANTEED,
@@ -192,8 +205,9 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(1, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(1))
-        .addContainerResources(container);
+    assertEquals(
+        ResourceUtilization.newInstance(1024, 1024, 4.0f),
+        spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as LAUNCHED, OPPORTUNISTIC,
@@ -212,8 +226,9 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(1, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(1))
-        .addContainerResources(container);
+    assertEquals(
+        ResourceUtilization.newInstance(1024, 1024, 4.0f),
+        spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as REQUESTED, GUARANTEED,
@@ -232,8 +247,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as REQUESTED, OPPORTUNISTIC,
@@ -252,8 +266,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as COMPLETED, GUARANTEED,
@@ -272,8 +285,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as COMPLETED, OPPORTUNISTIC,
@@ -292,8 +304,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as GUARANTEED but no executionType set,
@@ -311,8 +322,7 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 
   /*Test if a container is recovered as PAUSED but no executionType set,
@@ -330,7 +340,6 @@ public class TestContainerSchedulerRecovery {
     assertEquals(0, spy.getNumQueuedGuaranteedContainers());
     assertEquals(0, spy.getNumQueuedOpportunisticContainers());
     assertEquals(0, spy.getNumRunningContainers());
-    Mockito.verify(allocationBasedResourceUtilizationTracker, Mockito.times(0))
-        .addContainerResources(container);
+    assertEquals(ZERO, spy.getCurrentUtilization());
   }
 }