Browse Source

YARN-6920. Fix resource leak that happens during container re-initialization. (asuresh)

Arun Suresh 7 năm trước cách đây
mục cha
commit
8d3fd81980

+ 17 - 20
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-client/src/test/java/org/apache/hadoop/yarn/client/api/impl/TestNMClient.java

@@ -398,6 +398,8 @@ public class TestNMClient {
               "will be Rolled-back", Arrays.asList(new Integer[] {-1000}));
           testCommitContainer(container.getId(), true);
           testReInitializeContainer(container.getId(), clc, false);
+          testGetContainerStatus(container, i, ContainerState.RUNNING,
+              "will be Re-initialized", Arrays.asList(new Integer[] {-1000}));
           testCommitContainer(container.getId(), false);
         } else {
           testReInitializeContainer(container.getId(), clc, true);
@@ -449,24 +451,21 @@ public class TestNMClient {
       ContainerState state, String diagnostics, List<Integer> exitStatuses)
           throws YarnException, IOException {
     while (true) {
-      try {
-        ContainerStatus status = nmClient.getContainerStatus(
-            container.getId(), container.getNodeId());
-        // NodeManager may still need some time to get the stable
-        // container status
-        if (status.getState() == state) {
-          assertEquals(container.getId(), status.getContainerId());
-          assertTrue("" + index + ": " + status.getDiagnostics(),
-              status.getDiagnostics().contains(diagnostics));
-          
-          assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
-              ", but the actual exit status code is: " + status.getExitStatus(),
-              exitStatuses.contains(status.getExitStatus()));
-          break;
-        }
-        Thread.sleep(100);
-      } catch (InterruptedException e) {
-        e.printStackTrace();
+      sleep(250);
+      ContainerStatus status = nmClient.getContainerStatus(
+          container.getId(), container.getNodeId());
+      // NodeManager may still need some time to get the stable
+      // container status
+      if (status.getState() == state) {
+        assertEquals(container.getId(), status.getContainerId());
+        assertTrue("" + index + ": " + status.getDiagnostics(),
+            status.getDiagnostics().contains(diagnostics));
+
+        assertTrue("Exit Statuses are supposed to be in: " + exitStatuses +
+                ", but the actual exit status code is: " +
+                status.getExitStatus(),
+            exitStatuses.contains(status.getExitStatus()));
+        break;
       }
     }
   }
@@ -559,9 +558,7 @@ public class TestNMClient {
       ContainerLaunchContext clc, boolean autoCommit)
       throws YarnException, IOException {
     try {
-      sleep(250);
       nmClient.reInitializeContainer(containerId, clc, autoCommit);
-      sleep(250);
     } catch (YarnException e) {
       // NM container will only be in SCHEDULED state, so expect the increase
       // action to fail.

+ 4 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

@@ -1397,6 +1397,10 @@ public class ContainerImpl implements Container {
       container.resourceSet =
           container.reInitContext.mergedResourceSet(container.resourceSet);
       container.isMarkeForKilling = false;
+      // Ensure Resources are decremented.
+      container.dispatcher.getEventHandler().handle(
+          new ContainerSchedulerEvent(container,
+          ContainerSchedulerEventType.CONTAINER_COMPLETED));
       container.sendScheduleEvent();
     }
   }

+ 4 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/scheduler/ContainerScheduler.java

@@ -466,4 +466,8 @@ public class ContainerScheduler extends AbstractService implements
     return this.context.getContainerManager().getContainersMonitor();
   }
 
+  @VisibleForTesting
+  public ResourceUtilization getCurrentUtilization() {
+    return this.utilizationTracker.getCurrentUtilization();
+  }
 }

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/TestContainerManager.java

@@ -74,6 +74,7 @@ import org.apache.hadoop.yarn.api.records.LocalResource;
 import org.apache.hadoop.yarn.api.records.LocalResourceType;
 import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
 import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.api.records.ResourceUtilization;
 import org.apache.hadoop.yarn.api.records.SerializedException;
 import org.apache.hadoop.yarn.api.records.SignalContainerCommand;
 import org.apache.hadoop.yarn.api.records.Token;
@@ -437,7 +438,15 @@ public class TestContainerManager extends BaseContainerManagerTest {
 
     File newStartFile = new File(tmpDir, "start_file_n.txt").getAbsoluteFile();
 
+    ResourceUtilization beforeUpgrade =
+        ResourceUtilization.newInstance(
+            containerManager.getContainerScheduler().getCurrentUtilization());
     prepareContainerUpgrade(autoCommit, false, false, cId, newStartFile);
+    ResourceUtilization afterUpgrade =
+        ResourceUtilization.newInstance(
+            containerManager.getContainerScheduler().getCurrentUtilization());
+    Assert.assertEquals("Possible resource leak detected !!",
+        beforeUpgrade, afterUpgrade);
 
     // Assert that the First process is not alive anymore
     Assert.assertFalse("Process is still alive!",