浏览代码

YARN-9877 - Intermittent TIME_OUT of LogAggregationReport (#5784)

K0K0V0K 1 年之前
父节点
当前提交
82c8070e93

+ 7 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppImpl.java

@@ -1088,8 +1088,13 @@ public class RMAppImpl implements RMApp, Recoverable {
       // otherwise, add it to ranNodes for further process
       app.ranNodes.add(nodeAddedEvent.getNodeId());
 
-      app.logAggregation.addReportIfNecessary(
-          nodeAddedEvent.getNodeId(), app.getApplicationId());
+      if (!nodeAddedEvent.isCreatedFromAcquiredState()) {
+        app.logAggregation.addReportIfNecessary(
+            nodeAddedEvent.getNodeId(), app.getApplicationId());
+      } else {
+        LOG.debug("Not considering node for log aggregation yet. nodeId: {}, appId: {}",
+            nodeAddedEvent.getNodeId(), app.getApplicationId());
+      }
     }
   }
 

+ 14 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/RMAppRunningOnNodeEvent.java

@@ -23,13 +23,27 @@ import org.apache.hadoop.yarn.api.records.NodeId;
 
 public class RMAppRunningOnNodeEvent extends RMAppEvent {
   private final NodeId node;
+  private final boolean createdFromAcquiredState;
 
   public RMAppRunningOnNodeEvent(ApplicationId appId, NodeId node) {
+    this(appId, node, false);
+  }
+
+  public RMAppRunningOnNodeEvent(
+      ApplicationId appId,
+      NodeId node,
+      boolean createdFromAcquiredState
+  ) {
     super(appId, RMAppEventType.APP_RUNNING_ON_NODE);
     this.node = node;
+    this.createdFromAcquiredState = createdFromAcquiredState;
   }
   
   public NodeId getNodeId() {
     return node;
   }
+
+  public boolean isCreatedFromAcquiredState() {
+    return createdFromAcquiredState;
+  }
 }

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/rmcontainer/RMContainerImpl.java

@@ -606,7 +606,7 @@ public class RMContainerImpl implements RMContainer {
 
       // Tell the app
       container.eventHandler.handle(new RMAppRunningOnNodeEvent(container
-          .getApplicationAttemptId().getApplicationId(), container.nodeId));
+          .getApplicationAttemptId().getApplicationId(), container.nodeId, true));
 
       // Opportunistic containers move directly from NEW to ACQUIRED
       if (container.getState() == RMContainerState.NEW) {

+ 12 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/rmapp/TestRMAppTransitions.java

@@ -51,6 +51,7 @@ import org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPB
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.DrainDispatcher;
 import org.apache.hadoop.yarn.event.EventHandler;
+import org.apache.hadoop.yarn.server.api.protocolrecords.LogAggregationReport;
 import org.apache.hadoop.yarn.server.resourcemanager.ApplicationMasterService;
 import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.RMAppManagerEventType;
@@ -962,6 +963,17 @@ public class TestRMAppTransitions {
     assertAppStateLaunchTimeSaved(1234L);
   }
 
+  @Test
+  public void testAcquiredReleased() throws IOException {
+    RMApp application = testCreateAppSubmittedNoRecovery(null);
+    NodeId nodeId = NodeId.newInstance("host", 1234);
+    application.handle(
+        new RMAppRunningOnNodeEvent(application.getApplicationId(), nodeId, true));
+    Map<NodeId, LogAggregationReport> logAggregationReportsForApp =
+        application.getLogAggregationReportsForApp();
+    assertEquals(0, logAggregationReportsForApp.size());
+  }
+
   @Test
   public void testAppAcceptedAttemptKilled() throws IOException,
       InterruptedException {