Browse Source

YARN-3792. Test case failures in TestDistributedShell and some issue fixes related to ATSV2 (Naganarasimha G R via sjlee)

(cherry picked from commit 84f37f1c7eefec6d139cbf091c50d6c06f734323)
Sangjin Lee 10 năm trước cách đây
mục cha
commit
3947422efb
11 tập tin đã thay đổi với 107 bổ sung76 xóa
  1. 18 15
      hadoop-yarn-project/CHANGES.txt
  2. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java
  3. 59 32
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java
  4. 7 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShellWithNodeLabels.java
  5. 8 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java
  6. 2 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
  7. 9 6
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
  8. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/timelineservice/RMTimelineCollectorManager.java
  9. 0 14
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/NodeTimelineCollectorManager.java
  10. 1 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java
  11. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/TimelineCollectorManager.java

+ 18 - 15
hadoop-yarn-project/CHANGES.txt

@@ -35,9 +35,6 @@ Branch YARN-2928: Timeline Server Next Generation: Phase 1
     YARN-3333. Rename TimelineAggregator etc. to TimelineCollector. (Sangjin Lee
     via junping_du)
 
-    YARN-3377. Fixed test failure in TestTimelineServiceClientIntegration.
-    (Sangjin Lee via zjshen)
-
     YARN-3034. Implement RM starting its timeline collector. (Naganarasimha G R
     via junping_du)
 
@@ -61,27 +58,15 @@ Branch YARN-2928: Timeline Server Next Generation: Phase 1
     YARN-3551. Consolidate data model change according to the backend
     implementation (Zhijie Shen via sjlee)
 
-    YARN-3562. unit tests failures and issues found from findbug from earlier
-    ATS checkins (Naganarasimha G R via sjlee)
-
     YARN-3134. Implemented Phoenix timeline writer to access HBase backend. (Li
     Lu via zjshen)
 
     YARN-3529. Added mini HBase cluster and Phoenix support to timeline service
     v2 unit tests. (Li Lu via zjshen)
 
-    YARN-3634. TestMRTimelineEventHandling and TestApplication are broken. (
-    Sangjin Lee via junping_du)
-
     YARN-3411. [Storage implementation] explore the native HBase write schema
     for storage (Vrushali C via sjlee)
 
-    YARN-3726. Fix TestHBaseTimelineWriterImpl unit test failure by fixing its
-    test data (Vrushali C via sjlee)
-
-    YARN-3721. build is broken on YARN-2928 branch due to possible dependency
-    cycle (Li Lu via sjlee)
-
     YARN-3044. Made RM write app, attempt and optional container lifecycle
     events to timeline service v2. (Naganarasimha G R via zjshen)
 
@@ -100,6 +85,24 @@ Branch YARN-2928: Timeline Server Next Generation: Phase 1
 
   BUG FIXES
 
+    YARN-3377. Fixed test failure in TestTimelineServiceClientIntegration.
+    (Sangjin Lee via zjshen)
+
+    YARN-3562. unit tests failures and issues found from findbug from earlier
+    ATS checkins (Naganarasimha G R via sjlee)
+
+    YARN-3634. TestMRTimelineEventHandling and TestApplication are broken. (
+    Sangjin Lee via junping_du)
+
+    YARN-3726. Fix TestHBaseTimelineWriterImpl unit test failure by fixing its
+    test data (Vrushali C via sjlee)
+
+    YARN-3721. build is broken on YARN-2928 branch due to possible dependency
+    cycle (Li Lu via sjlee)
+
+    YARN-3792. Test case failures in TestDistributedShell and some issue fixes
+    related to ATSV2 (Naganarasimha G R via sjlee)
+
 Trunk - Unreleased
 
   INCOMPATIBLE CHANGES

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java

@@ -463,7 +463,7 @@ public class Client {
     }
     if (cliParser.hasOption("flow_run_id")) {
       try {
-        flowRunId = Long.valueOf(cliParser.getOptionValue("flow_run_id"));
+        flowRunId = Long.parseLong(cliParser.getOptionValue("flow_run_id"));
       } catch (NumberFormatException e) {
         throw new IllegalArgumentException(
             "Flow run is not a valid long value", e);

+ 59 - 32
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java

@@ -46,6 +46,7 @@ import org.apache.hadoop.util.JarFinder;
 import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ApplicationReport;
+import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
 import org.apache.hadoop.yarn.api.records.YarnApplicationState;
 import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
 import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
@@ -86,28 +87,33 @@ public class TestDistributedShell {
 
   @Before
   public void setup() throws Exception {
-    setupInternal(NUM_NMS);
+    setupInternal(NUM_NMS, currTestName);
   }
 
-  protected void setupInternal(int numNodeManager) throws Exception {
-
+  protected void setupInternal(int numNodeManager, TestName testName)
+      throws Exception {
     LOG.info("Starting up YARN cluster");
 
     conf = new YarnConfiguration();
     conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
     conf.set("yarn.log.dir", "target");
     conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
-    // mark if we need to launch the v1 timeline server
-    boolean enableATSV1 = false;
-    if (!currTestName.getMethodName().toLowerCase().contains("v2")) {
+
+    if (!testName.getMethodName().toLowerCase().contains("v2")) {
       // disable aux-service based timeline collectors
       conf.set(YarnConfiguration.NM_AUX_SERVICES, "");
-      enableATSV1 = true;
+      conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
+          true);
+      conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, false);
     } else {
       // enable aux-service based timeline collectors
       conf.set(YarnConfiguration.NM_AUX_SERVICES, TIMELINE_AUX_SERVICE_NAME);
-      conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + TIMELINE_AUX_SERVICE_NAME
-        + ".class", PerNodeTimelineCollectorsAuxService.class.getName());
+      conf.set(YarnConfiguration.NM_AUX_SERVICES + "."
+          + TIMELINE_AUX_SERVICE_NAME + ".class",
+          PerNodeTimelineCollectorsAuxService.class.getName());
+      conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
+      conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
+          false);
     }
     conf.set(YarnConfiguration.NM_VMEM_PMEM_RATIO, "8");
     conf.set(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class.getName());
@@ -123,12 +129,11 @@ public class TestDistributedShell {
     conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true);
     conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
         true);
-    conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
 
     if (yarnCluster == null) {
       yarnCluster =
           new MiniYARNCluster(TestDistributedShell.class.getSimpleName(), 1,
-              numNodeManager, 1, 1, enableATSV1);
+              numNodeManager, 1, 1);
       yarnCluster.init(conf);
       
       yarnCluster.start();
@@ -303,13 +308,15 @@ public class TestDistributedShell {
       if (checkHostname(appReport.getHost()) && appReport.getRpcPort() == -1) {
         verified = true;
       }
-      if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED) {
+
+      if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED
+          && appReport.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) {
         break;
       }
     }
     Assert.assertTrue(errorMessage, verified);
     t.join();
-    LOG.info("Client run completed. Result=" + result);
+    LOG.info("Client run completed for testDSShell. Result=" + result);
     Assert.assertTrue(result.get());
 
     if (!isTestingTimelineV2) {
@@ -364,9 +371,9 @@ public class TestDistributedShell {
     }
   }
 
-  private void checkTimelineV2(
-      boolean haveDomain, ApplicationId appId, boolean defaultFlow)
-      throws Exception {
+  private void checkTimelineV2(boolean haveDomain, ApplicationId appId,
+      boolean defaultFlow) throws Exception {
+    LOG.info("Started checkTimelineV2 ");
     // For PoC check in /tmp/timeline_service_data YARN-3264
     String tmpRoot =
         FileSystemTimelineWriterImpl.DEFAULT_TIMELINE_SERVICE_STORAGE_DIR_ROOT
@@ -417,12 +424,29 @@ public class TestDistributedShell {
           verifyEntityTypeFileExists(basePath,
               TimelineEntityType.YARN_APPLICATION.toString(),
               appMetricsTimestampFileName);
-      verifyStringExistsSpecifiedTimes(appEntityFile,
-          ApplicationMetricsConstants.CREATED_EVENT_TYPE, 1,
-          "Application created event should be published atleast once");
-      verifyStringExistsSpecifiedTimes(appEntityFile,
-          ApplicationMetricsConstants.FINISHED_EVENT_TYPE, 1,
-          "Application finished event should be published atleast once");
+      Assert.assertEquals(
+          "Application created event should be published atleast once",
+          1,
+          getNumOfStringOccurences(appEntityFile,
+              ApplicationMetricsConstants.CREATED_EVENT_TYPE));
+
+      // to avoid race condition of testcase, atleast check 4 times with sleep
+      // of 500ms
+      long numOfStringOccurences = 0;
+      for (int i = 0; i < 4; i++) {
+        numOfStringOccurences =
+            getNumOfStringOccurences(appEntityFile,
+                ApplicationMetricsConstants.FINISHED_EVENT_TYPE);
+        if (numOfStringOccurences > 0) {
+          break;
+        } else {
+          Thread.sleep(500l);
+        }
+      }
+      Assert.assertEquals(
+          "Application finished event should be published atleast once",
+          1,
+          numOfStringOccurences);
 
       // Verify RM posting AppAttempt life cycle Events are getting published
       String appAttemptMetricsTimestampFileName =
@@ -433,12 +457,17 @@ public class TestDistributedShell {
           verifyEntityTypeFileExists(basePath,
               TimelineEntityType.YARN_APPLICATION_ATTEMPT.toString(),
               appAttemptMetricsTimestampFileName);
-      verifyStringExistsSpecifiedTimes(appAttemptEntityFile,
-          AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE, 1,
-          "AppAttempt register event should be published atleast once");
-      verifyStringExistsSpecifiedTimes(appAttemptEntityFile,
-          AppAttemptMetricsConstants.FINISHED_EVENT_TYPE, 1,
-          "AppAttempt finished event should be published atleast once");
+      Assert.assertEquals(
+          "AppAttempt register event should be published atleast once",
+          1,
+          getNumOfStringOccurences(appAttemptEntityFile,
+              AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE));
+
+      Assert.assertEquals(
+          "AppAttempt finished event should be published atleast once",
+          1,
+          getNumOfStringOccurences(appAttemptEntityFile,
+              AppAttemptMetricsConstants.FINISHED_EVENT_TYPE));
     } finally {
       FileUtils.deleteDirectory(tmpRootFolder.getParentFile());
     }
@@ -457,8 +486,7 @@ public class TestDistributedShell {
     return entityFile;
   }
 
-  private void verifyStringExistsSpecifiedTimes(File entityFile,
-      String searchString, long expectedNumOfTimes, String errorMsg)
+  private long getNumOfStringOccurences(File entityFile, String searchString)
       throws IOException {
     BufferedReader reader = null;
     String strLine;
@@ -472,7 +500,7 @@ public class TestDistributedShell {
     } finally {
       reader.close();
     }
-    Assert.assertEquals(errorMsg, expectedNumOfTimes, actualCount);
+    return actualCount;
   }
 
   /**
@@ -1117,4 +1145,3 @@ public class TestDistributedShell {
     return numOfWords;
   }
 }
-

+ 7 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShellWithNodeLabels.java

@@ -30,7 +30,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TestName;
 
 import com.google.common.collect.ImmutableMap;
 
@@ -40,11 +42,14 @@ public class TestDistributedShellWithNodeLabels {
   
   static final int NUM_NMS = 2;
   TestDistributedShell distShellTest;
- 
+
+  @Rule
+  public TestName currTestName = new TestName();
+
   @Before
   public void setup() throws Exception {
     distShellTest = new TestDistributedShell();
-    distShellTest.setupInternal(NUM_NMS);
+    distShellTest.setupInternal(NUM_NMS,currTestName);
   }
   
   private void initializeNodeLabels() throws IOException {

+ 8 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java

@@ -408,6 +408,14 @@ public class TimelineClientImpl extends TimelineClient {
     // timelineServiceAddress could haven't be initialized yet
     // or stale (only for new timeline service)
     int retries = pollTimelineServiceAddress(this.maxServiceRetries);
+    if (timelineServiceAddress == null) {
+      String errMessage = "TimelineClient has reached to max retry times : "
+          + this.maxServiceRetries
+          + ", but failed to fetch timeline service address. Please verify"
+          + " Timeline Auxillary Service is configured in all the NMs";
+      LOG.error(errMessage);
+      throw new YarnException(errMessage);
+    }
 
     // timelineServiceAddress could be stale, add retry logic here.
     boolean needRetry = true;

+ 2 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java

@@ -104,11 +104,11 @@ public class ApplicationImpl implements Application {
     stateMachine = stateMachineFactory.make(this);
     Configuration conf = context.getConf();
     if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) {
-      createAndStartTimelienClient(conf);
+      createAndStartTimelineClient(conf);
     }
   }
   
-  private void createAndStartTimelienClient(Configuration conf) {
+  private void createAndStartTimelineClient(Configuration conf) {
     // create and start timeline client
     this.timelineClient = TimelineClient.createTimelineClient(appId);
     timelineClient.init(conf);

+ 9 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -96,11 +96,8 @@ public class ContainersMonitorImpl extends AbstractService implements
   
   // For posting entities in new timeline service in a non-blocking way
   // TODO replace with event loop in TimelineClient.
-  private static ExecutorService threadPool =
-      Executors.newCachedThreadPool(
-          new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
-          .build());
-  
+  private static ExecutorService threadPool;
+
   @Private
   public static enum ContainerMetric {
     CPU, MEMORY
@@ -220,6 +217,10 @@ public class ContainersMonitorImpl extends AbstractService implements
     if (publishContainerMetricsToTimelineService) {
       LOG.info("NodeManager has been configured to publish container " +
           "metrics to Timeline Service V2.");
+      threadPool =
+          Executors.newCachedThreadPool(
+              new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
+              .build());
     } else {
       LOG.warn("NodeManager has not been configured to publish container " +
           "metrics to Timeline Service V2.");
@@ -275,6 +276,9 @@ public class ContainersMonitorImpl extends AbstractService implements
   
   // TODO remove threadPool after adding non-blocking call in TimelineClient
   private static void shutdownAndAwaitTermination() {
+    if (threadPool == null) {
+      return;
+    }
     threadPool.shutdown();
     try {
       if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
@@ -676,7 +680,6 @@ public class ContainersMonitorImpl extends AbstractService implements
             timelineClient.putEntities(entity);
           } catch (IOException|YarnException e) {
             LOG.error("putEntityNonBlocking get failed: " + e);
-            throw new RuntimeException(e.toString());
           }
         }
       };

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/timelineservice/RMTimelineCollectorManager.java

@@ -56,7 +56,7 @@ public class RMTimelineCollectorManager extends TimelineCollectorManager {
       if (parts.length != 2 || parts[1].isEmpty()) {
         continue;
       }
-      switch (parts[0]) {
+      switch (parts[0].toUpperCase()) {
         case TimelineUtils.FLOW_NAME_TAG_PREFIX:
           collector.getTimelineEntityContext().setFlowName(parts[1]);
           break;

+ 0 - 14
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/NodeTimelineCollectorManager.java

@@ -48,21 +48,11 @@ import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
 
 import com.google.common.annotations.VisibleForTesting;
 
-
-/**
- *
- * It is a singleton, and instances should be obtained via
- * {@link #getInstance()}.
- *
- */
 @Private
 @Unstable
 public class NodeTimelineCollectorManager extends TimelineCollectorManager {
   private static final Log LOG =
       LogFactory.getLog(NodeTimelineCollectorManager.class);
-  private static final NodeTimelineCollectorManager INSTANCE =
-      new NodeTimelineCollectorManager();
-
 
   // REST server for this collector manager
   private HttpServer2 timelineRestServer;
@@ -73,10 +63,6 @@ public class NodeTimelineCollectorManager extends TimelineCollectorManager {
 
   static final String COLLECTOR_MANAGER_ATTR_KEY = "collector.manager";
 
-  static NodeTimelineCollectorManager getInstance() {
-    return INSTANCE;
-  }
-
   @VisibleForTesting
   protected NodeTimelineCollectorManager() {
     super(NodeTimelineCollectorManager.class.getName());

+ 1 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java

@@ -56,8 +56,7 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
   private final NodeTimelineCollectorManager collectorManager;
 
   public PerNodeTimelineCollectorsAuxService() {
-    // use the same singleton
-    this(NodeTimelineCollectorManager.getInstance());
+    this(new NodeTimelineCollectorManager());
   }
 
   @VisibleForTesting PerNodeTimelineCollectorsAuxService(

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/TimelineCollectorManager.java

@@ -128,7 +128,7 @@ public abstract class TimelineCollectorManager extends AbstractService {
       postRemove(appId, collector);
       // stop the service to do clean up
       collector.stop();
-      LOG.info("the collector service for " + appId + " was removed");
+      LOG.info("The collector service for " + appId + " was removed");
     }
     return collector != null;
   }