Ver código fonte

YARN-3792. Test case failures in TestDistributedShell and some issue fixes related to ATSV2 (Naganarasimha G R via sjlee)

(cherry picked from commit 84f37f1c7eefec6d139cbf091c50d6c06f734323)
Sangjin Lee 10 anos atrás
pai
commit
3947422efb
11 arquivos alterados com 107 adições e 76 exclusões
  1. 18 15
      hadoop-yarn-project/CHANGES.txt
  2. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java
  3. 59 32
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java
  4. 7 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShellWithNodeLabels.java
  5. 8 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java
  6. 2 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java
  7. 9 6
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java
  8. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/timelineservice/RMTimelineCollectorManager.java
  9. 0 14
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/NodeTimelineCollectorManager.java
  10. 1 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java
  11. 1 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/TimelineCollectorManager.java

+ 18 - 15
hadoop-yarn-project/CHANGES.txt

@@ -35,9 +35,6 @@ Branch YARN-2928: Timeline Server Next Generation: Phase 1
     YARN-3333. Rename TimelineAggregator etc. to TimelineCollector. (Sangjin Lee
     via junping_du)
 
-    YARN-3377. Fixed test failure in TestTimelineServiceClientIntegration.
-    (Sangjin Lee via zjshen)
-
     YARN-3034. Implement RM starting its timeline collector. (Naganarasimha G R
     via junping_du)
 
@@ -61,27 +58,15 @@ Branch YARN-2928: Timeline Server Next Generation: Phase 1
     YARN-3551. Consolidate data model change according to the backend
     implementation (Zhijie Shen via sjlee)
 
-    YARN-3562. unit tests failures and issues found from findbug from earlier
-    ATS checkins (Naganarasimha G R via sjlee)
-
     YARN-3134. Implemented Phoenix timeline writer to access HBase backend. (Li
     Lu via zjshen)
 
     YARN-3529. Added mini HBase cluster and Phoenix support to timeline service
     v2 unit tests. (Li Lu via zjshen)
 
-    YARN-3634. TestMRTimelineEventHandling and TestApplication are broken. (
-    Sangjin Lee via junping_du)
-
     YARN-3411. [Storage implementation] explore the native HBase write schema
     for storage (Vrushali C via sjlee)
 
-    YARN-3726. Fix TestHBaseTimelineWriterImpl unit test failure by fixing its
-    test data (Vrushali C via sjlee)
-
-    YARN-3721. build is broken on YARN-2928 branch due to possible dependency
-    cycle (Li Lu via sjlee)
-
     YARN-3044. Made RM write app, attempt and optional container lifecycle
     events to timeline service v2. (Naganarasimha G R via zjshen)
 
@@ -100,6 +85,24 @@ Branch YARN-2928: Timeline Server Next Generation: Phase 1
 
   BUG FIXES
 
+    YARN-3377. Fixed test failure in TestTimelineServiceClientIntegration.
+    (Sangjin Lee via zjshen)
+
+    YARN-3562. unit tests failures and issues found from findbug from earlier
+    ATS checkins (Naganarasimha G R via sjlee)
+
+    YARN-3634. TestMRTimelineEventHandling and TestApplication are broken. (
+    Sangjin Lee via junping_du)
+
+    YARN-3726. Fix TestHBaseTimelineWriterImpl unit test failure by fixing its
+    test data (Vrushali C via sjlee)
+
+    YARN-3721. build is broken on YARN-2928 branch due to possible dependency
+    cycle (Li Lu via sjlee)
+
+    YARN-3792. Test case failures in TestDistributedShell and some issue fixes
+    related to ATSV2 (Naganarasimha G R via sjlee)
+
 Trunk - Unreleased
 
   INCOMPATIBLE CHANGES

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java

@@ -463,7 +463,7 @@ public class Client {
     }
     if (cliParser.hasOption("flow_run_id")) {
       try {
-        flowRunId = Long.valueOf(cliParser.getOptionValue("flow_run_id"));
+        flowRunId = Long.parseLong(cliParser.getOptionValue("flow_run_id"));
       } catch (NumberFormatException e) {
         throw new IllegalArgumentException(
             "Flow run is not a valid long value", e);

+ 59 - 32
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShell.java

@@ -46,6 +46,7 @@ import org.apache.hadoop.util.JarFinder;
 import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.ApplicationReport;
+import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
 import org.apache.hadoop.yarn.api.records.YarnApplicationState;
 import org.apache.hadoop.yarn.api.records.timeline.TimelineDomain;
 import org.apache.hadoop.yarn.api.records.timeline.TimelineEntities;
@@ -86,28 +87,33 @@ public class TestDistributedShell {
 
   @Before
   public void setup() throws Exception {
-    setupInternal(NUM_NMS);
+    setupInternal(NUM_NMS, currTestName);
   }
 
-  protected void setupInternal(int numNodeManager) throws Exception {
-
+  protected void setupInternal(int numNodeManager, TestName testName)
+      throws Exception {
     LOG.info("Starting up YARN cluster");
 
     conf = new YarnConfiguration();
     conf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
     conf.set("yarn.log.dir", "target");
     conf.setBoolean(YarnConfiguration.TIMELINE_SERVICE_ENABLED, true);
-    // mark if we need to launch the v1 timeline server
-    boolean enableATSV1 = false;
-    if (!currTestName.getMethodName().toLowerCase().contains("v2")) {
+
+    if (!testName.getMethodName().toLowerCase().contains("v2")) {
       // disable aux-service based timeline collectors
       conf.set(YarnConfiguration.NM_AUX_SERVICES, "");
-      enableATSV1 = true;
+      conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
+          true);
+      conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, false);
     } else {
       // enable aux-service based timeline collectors
       conf.set(YarnConfiguration.NM_AUX_SERVICES, TIMELINE_AUX_SERVICE_NAME);
-      conf.set(YarnConfiguration.NM_AUX_SERVICES + "." + TIMELINE_AUX_SERVICE_NAME
-        + ".class", PerNodeTimelineCollectorsAuxService.class.getName());
+      conf.set(YarnConfiguration.NM_AUX_SERVICES + "."
+          + TIMELINE_AUX_SERVICE_NAME + ".class",
+          PerNodeTimelineCollectorsAuxService.class.getName());
+      conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
+      conf.setBoolean(YarnConfiguration.RM_SYSTEM_METRICS_PUBLISHER_ENABLED,
+          false);
     }
     conf.set(YarnConfiguration.NM_VMEM_PMEM_RATIO, "8");
     conf.set(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class.getName());
@@ -123,12 +129,11 @@ public class TestDistributedShell {
     conf.setBoolean(YarnConfiguration.NM_VMEM_CHECK_ENABLED, true);
     conf.setBoolean(YarnConfiguration.YARN_MINICLUSTER_CONTROL_RESOURCE_MONITORING,
         true);
-    conf.setBoolean(YarnConfiguration.SYSTEM_METRICS_PUBLISHER_ENABLED, true);
 
     if (yarnCluster == null) {
       yarnCluster =
           new MiniYARNCluster(TestDistributedShell.class.getSimpleName(), 1,
-              numNodeManager, 1, 1, enableATSV1);
+              numNodeManager, 1, 1);
       yarnCluster.init(conf);
       
       yarnCluster.start();
@@ -303,13 +308,15 @@ public class TestDistributedShell {
       if (checkHostname(appReport.getHost()) && appReport.getRpcPort() == -1) {
         verified = true;
       }
-      if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED) {
+
+      if (appReport.getYarnApplicationState() == YarnApplicationState.FINISHED
+          && appReport.getFinalApplicationStatus() != FinalApplicationStatus.UNDEFINED) {
         break;
       }
     }
     Assert.assertTrue(errorMessage, verified);
     t.join();
-    LOG.info("Client run completed. Result=" + result);
+    LOG.info("Client run completed for testDSShell. Result=" + result);
     Assert.assertTrue(result.get());
 
     if (!isTestingTimelineV2) {
@@ -364,9 +371,9 @@ public class TestDistributedShell {
     }
   }
 
-  private void checkTimelineV2(
-      boolean haveDomain, ApplicationId appId, boolean defaultFlow)
-      throws Exception {
+  private void checkTimelineV2(boolean haveDomain, ApplicationId appId,
+      boolean defaultFlow) throws Exception {
+    LOG.info("Started checkTimelineV2 ");
     // For PoC check in /tmp/timeline_service_data YARN-3264
     String tmpRoot =
         FileSystemTimelineWriterImpl.DEFAULT_TIMELINE_SERVICE_STORAGE_DIR_ROOT
@@ -417,12 +424,29 @@ public class TestDistributedShell {
           verifyEntityTypeFileExists(basePath,
               TimelineEntityType.YARN_APPLICATION.toString(),
               appMetricsTimestampFileName);
-      verifyStringExistsSpecifiedTimes(appEntityFile,
-          ApplicationMetricsConstants.CREATED_EVENT_TYPE, 1,
-          "Application created event should be published atleast once");
-      verifyStringExistsSpecifiedTimes(appEntityFile,
-          ApplicationMetricsConstants.FINISHED_EVENT_TYPE, 1,
-          "Application finished event should be published atleast once");
+      Assert.assertEquals(
+          "Application created event should be published atleast once",
+          1,
+          getNumOfStringOccurences(appEntityFile,
+              ApplicationMetricsConstants.CREATED_EVENT_TYPE));
+
+      // to avoid race condition of testcase, atleast check 4 times with sleep
+      // of 500ms
+      long numOfStringOccurences = 0;
+      for (int i = 0; i < 4; i++) {
+        numOfStringOccurences =
+            getNumOfStringOccurences(appEntityFile,
+                ApplicationMetricsConstants.FINISHED_EVENT_TYPE);
+        if (numOfStringOccurences > 0) {
+          break;
+        } else {
+          Thread.sleep(500l);
+        }
+      }
+      Assert.assertEquals(
+          "Application finished event should be published atleast once",
+          1,
+          numOfStringOccurences);
 
       // Verify RM posting AppAttempt life cycle Events are getting published
       String appAttemptMetricsTimestampFileName =
@@ -433,12 +457,17 @@ public class TestDistributedShell {
           verifyEntityTypeFileExists(basePath,
               TimelineEntityType.YARN_APPLICATION_ATTEMPT.toString(),
               appAttemptMetricsTimestampFileName);
-      verifyStringExistsSpecifiedTimes(appAttemptEntityFile,
-          AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE, 1,
-          "AppAttempt register event should be published atleast once");
-      verifyStringExistsSpecifiedTimes(appAttemptEntityFile,
-          AppAttemptMetricsConstants.FINISHED_EVENT_TYPE, 1,
-          "AppAttempt finished event should be published atleast once");
+      Assert.assertEquals(
+          "AppAttempt register event should be published atleast once",
+          1,
+          getNumOfStringOccurences(appAttemptEntityFile,
+              AppAttemptMetricsConstants.REGISTERED_EVENT_TYPE));
+
+      Assert.assertEquals(
+          "AppAttempt finished event should be published atleast once",
+          1,
+          getNumOfStringOccurences(appAttemptEntityFile,
+              AppAttemptMetricsConstants.FINISHED_EVENT_TYPE));
     } finally {
       FileUtils.deleteDirectory(tmpRootFolder.getParentFile());
     }
@@ -457,8 +486,7 @@ public class TestDistributedShell {
     return entityFile;
   }
 
-  private void verifyStringExistsSpecifiedTimes(File entityFile,
-      String searchString, long expectedNumOfTimes, String errorMsg)
+  private long getNumOfStringOccurences(File entityFile, String searchString)
       throws IOException {
     BufferedReader reader = null;
     String strLine;
@@ -472,7 +500,7 @@ public class TestDistributedShell {
     } finally {
       reader.close();
     }
-    Assert.assertEquals(errorMsg, expectedNumOfTimes, actualCount);
+    return actualCount;
   }
 
   /**
@@ -1117,4 +1145,3 @@ public class TestDistributedShell {
     return numOfWords;
   }
 }
-

+ 7 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/test/java/org/apache/hadoop/yarn/applications/distributedshell/TestDistributedShellWithNodeLabels.java

@@ -30,7 +30,9 @@ import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.junit.Assert;
 import org.junit.Before;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.TestName;
 
 import com.google.common.collect.ImmutableMap;
 
@@ -40,11 +42,14 @@ public class TestDistributedShellWithNodeLabels {
   
   static final int NUM_NMS = 2;
   TestDistributedShell distShellTest;
- 
+
+  @Rule
+  public TestName currTestName = new TestName();
+
   @Before
   public void setup() throws Exception {
     distShellTest = new TestDistributedShell();
-    distShellTest.setupInternal(NUM_NMS);
+    distShellTest.setupInternal(NUM_NMS,currTestName);
   }
   
   private void initializeNodeLabels() throws IOException {

+ 8 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/client/api/impl/TimelineClientImpl.java

@@ -408,6 +408,14 @@ public class TimelineClientImpl extends TimelineClient {
     // timelineServiceAddress could haven't be initialized yet
     // or stale (only for new timeline service)
     int retries = pollTimelineServiceAddress(this.maxServiceRetries);
+    if (timelineServiceAddress == null) {
+      String errMessage = "TimelineClient has reached to max retry times : "
+          + this.maxServiceRetries
+          + ", but failed to fetch timeline service address. Please verify"
+          + " Timeline Auxillary Service is configured in all the NMs";
+      LOG.error(errMessage);
+      throw new YarnException(errMessage);
+    }
 
     // timelineServiceAddress could be stale, add retry logic here.
     boolean needRetry = true;

+ 2 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/application/ApplicationImpl.java

@@ -104,11 +104,11 @@ public class ApplicationImpl implements Application {
     stateMachine = stateMachineFactory.make(this);
     Configuration conf = context.getConf();
     if (YarnConfiguration.systemMetricsPublisherEnabled(conf)) {
-      createAndStartTimelienClient(conf);
+      createAndStartTimelineClient(conf);
     }
   }
   
-  private void createAndStartTimelienClient(Configuration conf) {
+  private void createAndStartTimelineClient(Configuration conf) {
     // create and start timeline client
     this.timelineClient = TimelineClient.createTimelineClient(appId);
     timelineClient.init(conf);

+ 9 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/monitor/ContainersMonitorImpl.java

@@ -96,11 +96,8 @@ public class ContainersMonitorImpl extends AbstractService implements
   
   // For posting entities in new timeline service in a non-blocking way
   // TODO replace with event loop in TimelineClient.
-  private static ExecutorService threadPool =
-      Executors.newCachedThreadPool(
-          new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
-          .build());
-  
+  private static ExecutorService threadPool;
+
   @Private
   public static enum ContainerMetric {
     CPU, MEMORY
@@ -220,6 +217,10 @@ public class ContainersMonitorImpl extends AbstractService implements
     if (publishContainerMetricsToTimelineService) {
       LOG.info("NodeManager has been configured to publish container " +
           "metrics to Timeline Service V2.");
+      threadPool =
+          Executors.newCachedThreadPool(
+              new ThreadFactoryBuilder().setNameFormat("TimelineService #%d")
+              .build());
     } else {
       LOG.warn("NodeManager has not been configured to publish container " +
           "metrics to Timeline Service V2.");
@@ -275,6 +276,9 @@ public class ContainersMonitorImpl extends AbstractService implements
   
   // TODO remove threadPool after adding non-blocking call in TimelineClient
   private static void shutdownAndAwaitTermination() {
+    if (threadPool == null) {
+      return;
+    }
     threadPool.shutdown();
     try {
       if (!threadPool.awaitTermination(60, TimeUnit.SECONDS)) {
@@ -676,7 +680,6 @@ public class ContainersMonitorImpl extends AbstractService implements
             timelineClient.putEntities(entity);
           } catch (IOException|YarnException e) {
             LOG.error("putEntityNonBlocking get failed: " + e);
-            throw new RuntimeException(e.toString());
           }
         }
       };

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/timelineservice/RMTimelineCollectorManager.java

@@ -56,7 +56,7 @@ public class RMTimelineCollectorManager extends TimelineCollectorManager {
       if (parts.length != 2 || parts[1].isEmpty()) {
         continue;
       }
-      switch (parts[0]) {
+      switch (parts[0].toUpperCase()) {
         case TimelineUtils.FLOW_NAME_TAG_PREFIX:
           collector.getTimelineEntityContext().setFlowName(parts[1]);
           break;

+ 0 - 14
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/NodeTimelineCollectorManager.java

@@ -48,21 +48,11 @@ import org.apache.hadoop.yarn.webapp.util.WebAppUtils;
 
 import com.google.common.annotations.VisibleForTesting;
 
-
-/**
- *
- * It is a singleton, and instances should be obtained via
- * {@link #getInstance()}.
- *
- */
 @Private
 @Unstable
 public class NodeTimelineCollectorManager extends TimelineCollectorManager {
   private static final Log LOG =
       LogFactory.getLog(NodeTimelineCollectorManager.class);
-  private static final NodeTimelineCollectorManager INSTANCE =
-      new NodeTimelineCollectorManager();
-
 
   // REST server for this collector manager
   private HttpServer2 timelineRestServer;
@@ -73,10 +63,6 @@ public class NodeTimelineCollectorManager extends TimelineCollectorManager {
 
   static final String COLLECTOR_MANAGER_ATTR_KEY = "collector.manager";
 
-  static NodeTimelineCollectorManager getInstance() {
-    return INSTANCE;
-  }
-
   @VisibleForTesting
   protected NodeTimelineCollectorManager() {
     super(NodeTimelineCollectorManager.class.getName());

+ 1 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/PerNodeTimelineCollectorsAuxService.java

@@ -56,8 +56,7 @@ public class PerNodeTimelineCollectorsAuxService extends AuxiliaryService {
   private final NodeTimelineCollectorManager collectorManager;
 
   public PerNodeTimelineCollectorsAuxService() {
-    // use the same singleton
-    this(NodeTimelineCollectorManager.getInstance());
+    this(new NodeTimelineCollectorManager());
   }
 
   @VisibleForTesting PerNodeTimelineCollectorsAuxService(

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-timelineservice/src/main/java/org/apache/hadoop/yarn/server/timelineservice/collector/TimelineCollectorManager.java

@@ -128,7 +128,7 @@ public abstract class TimelineCollectorManager extends AbstractService {
       postRemove(appId, collector);
       // stop the service to do clean up
       collector.stop();
-      LOG.info("the collector service for " + appId + " was removed");
+      LOG.info("The collector service for " + appId + " was removed");
     }
     return collector != null;
   }