Browse Source

YARN-4983. JVM and UGI metrics disappear after RM transitioned to standby mode

Jian He 9 năm trước cách đây
mục cha
commit
4beff01354

+ 4 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/source/JvmMetrics.java

@@ -86,6 +86,10 @@ public class JvmMetrics implements MetricsSource {
                        new JvmMetrics(processName, sessionId));
   }
 
+  public static void reattach(MetricsSystem ms, JvmMetrics jvmMetrics) {
+    ms.register(JvmMetrics.name(), JvmMetrics.description(), jvmMetrics);
+  }
+
   public static JvmMetrics initSingleton(String processName, String sessionId) {
     return Singleton.INSTANCE.init(processName, sessionId);
   }

+ 11 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java

@@ -126,6 +126,10 @@ public class UserGroupInformation {
       return DefaultMetricsSystem.instance().register(new UgiMetrics());
     }
 
+    static void reattach() {
+      metrics = UgiMetrics.create();
+    }
+
     void addGetGroups(long latency) {
       getGroups.add(latency);
       if (getGroupsQuantiles != null) {
@@ -238,6 +242,13 @@ public class UserGroupInformation {
     }
   }
 
+  /**
+   * Reattach the class's metrics to a new metric system.
+   */
+  public static void reattachMetrics() {
+    UgiMetrics.reattach();
+  }
+
   /** Metrics to track UGI activity */
   static UgiMetrics metrics = UgiMetrics.create();
   /** The auth method to use */

+ 20 - 8
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java

@@ -30,6 +30,7 @@ import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.ha.HAServiceProtocol;
 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
 import org.apache.hadoop.http.lib.StaticUserWebFilter;
+import org.apache.hadoop.metrics2.MetricsSystem;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.source.JvmMetrics;
 import org.apache.hadoop.security.AuthenticationFilterInitializer;
@@ -173,7 +174,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
   private WebApp webApp;
   private AppReportFetcher fetcher = null;
   protected ResourceTrackerService resourceTracker;
-  private JvmPauseMonitor pauseMonitor;
+  private JvmMetrics jvmMetrics;
   private boolean curatorEnabled = false;
   private CuratorFramework curator;
   private final String zkRootNodePassword =
@@ -286,7 +287,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
 
     rmContext.setYarnConfiguration(conf);
     
-    createAndInitActiveServices();
+    createAndInitActiveServices(false);
 
     webAppAddress = WebAppUtils.getWebAppBindURL(this.conf,
                       YarnConfiguration.RM_BIND_HOST,
@@ -491,6 +492,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
     private ContainerAllocationExpirer containerAllocationExpirer;
     private ResourceManager rm;
     private RMActiveServiceContext activeServiceContext;
+    private boolean fromActive = false;
 
     RMActiveServices(ResourceManager rm) {
       super("RMActiveServices");
@@ -598,11 +600,17 @@ public class ResourceManager extends CompositeService implements Recoverable {
       addService(resourceTracker);
       rmContext.setResourceTrackerService(resourceTracker);
 
-      DefaultMetricsSystem.initialize("ResourceManager");
-      JvmMetrics jm = JvmMetrics.initSingleton("ResourceManager", null);
-      pauseMonitor = new JvmPauseMonitor();
+      MetricsSystem ms = DefaultMetricsSystem.initialize("ResourceManager");
+      if (fromActive) {
+        JvmMetrics.reattach(ms, jvmMetrics);
+        UserGroupInformation.reattachMetrics();
+      } else {
+        jvmMetrics = JvmMetrics.initSingleton("ResourceManager", null);
+      }
+
+      JvmPauseMonitor pauseMonitor = new JvmPauseMonitor();
       addService(pauseMonitor);
-      jm.setPauseMonitor(pauseMonitor);
+      jvmMetrics.setPauseMonitor(pauseMonitor);
 
       // Initialize the Reservation system
       if (conf.getBoolean(YarnConfiguration.RM_RESERVATION_SYSTEM_ENABLE,
@@ -982,9 +990,13 @@ public class ResourceManager extends CompositeService implements Recoverable {
   /**
    * Helper method to create and init {@link #activeServices}. This creates an
    * instance of {@link RMActiveServices} and initializes it.
+   *
+   * @param fromActive Indicates if the call is from the active state transition
+   *                   or the RM initialization.
    */
-  protected void createAndInitActiveServices() {
+  protected void createAndInitActiveServices(boolean fromActive) {
     activeServices = new RMActiveServices(this);
+    activeServices.fromActive = fromActive;
     activeServices.init(conf);
   }
 
@@ -1015,7 +1027,7 @@ public class ResourceManager extends CompositeService implements Recoverable {
     QueueMetrics.clearQueueMetrics();
     if (initialize) {
       resetDispatcher();
-      createAndInitActiveServices();
+      createAndInitActiveServices(true);
     }
   }
 

+ 76 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHAMetrics.java

@@ -0,0 +1,76 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.yarn.conf.HAUtil;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
+import org.junit.Before;
+import org.junit.Test;
+
+import static junit.framework.TestCase.assertNotNull;
+
+/**
+ * Metrics related RM HA testing. Metrics are mostly static singletons. To
+ * avoid interference with other RM HA tests, separating metric tests for RM HA
+ * into a separate file temporarily.
+ */
+public class TestRMHAMetrics {
+  private Configuration configuration;
+
+  private static final String RM1_ADDRESS = "1.1.1.1:1";
+  private static final String RM1_NODE_ID = "rm1";
+
+  private static final String RM2_ADDRESS = "0.0.0.0:0";
+  private static final String RM2_NODE_ID = "rm2";
+
+  @Before
+  public void setUp() throws Exception {
+    configuration = new Configuration();
+    UserGroupInformation.setConfiguration(configuration);
+    configuration.setBoolean(YarnConfiguration.RM_HA_ENABLED, true);
+    configuration.set(YarnConfiguration.RM_HA_IDS, RM1_NODE_ID + ","
+        + RM2_NODE_ID);
+    for (String confKey : YarnConfiguration
+        .getServiceAddressConfKeys(configuration)) {
+      configuration.set(HAUtil.addSuffix(confKey, RM1_NODE_ID), RM1_ADDRESS);
+      configuration.set(HAUtil.addSuffix(confKey, RM2_NODE_ID), RM2_ADDRESS);
+    }
+
+    ClusterMetrics.destroy();
+    QueueMetrics.clearQueueMetrics();
+    DefaultMetricsSystem.shutdown();
+  }
+
+  @Test(timeout = 300000)
+  public void testMetricsAfterTransitionToStandby() throws Exception {
+    configuration.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, false);
+    Configuration conf = new YarnConfiguration(configuration);
+    MockRM rm = new MockRM(conf);
+    rm.init(conf);
+    rm.start();
+    rm.transitionToActive();
+    rm.transitionToStandby(true);
+    assertNotNull(DefaultMetricsSystem.instance().getSource("JvmMetrics"));
+    assertNotNull(DefaultMetricsSystem.instance().getSource("UgiMetrics"));
+    rm.stop();
+  }
+}