Browse Source

YARN-1705. Reset cluster-metrics on transition to standby. (Rohith via kasha)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1579014 13f79535-47bb-0310-9956-ffa450edef68
Karthik Kambatla 11 năm trước cách đây
mục cha
commit
bfafc1d0ce

+ 2 - 0
hadoop-yarn-project/CHANGES.txt

@@ -173,6 +173,8 @@ Release 2.4.0 - UNRELEASED
     YARN-1846. TestRM#testNMTokenSentForNormalContainer assumes CapacityScheduler.
     (Robert Kanter via kasha)
 
+    YARN-1705. Reset cluster-metrics on transition to standby. (Rohith via kasha)
+
   IMPROVEMENTS
 
     YARN-1007. Enhance History Reader interface for Containers. (Mayank Bansal via

+ 3 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/ResourceManager.java

@@ -81,6 +81,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerPreemptEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEvent;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.SchedulerEventType;
@@ -840,6 +841,8 @@ public class ResourceManager extends CompositeService implements Recoverable {
       rmContext.getRMNodes().clear();
       rmContext.getInactiveRMNodes().clear();
       rmContext.getRMApps().clear();
+      ClusterMetrics.destroy();
+      QueueMetrics.clearQueueMetrics();
     }
   }
 

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/QueueMetrics.java

@@ -127,7 +127,7 @@ public class QueueMetrics implements MetricsSource {
   }
 
   /**
-   * Helper method to clear cache - used only for unit tests.
+   * Helper method to clear cache.
    */
   @Private
   public synchronized static void clearQueueMetrics() {

+ 34 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMHA.java

@@ -32,6 +32,7 @@ import org.apache.hadoop.yarn.conf.HAUtil;
 import org.apache.hadoop.yarn.event.Dispatcher;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
 import org.junit.Before;
 import org.junit.Test;
 
@@ -138,32 +139,38 @@ public class TestRMHA {
     rm.start();
     checkMonitorHealth();
     checkStandbyRMFunctionality();
-
+    verifyClusterMetrics(0, 0, 0, 0, 0, 0);
+    
     // 1. Transition to Standby - must be a no-op
     rm.adminService.transitionToStandby(requestInfo);
     checkMonitorHealth();
     checkStandbyRMFunctionality();
-
+    verifyClusterMetrics(0, 0, 0, 0, 0, 0);
+    
     // 2. Transition to active
     rm.adminService.transitionToActive(requestInfo);
     checkMonitorHealth();
     checkActiveRMFunctionality();
-
+    verifyClusterMetrics(1, 1, 1, 1, 2048, 1);
+    
     // 3. Transition to active - no-op
     rm.adminService.transitionToActive(requestInfo);
     checkMonitorHealth();
     checkActiveRMFunctionality();
-
+    verifyClusterMetrics(1, 2, 2, 2, 2048, 2);
+    
     // 4. Transition to standby
     rm.adminService.transitionToStandby(requestInfo);
     checkMonitorHealth();
     checkStandbyRMFunctionality();
-
+    verifyClusterMetrics(0, 0, 0, 0, 0, 0);
+   
     // 5. Transition to active to check Active->Standby->Active works
     rm.adminService.transitionToActive(requestInfo);
     checkMonitorHealth();
     checkActiveRMFunctionality();
-
+    verifyClusterMetrics(1, 1, 1, 1, 2048, 1);
+    
     // 6. Stop the RM. All services should stop and RM should not be ready to
     // become active
     rm.stop();
@@ -367,6 +374,27 @@ public class TestRMHA {
       fail("Should not throw any exceptions.");
     }
   }
+  
+  private void verifyClusterMetrics(int activeNodes, int appsSubmitted,
+      int appsPending, int containersPending, int availableMB,
+      int activeApplications) {
+    QueueMetrics metrics = rm.getResourceScheduler().getRootQueueMetrics();
+    // verify queue metrics
+    assertMetric("appsSubmitted", appsSubmitted, metrics.getAppsSubmitted());
+    assertMetric("appsPending", appsPending, metrics.getAppsPending());
+    assertMetric("containersPending", containersPending,
+        metrics.getPendingContainers());
+    assertMetric("availableMB", availableMB, metrics.getAvailableMB());
+    assertMetric("activeApplications", activeApplications,
+        metrics.getActiveApps());
+    // verify node metric
+    ClusterMetrics clusterMetrics = ClusterMetrics.getMetrics();
+    assertMetric("activeNodes", activeNodes, clusterMetrics.getNumActiveNMs());
+  }
+
+  private void assertMetric(String metricName, int expected, int actual) {
+    assertEquals("Incorrect value for metric " + metricName, expected, actual);
+  }
 
   @SuppressWarnings("rawtypes")
   class MyCountingDispatcher extends AbstractService implements Dispatcher {