Pārlūkot izejas kodu

HDFS-17055 Export HAState as a metric from Namenode for monitoring (#5764)

Xing Lin 1 gadu atpakaļ
vecāks
revīzija
03902f5ef0

+ 2 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/BackupNode.java

@@ -48,6 +48,7 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
 import org.apache.hadoop.ipc.StandbyException;
 import org.apache.hadoop.ipc.RPC;
+import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.security.UserGroupInformation;
 
@@ -68,6 +69,7 @@ import org.apache.hadoop.thirdparty.protobuf.BlockingService;
  * </ol>
  */
 @InterfaceAudience.Private
+@Metrics(context="dfs")
 public class BackupNode extends NameNode {
   private static final String BN_ADDRESS_NAME_KEY = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_KEY;
   private static final String BN_ADDRESS_DEFAULT = DFSConfigKeys.DFS_NAMENODE_BACKUP_ADDRESS_DEFAULT;

+ 25 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNode.java

@@ -78,6 +78,8 @@ import org.apache.hadoop.ipc.RefreshCallQueueProtocol;
 import org.apache.hadoop.ipc.RetriableException;
 import org.apache.hadoop.ipc.Server;
 import org.apache.hadoop.ipc.StandbyException;
+import org.apache.hadoop.metrics2.annotation.Metric;
+import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
 import org.apache.hadoop.metrics2.util.MBeans;
 import org.apache.hadoop.net.NetUtils;
@@ -252,6 +254,7 @@ import static org.apache.hadoop.fs.CommonConfigurationKeys.IPC_BACKOFF_ENABLE_DE
  * NameNode state, for example partial blocksMap etc.
  **********************************************************/
 @InterfaceAudience.Private
+@Metrics(context="dfs")
 public class NameNode extends ReconfigurableBase implements
     NameNodeStatusMXBean, TokenVerifier<DelegationTokenIdentifier> {
   static{
@@ -1146,6 +1149,7 @@ public class NameNode extends ReconfigurableBase implements
         DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE,
         DFS_HA_NN_NOT_BECOME_ACTIVE_IN_SAFEMODE_DEFAULT);
     this.started.set(true);
+    DefaultMetricsSystem.instance().register(this);
   }
 
   private void stopAtException(Exception e){
@@ -1216,6 +1220,7 @@ public class NameNode extends ReconfigurableBase implements
         levelDBAliasMapServer.close();
       }
     }
+    started.set(false);
     tracer.close();
   }
 
@@ -2051,6 +2056,26 @@ public class NameNode extends ReconfigurableBase implements
     return state.getServiceState();
   }
 
+  /**
+   * Emit Namenode HA service state as an integer so that one can monitor NN HA
+   * state based on this metric.
+   *
+   * @return  0 when not fully started
+   *          1 for active or standalone (non-HA) NN
+   *          2 for standby
+   *          3 for observer
+   *
+   * These are the same integer values for the HAServiceState enum.
+   */
+  @Metric({"NameNodeState", "Namenode HA service state"})
+  public int getNameNodeState() {
+    if (!isStarted() || state == null) {
+      return HAServiceState.INITIALIZING.ordinal();
+    }
+
+    return state.getServiceState().ordinal();
+  }
+
   /**
    * Register NameNodeStatusMXBean
    */

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSFinalize.java

@@ -153,7 +153,7 @@ public class TestDFSFinalize {
       UpgradeUtilities.createEmptyDirs(dataNodeDirs);
 
       log("Finalize NN & BP with existing previous dir", numDirs);
-      String bpid = UpgradeUtilities.getCurrentBlockPoolID(cluster);
+      String bpid = UpgradeUtilities.getCurrentBlockPoolID(null);
       UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "current");
       UpgradeUtilities.createNameNodeStorageDirs(nameNodeDirs, "previous");
       UpgradeUtilities.createDataNodeStorageDirs(dataNodeDirs, "current");

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSRollback.java

@@ -328,7 +328,7 @@ public class TestDFSRollback {
           UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE);
       
       UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs,
-          storageInfo, UpgradeUtilities.getCurrentBlockPoolID(cluster));
+          storageInfo, UpgradeUtilities.getCurrentBlockPoolID(null));
       startNameNodeShouldFail("Cannot rollback to storage version 1 using this version");
       UpgradeUtilities.createEmptyDirs(nameNodeDirs);
     } // end numDir loop

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestDFSUpgrade.java

@@ -349,7 +349,7 @@ public class TestDFSUpgrade {
           UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE);
       
       UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, storageInfo,
-          UpgradeUtilities.getCurrentBlockPoolID(cluster));
+          UpgradeUtilities.getCurrentBlockPoolID(null));
       
       startNameNodeShouldFail(StartupOption.UPGRADE);
       UpgradeUtilities.createEmptyDirs(nameNodeDirs);
@@ -362,7 +362,7 @@ public class TestDFSUpgrade {
           UpgradeUtilities.getCurrentFsscTime(null), NodeType.NAME_NODE);
       
       UpgradeUtilities.createNameNodeVersionFile(conf, baseDirs, storageInfo,
-          UpgradeUtilities.getCurrentBlockPoolID(cluster));
+          UpgradeUtilities.getCurrentBlockPoolID(null));
       
       startNameNodeShouldFail(StartupOption.UPGRADE);
       UpgradeUtilities.createEmptyDirs(nameNodeDirs);

+ 2 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeMetricsLogger.java

@@ -19,6 +19,7 @@
 package org.apache.hadoop.hdfs.server.namenode;
 
 import java.util.function.Supplier;
+import org.apache.hadoop.metrics2.annotation.Metrics;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -116,6 +117,7 @@ public class TestNameNodeMetricsLogger {
   /**
    * A NameNode that stubs out the NameSystem for testing.
    */
+  @Metrics(context="dfs")
   private static class TestNameNode extends NameNode {
     @Override
     protected void loadNamesystem(Configuration conf) throws IOException {

+ 55 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestHAMetrics.java

@@ -17,6 +17,8 @@
  */
 package org.apache.hadoop.hdfs.server.namenode.ha;
 
+import java.io.IOException;
+import org.apache.hadoop.ha.HAServiceProtocol;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -29,6 +31,7 @@ import org.apache.hadoop.hdfs.DistributedFileSystem;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.MiniDFSNNTopology;
 import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.io.IOUtils;
 import org.junit.Test;
 
@@ -176,4 +179,56 @@ public class TestHAMetrics {
     }
 
   }
+
+  /**
+   * Test the getNameNodeState() API added to NameNode.java.
+   *
+   * @throws IOException
+   */
+  @Test
+  public void testGetNameNodeState() throws IOException {
+    Configuration conf = new Configuration();
+    conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+    conf.setInt(DFSConfigKeys.DFS_HA_LOGROLL_PERIOD_KEY, Integer.MAX_VALUE);
+
+    MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).nnTopology(
+        MiniDFSNNTopology.simpleHATopology(3)).numDataNodes(1).build();
+
+    cluster.waitActive();
+
+    NameNode nn0 = cluster.getNameNode(0);
+    NameNode nn1 = cluster.getNameNode(1);
+    NameNode nn2 = cluster.getNameNode(2);
+
+    // All namenodes are in standby by default
+    assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
+        nn0.getNameNodeState());
+    assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
+        nn1.getNameNodeState());
+    assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
+        nn2.getNameNodeState());
+
+    // Transition nn0 to be active
+    cluster.transitionToActive(0);
+    assertEquals(HAServiceProtocol.HAServiceState.ACTIVE.ordinal(),
+        nn0.getNameNodeState());
+
+    // Transition nn1 to be active
+    cluster.transitionToStandby(0);
+    cluster.transitionToActive(1);
+    assertEquals(HAServiceProtocol.HAServiceState.STANDBY.ordinal(),
+        nn0.getNameNodeState());
+    assertEquals(HAServiceProtocol.HAServiceState.ACTIVE.ordinal(),
+        nn1.getNameNodeState());
+
+    // Transition nn2 to observer
+    cluster.transitionToObserver(2);
+    assertEquals(HAServiceProtocol.HAServiceState.OBSERVER.ordinal(),
+        nn2.getNameNodeState());
+
+    // Shutdown nn2. Now getNameNodeState should return the INITIALIZING state.
+    cluster.shutdownNameNode(2);
+    assertEquals(HAServiceProtocol.HAServiceState.INITIALIZING.ordinal(),
+        nn2.getNameNodeState());
+  }
 }