|
@@ -17,6 +17,7 @@
|
|
|
*/
|
|
|
package org.apache.hadoop.hdfs.server.namenode.metrics;
|
|
|
|
|
|
+import java.util.concurrent.TimeUnit;
|
|
|
import org.apache.hadoop.crypto.key.JavaKeyStoreProvider;
|
|
|
import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
|
|
|
import org.apache.hadoop.fs.FileSystemTestHelper;
|
|
@@ -41,7 +42,9 @@ import java.io.DataInputStream;
|
|
|
import java.io.File;
|
|
|
import java.io.IOException;
|
|
|
import java.security.NoSuchAlgorithmException;
|
|
|
+import java.util.ArrayList;
|
|
|
import java.util.EnumSet;
|
|
|
+import java.util.List;
|
|
|
import java.util.Random;
|
|
|
import com.google.common.collect.ImmutableList;
|
|
|
|
|
@@ -69,12 +72,15 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
|
|
import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
|
|
import org.apache.hadoop.hdfs.server.datanode.DataNode;
|
|
|
import org.apache.hadoop.hdfs.server.datanode.DataNodeTestUtils;
|
|
|
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
|
|
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsVolumeImpl;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.MockNameNodeResourceChecker;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.NameNode;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
|
|
|
import org.apache.hadoop.hdfs.tools.NNHAServiceTarget;
|
|
|
+import org.apache.hadoop.hdfs.util.HostsFileWriter;
|
|
|
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
|
|
|
import org.apache.hadoop.metrics2.MetricsSource;
|
|
|
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
|
|
@@ -115,6 +121,15 @@ public class TestNameNodeMetrics {
|
|
|
CONF.setInt(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, 1);
|
|
|
CONF.setLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
|
|
DFS_REDUNDANCY_INTERVAL);
|
|
|
+ // Set it long enough to essentially disable unless we manually call it
|
|
|
+ // Used for decommissioning DataNode metrics
|
|
|
+ CONF.setTimeDuration(
|
|
|
+ MiniDFSCluster.DFS_NAMENODE_DECOMMISSION_INTERVAL_TESTING_KEY, 999,
|
|
|
+ TimeUnit.DAYS);
|
|
|
+ // Next two configs used for checking failed volume metrics
|
|
|
+ CONF.setTimeDuration(DFSConfigKeys.DFS_DATANODE_DISK_CHECK_MIN_GAP_KEY,
|
|
|
+ 10, TimeUnit.MILLISECONDS);
|
|
|
+ CONF.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, 1);
|
|
|
CONF.setInt(DFSConfigKeys.DFS_NAMENODE_REDUNDANCY_INTERVAL_SECONDS_KEY,
|
|
|
DFS_REDUNDANCY_INTERVAL);
|
|
|
CONF.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
|
|
@@ -133,6 +148,7 @@ public class TestNameNodeMetrics {
|
|
|
private DistributedFileSystem fs;
|
|
|
private final Random rand = new Random();
|
|
|
private FSNamesystem namesystem;
|
|
|
+ private HostsFileWriter hostsFileWriter;
|
|
|
private BlockManager bm;
|
|
|
private Path ecDir;
|
|
|
|
|
@@ -142,6 +158,8 @@ public class TestNameNodeMetrics {
|
|
|
|
|
|
@Before
|
|
|
public void setUp() throws Exception {
|
|
|
+ hostsFileWriter = new HostsFileWriter();
|
|
|
+ hostsFileWriter.initialize(CONF, "temp/decommission");
|
|
|
cluster = new MiniDFSCluster.Builder(CONF).numDataNodes(DATANODE_COUNT)
|
|
|
.build();
|
|
|
cluster.waitActive();
|
|
@@ -161,6 +179,10 @@ public class TestNameNodeMetrics {
|
|
|
MetricsRecordBuilder rb = getMetrics(source);
|
|
|
assertQuantileGauges("GetGroups1s", rb);
|
|
|
}
|
|
|
+ if (hostsFileWriter != null) {
|
|
|
+ hostsFileWriter.cleanup();
|
|
|
+ hostsFileWriter = null;
|
|
|
+ }
|
|
|
if (cluster != null) {
|
|
|
cluster.shutdown();
|
|
|
cluster = null;
|
|
@@ -235,6 +257,96 @@ public class TestNameNodeMetrics {
|
|
|
.getBlockManager());
|
|
|
assertGauge("StaleDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
}
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Test metrics associated with volume failures.
|
|
|
+ */
|
|
|
+ @Test
|
|
|
+ public void testVolumeFailures() throws Exception {
|
|
|
+ assertGauge("VolumeFailuresTotal", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("EstimatedCapacityLostTotal", 0L, getMetrics(NS_METRICS));
|
|
|
+ DataNode dn = cluster.getDataNodes().get(0);
|
|
|
+ FsDatasetSpi.FsVolumeReferences volumeReferences =
|
|
|
+ DataNodeTestUtils.getFSDataset(dn).getFsVolumeReferences();
|
|
|
+ FsVolumeImpl fsVolume = (FsVolumeImpl) volumeReferences.get(0);
|
|
|
+ File dataDir = new File(fsVolume.getBaseURI());
|
|
|
+ long capacity = fsVolume.getCapacity();
|
|
|
+ volumeReferences.close();
|
|
|
+ DataNodeTestUtils.injectDataDirFailure(dataDir);
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn, fsVolume);
|
|
|
+ DataNodeTestUtils.triggerHeartbeat(dn);
|
|
|
+ BlockManagerTestUtil.checkHeartbeat(bm);
|
|
|
+ assertGauge("VolumeFailuresTotal", 1, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("EstimatedCapacityLostTotal", capacity, getMetrics(NS_METRICS));
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Test metrics associated with liveness and decommission status of DataNodes.
|
|
|
+ */
|
|
|
+ @Test
|
|
|
+ public void testDataNodeLivenessAndDecom() throws Exception {
|
|
|
+ List<DataNode> dataNodes = cluster.getDataNodes();
|
|
|
+ DatanodeDescriptor[] dnDescriptors = new DatanodeDescriptor[DATANODE_COUNT];
|
|
|
+ String[] dnAddresses = new String[DATANODE_COUNT];
|
|
|
+ for (int i = 0; i < DATANODE_COUNT; i++) {
|
|
|
+ dnDescriptors[i] = bm.getDatanodeManager()
|
|
|
+ .getDatanode(dataNodes.get(i).getDatanodeId());
|
|
|
+ dnAddresses[i] = dnDescriptors[i].getXferAddr();
|
|
|
+ }
|
|
|
+ // First put all DNs into include
|
|
|
+ hostsFileWriter.initIncludeHosts(dnAddresses);
|
|
|
+ bm.getDatanodeManager().refreshNodes(CONF);
|
|
|
+ assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
|
|
+
|
|
|
+ // Now decommission one DN
|
|
|
+ hostsFileWriter.initExcludeHost(dnAddresses[0]);
|
|
|
+ bm.getDatanodeManager().refreshNodes(CONF);
|
|
|
+ assertGauge("NumDecommissioningDataNodes", 1, getMetrics(NS_METRICS));
|
|
|
+ BlockManagerTestUtil.recheckDecommissionState(bm.getDatanodeManager());
|
|
|
+ assertGauge("NumDecommissioningDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumDecomLiveDataNodes", 1, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumLiveDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
|
|
+
|
|
|
+ // Now kill all DNs by expiring their heartbeats
|
|
|
+ for (int i = 0; i < DATANODE_COUNT; i++) {
|
|
|
+ DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), true);
|
|
|
+ long expireInterval = CONF.getLong(
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_KEY,
|
|
|
+ DFSConfigKeys.DFS_NAMENODE_HEARTBEAT_RECHECK_INTERVAL_DEFAULT) * 2L
|
|
|
+ + CONF.getLong(DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY,
|
|
|
+ DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 10 * 1000L;
|
|
|
+ DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i],
|
|
|
+ -(expireInterval + 1));
|
|
|
+ }
|
|
|
+ BlockManagerTestUtil.checkHeartbeat(bm);
|
|
|
+ assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumDecomDeadDataNodes", 1, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumDeadDataNodes", DATANODE_COUNT, getMetrics(NS_METRICS));
|
|
|
+
|
|
|
+ // Now remove the decommissioned DN altogether
|
|
|
+ String[] includeHosts = new String[dnAddresses.length - 1];
|
|
|
+ for (int i = 0; i < includeHosts.length; i++) {
|
|
|
+ includeHosts[i] = dnAddresses[i + 1];
|
|
|
+ }
|
|
|
+ hostsFileWriter.initIncludeHosts(includeHosts);
|
|
|
+ hostsFileWriter.initExcludeHosts(new ArrayList<>());
|
|
|
+ bm.getDatanodeManager().refreshNodes(CONF);
|
|
|
+ assertGauge("NumDecomLiveDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumDecomDeadDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumLiveDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumDeadDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
|
|
|
+
|
|
|
+ // Finally mark the remaining DNs as live again
|
|
|
+ for (int i = 1; i < dataNodes.size(); i++) {
|
|
|
+ DataNodeTestUtils.setHeartbeatsDisabledForTests(dataNodes.get(i), false);
|
|
|
+ DFSTestUtil.resetLastUpdatesWithOffset(dnDescriptors[i], 0);
|
|
|
+ }
|
|
|
+ BlockManagerTestUtil.checkHeartbeat(bm);
|
|
|
+ assertGauge("NumLiveDataNodes", DATANODE_COUNT - 1, getMetrics(NS_METRICS));
|
|
|
+ assertGauge("NumDeadDataNodes", 0, getMetrics(NS_METRICS));
|
|
|
+ }
|
|
|
|
|
|
/** Test metrics associated with addition of a file */
|
|
|
@Test
|