|
@@ -0,0 +1,391 @@
|
|
|
+/**
|
|
|
+ * Licensed to the Apache Software Foundation (ASF) under one
|
|
|
+ * or more contributor license agreements. See the NOTICE file
|
|
|
+ * distributed with this work for additional information
|
|
|
+ * regarding copyright ownership. The ASF licenses this file
|
|
|
+ * to you under the Apache License, Version 2.0 (the
|
|
|
+ * "License"); you may not use this file except in compliance
|
|
|
+ * with the License. You may obtain a copy of the License at
|
|
|
+ *
|
|
|
+ * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+ *
|
|
|
+ * Unless required by applicable law or agreed to in writing, software
|
|
|
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+ * See the License for the specific language governing permissions and
|
|
|
+ * limitations under the License.
|
|
|
+ */
|
|
|
+package org.apache.hadoop.hdfs.server.blockmanagement;
|
|
|
+
|
|
|
+import static org.junit.Assert.*;
|
|
|
+
|
|
|
+import java.io.IOException;
|
|
|
+import java.util.List;
|
|
|
+import java.util.Map.Entry;
|
|
|
+
|
|
|
+import org.apache.hadoop.conf.Configuration;
|
|
|
+import org.apache.hadoop.hdfs.DFSConfigKeys;
|
|
|
+import org.apache.hadoop.hdfs.HdfsConfiguration;
|
|
|
+import org.apache.hadoop.hdfs.protocol.Block;
|
|
|
+import org.apache.hadoop.hdfs.protocol.DatanodeID;
|
|
|
+import org.apache.hadoop.hdfs.protocol.FSConstants;
|
|
|
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
|
|
|
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockManager;
|
|
|
+import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor;
|
|
|
+import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor.BlockTargetPair;
|
|
|
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
|
|
+import org.apache.hadoop.hdfs.server.namenode.INodeFile;
|
|
|
+import org.apache.hadoop.net.NetworkTopology;
|
|
|
+import org.junit.Before;
|
|
|
+import org.junit.Test;
|
|
|
+import org.mockito.Mockito;
|
|
|
+
|
|
|
+import com.google.common.base.Joiner;
|
|
|
+import com.google.common.collect.ImmutableList;
|
|
|
+import com.google.common.collect.LinkedListMultimap;
|
|
|
+import com.google.common.collect.Lists;
|
|
|
+
|
|
|
+public class TestBlockManager {
|
|
|
+ private final List<DatanodeDescriptor> nodes = ImmutableList.of(
|
|
|
+ new DatanodeDescriptor(new DatanodeID("h1:5020"), "/rackA"),
|
|
|
+ new DatanodeDescriptor(new DatanodeID("h2:5020"), "/rackA"),
|
|
|
+ new DatanodeDescriptor(new DatanodeID("h3:5020"), "/rackA"),
|
|
|
+ new DatanodeDescriptor(new DatanodeID("h4:5020"), "/rackB"),
|
|
|
+ new DatanodeDescriptor(new DatanodeID("h5:5020"), "/rackB"),
|
|
|
+ new DatanodeDescriptor(new DatanodeID("h6:5020"), "/rackB")
|
|
|
+ );
|
|
|
+ private final List<DatanodeDescriptor> rackA = nodes.subList(0, 3);
|
|
|
+ private final List<DatanodeDescriptor> rackB = nodes.subList(3, 6);
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Some of these tests exercise code which has some randomness involved -
|
|
|
+ * ie even if there's a bug, they may pass because the random node selection
|
|
|
+ * chooses the correct result.
|
|
|
+ *
|
|
|
+ * Since they're true unit tests and run quickly, we loop them a number
|
|
|
+ * of times trying to trigger the incorrect behavior.
|
|
|
+ */
|
|
|
+ private static final int NUM_TEST_ITERS = 30;
|
|
|
+
|
|
|
+ private static final int BLOCK_SIZE = 64*1024;
|
|
|
+
|
|
|
+ private Configuration conf;
|
|
|
+ private FSNamesystem fsn;
|
|
|
+ private BlockManager bm;
|
|
|
+
|
|
|
+ @Before
|
|
|
+ public void setupMockCluster() throws IOException {
|
|
|
+ conf = new HdfsConfiguration();
|
|
|
+ conf.set(DFSConfigKeys.NET_TOPOLOGY_SCRIPT_FILE_NAME_KEY,
|
|
|
+ "need to set a dummy value here so it assumes a multi-rack cluster");
|
|
|
+ fsn = Mockito.mock(FSNamesystem.class);
|
|
|
+ Mockito.doReturn(true).when(fsn).hasWriteLock();
|
|
|
+ bm = new BlockManager(fsn, conf);
|
|
|
+ }
|
|
|
+
|
|
|
+ private void addNodes(Iterable<DatanodeDescriptor> nodesToAdd) {
|
|
|
+ NetworkTopology cluster = bm.getDatanodeManager().getNetworkTopology();
|
|
|
+ // construct network topology
|
|
|
+ for (DatanodeDescriptor dn : nodesToAdd) {
|
|
|
+ cluster.add(dn);
|
|
|
+ dn.updateHeartbeat(
|
|
|
+ 2*FSConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L,
|
|
|
+ 2*FSConstants.MIN_BLOCKS_FOR_WRITE*BLOCK_SIZE, 0L, 0, 0);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void removeNode(DatanodeDescriptor deadNode) {
|
|
|
+ NetworkTopology cluster = bm.getDatanodeManager().getNetworkTopology();
|
|
|
+ cluster.remove(deadNode);
|
|
|
+ bm.removeBlocksAssociatedTo(deadNode);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Test that replication of under-replicated blocks is detected
|
|
|
+ * and basically works
|
|
|
+ */
|
|
|
+ @Test
|
|
|
+ public void testBasicReplication() throws Exception {
|
|
|
+ addNodes(nodes);
|
|
|
+ for (int i = 0; i < NUM_TEST_ITERS; i++) {
|
|
|
+ doBasicTest(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void doBasicTest(int testIndex) {
|
|
|
+ List<DatanodeDescriptor> origNodes = nodes(0, 1);
|
|
|
+ BlockInfo blockInfo = addBlockOnNodes((long)testIndex, origNodes);
|
|
|
+
|
|
|
+ DatanodeDescriptor[] pipeline = scheduleSingleReplication(blockInfo);
|
|
|
+ assertEquals(2, pipeline.length);
|
|
|
+ assertTrue("Source of replication should be one of the nodes the block " +
|
|
|
+ "was on. Was: " + pipeline[0],
|
|
|
+ origNodes.contains(pipeline[0]));
|
|
|
+ assertTrue("Destination of replication should be on the other rack. " +
|
|
|
+ "Was: " + pipeline[1],
|
|
|
+ rackB.contains(pipeline[1]));
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Regression test for HDFS-1480
|
|
|
+ * - Cluster has 2 racks, A and B, each with three nodes.
|
|
|
+ * - Block initially written on A1, A2, B1
|
|
|
+ * - Admin decommissions two of these nodes (let's say A1 and A2 but it doesn't matter)
|
|
|
+ * - Re-replication should respect rack policy
|
|
|
+ */
|
|
|
+ @Test
|
|
|
+ public void testTwoOfThreeNodesDecommissioned() throws Exception {
|
|
|
+ addNodes(nodes);
|
|
|
+ for (int i = 0; i < NUM_TEST_ITERS; i++) {
|
|
|
+ doTestTwoOfThreeNodesDecommissioned(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void doTestTwoOfThreeNodesDecommissioned(int testIndex) throws Exception {
|
|
|
+ // Block originally on A1, A2, B1
|
|
|
+ List<DatanodeDescriptor> origNodes = nodes(0, 1, 3);
|
|
|
+ BlockInfo blockInfo = addBlockOnNodes(testIndex, origNodes);
|
|
|
+
|
|
|
+ // Decommission two of the nodes (A1, A2)
|
|
|
+ List<DatanodeDescriptor> decomNodes = startDecommission(0, 1);
|
|
|
+
|
|
|
+ DatanodeDescriptor[] pipeline = scheduleSingleReplication(blockInfo);
|
|
|
+ assertTrue("Source of replication should be one of the nodes the block " +
|
|
|
+ "was on. Was: " + pipeline[0],
|
|
|
+ origNodes.contains(pipeline[0]));
|
|
|
+ assertEquals("Should have two targets", 3, pipeline.length);
|
|
|
+
|
|
|
+ boolean foundOneOnRackA = false;
|
|
|
+ for (int i = 1; i < pipeline.length; i++) {
|
|
|
+ DatanodeDescriptor target = pipeline[i];
|
|
|
+ if (rackA.contains(target)) {
|
|
|
+ foundOneOnRackA = true;
|
|
|
+ }
|
|
|
+ assertFalse(decomNodes.contains(target));
|
|
|
+ assertFalse(origNodes.contains(target));
|
|
|
+ }
|
|
|
+
|
|
|
+ assertTrue("Should have at least one target on rack A. Pipeline: " +
|
|
|
+ Joiner.on(",").join(pipeline),
|
|
|
+ foundOneOnRackA);
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Test what happens when a block is on three nodes, and all three of those
|
|
|
+ * nodes are decommissioned. It should properly re-replicate to three new
|
|
|
+ * nodes.
|
|
|
+ */
|
|
|
+ @Test
|
|
|
+ public void testAllNodesHoldingReplicasDecommissioned() throws Exception {
|
|
|
+ addNodes(nodes);
|
|
|
+ for (int i = 0; i < NUM_TEST_ITERS; i++) {
|
|
|
+ doTestAllNodesHoldingReplicasDecommissioned(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void doTestAllNodesHoldingReplicasDecommissioned(int testIndex) throws Exception {
|
|
|
+ // Block originally on A1, A2, B1
|
|
|
+ List<DatanodeDescriptor> origNodes = nodes(0, 1, 3);
|
|
|
+ BlockInfo blockInfo = addBlockOnNodes(testIndex, origNodes);
|
|
|
+
|
|
|
+ // Decommission all of the nodes
|
|
|
+ List<DatanodeDescriptor> decomNodes = startDecommission(0, 1, 3);
|
|
|
+
|
|
|
+ DatanodeDescriptor[] pipeline = scheduleSingleReplication(blockInfo);
|
|
|
+ assertTrue("Source of replication should be one of the nodes the block " +
|
|
|
+ "was on. Was: " + pipeline[0],
|
|
|
+ origNodes.contains(pipeline[0]));
|
|
|
+ assertEquals("Should have three targets", 4, pipeline.length);
|
|
|
+
|
|
|
+ boolean foundOneOnRackA = false;
|
|
|
+ boolean foundOneOnRackB = false;
|
|
|
+ for (int i = 1; i < pipeline.length; i++) {
|
|
|
+ DatanodeDescriptor target = pipeline[i];
|
|
|
+ if (rackA.contains(target)) {
|
|
|
+ foundOneOnRackA = true;
|
|
|
+ } else if (rackB.contains(target)) {
|
|
|
+ foundOneOnRackB = true;
|
|
|
+ }
|
|
|
+ assertFalse(decomNodes.contains(target));
|
|
|
+ assertFalse(origNodes.contains(target));
|
|
|
+ }
|
|
|
+
|
|
|
+ assertTrue("Should have at least one target on rack A. Pipeline: " +
|
|
|
+ Joiner.on(",").join(pipeline),
|
|
|
+ foundOneOnRackA);
|
|
|
+ assertTrue("Should have at least one target on rack B. Pipeline: " +
|
|
|
+ Joiner.on(",").join(pipeline),
|
|
|
+ foundOneOnRackB);
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Test what happens when there are two racks, and an entire rack is
|
|
|
+ * decommissioned.
|
|
|
+ *
|
|
|
+ * Since the cluster is multi-rack, it will consider the block
|
|
|
+ * under-replicated rather than create a third replica on the
|
|
|
+ * same rack. Adding a new node on a third rack should cause re-replication
|
|
|
+ * to that node.
|
|
|
+ */
|
|
|
+ @Test
|
|
|
+ public void testOneOfTwoRacksDecommissioned() throws Exception {
|
|
|
+ addNodes(nodes);
|
|
|
+ for (int i = 0; i < NUM_TEST_ITERS; i++) {
|
|
|
+ doTestOneOfTwoRacksDecommissioned(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void doTestOneOfTwoRacksDecommissioned(int testIndex) throws Exception {
|
|
|
+ // Block originally on A1, A2, B1
|
|
|
+ List<DatanodeDescriptor> origNodes = nodes(0, 1, 3);
|
|
|
+ BlockInfo blockInfo = addBlockOnNodes(testIndex, origNodes);
|
|
|
+
|
|
|
+ // Decommission all of the nodes in rack A
|
|
|
+ List<DatanodeDescriptor> decomNodes = startDecommission(0, 1, 2);
|
|
|
+
|
|
|
+ DatanodeDescriptor[] pipeline = scheduleSingleReplication(blockInfo);
|
|
|
+ assertTrue("Source of replication should be one of the nodes the block " +
|
|
|
+ "was on. Was: " + pipeline[0],
|
|
|
+ origNodes.contains(pipeline[0]));
|
|
|
+ assertEquals("Should have 2 targets", 3, pipeline.length);
|
|
|
+
|
|
|
+ boolean foundOneOnRackB = false;
|
|
|
+ for (int i = 1; i < pipeline.length; i++) {
|
|
|
+ DatanodeDescriptor target = pipeline[i];
|
|
|
+ if (rackB.contains(target)) {
|
|
|
+ foundOneOnRackB = true;
|
|
|
+ }
|
|
|
+ assertFalse(decomNodes.contains(target));
|
|
|
+ assertFalse(origNodes.contains(target));
|
|
|
+ }
|
|
|
+
|
|
|
+ assertTrue("Should have at least one target on rack B. Pipeline: " +
|
|
|
+ Joiner.on(",").join(pipeline),
|
|
|
+ foundOneOnRackB);
|
|
|
+
|
|
|
+ // Mark the block as received on the target nodes in the pipeline
|
|
|
+ fulfillPipeline(blockInfo, pipeline);
|
|
|
+
|
|
|
+ // the block is still under-replicated. Add a new node. This should allow
|
|
|
+ // the third off-rack replica.
|
|
|
+ DatanodeDescriptor rackCNode = new DatanodeDescriptor(new DatanodeID("h7:5020"), "/rackC");
|
|
|
+ addNodes(ImmutableList.of(rackCNode));
|
|
|
+ try {
|
|
|
+ DatanodeDescriptor[] pipeline2 = scheduleSingleReplication(blockInfo);
|
|
|
+ assertEquals(2, pipeline2.length);
|
|
|
+ assertEquals(rackCNode, pipeline2[1]);
|
|
|
+ } finally {
|
|
|
+ removeNode(rackCNode);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Unit test version of testSufficientlyReplBlocksUsesNewRack from
|
|
|
+ * {@link TestBlocksWithNotEnoughRacks}.
|
|
|
+ **/
|
|
|
+ @Test
|
|
|
+ public void testSufficientlyReplBlocksUsesNewRack() throws Exception {
|
|
|
+ addNodes(nodes);
|
|
|
+ for (int i = 0; i < NUM_TEST_ITERS; i++) {
|
|
|
+ doTestSufficientlyReplBlocksUsesNewRack(i);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private void doTestSufficientlyReplBlocksUsesNewRack(int testIndex) {
|
|
|
+ // Originally on only nodes in rack A.
|
|
|
+ List<DatanodeDescriptor> origNodes = rackA;
|
|
|
+ BlockInfo blockInfo = addBlockOnNodes((long)testIndex, origNodes);
|
|
|
+ DatanodeDescriptor pipeline[] = scheduleSingleReplication(blockInfo);
|
|
|
+
|
|
|
+ assertEquals(2, pipeline.length); // single new copy
|
|
|
+ assertTrue("Source of replication should be one of the nodes the block " +
|
|
|
+ "was on. Was: " + pipeline[0],
|
|
|
+ origNodes.contains(pipeline[0]));
|
|
|
+ assertTrue("Destination of replication should be on the other rack. " +
|
|
|
+ "Was: " + pipeline[1],
|
|
|
+ rackB.contains(pipeline[1]));
|
|
|
+ }
|
|
|
+
|
|
|
+
|
|
|
+ /**
|
|
|
+ * Tell the block manager that replication is completed for the given
|
|
|
+ * pipeline.
|
|
|
+ */
|
|
|
+ private void fulfillPipeline(BlockInfo blockInfo,
|
|
|
+ DatanodeDescriptor[] pipeline) throws IOException {
|
|
|
+ for (int i = 1; i < pipeline.length; i++) {
|
|
|
+ bm.addBlock(pipeline[i], blockInfo, null);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ private BlockInfo blockOnNodes(long blkId, List<DatanodeDescriptor> nodes) {
|
|
|
+ Block block = new Block(blkId);
|
|
|
+ BlockInfo blockInfo = new BlockInfo(block, 3);
|
|
|
+
|
|
|
+ for (DatanodeDescriptor dn : nodes) {
|
|
|
+ blockInfo.addNode(dn);
|
|
|
+ }
|
|
|
+ return blockInfo;
|
|
|
+ }
|
|
|
+
|
|
|
+ private List<DatanodeDescriptor> nodes(int ... indexes) {
|
|
|
+ List<DatanodeDescriptor> ret = Lists.newArrayList();
|
|
|
+ for (int idx : indexes) {
|
|
|
+ ret.add(nodes.get(idx));
|
|
|
+ }
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ private List<DatanodeDescriptor> startDecommission(int ... indexes) {
|
|
|
+ List<DatanodeDescriptor> nodes = nodes(indexes);
|
|
|
+ for (DatanodeDescriptor node : nodes) {
|
|
|
+ node.startDecommission();
|
|
|
+ }
|
|
|
+ return nodes;
|
|
|
+ }
|
|
|
+
|
|
|
+ private BlockInfo addBlockOnNodes(long blockId, List<DatanodeDescriptor> nodes) {
|
|
|
+ INodeFile iNode = Mockito.mock(INodeFile.class);
|
|
|
+ Mockito.doReturn((short)3).when(iNode).getReplication();
|
|
|
+ BlockInfo blockInfo = blockOnNodes(blockId, nodes);
|
|
|
+
|
|
|
+ bm.blocksMap.addINode(blockInfo, iNode);
|
|
|
+ return blockInfo;
|
|
|
+ }
|
|
|
+
|
|
|
+ private DatanodeDescriptor[] scheduleSingleReplication(Block block) {
|
|
|
+ assertEquals("Block not initially pending replication",
|
|
|
+ 0, bm.pendingReplications.getNumReplicas(block));
|
|
|
+ assertTrue("computeReplicationWork should indicate replication is needed",
|
|
|
+ bm.computeReplicationWorkForBlock(block, 1));
|
|
|
+ assertTrue("replication is pending after work is computed",
|
|
|
+ bm.pendingReplications.getNumReplicas(block) > 0);
|
|
|
+
|
|
|
+ LinkedListMultimap<DatanodeDescriptor, BlockTargetPair> repls =
|
|
|
+ getAllPendingReplications();
|
|
|
+ assertEquals(1, repls.size());
|
|
|
+ Entry<DatanodeDescriptor, BlockTargetPair> repl = repls.entries().iterator().next();
|
|
|
+ DatanodeDescriptor[] targets = repl.getValue().targets;
|
|
|
+
|
|
|
+ DatanodeDescriptor[] pipeline = new DatanodeDescriptor[1 + targets.length];
|
|
|
+ pipeline[0] = repl.getKey();
|
|
|
+ System.arraycopy(targets, 0, pipeline, 1, targets.length);
|
|
|
+
|
|
|
+ return pipeline;
|
|
|
+ }
|
|
|
+
|
|
|
+ private LinkedListMultimap<DatanodeDescriptor, BlockTargetPair> getAllPendingReplications() {
|
|
|
+ LinkedListMultimap<DatanodeDescriptor, BlockTargetPair> repls =
|
|
|
+ LinkedListMultimap.create();
|
|
|
+ for (DatanodeDescriptor dn : nodes) {
|
|
|
+ List<BlockTargetPair> thisRepls = dn.getReplicationCommand(10);
|
|
|
+ if (thisRepls != null) {
|
|
|
+ repls.putAll(dn, thisRepls);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return repls;
|
|
|
+ }
|
|
|
+}
|