|
@@ -24,6 +24,7 @@ import static org.junit.Assert.assertFalse;
|
|
|
import static org.junit.Assert.assertNotNull;
|
|
|
import static org.junit.Assert.assertThat;
|
|
|
import static org.junit.Assert.assertTrue;
|
|
|
+import static org.junit.Assert.fail;
|
|
|
|
|
|
import java.io.File;
|
|
|
import java.io.IOException;
|
|
@@ -31,12 +32,16 @@ import java.net.InetSocketAddress;
|
|
|
import java.net.Socket;
|
|
|
import java.util.Collection;
|
|
|
import java.util.HashMap;
|
|
|
+import java.util.Iterator;
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
+import java.util.Set;
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
+import java.util.concurrent.TimeoutException;
|
|
|
|
|
|
import org.apache.commons.io.FileUtils;
|
|
|
import org.apache.commons.io.filefilter.TrueFileFilter;
|
|
|
+import org.apache.commons.lang3.ArrayUtils;
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
import org.apache.hadoop.fs.FileUtil;
|
|
@@ -64,6 +69,7 @@ import org.apache.hadoop.hdfs.server.blockmanagement.BlockManagerTestUtil;
|
|
|
import org.apache.hadoop.hdfs.server.common.Storage;
|
|
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
|
|
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
|
|
|
+import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.AddBlockPoolException;
|
|
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetTestUtil;
|
|
|
import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
|
|
|
import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocols;
|
|
@@ -219,6 +225,50 @@ public class TestDataNodeVolumeFailure {
|
|
|
" is created and replicated");
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * If one of the sub-folders under the finalized directory is unreadable,
|
|
|
+ * either due to permissions or a filesystem corruption, the DN will fail
|
|
|
+ * to read it when scanning it for blocks to load into the replica map. This
|
|
|
+ * test ensures the DN does not exit and reports the failed volume to the
|
|
|
+ * NN (HDFS-14333). This is done by using a simulated FsDataset that throws
|
|
|
+ * an exception for a failed volume when the block pool is initialized.
|
|
|
+ */
|
|
|
+ @Test(timeout=15000)
|
|
|
+ public void testDnStartsAfterDiskErrorScanningBlockPool() throws Exception {
|
|
|
+ // Don't use the cluster configured in the setup() method for this test.
|
|
|
+ cluster.shutdown(true);
|
|
|
+ cluster.close();
|
|
|
+
|
|
|
+ conf.set(DFSConfigKeys.DFS_DATANODE_FSDATASET_FACTORY_KEY,
|
|
|
+ BadDiskFSDataset.Factory.class.getName());
|
|
|
+
|
|
|
+ final MiniDFSCluster localCluster = new MiniDFSCluster
|
|
|
+ .Builder(conf).numDataNodes(1).build();
|
|
|
+
|
|
|
+ try {
|
|
|
+ localCluster.waitActive();
|
|
|
+ DataNode dn = localCluster.getDataNodes().get(0);
|
|
|
+
|
|
|
+ try {
|
|
|
+ localCluster.waitDatanodeFullyStarted(dn, 3000);
|
|
|
+ } catch (TimeoutException e) {
|
|
|
+ fail("Datanode did not get fully started");
|
|
|
+ }
|
|
|
+ assertTrue(dn.isDatanodeUp());
|
|
|
+
|
|
|
+ // trigger DN to send heartbeat
|
|
|
+ DataNodeTestUtils.triggerHeartbeat(dn);
|
|
|
+ final BlockManager bm = localCluster.getNamesystem().getBlockManager();
|
|
|
+ // trigger NN handle heartbeat
|
|
|
+ BlockManagerTestUtil.checkHeartbeat(bm);
|
|
|
+
|
|
|
+ // NN now should have the failed volume
|
|
|
+ assertEquals(1, localCluster.getNamesystem().getVolumeFailuresTotal());
|
|
|
+ } finally {
|
|
|
+ localCluster.close();
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
/**
|
|
|
* Test that DataStorage and BlockPoolSliceStorage remove the failed volume
|
|
|
* after failure.
|
|
@@ -758,4 +808,64 @@ public class TestDataNodeVolumeFailure {
|
|
|
}
|
|
|
return total;
|
|
|
}
|
|
|
+
|
|
|
+ private static class BadDiskFSDataset extends SimulatedFSDataset {
|
|
|
+
|
|
|
+ BadDiskFSDataset(DataStorage storage, Configuration conf) {
|
|
|
+ super(storage, conf);
|
|
|
+ }
|
|
|
+
|
|
|
+ private String[] failedStorageLocations = null;
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void addBlockPool(String bpid, Configuration conf) {
|
|
|
+ super.addBlockPool(bpid, conf);
|
|
|
+ Map<FsVolumeSpi, IOException>
|
|
|
+ unhealthyDataDirs = new HashMap<>();
|
|
|
+ unhealthyDataDirs.put(this.getStorages().get(0).getVolume(),
|
|
|
+ new IOException());
|
|
|
+ throw new AddBlockPoolException(unhealthyDataDirs);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public synchronized void removeVolumes(Collection<StorageLocation> volumes,
|
|
|
+ boolean clearFailure) {
|
|
|
+ Iterator<StorageLocation> itr = volumes.iterator();
|
|
|
+ String[] failedLocations = new String[volumes.size()];
|
|
|
+ int index = 0;
|
|
|
+ while(itr.hasNext()) {
|
|
|
+ StorageLocation s = itr.next();
|
|
|
+ failedLocations[index] = s.getUri().getPath();
|
|
|
+ index += 1;
|
|
|
+ }
|
|
|
+ failedStorageLocations = failedLocations;
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public void handleVolumeFailures(Set<FsVolumeSpi> failedVolumes) {
|
|
|
+ // do nothing
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public VolumeFailureSummary getVolumeFailureSummary() {
|
|
|
+ if (failedStorageLocations != null) {
|
|
|
+ return new VolumeFailureSummary(failedStorageLocations, 0, 0);
|
|
|
+ } else {
|
|
|
+ return new VolumeFailureSummary(ArrayUtils.EMPTY_STRING_ARRAY, 0, 0);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ static class Factory extends FsDatasetSpi.Factory<BadDiskFSDataset> {
|
|
|
+ @Override
|
|
|
+ public BadDiskFSDataset newInstance(DataNode datanode,
|
|
|
+ DataStorage storage, Configuration conf) throws IOException {
|
|
|
+ return new BadDiskFSDataset(storage, conf);
|
|
|
+ }
|
|
|
+
|
|
|
+ @Override
|
|
|
+ public boolean isSimulated() {
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
}
|