|
@@ -34,16 +34,15 @@ import java.util.HashMap;
|
|
|
import java.util.List;
|
|
|
import java.util.Map;
|
|
|
import java.util.concurrent.TimeUnit;
|
|
|
-import java.util.concurrent.TimeoutException;
|
|
|
|
|
|
+import org.apache.commons.io.FileUtils;
|
|
|
+import org.apache.commons.io.filefilter.TrueFileFilter;
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
-import org.apache.hadoop.conf.ReconfigurationException;
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
import org.apache.hadoop.fs.FileUtil;
|
|
|
import org.apache.hadoop.fs.FsTracer;
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
import org.apache.hadoop.hdfs.BlockReader;
|
|
|
-import org.apache.hadoop.hdfs.client.impl.BlockReaderFactory;
|
|
|
import org.apache.hadoop.hdfs.ClientContext;
|
|
|
import org.apache.hadoop.hdfs.DFSConfigKeys;
|
|
|
import org.apache.hadoop.hdfs.DFSTestUtil;
|
|
@@ -51,6 +50,7 @@ import org.apache.hadoop.hdfs.DFSUtilClient;
|
|
|
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
|
|
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
|
|
import org.apache.hadoop.hdfs.RemotePeerFactory;
|
|
|
+import org.apache.hadoop.hdfs.client.impl.BlockReaderFactory;
|
|
|
import org.apache.hadoop.hdfs.client.impl.DfsClientConf;
|
|
|
import org.apache.hadoop.hdfs.net.Peer;
|
|
|
import org.apache.hadoop.hdfs.protocol.Block;
|
|
@@ -73,19 +73,16 @@ import org.apache.hadoop.io.IOUtils;
|
|
|
import org.apache.hadoop.net.NetUtils;
|
|
|
import org.apache.hadoop.security.token.Token;
|
|
|
import org.apache.hadoop.test.GenericTestUtils;
|
|
|
-
|
|
|
-import org.apache.commons.io.FileUtils;
|
|
|
-import org.apache.commons.io.filefilter.TrueFileFilter;
|
|
|
-
|
|
|
-import com.google.common.base.Supplier;
|
|
|
-
|
|
|
import org.junit.After;
|
|
|
import org.junit.Before;
|
|
|
+import org.junit.Rule;
|
|
|
import org.junit.Test;
|
|
|
-
|
|
|
+import org.junit.rules.Timeout;
|
|
|
import org.slf4j.Logger;
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
|
|
+import com.google.common.base.Supplier;
|
|
|
+
|
|
|
/**
|
|
|
* Fine-grain testing of block files and locations after volume failure.
|
|
|
*/
|
|
@@ -111,6 +108,10 @@ public class TestDataNodeVolumeFailure {
|
|
|
// block id to BlockLocs
|
|
|
final Map<String, BlockLocs> block_map = new HashMap<String, BlockLocs> ();
|
|
|
|
|
|
+ // specific the timeout for entire test class
|
|
|
+ @Rule
|
|
|
+ public Timeout timeout = new Timeout(120 * 1000);
|
|
|
+
|
|
|
@Before
|
|
|
public void setUp() throws Exception {
|
|
|
// bring up a cluster of 2
|
|
@@ -225,7 +226,7 @@ public class TestDataNodeVolumeFailure {
|
|
|
*/
|
|
|
@Test(timeout=150000)
|
|
|
public void testFailedVolumeBeingRemovedFromDataNode()
|
|
|
- throws InterruptedException, IOException, TimeoutException {
|
|
|
+ throws Exception {
|
|
|
// The test uses DataNodeTestUtils#injectDataDirFailure() to simulate
|
|
|
// volume failures which is currently not supported on Windows.
|
|
|
assumeNotWindows();
|
|
@@ -237,7 +238,8 @@ public class TestDataNodeVolumeFailure {
|
|
|
File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
|
|
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
|
|
|
DataNode dn0 = cluster.getDataNodes().get(0);
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
|
|
|
// Verify dn0Vol1 has been completely removed from DN0.
|
|
|
// 1. dn0Vol1 is removed from DataStorage.
|
|
@@ -284,35 +286,22 @@ public class TestDataNodeVolumeFailure {
|
|
|
assertFalse(dataDirStrs[0].contains(dn0Vol1.getAbsolutePath()));
|
|
|
}
|
|
|
|
|
|
- private static void checkDiskErrorSync(DataNode dn, FsVolumeSpi volume)
|
|
|
- throws InterruptedException {
|
|
|
- final long lastDiskErrorCheck = dn.getLastDiskErrorCheck();
|
|
|
- dn.checkDiskErrorAsync(volume);
|
|
|
- // Wait 10 seconds for checkDiskError thread to finish and discover volume
|
|
|
- // failures.
|
|
|
- int count = 100;
|
|
|
- while (count > 0 && dn.getLastDiskErrorCheck() == lastDiskErrorCheck) {
|
|
|
- Thread.sleep(100);
|
|
|
- count--;
|
|
|
- }
|
|
|
- assertTrue("Disk checking thread does not finish in 10 seconds",
|
|
|
- count > 0);
|
|
|
- }
|
|
|
-
|
|
|
/**
|
|
|
* Test DataNode stops when the number of failed volumes exceeds
|
|
|
* dfs.datanode.failed.volumes.tolerated .
|
|
|
*/
|
|
|
@Test(timeout=10000)
|
|
|
public void testDataNodeShutdownAfterNumFailedVolumeExceedsTolerated()
|
|
|
- throws InterruptedException, IOException {
|
|
|
+ throws Exception {
|
|
|
// make both data directories to fail on dn0
|
|
|
final File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
|
|
|
final File dn0Vol2 = new File(dataDir, "data" + (2 * 0 + 2));
|
|
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol1, dn0Vol2);
|
|
|
DataNode dn0 = cluster.getDataNodes().get(0);
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol2));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol2));
|
|
|
|
|
|
// DN0 should stop after the number of failure disks exceed tolerated
|
|
|
// value (1).
|
|
@@ -324,7 +313,7 @@ public class TestDataNodeVolumeFailure {
|
|
|
*/
|
|
|
@Test
|
|
|
public void testVolumeFailureRecoveredByHotSwappingVolume()
|
|
|
- throws InterruptedException, ReconfigurationException, IOException {
|
|
|
+ throws Exception {
|
|
|
final File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
|
|
|
final File dn0Vol2 = new File(dataDir, "data" + (2 * 0 + 2));
|
|
|
final DataNode dn0 = cluster.getDataNodes().get(0);
|
|
@@ -333,7 +322,8 @@ public class TestDataNodeVolumeFailure {
|
|
|
|
|
|
// Fail dn0Vol1 first.
|
|
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
|
|
|
// Hot swap out the failure volume.
|
|
|
String dataDirs = dn0Vol2.getPath();
|
|
@@ -352,7 +342,8 @@ public class TestDataNodeVolumeFailure {
|
|
|
// Fail dn0Vol2. Now since dn0Vol1 has been fixed, DN0 has sufficient
|
|
|
// resources, thus it should keep running.
|
|
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol2);
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol2));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol2));
|
|
|
assertTrue(dn0.shouldRun());
|
|
|
}
|
|
|
|
|
@@ -362,7 +353,7 @@ public class TestDataNodeVolumeFailure {
|
|
|
*/
|
|
|
@Test
|
|
|
public void testTolerateVolumeFailuresAfterAddingMoreVolumes()
|
|
|
- throws InterruptedException, ReconfigurationException, IOException {
|
|
|
+ throws Exception {
|
|
|
final File dn0Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
|
|
|
final File dn0Vol2 = new File(dataDir, "data" + (2 * 0 + 2));
|
|
|
final File dn0VolNew = new File(dataDir, "data_new");
|
|
@@ -379,12 +370,14 @@ public class TestDataNodeVolumeFailure {
|
|
|
|
|
|
// Fail dn0Vol1 first and hot swap it.
|
|
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol1);
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol1));
|
|
|
assertTrue(dn0.shouldRun());
|
|
|
|
|
|
// Fail dn0Vol2, now dn0 should stop, because we only tolerate 1 disk failure.
|
|
|
DataNodeTestUtils.injectDataDirFailure(dn0Vol2);
|
|
|
- checkDiskErrorSync(dn0, DataNodeTestUtils.getVolume(dn0, dn0Vol2));
|
|
|
+ DataNodeTestUtils.waitForDiskError(dn0,
|
|
|
+ DataNodeTestUtils.getVolume(dn0, dn0Vol2));
|
|
|
assertFalse(dn0.shouldRun());
|
|
|
}
|
|
|
|