Ver Fonte

HDFS-11031. Add additional unit test for DataNode startup behavior when volumes fail. Contributed by Mingliang Liu.

(cherry picked from commit 35e6081020d34be966724f44df678f1f5ee6f6d8)
Brahma Reddy Battula há 8 anos atrás
pai
commit
77142827e8

+ 110 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailure.java

@@ -74,17 +74,24 @@ import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.test.GenericTestUtils;
 
-import com.google.common.base.Supplier;
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.filefilter.TrueFileFilter;
+
+import com.google.common.base.Supplier;
+
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * Fine-grain testing of block files and locations after volume failure.
  */
 public class TestDataNodeVolumeFailure {
+  private final static Logger LOG = LoggerFactory.getLogger(
+      TestDataNodeVolumeFailure.class);
   final private int block_size = 512;
   MiniDFSCluster cluster = null;
   private Configuration conf;
@@ -414,6 +421,108 @@ public class TestDataNodeVolumeFailure {
         underReplicatedBlocks > 0);
   }
 
+  /**
+   * Test if there is volume failure, the DataNode will fail to start.
+   *
+   * We fail a volume by setting the parent directory non-writable.
+   */
+  @Test (timeout = 120000)
+  public void testDataNodeFailToStartWithVolumeFailure() throws Exception {
+    // Method to simulate volume failures is currently not supported on Windows.
+    assumeTrue(!Path.WINDOWS);
+
+    failedDir = new File(dataDir, "failedDir");
+    assertTrue("Failed to fail a volume by setting it non-writable",
+        failedDir.mkdir() && failedDir.setReadOnly());
+
+    startNewDataNodeWithDiskFailure(new File(failedDir, "newDir1"), false);
+  }
+
+  /**
+   * DataNode will start and tolerate one failing disk according to config.
+   *
+   * We fail a volume by setting the parent directory non-writable.
+   */
+  @Test (timeout = 120000)
+  public void testDNStartAndTolerateOneVolumeFailure() throws Exception {
+    // Method to simulate volume failures is currently not supported on Windows.
+    assumeTrue(!Path.WINDOWS);
+
+    failedDir = new File(dataDir, "failedDir");
+    assertTrue("Failed to fail a volume by setting it non-writable",
+        failedDir.mkdir() && failedDir.setReadOnly());
+
+    startNewDataNodeWithDiskFailure(new File(failedDir, "newDir1"), true);
+  }
+
+  /**
+   * Test if data directory is not readable/writable, DataNode won't start.
+   */
+  @Test (timeout = 120000)
+  public void testDNFailToStartWithDataDirNonWritable() throws Exception {
+    // Method to simulate volume failures is currently not supported on Windows.
+    assumeTrue(!Path.WINDOWS);
+
+    final File readOnlyDir = new File(dataDir, "nonWritable");
+    assertTrue("Set the data dir permission non-writable",
+        readOnlyDir.mkdir() && readOnlyDir.setReadOnly());
+
+    startNewDataNodeWithDiskFailure(new File(readOnlyDir, "newDir1"), false);
+  }
+
+  /**
+   * DataNode will start and tolerate one non-writable data directory
+   * according to config.
+   */
+  @Test (timeout = 120000)
+  public void testDNStartAndTolerateOneDataDirNonWritable() throws Exception {
+    // Method to simulate volume failures is currently not supported on Windows.
+    assumeTrue(!Path.WINDOWS);
+
+    final File readOnlyDir = new File(dataDir, "nonWritable");
+    assertTrue("Set the data dir permission non-writable",
+        readOnlyDir.mkdir() && readOnlyDir.setReadOnly());
+    startNewDataNodeWithDiskFailure(new File(readOnlyDir, "newDir1"), true);
+  }
+
+  /**
+   * @param badDataDir bad data dir, either disk failure or non-writable
+   * @param tolerated true if one volume failure is allowed else false
+   */
+  private void startNewDataNodeWithDiskFailure(File badDataDir,
+      boolean tolerated) throws Exception {
+    final File data5 = new File(dataDir, "data5");
+    final String newDirs = badDataDir.toString() + "," + data5.toString();
+    final Configuration newConf = new Configuration(conf);
+    newConf.set(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY, newDirs);
+    LOG.info("Setting dfs.datanode.data.dir for new DataNode as {}", newDirs);
+    newConf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY,
+        tolerated ? 1 : 0);
+
+    // bring up one more DataNode
+    assertEquals(repl, cluster.getDataNodes().size());
+    cluster.startDataNodes(newConf, 1, false, null, null);
+    assertEquals(repl + 1, cluster.getDataNodes().size());
+
+    if (tolerated) {
+      // create new file and it should be able to replicate to 3 nodes
+      final Path p = new Path("/test1.txt");
+      DFSTestUtil.createFile(fs, p, block_size * blocks_num, (short) 3, 1L);
+      DFSTestUtil.waitReplication(fs, p, (short) (repl + 1));
+    } else {
+      // DataNode should stop soon if it does not tolerate disk failure
+      GenericTestUtils.waitFor(new Supplier<Boolean>() {
+        @Override
+        public Boolean get() {
+          final String bpid = cluster.getNamesystem().getBlockPoolId();
+          final BPOfferService bpos = cluster.getDataNodes().get(2)
+              .getBPOfferService(bpid);
+          return !bpos.isAlive();
+        }
+      }, 100, 30 * 1000);
+    }
+  }
+
   /**
    * verifies two things:
    *  1. number of locations of each block in the name node