浏览代码

HDFS-2186. DN volume failures on startup are not counted. Contributed by Eli Collins

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1156974 13f79535-47bb-0310-9956-ffa450edef68
Eli Collins 14 年之前
父节点
当前提交
1c2ab728f5

+ 2 - 0
hdfs/CHANGES.txt

@@ -959,6 +959,8 @@ Trunk (unreleased changes)
 
     HDFS-2235. Encode servlet paths. (eli)
 
+    HDFS-2186. DN volume failures on startup are not counted. (eli)
+
   BREAKDOWN OF HDFS-1073 SUBTASKS
 
     HDFS-1521. Persist transaction ID on disk between NN restarts.

+ 12 - 7
hdfs/src/java/org/apache/hadoop/hdfs/server/datanode/FSDataset.java

@@ -775,12 +775,13 @@ public class FSDataset implements FSDatasetInterface {
      */
     private volatile List<FSVolume> volumes = null;
     BlockVolumeChoosingPolicy blockChooser;
-    int numFailedVolumes = 0;
+    int numFailedVolumes;
 
-    FSVolumeSet(FSVolume[] volumes, BlockVolumeChoosingPolicy blockChooser) {
+    FSVolumeSet(FSVolume[] volumes, int failedVols, BlockVolumeChoosingPolicy blockChooser) {
       List<FSVolume> list = Arrays.asList(volumes);
       this.volumes = Collections.unmodifiableList(list);
       this.blockChooser = blockChooser;
+      this.numFailedVolumes = failedVols;
     }
     
     private int numberOfVolumes() {
@@ -1144,15 +1145,19 @@ public class FSDataset implements FSDatasetInterface {
     String[] dataDirs = conf.getTrimmedStrings(DFSConfigKeys.DFS_DATANODE_DATA_DIR_KEY);
 
     int volsConfigured = (dataDirs == null) ? 0 : dataDirs.length;
-
+    int volsFailed = volsConfigured - storage.getNumStorageDirs();
     this.validVolsRequired = volsConfigured - volFailuresTolerated;
 
-    if (validVolsRequired < 1
-        || validVolsRequired > storage.getNumStorageDirs()) {
+    if (volFailuresTolerated < 0 || volFailuresTolerated >= volsConfigured) {
+      throw new DiskErrorException("Invalid volume failure "
+          + " config value: " + volFailuresTolerated);
+    }
+    if (volsFailed > volFailuresTolerated) {
       throw new DiskErrorException("Too many failed volumes - "
           + "current valid volumes: " + storage.getNumStorageDirs() 
           + ", volumes configured: " + volsConfigured 
-          + ", volume failures tolerated: " + volFailuresTolerated );
+          + ", volumes failed: " + volsFailed
+          + ", volume failures tolerated: " + volFailuresTolerated);
     }
 
     FSVolume[] volArray = new FSVolume[storage.getNumStorageDirs()];
@@ -1170,7 +1175,7 @@ public class FSDataset implements FSDatasetInterface {
             RoundRobinVolumesPolicy.class,
             BlockVolumeChoosingPolicy.class),
         conf);
-    volumes = new FSVolumeSet(volArray, blockChooserImpl);
+    volumes = new FSVolumeSet(volArray, volsFailed, blockChooserImpl);
     volumes.getVolumeMap(volumeMap);
 
     File[] roots = new File[storage.getNumStorageDirs()];

+ 30 - 2
hdfs/src/test/hdfs/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeFailureToleration.java

@@ -24,6 +24,7 @@ import static org.junit.Assume.assumeTrue;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.concurrent.TimeoutException;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -37,6 +38,7 @@ import org.apache.hadoop.hdfs.DFSTestUtil;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager;
+import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
 import org.apache.log4j.Level;
 import org.junit.After;
 import org.junit.Before;
@@ -189,7 +191,7 @@ public class TestDataNodeVolumeFailureToleration {
    */
   private void restartDatanodes(int volTolerated, boolean manageDfsDirs)
       throws IOException {
-    //Make sure no datanode is running
+    // Make sure no datanode is running
     cluster.shutdownDataNodes();
     conf.setInt(DFSConfigKeys.DFS_DATANODE_FAILED_VOLUMES_TOLERATED_KEY, volTolerated);
     cluster.startDataNodes(conf, 1, manageDfsDirs, null, null);
@@ -226,7 +228,7 @@ public class TestDataNodeVolumeFailureToleration {
    */
   private void testVolumeConfig(int volumesTolerated, int volumesFailed,
       boolean expectedBPServiceState, boolean manageDfsDirs)
-      throws IOException, InterruptedException {
+      throws IOException, InterruptedException, TimeoutException {
     assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
     final int dnIndex = 0;
     // Fail the current directory since invalid storage directory perms
@@ -261,4 +263,30 @@ public class TestDataNodeVolumeFailureToleration {
     assertEquals("Couldn't chmod local vol", 0,
         FileUtil.chmod(dir.toString(), "000"));
   }
+
+  /**
+   * Test that a volume that is considered failed on startup is seen as
+   *  a failed volume by the NN.
+   */
+  @Test
+  public void testFailedVolumeOnStartupIsCounted() throws Exception {
+    assumeTrue(!System.getProperty("os.name").startsWith("Windows"));
+    final DatanodeManager dm = cluster.getNamesystem().getBlockManager(
+    ).getDatanodeManager();
+    long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
+    File dir = new File(MiniDFSCluster.getStorageDir(0, 0), "current");
+
+    try {
+      prepareDirToFail(dir);
+      restartDatanodes(1, false);
+      // The cluster is up..
+      assertEquals(true, cluster.getDataNodes().get(0)
+          .isBPServiceAlive(cluster.getNamesystem().getBlockPoolId()));
+      // but there has been a single volume failure
+      DFSTestUtil.waitForDatanodeStatus(dm, 1, 0, 1,
+          origCapacity / 2, WAIT_FOR_HEARTBEATS);
+    } finally {
+      FileUtil.chmod(dir.toString(), "755");
+    }
+  }
 }