Browse Source

HDFS-2853. HA: NN fails to start if the shared edits dir is marked required. Contributed by Aaron T. Myers.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/HDFS-1623@1238134 13f79535-47bb-0310-9956-ffa450edef68
Eli Collins 13 years ago
parent
commit
6be13332db

+ 2 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.HDFS-1623.txt

@@ -141,3 +141,5 @@ HDFS-2841. HAAdmin does not work if security is enabled. (atm)
 HDFS-2691. Fixes for pipeline recovery in an HA cluster: report RBW replicas immediately upon pipeline creation. (todd)
 
 HDFS-2824. Fix failover when prior NN died just after creating an edit log segment. (atm via todd)
+
+HDFS-2853. HA: NN fails to start if the shared edits dir is marked required (atm via eli)

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java

@@ -865,7 +865,7 @@ public class FSEditLog  {
       editLogStream = journalSet.startLogSegment(segmentTxId);
     } catch (IOException ex) {
       throw new IOException("Unable to start log segment " +
-          segmentTxId + ": no journals successfully started.");
+          segmentTxId + ": too few journals successfully started.", ex);
     }
     
     curSegmentTxId = segmentTxId;

+ 0 - 9
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/NameNodeResourcePolicy.java

@@ -37,9 +37,6 @@ final class NameNodeResourcePolicy {
    *        required to continue operation.
    * @return true if and only if there are sufficient NN resources to
    *         continue logging edits.
-   * @throws RuntimeException if the number of <bold>configured</bold>
-   *         redundant resources is fewer than the minimum number of available
-   *         redundant resources.
    */
   static boolean areResourcesAvailable(
       Collection<? extends CheckableNameNodeResource> resources,
@@ -63,12 +60,6 @@ final class NameNodeResourcePolicy {
       }
     }
     
-    if (redundantResourceCount < minimumRedundantResources) {
-      throw new RuntimeException("Need a minimum of " + minimumRedundantResources
-          + " for NN to operate but only " + redundantResourceCount
-          + " are configured.");
-    }
-    
     if (redundantResourceCount == 0) {
       // If there are no redundant resources, return true if there are any
       // required resources available.

+ 6 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/MiniDFSCluster.java

@@ -664,7 +664,12 @@ public class MiniDFSCluster {
   }
   
   public URI getSharedEditsDir(int minNN, int maxNN) throws IOException {
-    return fileAsURI(new File(base_dir, "shared-edits-" +
+    return formatSharedEditsDir(base_dir, minNN, maxNN);
+  }
+  
+  public static URI formatSharedEditsDir(File baseDir, int minNN, int maxNN)
+      throws IOException {
+    return fileAsURI(new File(baseDir, "shared-edits-" +
         minNN + "-through-" + maxNN));
   }
 

+ 1 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestEditLog.java

@@ -807,7 +807,7 @@ public class TestEditLog extends TestCase {
       fail("Did no throw exception on only having a bad dir");
     } catch (IOException ioe) {
       GenericTestUtils.assertExceptionContains(
-          "no journals successfully started", ioe);
+          "too few journals successfully started", ioe);
     } finally {
       logDir.setWritable(true);
       log.close();

+ 1 - 7
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestNameNodeResourcePolicy.java

@@ -50,13 +50,7 @@ public class TestNameNodeResourcePolicy {
     assertFalse(testResourceScenario(4, 0, 3, 0, 2));
     assertTrue(testResourceScenario(4, 0, 3, 0, 1));
     assertFalse(testResourceScenario(4, 0, 4, 0, 1));
-    try {
-      testResourceScenario(1, 0, 0, 0, 2);
-      fail("Should fail if there are more minimum redundant resources than " +
-          "total redundant resources");
-    } catch (RuntimeException rte) {
-      assertTrue(rte.getMessage().startsWith("Need a minimum"));
-    }
+    assertFalse(testResourceScenario(1, 0, 0, 0, 2));
   }
   
   @Test

+ 93 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestFailureOfSharedDir.java

@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.namenode.ha;
+
+import static org.junit.Assert.*;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.net.URISyntaxException;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.FileUtil;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.MiniDFSCluster;
+import org.apache.hadoop.hdfs.MiniDFSNNTopology;
+import org.apache.hadoop.hdfs.server.namenode.NameNode;
+import org.apache.hadoop.test.GenericTestUtils;
+import org.junit.Test;
+
+public class TestFailureOfSharedDir {
+  
+  private static final Log LOG = LogFactory.getLog(TestFailureOfSharedDir.class);
+
+  /**
+   * Test that marking the shared edits dir as being "required" causes the NN to
+   * fail if that dir can't be accessed.
+   */
+  @Test
+  public void testFailureOfSharedDir() throws Exception {
+    Configuration conf = new Configuration();
+    URI sharedEditsUri = MiniDFSCluster.formatSharedEditsDir(
+        new File(MiniDFSCluster.getBaseDirectory()), 0, 1);
+    // Mark the shared edits dir required.
+    conf.set(DFSConfigKeys.DFS_NAMENODE_EDITS_DIR_REQUIRED_KEY,
+        sharedEditsUri.toString());
+    
+    MiniDFSCluster cluster = null;
+    try {
+      cluster = new MiniDFSCluster.Builder(conf)
+        .nnTopology(MiniDFSNNTopology.simpleHATopology())
+        .numDataNodes(0)
+        .build();
+      
+      assertEquals(sharedEditsUri, cluster.getSharedEditsDir(0, 1));
+      
+      cluster.waitActive();
+      cluster.transitionToActive(0);
+      
+      FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
+      
+      assertTrue(fs.mkdirs(new Path("/test1")));
+      
+      // Blow away the shared edits dir.
+      FileUtil.fullyDelete(new File(sharedEditsUri));
+      
+      NameNode nn0 = cluster.getNameNode(0);
+      try {
+        // Make sure that subsequent operations on the NN fail.
+        nn0.getRpcServer().rollEditLog();
+        fail("Succeeded in rolling edit log despite shared dir being deleted");
+      } catch (IOException ioe) {
+        GenericTestUtils.assertExceptionContains(
+            "Unable to start log segment 4: too few journals successfully started",
+            ioe);
+        LOG.info("Got expected exception", ioe);
+      }
+    } finally {
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+    }
+  }
+}