Forráskód Böngészése

HDFS-15590. namenode fails to start when ordered snapshot deletion feature is disabled (#2326)

bshashikant 4 éve
szülő
commit
368f2f637e

+ 18 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/DirectorySnapshottableFeature.java

@@ -241,6 +241,24 @@ public class DirectorySnapshottableFeature extends DirectoryWithSnapshotFeature
       throws SnapshotException {
     final int i = searchSnapshot(DFSUtil.string2Bytes(snapshotName));
     if (i < 0) {
+      // considering a sequence like this with snapshots S1 and s2
+      // 1. Ordered snapshot deletion feature is turned on
+      // 2. Delete S2 creating edit log entry for S2 deletion
+      // 3. Delete S1
+      // 4. S2 gets deleted by snapshot gc thread creating edit log record for
+      //    S2 deletion again
+      // 5. Disable Ordered snapshot deletion feature
+      // 6. Restarting Namenode
+      // In this case, when edit log replay happens actual deletion of S2
+      // will happen when first edit log for S2 deletion gets replayed and
+      // the second edit log record replay for S2 deletion will fail as snapshot
+      // won't exist thereby failing the Namenode start
+      // The idea here is to check during edit log replay, if a certain snapshot
+      // is not found and the ordered snapshot deletion is off, ignore the error
+      if (!snapshotManager.isSnapshotDeletionOrdered() &&
+          !snapshotManager.isImageLoaded()) {
+        return null;
+      }
       throw new SnapshotException("Cannot delete snapshot " + snapshotName
           + " from path " + snapshotRoot.getFullPathName()
           + ": the snapshot does not exist.");

+ 6 - 3
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/snapshot/SnapshotManager.java

@@ -479,10 +479,10 @@ public class SnapshotManager implements SnapshotStatsMXBean {
   void checkSnapshotLimit(int limit, int snapshotCount, String type)
       throws SnapshotException {
     if (snapshotCount >= limit) {
-      String msg = "there are already " + (snapshotCount + 1)
+      String msg = "there are already " + snapshotCount
           + " snapshot(s) and the "  + type + " snapshot limit is "
           + limit;
-      if (fsdir.isImageLoaded()) {
+      if (isImageLoaded()) {
         // We have reached the maximum snapshot limit
         throw new SnapshotException(
             "Failed to create snapshot: " + msg);
@@ -492,7 +492,10 @@ public class SnapshotManager implements SnapshotStatsMXBean {
       }
     }
   }
-  
+
+  boolean isImageLoaded() {
+    return fsdir.isImageLoaded();
+  }
   /**
    * Delete a snapshot for a snapshottable directory
    * @param snapshotName Name of the snapshot to be deleted

+ 2 - 2
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestSnapshotCommands.java

@@ -128,8 +128,8 @@ public class TestSnapshotCommands {
     DFSTestUtil.FsShellRun("-createSnapshot /sub3 sn2", 0,
         "Created snapshot /sub3/.snapshot/sn2", conf);
     DFSTestUtil.FsShellRun("-createSnapshot /sub3 sn3", 1,
-        "Failed to add snapshot: there are already 3 snapshot(s) and "
-            + "the max snapshot limit is 3", conf);
+        "Failed to create snapshot: there are already 3 snapshot(s) and "
+            + "the per directory snapshot limit is 3", conf);
   }
 
   @Test

+ 20 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/snapshot/TestOrderedSnapshotDeletion.java

@@ -195,6 +195,26 @@ public class TestOrderedSnapshotDeletion {
     assertXAttrSet("s1", hdfs, null);
   }
 
+  @Test(timeout = 6000000)
+  public void testOrderedDeletionWithRestart() throws Exception {
+    DistributedFileSystem hdfs = cluster.getFileSystem();
+    hdfs.mkdirs(snapshottableDir);
+    hdfs.allowSnapshot(snapshottableDir);
+
+    final Path sub0 = new Path(snapshottableDir, "sub0");
+    hdfs.mkdirs(sub0);
+    hdfs.createSnapshot(snapshottableDir, "s0");
+
+    final Path sub1 = new Path(snapshottableDir, "sub1");
+    hdfs.mkdirs(sub1);
+    hdfs.createSnapshot(snapshottableDir, "s1");
+    assertXAttrSet("s1", hdfs, null);
+    assertXAttrSet("s1", hdfs, null);
+    cluster.getNameNode().getConf().
+        setBoolean(DFS_NAMENODE_SNAPSHOT_DELETION_ORDERED, false);
+    cluster.restartNameNodes();
+  }
+
   @Test(timeout = 60000)
   public void testSnapshotXattrWithDisablingXattr() throws Exception {
     DistributedFileSystem hdfs = cluster.getFileSystem();