Browse Source

HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum error. Contributed by Andrew Wang.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-2@1456633 13f79535-47bb-0310-9956-ffa450edef68
Aaron Myers 12 years ago
parent
commit
7a02bc2bac

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -85,6 +85,9 @@ Release 2.0.5-beta - UNRELEASED
     HDFS-3277. fail over to loading a different FSImage if the first one we
     HDFS-3277. fail over to loading a different FSImage if the first one we
     try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
     try to load is corrupt. (Colin Patrick McCabe and Andrew Wang via atm)
 
 
+    HDFS-4596. Shutting down namenode during checkpointing can lead to md5sum
+    error. (Andrew Wang via atm)
+
 Release 2.0.4-alpha - UNRELEASED
 Release 2.0.4-alpha - UNRELEASED
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 1 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/CheckpointFaultInjector.java

@@ -44,4 +44,5 @@ class CheckpointFaultInjector {
     return false;
     return false;
   }
   }
   
   
+  public void afterMD5Rename() throws IOException {}
 }
 }

+ 5 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java

@@ -1102,7 +1102,7 @@ public class FSImage implements Closeable {
    */
    */
   public synchronized void saveDigestAndRenameCheckpointImage(
   public synchronized void saveDigestAndRenameCheckpointImage(
       long txid, MD5Hash digest) throws IOException {
       long txid, MD5Hash digest) throws IOException {
-    renameCheckpoint(txid);
+    // Write and rename MD5 file
     List<StorageDirectory> badSds = Lists.newArrayList();
     List<StorageDirectory> badSds = Lists.newArrayList();
     
     
     for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
     for (StorageDirectory sd : storage.dirIterable(NameNodeDirType.IMAGE)) {
@@ -1115,6 +1115,10 @@ public class FSImage implements Closeable {
     }
     }
     storage.reportErrorsOnDirectories(badSds);
     storage.reportErrorsOnDirectories(badSds);
     
     
+    CheckpointFaultInjector.getInstance().afterMD5Rename();
+    
+    // Rename image from tmp file
+    renameCheckpoint(txid);
     // So long as this is the newest image available,
     // So long as this is the newest image available,
     // advertise it as such to other checkpointers
     // advertise it as such to other checkpointers
     // from now on
     // from now on

+ 55 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java

@@ -245,7 +245,7 @@ public class TestCheckpoint {
   /*
   /*
    * Simulate exception during edit replay.
    * Simulate exception during edit replay.
    */
    */
-  @Test(timeout=5000)
+  @Test(timeout=30000)
   public void testReloadOnEditReplayFailure () throws IOException {
   public void testReloadOnEditReplayFailure () throws IOException {
     Configuration conf = new HdfsConfiguration();
     Configuration conf = new HdfsConfiguration();
     FSDataOutputStream fos = null;
     FSDataOutputStream fos = null;
@@ -1418,6 +1418,60 @@ public class TestCheckpoint {
     }
     }
   }
   }
   
   
+  /**
+   * Test NN restart if a failure happens in between creating the fsimage
+   * MD5 file and renaming the fsimage.
+   */
+  @Test(timeout=30000)
+  public void testFailureBeforeRename () throws IOException {
+    Configuration conf = new HdfsConfiguration();
+    FSDataOutputStream fos = null;
+    SecondaryNameNode secondary = null;
+    MiniDFSCluster cluster = null;
+    FileSystem fs = null;
+    NameNode namenode = null;
+
+    try {
+      cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDatanodes)
+          .build();
+      cluster.waitActive();
+      namenode = cluster.getNameNode();
+      fs = cluster.getFileSystem();
+      secondary = startSecondaryNameNode(conf);
+      fos = fs.create(new Path("tmpfile0"));
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      secondary.doCheckpoint();
+      fos.write(new byte[] { 0, 1, 2, 3 });
+      fos.hsync();
+
+      // Cause merge to fail in next checkpoint.
+      Mockito.doThrow(new IOException(
+          "Injecting failure after MD5Rename"))
+          .when(faultInjector).afterMD5Rename();
+
+      try {
+        secondary.doCheckpoint();
+        fail("Fault injection failed.");
+      } catch (IOException ioe) {
+        // This is expected.
+      }
+      Mockito.reset(faultInjector);
+      // Namenode should still restart successfully
+      cluster.restartNameNode();
+    } finally {
+      if (secondary != null) {
+        secondary.shutdown();
+      }
+      if (fs != null) {
+        fs.close();
+      }
+      if (cluster != null) {
+        cluster.shutdown();
+      }
+      Mockito.reset(faultInjector);
+    }
+  }
+
   /**
   /**
    * Test case where two secondary namenodes are checkpointing the same
    * Test case where two secondary namenodes are checkpointing the same
    * NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}
    * NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}