Преглед изворни кода

HDFS-2800. Fix cancellation of checkpoints in the standby node to be more reliable. Contributed by Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1339745 13f79535-47bb-0310-9956-ffa450edef68
Todd Lipcon пре 13 година
родитељ
комит
15ddb6634f

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -217,6 +217,9 @@ Release 2.0.1-alpha - UNRELEASED
     HDFS-3434. InvalidProtocolBufferException when visiting DN
     browseDirectory.jsp (eli)
 
+    HDFS-2800. Fix cancellation of checkpoints in the standby node to be more
+    reliable. (todd)
+
 Release 2.0.0-alpha - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 29 - 20
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImage.java

@@ -54,6 +54,7 @@ import org.apache.hadoop.hdfs.server.protocol.NamenodeCommand;
 import org.apache.hadoop.hdfs.server.protocol.NamenodeProtocol;
 import org.apache.hadoop.hdfs.server.protocol.NamenodeRegistration;
 import org.apache.hadoop.hdfs.server.protocol.NamespaceInfo;
+import org.apache.hadoop.hdfs.util.Canceler;
 import org.apache.hadoop.hdfs.util.MD5FileUtils;
 import org.apache.hadoop.io.MD5Hash;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
@@ -89,9 +90,6 @@ public class FSImage implements Closeable {
 
   private final NNStorageRetentionManager archivalManager;
 
-  private SaveNamespaceContext curSaveNamespaceContext = null; 
-
-
   /**
    * Construct an FSImage
    * @param conf Configuration
@@ -804,17 +802,28 @@ public class FSImage implements Closeable {
         try {
           thread.join();
         } catch (InterruptedException iex) {
-          LOG.error("Caught exception while waiting for thread " +
+          LOG.error("Caught interrupted exception while waiting for thread " +
                     thread.getName() + " to finish. Retrying join");
         }        
       }
     }
   }
+  
+  /**
+   * @see #saveNamespace(FSNamesystem, Canceler)
+   */
+  public synchronized void saveNamespace(FSNamesystem source)
+      throws IOException {
+    saveNamespace(source, null);
+  }
+  
   /**
    * Save the contents of the FS image to a new image file in each of the
    * current storage directories.
+   * @param canceler 
    */
-  public synchronized void saveNamespace(FSNamesystem source) throws IOException {
+  public synchronized void saveNamespace(FSNamesystem source,
+      Canceler canceler) throws IOException {
     assert editLog != null : "editLog must be initialized";
     storage.attemptRestoreRemovedStorage();
 
@@ -825,7 +834,7 @@ public class FSImage implements Closeable {
     }
     long imageTxId = getLastAppliedOrWrittenTxId();
     try {
-      saveFSImageInAllDirs(source, imageTxId);
+      saveFSImageInAllDirs(source, imageTxId, canceler);
       storage.writeAll();
     } finally {
       if (editLogWasOpen) {
@@ -837,27 +846,27 @@ public class FSImage implements Closeable {
         storage.writeTransactionIdFileToStorage(imageTxId + 1);
       }
     }
-    
-  }
-  
-  public void cancelSaveNamespace(String reason)
-      throws InterruptedException {
-    SaveNamespaceContext ctx = curSaveNamespaceContext;
-    if (ctx != null) {
-      ctx.cancel(reason); // waits until complete
-    }
   }
 
-  
+  /**
+   * @see #saveFSImageInAllDirs(FSNamesystem, long, Canceler)
+   */
   protected synchronized void saveFSImageInAllDirs(FSNamesystem source, long txid)
+      throws IOException {
+    saveFSImageInAllDirs(source, txid, null);
+  }
+
+  protected synchronized void saveFSImageInAllDirs(FSNamesystem source, long txid,
+      Canceler canceler)
       throws IOException {    
     if (storage.getNumStorageDirs(NameNodeDirType.IMAGE) == 0) {
       throw new IOException("No image directories available!");
     }
-    
+    if (canceler == null) {
+      canceler = new Canceler();
+    }
     SaveNamespaceContext ctx = new SaveNamespaceContext(
-        source, txid);
-    curSaveNamespaceContext = ctx;
+        source, txid, canceler);
     
     try {
       List<Thread> saveThreads = new ArrayList<Thread>();
@@ -878,7 +887,7 @@ public class FSImage implements Closeable {
         throw new IOException(
           "Failed to save in any storage directories while saving namespace.");
       }
-      if (ctx.isCancelled()) {
+      if (canceler.isCancelled()) {
         deleteCancelledCheckpoint(txid);
         ctx.checkCancelled(); // throws
         assert false : "should have thrown above!";

+ 4 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSImageFormat.java

@@ -540,7 +540,6 @@ class FSImageFormat {
     private void saveImage(ByteBuffer currentDirName,
                                   INodeDirectory current,
                                   DataOutputStream out) throws IOException {
-      context.checkCancelled();
       List<INode> children = current.getChildrenRaw();
       if (children == null || children.isEmpty())
         return;
@@ -554,9 +553,13 @@ class FSImageFormat {
         out.write(currentDirName.array(), 0, prefixLen);
       }
       out.writeInt(children.size());
+      int i = 0;
       for(INode child : children) {
         // print all children first
         FSImageSerialization.saveINode2Image(child, out);
+        if (i++ % 50 == 0) {
+          context.checkCancelled();
+        }
       }
       for(INode child : children) {
         if(!child.isDirectory())

+ 2 - 22
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -702,7 +702,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
    */
   void prepareToStopStandbyServices() throws ServiceFailedException {
     if (standbyCheckpointer != null) {
-      standbyCheckpointer.cancelAndPreventCheckpoints();
+      standbyCheckpointer.cancelAndPreventCheckpoints(
+          "About to leave standby state");
     }
   }
 
@@ -3372,27 +3373,6 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     }
   }
   
-  /**
-   * Cancel an ongoing saveNamespace operation and wait for its
-   * threads to exit, if one is currently in progress.
-   *
-   * If no such operation is in progress, this call does nothing.
-   *
-   * @param reason a reason to be communicated to the caller saveNamespace 
-   * @throws IOException
-   */
-  void cancelSaveNamespace(String reason) throws IOException {
-    readLock();
-    try {
-      checkSuperuserPrivilege();
-      getFSImage().cancelSaveNamespace(reason);
-    } catch (InterruptedException e) {
-      throw new IOException(e);
-    } finally {
-      readUnlock();
-    }
-  }
-  
   /**
    * Enables/Disables/Checks restoring failed storage replicas if the storage becomes available again.
    * Requires superuser privilege.

+ 8 - 25
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/SaveNamespaceContext.java

@@ -23,6 +23,7 @@ import java.util.List;
 import java.util.concurrent.CountDownLatch;
 
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.util.Canceler;
 
 import com.google.common.base.Preconditions;
 
@@ -36,20 +37,17 @@ class SaveNamespaceContext {
   private final long txid;
   private final List<StorageDirectory> errorSDs =
     Collections.synchronizedList(new ArrayList<StorageDirectory>());
-
-  /**
-   * If the operation has been canceled, set to the reason why
-   * it has been canceled (eg standby moving to active)
-   */
-  private volatile String cancelReason = null;
   
+  private final Canceler canceller;
   private CountDownLatch completionLatch = new CountDownLatch(1);
-  
+
   SaveNamespaceContext(
       FSNamesystem sourceNamesystem,
-      long txid) {
+      long txid,
+      Canceler canceller) {
     this.sourceNamesystem = sourceNamesystem;
     this.txid = txid;
+    this.canceller = canceller;
   }
 
   FSNamesystem getSourceNamesystem() {
@@ -68,17 +66,6 @@ class SaveNamespaceContext {
     return errorSDs;
   }
 
-  /**
-   * Requests that the current saveNamespace operation be
-   * canceled if it is still running.
-   * @param reason the reason why cancellation is requested
-   * @throws InterruptedException 
-   */
-  void cancel(String reason) throws InterruptedException {
-    this.cancelReason = reason;
-    completionLatch.await();
-  }
-  
   void markComplete() {
     Preconditions.checkState(completionLatch.getCount() == 1,
         "Context already completed!");
@@ -86,13 +73,9 @@ class SaveNamespaceContext {
   }
 
   void checkCancelled() throws SaveNamespaceCancelledException {
-    if (cancelReason != null) {
+    if (canceller.isCancelled()) {
       throw new SaveNamespaceCancelledException(
-          cancelReason);
+          canceller.getCancellationReason());
     }
   }
-
-  boolean isCancelled() {
-    return cancelReason != null;
-  }
 }

+ 36 - 16
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java

@@ -34,6 +34,7 @@ import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.SaveNamespaceCancelledException;
 import org.apache.hadoop.hdfs.server.namenode.TransferFsImage;
+import org.apache.hadoop.hdfs.util.Canceler;
 import org.apache.hadoop.net.NetUtils;
 import org.apache.hadoop.security.SecurityUtil;
 import org.apache.hadoop.security.UserGroupInformation;
@@ -58,6 +59,9 @@ public class StandbyCheckpointer {
   private final CheckpointerThread thread;
   private String activeNNAddress;
   private InetSocketAddress myNNAddress;
+
+  private Object cancelLock = new Object();
+  private Canceler canceler;
   
   // Keep track of how many checkpoints were canceled.
   // This is for use in tests.
@@ -123,6 +127,7 @@ public class StandbyCheckpointer {
   }
   
   public void stop() throws IOException {
+    cancelAndPreventCheckpoints("Stopping checkpointer");
     thread.setShouldRun(false);
     thread.interrupt();
     try {
@@ -134,6 +139,7 @@ public class StandbyCheckpointer {
   }
 
   private void doCheckpoint() throws InterruptedException, IOException {
+    assert canceler != null;
     long txid;
     
     namesystem.writeLockInterruptibly();
@@ -153,8 +159,8 @@ public class StandbyCheckpointer {
             thisCheckpointTxId + ". Skipping...");
         return;
       }
-      
-      img.saveNamespace(namesystem);
+
+      img.saveNamespace(namesystem, canceler);
       txid = img.getStorage().getMostRecentCheckpointTxId();
       assert txid == thisCheckpointTxId : "expected to save checkpoint at txid=" +
         thisCheckpointTxId + " but instead saved at txid=" + txid;
@@ -173,16 +179,18 @@ public class StandbyCheckpointer {
    * and prevent any new checkpoints from starting for the next
    * minute or so.
    */
-  public void cancelAndPreventCheckpoints() throws ServiceFailedException {
-    try {
-      thread.preventCheckpointsFor(PREVENT_AFTER_CANCEL_MS);
-      // TODO(HA): there is a really narrow race here if we are just
-      // about to start a checkpoint - this won't cancel it!
-      namesystem.getFSImage().cancelSaveNamespace(
-          "About to exit standby state");
-    } catch (InterruptedException e) {
-      throw new ServiceFailedException(
-          "Interrupted while trying to cancel checkpoint");
+  public void cancelAndPreventCheckpoints(String msg) throws ServiceFailedException {
+    thread.preventCheckpointsFor(PREVENT_AFTER_CANCEL_MS);
+    synchronized (cancelLock) {
+      // Before beginning a checkpoint, the checkpointer thread
+      // takes this lock, and creates a canceler object.
+      // If the canceler is non-null, then a checkpoint is in
+      // progress and we need to cancel it. If it's null, then
+      // the operation has not started, meaning that the above
+      // time-based prevention will take effect.
+      if (canceler != null) {
+        canceler.cancel(msg);
+      }
     }
   }
   
@@ -272,10 +280,18 @@ public class StandbyCheckpointer {
                 "exceeds the configured interval " + checkpointConf.getPeriod());
             needCheckpoint = true;
           }
-          if (needCheckpoint && now < preventCheckpointsUntil) {
-            LOG.info("But skipping this checkpoint since we are about to failover!");
-            canceledCount++;
-          } else if (needCheckpoint) {
+          
+          synchronized (cancelLock) {
+            if (now < preventCheckpointsUntil) {
+              LOG.info("But skipping this checkpoint since we are about to failover!");
+              canceledCount++;
+              continue;
+            }
+            assert canceler == null;
+            canceler = new Canceler();
+          }
+          
+          if (needCheckpoint) {
             doCheckpoint();
             lastCheckpointTime = now;
           }
@@ -287,6 +303,10 @@ public class StandbyCheckpointer {
           continue;
         } catch (Throwable t) {
           LOG.error("Exception in doCheckpoint", t);
+        } finally {
+          synchronized (cancelLock) {
+            canceler = null;
+          }
         }
       }
     }

+ 51 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/Canceler.java

@@ -0,0 +1,51 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.util;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+
+/**
+ * Provides a simple interface where one thread can mark an operation
+ * for cancellation, and another thread can poll for whether the
+ * cancellation has occurred.
+ */
+@InterfaceAudience.Private
+public class Canceler {
+  /**
+   * If the operation has been canceled, set to the reason why
+   * it has been canceled (eg standby moving to active)
+   */
+  private volatile String cancelReason = null;
+  
+  /**
+   * Requests that the current operation be canceled if it is still running.
+   * This does not block until the cancellation is successful.
+   * @param reason the reason why cancellation is requested
+   */
+  public void cancel(String reason) {
+    this.cancelReason = reason;
+  }
+
+  public boolean isCancelled() {
+    return cancelReason != null;
+  }
+  
+  public String getCancellationReason() {
+    return cancelReason;
+  }
+}

+ 5 - 3
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSaveNamespace.java

@@ -51,6 +51,7 @@ import org.apache.hadoop.hdfs.MiniDFSCluster;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants.SafeModeAction;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants.NamenodeRole;
 import org.apache.hadoop.hdfs.server.common.Storage.StorageDirectory;
+import org.apache.hadoop.hdfs.util.Canceler;
 import org.apache.hadoop.hdfs.util.MD5FileUtils;
 import org.apache.hadoop.io.IOUtils;
 import org.apache.hadoop.test.GenericTestUtils;
@@ -517,14 +518,15 @@ public class TestSaveNamespace {
     
     try {
       doAnEdit(fsn, 1);
-
+      final Canceler canceler = new Canceler();
+      
       // Save namespace
       fsn.setSafeMode(SafeModeAction.SAFEMODE_ENTER);
       try {
         Future<Void> saverFuture = pool.submit(new Callable<Void>() {
           @Override
           public Void call() throws Exception {
-            image.saveNamespace(finalFsn);
+            image.saveNamespace(finalFsn, canceler);
             return null;
           }
         });
@@ -534,7 +536,7 @@ public class TestSaveNamespace {
         // then cancel the saveNamespace
         Future<Void> cancelFuture = pool.submit(new Callable<Void>() {
           public Void call() throws Exception {
-            image.cancelSaveNamespace("cancelled");
+            canceler.cancel("cancelled");
             return null;
           }
         });

+ 36 - 4
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyCheckpoints.java

@@ -21,6 +21,7 @@ import static org.junit.Assert.*;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.OutputStream;
 import java.net.URI;
 import java.util.List;
 
@@ -36,6 +37,11 @@ import org.apache.hadoop.hdfs.server.namenode.FSNamesystem;
 import org.apache.hadoop.hdfs.server.namenode.NNStorage;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter;
+import org.apache.hadoop.hdfs.util.Canceler;
+import org.apache.hadoop.io.compress.CompressionCodecFactory;
+import org.apache.hadoop.io.compress.CompressionOutputStream;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.test.GenericTestUtils;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -52,12 +58,18 @@ public class TestStandbyCheckpoints {
   private NameNode nn0, nn1;
   private FileSystem fs;
 
+  @SuppressWarnings("rawtypes")
   @Before
   public void setupCluster() throws Exception {
     Configuration conf = new Configuration();
     conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_CHECK_PERIOD_KEY, 1);
     conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 5);
     conf.setInt(DFSConfigKeys.DFS_HA_TAILEDITS_PERIOD_KEY, 1);
+    conf.setBoolean(DFSConfigKeys.DFS_IMAGE_COMPRESS_KEY, true);
+    conf.set(DFSConfigKeys.DFS_IMAGE_COMPRESSION_CODEC_KEY,
+        SlowCodec.class.getCanonicalName());
+    CompressionCodecFactory.setCodecClasses(conf,
+        ImmutableList.<Class>of(SlowCodec.class));
 
     MiniDFSNNTopology topology = new MiniDFSNNTopology()
       .addNameservice(new MiniDFSNNTopology.NSConf("ns1")
@@ -159,14 +171,15 @@ public class TestStandbyCheckpoints {
     
     // We should make exactly one checkpoint at this new txid. 
     Mockito.verify(spyImage1, Mockito.times(1))
-      .saveNamespace((FSNamesystem) Mockito.anyObject());       
+      .saveNamespace((FSNamesystem) Mockito.anyObject(),
+          (Canceler)Mockito.anyObject());       
   }
   
   /**
    * Test cancellation of ongoing checkpoints when failover happens
    * mid-checkpoint. 
    */
-  @Test
+  @Test(timeout=120000)
   public void testCheckpointCancellation() throws Exception {
     cluster.transitionToStandby(0);
     
@@ -191,16 +204,18 @@ public class TestStandbyCheckpoints {
 
     cluster.transitionToActive(0);    
     
-    for (int i = 0; i < 10; i++) {
+    boolean canceledOne = false;
+    for (int i = 0; i < 10 && !canceledOne; i++) {
       
       doEdits(i*10, i*10 + 10);
       cluster.transitionToStandby(0);
       cluster.transitionToActive(1);
       cluster.transitionToStandby(1);
       cluster.transitionToActive(0);
+      canceledOne = StandbyCheckpointer.getCanceledCount() > 0;
     }
     
-    assertTrue(StandbyCheckpointer.getCanceledCount() > 0);
+    assertTrue(canceledOne);
   }
 
   private void doEdits(int start, int stop) throws IOException {
@@ -209,5 +224,22 @@ public class TestStandbyCheckpoints {
       fs.mkdirs(p);
     }
   }
+  
+  /**
+   * A codec which just slows down the saving of the image significantly
+   * by sleeping a few milliseconds on every write. This makes it easy to
+   * catch the standby in the middle of saving a checkpoint.
+   */
+  public static class SlowCodec extends GzipCodec {
+    @Override
+    public CompressionOutputStream createOutputStream(OutputStream out)
+        throws IOException {
+      CompressionOutputStream ret = super.createOutputStream(out);
+      CompressionOutputStream spy = Mockito.spy(ret);
+      Mockito.doAnswer(new GenericTestUtils.SleepAnswer(2))
+        .when(spy).write(Mockito.<byte[]>any(), Mockito.anyInt(), Mockito.anyInt());
+      return spy;
+    }
+  }
 
 }