Forráskód Böngészése

HDFS-13112. Token expiration edits may cause log corruption or deadlock. Contributed by Daryn Sharp.

(cherry picked from commit 47473952e56b0380147d42f4110ad03c2276c961)
Kihwal Lee 7 éve
szülő
commit
50e9419ce2

+ 38 - 15
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/security/token/delegation/DelegationTokenSecretManager.java

@@ -21,7 +21,6 @@ package org.apache.hadoop.hdfs.security.token.delegation;
 import java.io.DataInput;
 import java.io.DataOutputStream;
 import java.io.IOException;
-import java.io.InterruptedIOException;
 import java.net.InetSocketAddress;
 import java.util.ArrayList;
 import java.util.Iterator;
@@ -366,34 +365,58 @@ public class DelegationTokenSecretManager
   @Override //AbstractDelegationTokenManager
   protected void logUpdateMasterKey(DelegationKey key)
       throws IOException {
-    synchronized (noInterruptsLock) {
+    try {
       // The edit logging code will fail catastrophically if it
       // is interrupted during a logSync, since the interrupt
       // closes the edit log files. Doing this inside the
-      // above lock and then checking interruption status
-      // prevents this bug.
-      if (Thread.interrupted()) {
-        throw new InterruptedIOException(
-            "Interrupted before updating master key");
+      // fsn lock will prevent being interrupted when stopping
+      // the secret manager.
+      namesystem.readLockInterruptibly();
+      try {
+        // this monitor isn't necessary if stopped while holding write lock
+        // but for safety, guard against a stop with read lock.
+        synchronized (noInterruptsLock) {
+          if (Thread.currentThread().isInterrupted()) {
+            return; // leave flag set so secret monitor exits.
+          }
+          namesystem.logUpdateMasterKey(key);
+        }
+      } finally {
+        namesystem.readUnlock();
       }
-      namesystem.logUpdateMasterKey(key);
+    } catch (InterruptedException ie) {
+      // AbstractDelegationTokenManager may crash if an exception is thrown.
+      // The interrupt flag will be detected when it attempts to sleep.
+      Thread.currentThread().interrupt();
     }
   }
   
   @Override //AbstractDelegationTokenManager
   protected void logExpireToken(final DelegationTokenIdentifier dtId)
       throws IOException {
-    synchronized (noInterruptsLock) {
+    try {
       // The edit logging code will fail catastrophically if it
       // is interrupted during a logSync, since the interrupt
       // closes the edit log files. Doing this inside the
-      // above lock and then checking interruption status
-      // prevents this bug.
-      if (Thread.interrupted()) {
-        throw new InterruptedIOException(
-            "Interrupted before expiring delegation token");
+      // fsn lock will prevent being interrupted when stopping
+      // the secret manager.
+      namesystem.readLockInterruptibly();
+      try {
+        // this monitor isn't necessary if stopped while holding write lock
+        // but for safety, guard against a stop with read lock.
+        synchronized (noInterruptsLock) {
+          if (Thread.currentThread().isInterrupted()) {
+            return; // leave flag set so secret monitor exits.
+          }
+          namesystem.logExpireDelegationToken(dtId);
+        }
+      } finally {
+        namesystem.readUnlock();
       }
-      namesystem.logExpireDelegationToken(dtId);
+    } catch (InterruptedException ie) {
+      // AbstractDelegationTokenManager may crash if an exception is thrown.
+      // The interrupt flag will be detected when it attempts to sleep.
+      Thread.currentThread().interrupt();
     }
   }
 

+ 11 - 6
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -1568,6 +1568,10 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
     this.fsLock.readLock();
   }
   @Override
+  public void readLockInterruptibly() throws InterruptedException {
+    this.fsLock.readLockInterruptibly();
+  }
+  @Override
   public void readUnlock() {
     this.fsLock.readUnlock();
   }
@@ -5618,9 +5622,9 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
     assert !isInSafeMode() :
       "this should never be called while in safemode, since we stop " +
       "the DT manager before entering safemode!";
-    // No need to hold FSN lock since we don't access any internal
-    // structures, and this is stopped before the FSN shuts itself
-    // down, etc.
+    // edit log rolling is not thread-safe and must be protected by the
+    // fsn lock.  not updating namespace so read lock is sufficient.
+    assert hasReadLock();
     getEditLog().logUpdateMasterKey(key);
     getEditLog().logSync();
   }
@@ -5634,9 +5638,10 @@ public class FSNamesystem implements Namesystem, FSNamesystemMBean,
     assert !isInSafeMode() :
       "this should never be called while in safemode, since we stop " +
       "the DT manager before entering safemode!";
-    // No need to hold FSN lock since we don't access any internal
-    // structures, and this is stopped before the FSN shuts itself
-    // down, etc.
+    // edit log rolling is not thread-safe and must be protected by the
+    // fsn lock.  not updating namespace so read lock is sufficient.
+    assert hasReadLock();
+    // do not logSync so expiration edits are batched
     getEditLog().logCancelDelegationToken(id);
   }  
   

+ 7 - 0
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystemLock.java

@@ -145,6 +145,13 @@ class FSNamesystemLock {
     }
   }
 
+  public void readLockInterruptibly() throws InterruptedException {
+    coarseLock.readLock().lockInterruptibly();
+    if (coarseLock.getReadHoldCount() == 1) {
+      readLockHeldTimeStampNanos.set(timer.monotonicNowNanos());
+    }
+  }
+
   public void readUnlock() {
     readUnlock(OP_NAME_OTHER);
   }

+ 4 - 1
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/util/RwLock.java

@@ -21,7 +21,10 @@ package org.apache.hadoop.hdfs.util;
 public interface RwLock {
   /** Acquire read lock. */
   public void readLock();
-  
+
+  /** Acquire read lock, unless interrupted while waiting  */
+  void readLockInterruptibly() throws InterruptedException;
+
   /** Release read lock. */
   public void readUnlock();
 

+ 23 - 1
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestSecurityTokenEditLog.java

@@ -24,6 +24,7 @@ import java.io.File;
 import java.io.IOException;
 import java.net.URI;
 import java.util.Iterator;
+import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -37,7 +38,11 @@ import org.apache.hadoop.hdfs.server.namenode.NNStorage.NameNodeDirType;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.security.token.Token;
+import org.junit.Assert;
 import org.junit.Test;
+import org.mockito.invocation.InvocationOnMock;
+import org.mockito.stubbing.Answer;
+
 import static org.mockito.Mockito.*;
 
 /**
@@ -180,8 +185,25 @@ public class TestSecurityTokenEditLog {
     Text renewer = new Text(UserGroupInformation.getCurrentUser().getUserName());
     FSImage fsImage = mock(FSImage.class);
     FSEditLog log = mock(FSEditLog.class);
-    doReturn(log).when(fsImage).getEditLog();   
+    doReturn(log).when(fsImage).getEditLog();
+    // verify that the namesystem read lock is held while logging token
+    // expirations.  the namesystem is not updated, so write lock is not
+    // necessary, but the lock is required because edit log rolling is not
+    // thread-safe.
+    final AtomicReference<FSNamesystem> fsnRef = new AtomicReference<>();
+    doAnswer(
+      new Answer<Void>() {
+        @Override
+        public Void answer(InvocationOnMock invocation) throws Throwable {
+          // fsn claims read lock if either read or write locked.
+          Assert.assertTrue(fsnRef.get().hasReadLock());
+          Assert.assertFalse(fsnRef.get().hasWriteLock());
+          return null;
+        }
+      }
+    ).when(log).logCancelDelegationToken(any(DelegationTokenIdentifier.class));
     FSNamesystem fsn = new FSNamesystem(conf, fsImage);
+    fsnRef.set(fsn);
     
     DelegationTokenSecretManager dtsm = fsn.getDelegationTokenSecretManager();
     try {