Browse Source

HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in checkLeases (Ravi Prakash via Colin P. McCabe)
(cherry picked from commit daacbc18d739d030822df0b75205eeb067f89850)

Colin Patrick Mccabe 10 năm trước cách đây
mục cha
commit
575d7bd5b0

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -15,6 +15,9 @@ Release 2.6.1 - UNRELEASED
     HDFS-7425. NameNode block deletion logging uses incorrect appender.
     (cnauroth)
 
+    HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
+    checkLeases (Ravi Prakash via Colin P. McCabe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

+ 1 - 19
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/FSNamesystem.java

@@ -6231,26 +6231,8 @@ public class FSNamesystem implements Namesystem, FSClusterStats,
     // Calculate number of blocks under construction
     long numUCBlocks = 0;
     readLock();
+    numUCBlocks = leaseManager.getNumUnderConstructionBlocks();
     try {
-      for (Lease lease : leaseManager.getSortedLeases()) {
-        for (String path : lease.getPaths()) {
-          final INodeFile cons;
-          try {
-            cons = dir.getINode(path).asFile();
-            Preconditions.checkState(cons.isUnderConstruction());
-          } catch (UnresolvedLinkException e) {
-            throw new AssertionError("Lease files should reside on this FS");
-          }
-          BlockInfo[] blocks = cons.getBlocks();
-          if(blocks == null)
-            continue;
-          for(BlockInfo b : blocks) {
-            if(!b.isComplete())
-              numUCBlocks++;
-          }
-        }
-      }
-      LOG.info("Number of blocks under construction: " + numUCBlocks);
       return getBlocksTotal() - numUCBlocks;
     } finally {
       readUnlock();

+ 65 - 17
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/LeaseManager.java

@@ -25,8 +25,9 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.NavigableSet;
+import java.util.NoSuchElementException;
 import java.util.SortedMap;
-import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
 
@@ -36,6 +37,7 @@ import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.UnresolvedLinkException;
 import org.apache.hadoop.hdfs.protocol.HdfsConstants;
+import org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo;
 import org.apache.hadoop.hdfs.server.common.HdfsServerConstants;
 import org.apache.hadoop.util.Daemon;
 
@@ -79,7 +81,7 @@ public class LeaseManager {
   //
   private final SortedMap<String, Lease> leases = new TreeMap<String, Lease>();
   // Set of: Lease
-  private final SortedSet<Lease> sortedLeases = new TreeSet<Lease>();
+  private final NavigableSet<Lease> sortedLeases = new TreeSet<Lease>();
 
   // 
   // Map path names to leases. It is protected by the sortedLeases lock.
@@ -95,8 +97,41 @@ public class LeaseManager {
   Lease getLease(String holder) {
     return leases.get(holder);
   }
-  
-  SortedSet<Lease> getSortedLeases() {return sortedLeases;}
+
+  @VisibleForTesting
+  int getNumSortedLeases() {return sortedLeases.size();}
+
+  /**
+   * This method iterates through all the leases and counts the number of blocks
+   * which are not COMPLETE. The FSNamesystem read lock MUST be held before
+   * calling this method.
+   * @return
+   */
+  synchronized long getNumUnderConstructionBlocks() {
+    assert this.fsnamesystem.hasReadLock() : "The FSNamesystem read lock wasn't"
+      + "acquired before counting under construction blocks";
+    long numUCBlocks = 0;
+    for (Lease lease : sortedLeases) {
+      for (String path : lease.getPaths()) {
+        final INodeFile cons;
+        try {
+          cons = this.fsnamesystem.getFSDirectory().getINode(path).asFile();
+            Preconditions.checkState(cons.isUnderConstruction());
+        } catch (UnresolvedLinkException e) {
+          throw new AssertionError("Lease files should reside on this FS");
+        }
+        BlockInfo[] blocks = cons.getBlocks();
+        if(blocks == null)
+          continue;
+        for(BlockInfo b : blocks) {
+          if(!b.isComplete())
+            numUCBlocks++;
+        }
+      }
+    }
+    LOG.info("Number of blocks under construction: " + numUCBlocks);
+    return numUCBlocks;
+  }
 
   /** @return the lease containing src */
   public Lease getLeaseByPath(String src) {return sortedLeasesByPath.get(src);}
@@ -421,33 +456,38 @@ public class LeaseManager {
   /** Check the leases beginning from the oldest.
    *  @return true is sync is needed.
    */
-  private synchronized boolean checkLeases() {
+  @VisibleForTesting
+  synchronized boolean checkLeases() {
     boolean needSync = false;
     assert fsnamesystem.hasWriteLock();
-    for(; sortedLeases.size() > 0; ) {
-      final Lease oldest = sortedLeases.first();
-      if (!oldest.expiredHardLimit()) {
-        return needSync;
+    Lease leaseToCheck = null;
+    try {
+      leaseToCheck = sortedLeases.first();
+    } catch(NoSuchElementException e) {}
+
+    while(leaseToCheck != null) {
+      if (!leaseToCheck.expiredHardLimit()) {
+        break;
       }
 
-      LOG.info(oldest + " has expired hard limit");
+      LOG.info(leaseToCheck + " has expired hard limit");
 
       final List<String> removing = new ArrayList<String>();
-      // need to create a copy of the oldest lease paths, becuase 
+      // need to create a copy of the oldest lease paths, because 
       // internalReleaseLease() removes paths corresponding to empty files,
       // i.e. it needs to modify the collection being iterated over
       // causing ConcurrentModificationException
-      String[] leasePaths = new String[oldest.getPaths().size()];
-      oldest.getPaths().toArray(leasePaths);
+      String[] leasePaths = new String[leaseToCheck.getPaths().size()];
+      leaseToCheck.getPaths().toArray(leasePaths);
       for(String p : leasePaths) {
         try {
-          boolean completed = fsnamesystem.internalReleaseLease(oldest, p,
+          boolean completed = fsnamesystem.internalReleaseLease(leaseToCheck, p,
               HdfsServerConstants.NAMENODE_LEASE_HOLDER);
           if (LOG.isDebugEnabled()) {
             if (completed) {
               LOG.debug("Lease recovery for " + p + " is complete. File closed.");
             } else {
-              LOG.debug("Started block recovery " + p + " lease " + oldest);
+              LOG.debug("Started block recovery " + p + " lease " + leaseToCheck);
             }
           }
           // If a lease recovery happened, we need to sync later.
@@ -456,15 +496,23 @@ public class LeaseManager {
           }
         } catch (IOException e) {
           LOG.error("Cannot release the path " + p + " in the lease "
-              + oldest, e);
+              + leaseToCheck, e);
           removing.add(p);
         }
       }
 
       for(String p : removing) {
-        removeLease(oldest, p);
+        removeLease(leaseToCheck, p);
       }
+      leaseToCheck = sortedLeases.higher(leaseToCheck);
     }
+
+    try {
+      if(leaseToCheck != sortedLeases.first()) {
+        LOG.warn("Unable to release hard-limit expired lease: "
+          + sortedLeases.first());
+      }
+    } catch(NoSuchElementException e) {}
     return needSync;
   }
 

+ 25 - 0
hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestLeaseManager.java

@@ -17,6 +17,7 @@
  */
 package org.apache.hadoop.hdfs.server.namenode;
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertNull;
 
@@ -54,4 +55,28 @@ public class TestLeaseManager {
     assertNull(lm.getLeaseByPath("/a/b"));
     assertNull(lm.getLeaseByPath("/a/c"));
   }
+
+  /** Check that even if LeaseManager.checkLease is not able to relinquish
+   * leases, the Namenode does't enter an infinite loop while holding the FSN
+   * write lock and thus become unresponsive
+   */
+  @Test (timeout=1000)
+  public void testCheckLeaseNotInfiniteLoop() {
+    FSNamesystem fsn = Mockito.mock(FSNamesystem.class);
+    Mockito.when(fsn.isRunning()).thenReturn(true);
+    Mockito.when(fsn.hasWriteLock()).thenReturn(true);
+    LeaseManager lm = new LeaseManager(fsn);
+
+    //Make sure the leases we are going to add exceed the hard limit
+    lm.setLeasePeriod(0,0);
+
+    //Add some leases to the LeaseManager
+    lm.addLease("holder1", "src1");
+    lm.addLease("holder2", "src2");
+    lm.addLease("holder3", "src3");
+    assertEquals(lm.getNumSortedLeases(), 3);
+
+    //Initiate a call to checkLease. This should exit within the test timeout
+    lm.checkLeases();
+  }
 }