Browse Source

HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)
(cherry picked from commit d8352b9b2b99aa46679c5880a724ba3f0ceb41ff)

Colin Patrick Mccabe 10 years ago
parent
commit
a037d6030b

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -317,6 +317,9 @@ Release 2.6.1 - UNRELEASED
     HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
     checkLeases (Ravi Prakash via Colin P. McCabe)
 
+    HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes
+    (Noah Lorang via Colin P. McCabe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

+ 28 - 28
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java

@@ -36,6 +36,7 @@ class FsVolumeList {
    * This list is replaced on modification holding "this" lock.
    */
   volatile List<FsVolumeImpl> volumes = null;
+  private Object checkDirsMutex = new Object();
 
   private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
   private volatile int numFailedVolumes;
@@ -167,40 +168,39 @@ class FsVolumeList {
    * Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
    * volumes from the active list that result in a DiskErrorException.
    * 
-   * This method is synchronized to allow only one instance of checkDirs() 
-   * call
+   * Use checkDirsMutext to allow only one instance of checkDirs() call
+   *
    * @return list of all the removed volumes.
    */
-  synchronized List<FsVolumeImpl> checkDirs() {
-    ArrayList<FsVolumeImpl> removedVols = null;
-    
-    // Make a copy of volumes for performing modification 
-    final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
+  List<FsVolumeImpl> checkDirs() {
+    synchronized(checkDirsMutex) {
+      ArrayList<FsVolumeImpl> removedVols = null;
+      
+      // Make a copy of volumes for performing modification 
+      final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
 
-    for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
-      final FsVolumeImpl fsv = i.next();
-      try {
-        fsv.checkDirs();
-      } catch (DiskErrorException e) {
-        FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
-        if (removedVols == null) {
-          removedVols = new ArrayList<FsVolumeImpl>(1);
+      for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
+        final FsVolumeImpl fsv = i.next();
+        try {
+          fsv.checkDirs();
+        } catch (DiskErrorException e) {
+          FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
+          if (removedVols == null) {
+            removedVols = new ArrayList<FsVolumeImpl>(1);
+          }
+          removedVols.add(fsv);
+          removeVolume(fsv.getBasePath());
+          numFailedVolumes++;
         }
-        removedVols.add(fsv);
-        fsv.shutdown(); 
-        i.remove(); // Remove the volume
-        numFailedVolumes++;
       }
-    }
-    
-    if (removedVols != null && removedVols.size() > 0) {
-      // Replace volume list
-      volumes = Collections.unmodifiableList(volumeList);
-      FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
-          + " volumes. Current volumes: " + this);
-    }
+      
+      if (removedVols != null && removedVols.size() > 0) {
+        FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+            + " volumes. Current volumes: " + this);
+      }
 
-    return removedVols;
+      return removedVols;
+    }
   }
 
   @Override