Browse Source

HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes (Noah Lorang via Colin P. McCabe)
(cherry picked from commit d8352b9b2b99aa46679c5880a724ba3f0ceb41ff)
(cherry picked from commit a037d6030b5ae9422fdb265f5e4880d515be9e37)

Colin Patrick Mccabe 10 năm trước cách đây
mục cha
commit
d93bfbace4

+ 3 - 0
hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt

@@ -18,6 +18,9 @@ Release 2.6.1 - UNRELEASED
     HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
     HDFS-4882. Prevent the Namenode's LeaseManager from looping forever in
     checkLeases (Ravi Prakash via Colin P. McCabe)
     checkLeases (Ravi Prakash via Colin P. McCabe)
 
 
+    HDFS-7489. Incorrect locking in FsVolumeList#checkDirs can hang datanodes
+    (Noah Lorang via Colin P. McCabe)
+
 Release 2.6.0 - 2014-11-18
 Release 2.6.0 - 2014-11-18
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 28 - 28
hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/FsVolumeList.java

@@ -36,6 +36,7 @@ class FsVolumeList {
    * This list is replaced on modification holding "this" lock.
    * This list is replaced on modification holding "this" lock.
    */
    */
   volatile List<FsVolumeImpl> volumes = null;
   volatile List<FsVolumeImpl> volumes = null;
+  private Object checkDirsMutex = new Object();
 
 
   private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
   private final VolumeChoosingPolicy<FsVolumeImpl> blockChooser;
   private volatile int numFailedVolumes;
   private volatile int numFailedVolumes;
@@ -167,40 +168,39 @@ class FsVolumeList {
    * Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
    * Calls {@link FsVolumeImpl#checkDirs()} on each volume, removing any
    * volumes from the active list that result in a DiskErrorException.
    * volumes from the active list that result in a DiskErrorException.
    * 
    * 
-   * This method is synchronized to allow only one instance of checkDirs() 
-   * call
+   * Use checkDirsMutext to allow only one instance of checkDirs() call
+   *
    * @return list of all the removed volumes.
    * @return list of all the removed volumes.
    */
    */
-  synchronized List<FsVolumeImpl> checkDirs() {
-    ArrayList<FsVolumeImpl> removedVols = null;
-    
-    // Make a copy of volumes for performing modification 
-    final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
+  List<FsVolumeImpl> checkDirs() {
+    synchronized(checkDirsMutex) {
+      ArrayList<FsVolumeImpl> removedVols = null;
+      
+      // Make a copy of volumes for performing modification 
+      final List<FsVolumeImpl> volumeList = new ArrayList<FsVolumeImpl>(volumes);
 
 
-    for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
-      final FsVolumeImpl fsv = i.next();
-      try {
-        fsv.checkDirs();
-      } catch (DiskErrorException e) {
-        FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
-        if (removedVols == null) {
-          removedVols = new ArrayList<FsVolumeImpl>(1);
+      for(Iterator<FsVolumeImpl> i = volumeList.iterator(); i.hasNext(); ) {
+        final FsVolumeImpl fsv = i.next();
+        try {
+          fsv.checkDirs();
+        } catch (DiskErrorException e) {
+          FsDatasetImpl.LOG.warn("Removing failed volume " + fsv + ": ",e);
+          if (removedVols == null) {
+            removedVols = new ArrayList<FsVolumeImpl>(1);
+          }
+          removedVols.add(fsv);
+          removeVolume(fsv.getBasePath());
+          numFailedVolumes++;
         }
         }
-        removedVols.add(fsv);
-        fsv.shutdown(); 
-        i.remove(); // Remove the volume
-        numFailedVolumes++;
       }
       }
-    }
-    
-    if (removedVols != null && removedVols.size() > 0) {
-      // Replace volume list
-      volumes = Collections.unmodifiableList(volumeList);
-      FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
-          + " volumes. Current volumes: " + this);
-    }
+      
+      if (removedVols != null && removedVols.size() > 0) {
+        FsDatasetImpl.LOG.warn("Completed checkDirs. Removed " + removedVols.size()
+            + " volumes. Current volumes: " + this);
+      }
 
 
-    return removedVols;
+      return removedVols;
+    }
   }
   }
 
 
   @Override
   @Override