Quellcode durchsuchen

YARN-2755. NM fails to clean up usercache_DEL_<timestamp> dirs after YARN-661. Contributed by Siqi Li

Jason Lowe vor 10 Jahren
Ursprung
Commit
73e626ad91

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -801,6 +801,9 @@ Release 2.6.0 - UNRELEASED
     YARN-2769. Fixed the problem that timeline domain is not set in distributed shell
     AM when using shell_command on Windows. (Varun Vasudev via zjshen)
 
+    YARN-2755. NM fails to clean up usercache_DEL_<timestamp> dirs after
+    YARN-661 (Siqi Li via jlowe)
+
 Release 2.5.1 - 2014-09-05
 
   INCOMPATIBLE CHANGES

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/localizer/ResourceLocalizationService.java

@@ -1324,7 +1324,7 @@ public class ResourceLocalizationService extends CompositeService
     RemoteIterator<FileStatus> userDirStatus = lfs.listStatus(userDirPath);
     FileDeletionTask dependentDeletionTask =
         del.createFileDeletionTask(null, userDirPath, new Path[] {});
-    if (userDirStatus != null) {
+    if (userDirStatus != null && userDirStatus.hasNext()) {
       List<FileDeletionTask> deletionTasks = new ArrayList<FileDeletionTask>();
       while (userDirStatus.hasNext()) {
         FileStatus status = userDirStatus.next();

+ 50 - 16
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestNodeManagerReboot.java

@@ -35,7 +35,9 @@ import java.util.Map;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.fs.FileContext;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.RemoteIterator;
 import org.apache.hadoop.fs.UnsupportedFileSystemException;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.yarn.api.ContainerManagementProtocol;
@@ -190,17 +192,41 @@ public class TestNodeManagerReboot {
         ResourceLocalizationService.NM_PRIVATE_DIR) > 0);
 
     // restart the NodeManager
+    restartNM(MAX_TRIES);
+    checkNumOfLocalDirs();
+    
+    verify(delService, times(1)).delete(
+      (String) isNull(),
+      argThat(new PathInclude(ResourceLocalizationService.NM_PRIVATE_DIR
+          + "_DEL_")));
+    verify(delService, times(1)).delete((String) isNull(),
+      argThat(new PathInclude(ContainerLocalizer.FILECACHE + "_DEL_")));
+    verify(delService, times(1)).scheduleFileDeletionTask(
+      argThat(new FileDeletionInclude(user, null,
+        new String[] { destinationFile })));
+    verify(delService, times(1)).scheduleFileDeletionTask(
+      argThat(new FileDeletionInclude(null, ContainerLocalizer.USERCACHE
+          + "_DEL_", new String[] {})));
+    
+    // restart the NodeManager again
+    // this time usercache directory should be empty
+    restartNM(MAX_TRIES);
+    checkNumOfLocalDirs();
+    
+  }
+
+  private void restartNM(int maxTries) {
     nm.stop();
     nm = new MyNodeManager();
     nm.start();
 
-    numTries = 0;
+    int numTries = 0;
     while ((numOfLocalDirs(nmLocalDir.getAbsolutePath(),
       ContainerLocalizer.USERCACHE) > 0
         || numOfLocalDirs(nmLocalDir.getAbsolutePath(),
           ContainerLocalizer.FILECACHE) > 0 || numOfLocalDirs(
       nmLocalDir.getAbsolutePath(), ResourceLocalizationService.NM_PRIVATE_DIR) > 0)
-        && numTries < MAX_TRIES) {
+        && numTries < maxTries) {
       try {
         Thread.sleep(500);
       } catch (InterruptedException ex) {
@@ -208,7 +234,9 @@ public class TestNodeManagerReboot {
       }
       numTries++;
     }
-
+  }
+  
+  private void checkNumOfLocalDirs() throws IOException {
     Assert
       .assertTrue(
         "After NM reboots, all local files should be deleted",
@@ -218,20 +246,13 @@ public class TestNodeManagerReboot {
               ContainerLocalizer.FILECACHE) == 0
             && numOfLocalDirs(nmLocalDir.getAbsolutePath(),
               ResourceLocalizationService.NM_PRIVATE_DIR) == 0);
-    verify(delService, times(1)).delete(
-      (String) isNull(),
-      argThat(new PathInclude(ResourceLocalizationService.NM_PRIVATE_DIR
-          + "_DEL_")));
-    verify(delService, times(1)).delete((String) isNull(),
-      argThat(new PathInclude(ContainerLocalizer.FILECACHE + "_DEL_")));
-    verify(delService, times(1)).scheduleFileDeletionTask(
-      argThat(new FileDeletionInclude(user, null,
-        new String[] { destinationFile })));
-    verify(delService, times(1)).scheduleFileDeletionTask(
-      argThat(new FileDeletionInclude(null, ContainerLocalizer.USERCACHE
-          + "_DEL_", new String[] {})));
+    
+    Assert
+    .assertTrue(
+      "After NM reboots, usercache_DEL_* directory should be deleted",
+      numOfUsercacheDELDirs(nmLocalDir.getAbsolutePath()) == 0);
   }
-
+  
   private int numOfLocalDirs(String localDir, String localSubDir) {
     File[] listOfFiles = new File(localDir, localSubDir).listFiles();
     if (listOfFiles == null) {
@@ -240,6 +261,19 @@ public class TestNodeManagerReboot {
       return listOfFiles.length;
     }
   }
+  
+  private int numOfUsercacheDELDirs(String localDir) throws IOException {
+    int count = 0;
+    RemoteIterator<FileStatus> fileStatus = localFS.listStatus(new Path(localDir));
+    while (fileStatus.hasNext()) {
+      FileStatus status = fileStatus.next();
+      if (status.getPath().getName().matches(".*" +
+          ContainerLocalizer.USERCACHE + "_DEL_.*")) {
+        count++;
+      }
+    }
+    return count;
+  }
 
   private void createFiles(String dir, String subDir, int numOfFiles) {
     for (int i = 0; i < numOfFiles; i++) {