Browse Source

YARN-8775. TestDiskFailures.testLocalDirsFailures sometimes can fail on concurrent File modifications. (Contributed by Antal Bálint Steinbach)

Haibo Chen 6 years ago
parent
commit
f880ff418c

+ 4 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java

@@ -27,6 +27,8 @@ import java.util.List;
 import java.util.Set;
 import java.util.Timer;
 import java.util.TimerTask;
+
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.hadoop.util.DiskChecker.DiskErrorException;
 import org.apache.hadoop.util.DiskValidator;
 import org.apache.hadoop.util.DiskValidatorFactory;
@@ -493,7 +495,8 @@ public class LocalDirsHandlerService extends AbstractService {
 
   }
 
-  private void checkDirs() {
+  @VisibleForTesting
+  public void checkDirs() {
     boolean disksStatusChange = false;
     Set<String> failedLocalDirsPreCheck =
         new HashSet<String>(localDirs.getFailedDirs());

+ 15 - 23
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-tests/src/test/java/org/apache/hadoop/yarn/server/TestDiskFailures.java

@@ -27,7 +27,6 @@ import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.api.records.NodeState;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
-import org.apache.hadoop.yarn.server.MiniYARNCluster;
 import org.apache.hadoop.yarn.server.nodemanager.LocalDirsHandlerService;
 import org.apache.hadoop.yarn.server.nodemanager.NodeManager;
 import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
@@ -56,7 +55,12 @@ public class TestDiskFailures {
 
   private static final Logger LOG = LoggerFactory.getLogger(TestDiskFailures.class);
 
-  private static final long DISK_HEALTH_CHECK_INTERVAL = 1000;//1 sec
+  /*
+   * Set disk check interval high enough so that it never runs during the test.
+   * Checks will be called manually if necessary.
+   */
+  private static final long TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL =
+      1000 * 60 * 60 * 24;
 
   private static FileContext localFS = null;
   private static final File testDir = new File("target",
@@ -146,9 +150,10 @@ public class TestDiskFailures {
                                          : YarnConfiguration.NM_LOG_DIRS;
 
     Configuration conf = new Configuration();
-    // set disk health check interval to a small value (say 1 sec).
+    // set disk health check interval to a large value to effectively disable
+    // disk health check done internally in LocalDirsHandlerService"
     conf.setLong(YarnConfiguration.NM_DISK_HEALTH_CHECK_INTERVAL_MS,
-                 DISK_HEALTH_CHECK_INTERVAL);
+        TOO_HIGH_DISK_HEALTH_CHECK_INTERVAL);
 
     // If 2 out of the total 4 local-dirs fail OR if 2 Out of the total 4
     // log-dirs fail, then the node's health status should become unhealthy.
@@ -202,22 +207,6 @@ public class TestDiskFailures {
     verifyDisksHealth(localORLogDirs, expectedDirs, false);
   }
 
-  /**
-   * Wait for the NodeManger to go for the disk-health-check at least once.
-   */
-  private void waitForDiskHealthCheck() {
-    long lastDisksCheckTime = dirsHandler.getLastDisksCheckTime();
-    long time = lastDisksCheckTime;
-    for (int i = 0; i < 10 && (time <= lastDisksCheckTime); i++) {
-      try {
-        Thread.sleep(1000);
-      } catch(InterruptedException e) {
-        LOG.error(
-            "Interrupted while waiting for NodeManager's disk health check.");
-      }
-      time = dirsHandler.getLastDisksCheckTime();
-    }
-  }
 
   /**
    * Verify if the NodeManager could identify disk failures.
@@ -228,8 +217,8 @@ public class TestDiskFailures {
    */
   private void verifyDisksHealth(boolean localORLogDirs, String expectedDirs,
       boolean isHealthy) {
-    // Wait for the NodeManager to identify disk failures.
-    waitForDiskHealthCheck();
+    // identify disk failures
+    dirsHandler.checkDirs();
 
     List<String> list = localORLogDirs ? dirsHandler.getLocalDirs()
                                        : dirsHandler.getLogDirs();
@@ -272,7 +261,10 @@ public class TestDiskFailures {
    */
   private void prepareDirToFail(String dir) throws IOException {
     File file = new File(dir);
-    FileUtil.fullyDelete(file);
+    if(!FileUtil.fullyDelete(file)) {
+      throw new IOException("Delete of file was unsuccessful! Path: " +
+          file.getAbsolutePath());
+    }
     file.createNewFile();
     LOG.info("Prepared " + dir + " to fail.");
   }