Przeglądaj źródła

YARN-3503. Expose disk utilization percentage and bad local and log dir counts in NM metrics. Contributed by Varun Vasudev
(cherry picked from commit 674c7ef64916fabbe59c8d6cdd50ca19cf7ddb7c)

Conflicts:
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java

Jian He 10 lat temu
rodzic
commit
613a783380

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -106,6 +106,9 @@ Release 2.8.0 - UNRELEASED
     YARN-3494. Expose AM resource limit and usage in CS QueueMetrics. (Rohith
     Sharmaks via jianhe)
 
+    YARN-3503. Expose disk utilization percentage and bad local and log dir 
+    counts in NM metrics. (Varun Vasudev via jianhe)
+
   OPTIMIZATIONS
 
     YARN-3339. TestDockerContainerExecutor should pull a single image and not

+ 31 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DirectoryCollection.java

@@ -82,6 +82,8 @@ class DirectoryCollection {
   private float diskUtilizationPercentageCutoff;
   private long diskUtilizationSpaceCutoff;
 
+  private int goodDirsDiskUtilizationPercentage;
+
   /**
    * Create collection for the directories specified. No check for free space.
    * 
@@ -277,6 +279,7 @@ class DirectoryCollection {
             + dirsFailedCheck.get(dir).message);
       }
     }
+    setGoodDirsDiskUtilizationPercentage();
     return setChanged;
   }
 
@@ -390,4 +393,32 @@ class DirectoryCollection {
         diskUtilizationSpaceCutoff < 0 ? 0 : diskUtilizationSpaceCutoff;
     this.diskUtilizationSpaceCutoff = diskUtilizationSpaceCutoff;
   }
+
+  private void setGoodDirsDiskUtilizationPercentage() {
+
+    long totalSpace = 0;
+    long usableSpace = 0;
+
+    for (String dir : localDirs) {
+      File f = new File(dir);
+      if (!f.isDirectory()) {
+        continue;
+      }
+      totalSpace += f.getTotalSpace();
+      usableSpace += f.getUsableSpace();
+    }
+    if (totalSpace != 0) {
+      long tmp = ((totalSpace - usableSpace) * 100) / totalSpace;
+      if (Integer.MIN_VALUE < tmp && Integer.MAX_VALUE > tmp) {
+        goodDirsDiskUtilizationPercentage = (int) tmp;
+      }
+    } else {
+      // got no good dirs
+      goodDirsDiskUtilizationPercentage = 0;
+    }
+  }
+
+  public int getGoodDirsDiskUtilizationPercentage() {
+    return goodDirsDiskUtilizationPercentage;
+  }
 }

+ 21 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LocalDirsHandlerService.java

@@ -38,6 +38,7 @@ import org.apache.hadoop.service.AbstractService;
 import org.apache.hadoop.util.StringUtils;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 
 /**
  * The class which provides functionality of checking the health of the local
@@ -84,6 +85,8 @@ public class LocalDirsHandlerService extends AbstractService {
   
   private static String FILE_SCHEME = "file";
 
+  private NodeManagerMetrics nodeManagerMetrics = null;
+
   /**
    * Class which is used by the {@link Timer} class to periodically execute the
    * disks' health checker code.
@@ -119,7 +122,12 @@ public class LocalDirsHandlerService extends AbstractService {
   }
 
   public LocalDirsHandlerService() {
+    this(null);
+  }
+
+  public LocalDirsHandlerService(NodeManagerMetrics nodeManagerMetrics) {
     super(LocalDirsHandlerService.class.getName());
+    this.nodeManagerMetrics = nodeManagerMetrics;
   }
 
   /**
@@ -389,6 +397,8 @@ public class LocalDirsHandlerService extends AbstractService {
       updateDirsAfterTest();
     }
 
+    updateMetrics();
+
     lastDisksCheckTime = System.currentTimeMillis();
   }
 
@@ -462,4 +472,15 @@ public class LocalDirsHandlerService extends AbstractService {
     validPaths.toArray(arrValidPaths);
     return arrValidPaths;
   }
+
+  protected void updateMetrics() {
+    if (nodeManagerMetrics != null) {
+      nodeManagerMetrics.setBadLocalDirs(localDirs.getFailedDirs().size());
+      nodeManagerMetrics.setBadLogDirs(logDirs.getFailedDirs().size());
+      nodeManagerMetrics.setGoodLocalDirsDiskUtilizationPerc(
+          localDirs.getGoodDirsDiskUtilizationPercentage());
+      nodeManagerMetrics.setGoodLogDirsDiskUtilizationPerc(
+          logDirs.getGoodDirsDiskUtilizationPercentage());
+    }
+  }
 }

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeManager.java

@@ -239,6 +239,8 @@ public class NodeManager extends CompositeService
     this.dispatcher = new AsyncDispatcher();
 
     nodeHealthChecker = new NodeHealthCheckerService();
+    dirsHandler = new LocalDirsHandlerService(metrics);
+
     addService(nodeHealthChecker);
     dirsHandler = nodeHealthChecker.getDiskHandler();
 

+ 48 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/metrics/NodeManagerMetrics.java

@@ -48,6 +48,15 @@ public class NodeManagerMetrics {
   @Metric MutableGaugeInt availableVCores;
   @Metric("Container launch duration")
       MutableRate containerLaunchDuration;
+  @Metric("# of bad local dirs")
+      MutableGaugeInt badLocalDirs;
+  @Metric("# of bad log dirs")
+      MutableGaugeInt badLogDirs;
+  @Metric("Disk utilization % on good local dirs")
+      MutableGaugeInt goodLocalDirsDiskUtilizationPerc;
+  @Metric("Disk utilization % on good log dirs")
+      MutableGaugeInt goodLogDirsDiskUtilizationPerc;
+
 
   private long allocatedMB;
   private long availableMB;
@@ -125,6 +134,24 @@ public class NodeManagerMetrics {
     containerLaunchDuration.add(value);
   }
 
+  public void setBadLocalDirs(int badLocalDirs) {
+    this.badLocalDirs.set(badLocalDirs);
+  }
+
+  public void setBadLogDirs(int badLogDirs) {
+    this.badLogDirs.set(badLogDirs);
+  }
+
+  public void setGoodLocalDirsDiskUtilizationPerc(
+      int goodLocalDirsDiskUtilizationPerc) {
+    this.goodLocalDirsDiskUtilizationPerc.set(goodLocalDirsDiskUtilizationPerc);
+  }
+
+  public void setGoodLogDirsDiskUtilizationPerc(
+      int goodLogDirsDiskUtilizationPerc) {
+    this.goodLogDirsDiskUtilizationPerc.set(goodLogDirsDiskUtilizationPerc);
+  }
+
   public int getRunningContainers() {
     return containersRunning.value();
   }
@@ -143,4 +170,25 @@ public class NodeManagerMetrics {
   public int getCompletedContainers() {
     return containersCompleted.value();
   }
+
+  @VisibleForTesting
+  public int getBadLogDirs() {
+    return badLogDirs.value();
+  }
+
+  @VisibleForTesting
+  public int getBadLocalDirs() {
+    return badLocalDirs.value();
+  }
+
+  @VisibleForTesting
+  public int getGoodLogDirsDiskUtilizationPerc() {
+    return goodLogDirsDiskUtilizationPerc.value();
+  }
+
+  @VisibleForTesting
+  public int getGoodLocalDirsDiskUtilizationPerc() {
+    return goodLocalDirsDiskUtilizationPerc.value();
+  }
+
 }

+ 14 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestDirectoryCollection.java

@@ -129,24 +129,38 @@ public class TestDirectoryCollection {
     Assert.assertEquals(0, dc.getGoodDirs().size());
     Assert.assertEquals(1, dc.getFailedDirs().size());
     Assert.assertEquals(1, dc.getFullDirs().size());
+    // no good dirs
+    Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
 
     dc = new DirectoryCollection(dirs, 100.0F);
+    int utilizedSpacePerc =
+        (int) ((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
+            testDir.getTotalSpace());
     dc.checkDirs();
     Assert.assertEquals(1, dc.getGoodDirs().size());
     Assert.assertEquals(0, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertEquals(utilizedSpacePerc,
+      dc.getGoodDirsDiskUtilizationPercentage());
 
     dc = new DirectoryCollection(dirs, testDir.getTotalSpace() / (1024 * 1024));
     dc.checkDirs();
     Assert.assertEquals(0, dc.getGoodDirs().size());
     Assert.assertEquals(1, dc.getFailedDirs().size());
     Assert.assertEquals(1, dc.getFullDirs().size());
+    // no good dirs
+    Assert.assertEquals(0, dc.getGoodDirsDiskUtilizationPercentage());
 
     dc = new DirectoryCollection(dirs, 100.0F, 0);
+    utilizedSpacePerc =
+        (int)((testDir.getTotalSpace() - testDir.getUsableSpace()) * 100 /
+            testDir.getTotalSpace());
     dc.checkDirs();
     Assert.assertEquals(1, dc.getGoodDirs().size());
     Assert.assertEquals(0, dc.getFailedDirs().size());
     Assert.assertEquals(0, dc.getFullDirs().size());
+    Assert.assertEquals(utilizedSpacePerc,
+      dc.getGoodDirsDiskUtilizationPercentage());
   }
 
   @Test

+ 30 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLocalDirsHandlerService.java

@@ -31,6 +31,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.service.Service.STATE;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.junit.After;
 import org.junit.Assert;
 import org.junit.Before;
@@ -106,12 +107,40 @@ public class TestLocalDirsHandlerService {
     conf.set(YarnConfiguration.NM_LOG_DIRS, logDir1 + "," + logDir2);
     conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
       0.0f);
-    LocalDirsHandlerService dirSvc = new LocalDirsHandlerService();
+    NodeManagerMetrics nm = NodeManagerMetrics.create();
+    LocalDirsHandlerService dirSvc = new LocalDirsHandlerService(nm);
     dirSvc.init(conf);
     Assert.assertEquals(0, dirSvc.getLocalDirs().size());
     Assert.assertEquals(0, dirSvc.getLogDirs().size());
     Assert.assertEquals(1, dirSvc.getDiskFullLocalDirs().size());
     Assert.assertEquals(1, dirSvc.getDiskFullLogDirs().size());
+    // check the metrics
+    Assert.assertEquals(2, nm.getBadLocalDirs());
+    Assert.assertEquals(2, nm.getBadLogDirs());
+    Assert.assertEquals(0, nm.getGoodLocalDirsDiskUtilizationPerc());
+    Assert.assertEquals(0, nm.getGoodLogDirsDiskUtilizationPerc());
+
+    conf.setFloat(YarnConfiguration.NM_MAX_PER_DISK_UTILIZATION_PERCENTAGE,
+      100.0f);
+    nm = NodeManagerMetrics.create();
+    dirSvc = new LocalDirsHandlerService(nm);
+    dirSvc.init(conf);
+    Assert.assertEquals(1, dirSvc.getLocalDirs().size());
+    Assert.assertEquals(1, dirSvc.getLogDirs().size());
+    Assert.assertEquals(0, dirSvc.getDiskFullLocalDirs().size());
+    Assert.assertEquals(0, dirSvc.getDiskFullLogDirs().size());
+    // check the metrics
+    File dir = new File(localDir1);
+    int utilizationPerc =
+        (int) ((dir.getTotalSpace() - dir.getUsableSpace()) * 100 /
+            dir.getTotalSpace());
+    Assert.assertEquals(1, nm.getBadLocalDirs());
+    Assert.assertEquals(1, nm.getBadLogDirs());
+    Assert.assertEquals(utilizationPerc,
+      nm.getGoodLocalDirsDiskUtilizationPerc());
+    Assert
+      .assertEquals(utilizationPerc, nm.getGoodLogDirsDiskUtilizationPerc());
+
     FileUtils.deleteDirectory(new File(localDir1));
     FileUtils.deleteDirectory(new File(localDir2));
     FileUtils.deleteDirectory(new File(logDir1));