Browse Source

YARN-9337. GPU auto-discovery script runs even when the resource is given by hand. Contributed by Adam Antal

Szilard Nemeth 5 years ago
parent
commit
61b0c2bb7c

+ 35 - 25
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

@@ -69,6 +69,8 @@ public class GpuDiscoverer {
   private int numOfErrorExecutionSinceLastSucceed = 0;
   private GpuDeviceInformation lastDiscoveredGpuInformation = null;
 
+  private List<GpuDevice> gpuDevicesFromUser;
+
   private void validateConfOrThrowException() throws YarnException {
     if (conf == null) {
       throw new YarnException("Please initialize (call initialize) before use "
@@ -141,6 +143,14 @@ public class GpuDiscoverer {
     }
   }
 
+  private boolean IsAutoDiscoveryEnabled() {
+    String allowedDevicesStr = conf.get(
+        YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
+        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
+    return allowedDevicesStr.equals(
+        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
+  }
+
   /**
    * Get list of GPU devices usable by YARN.
    *
@@ -151,15 +161,13 @@ public class GpuDiscoverer {
       throws YarnException {
     validateConfOrThrowException();
 
-    String allowedDevicesStr = conf.get(
-        YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
-        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
-
-    if (allowedDevicesStr.equals(
-        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES)) {
+    if (IsAutoDiscoveryEnabled()) {
       return parseGpuDevicesFromAutoDiscoveredGpuInfo();
     } else {
-      return parseGpuDevicesFromUserDefinedValues(allowedDevicesStr);
+      if (gpuDevicesFromUser == null) {
+        gpuDevicesFromUser = parseGpuDevicesFromUserDefinedValues();
+      }
+      return gpuDevicesFromUser;
     }
   }
 
@@ -191,16 +199,16 @@ public class GpuDiscoverer {
   }
 
   /**
-   * @param devices allowed devices coming from the config.
-   *                          Individual devices should be separated by commas.
-   *                          <br>The format of individual devices should be:
-   *                           &lt;index:&gt;&lt;minorNumber&gt;
    * @return List of GpuDevices
    * @throws YarnException when a GPU device is defined as a duplicate.
    * The first duplicate GPU device will be added to the exception message.
    */
-  private List<GpuDevice> parseGpuDevicesFromUserDefinedValues(String devices)
+  private List<GpuDevice> parseGpuDevicesFromUserDefinedValues()
       throws YarnException {
+    String devices = conf.get(
+        YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
+        YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
+
     if (devices.trim().isEmpty()) {
       throw GpuDeviceSpecificationException.createWithEmptyValueSpecified();
     }
@@ -242,19 +250,21 @@ public class GpuDiscoverer {
   public synchronized void initialize(Configuration config)
       throws YarnException {
     this.conf = config;
-    numOfErrorExecutionSinceLastSucceed = 0;
-    lookUpAutoDiscoveryBinary(config);
-
-    // Try to discover GPU information once and print
-    try {
-      LOG.info("Trying to discover GPU information ...");
-      GpuDeviceInformation info = getGpuDeviceInformation();
-      LOG.info("Discovered GPU information: " + info.toString());
-    } catch (YarnException e) {
-      String msg =
-          "Failed to discover GPU information from system, exception message:"
-              + e.getMessage() + " continue...";
-      LOG.warn(msg);
+    if (IsAutoDiscoveryEnabled()) {
+      numOfErrorExecutionSinceLastSucceed = 0;
+      lookUpAutoDiscoveryBinary(config);
+
+      // Try to discover GPU information once and print
+      try {
+        LOG.info("Trying to discover GPU information ...");
+        GpuDeviceInformation info = getGpuDeviceInformation();
+        LOG.info("Discovered GPU information: " + info.toString());
+      } catch (YarnException e) {
+        String msg =
+                "Failed to discover GPU information from system, exception message:"
+                        + e.getMessage() + " continue...";
+        LOG.warn(msg);
+      }
     }
   }
 

+ 18 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

@@ -40,6 +40,7 @@ import java.util.List;
 import java.util.function.Consumer;
 
 import static org.apache.hadoop.test.PlatformAssumptions.assumeNotWindows;
+import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_GPU_ALLOWED_DEVICES;
 import static org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer.DEFAULT_BINARY_NAME;
 import static org.hamcrest.CoreMatchers.containsString;
 import static org.hamcrest.CoreMatchers.not;
@@ -49,6 +50,9 @@ import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.mockito.Mockito.never;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.verify;
 
 public class TestGpuDiscoverer {
   private static final Logger LOG = LoggerFactory.getLogger(
@@ -96,7 +100,7 @@ public class TestGpuDiscoverer {
 
   private Configuration createConfigWithAllowedDevices(String s) {
     Configuration conf = new Configuration(false);
-    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, s);
+    conf.set(NM_GPU_ALLOWED_DEVICES, s);
     setupFakeBinary(conf);
     return conf;
   }
@@ -495,4 +499,17 @@ public class TestGpuDiscoverer {
           "executable in the default directories:"));
     }
   }
+
+  @Test
+  public void testScriptNotCalled() throws YarnException {
+    Configuration conf = new Configuration();
+    conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
+
+    GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
+
+    gpuSpy.initialize(conf);
+    gpuSpy.getGpusUsableByYarn();
+
+    verify(gpuSpy, never()).getGpuDeviceInformation();
+  }
 }