Browse Source

YARN-9217. Nodemanager will fail to start if GPU is misconfigured on the node or GPU drivers missing. Contributed by Peter Bacsko

Szilard Nemeth 5 năm trước cách đây
mục cha
commit
e8fa192f07
11 tập tin đã thay đổi với 256 bổ sung77 xóa
  1. 14 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
  2. 11 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
  3. 42 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java
  4. 4 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java
  5. 3 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java
  6. 46 37
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java
  7. 10 3
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java
  8. 30 5
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java
  9. 63 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java
  10. 10 6
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java
  11. 23 22
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

+ 14 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -1647,6 +1647,20 @@ public class YarnConfiguration extends Configuration {
   public static final String NM_RESOURCE_PLUGINS =
       NM_PREFIX + "resource-plugins";
 
+
+  /**
+   * Specifies whether the initialization of the Node Manager should continue
+   * if a certain device (GPU, FPGA, etc) was not found in the system. If set
+   * to "true", then an exception will be thrown if a device is missing or
+   * an error occurred during discovery.
+   */
+  @Private
+  public static final String NM_RESOURCE_PLUGINS_FAIL_FAST =
+      NM_RESOURCE_PLUGINS + ".fail-fast";
+
+  @Private
+  public static final boolean DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST = true;
+
   /**
    * This setting controls if pluggable device plugin framework is enabled.
    * */

+ 11 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -3918,6 +3918,17 @@
     <value></value>
   </property>
 
+  <property>
+    <description>
+      Specifies whether the initialization of the Node Manager should continue
+      if a certain device (GPU, FPGA, etc) was not found in the system. If set
+      to "true", then an exception will be thrown if a device is missing or
+      an error occurred during discovery.
+    </description>
+    <name>yarn.nodemanager.resource-plugins.fail-fast</name>
+    <value></value>
+  </property>
+
   <property>
     <description>
       Specify GPU devices which can be managed by YARN NodeManager, split by comma

+ 42 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourcesExceptionUtil.java

@@ -0,0 +1,42 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources;
+
+import static org.apache.hadoop.yarn.conf.YarnConfiguration.DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST;
+import static org.apache.hadoop.yarn.conf.YarnConfiguration.NM_RESOURCE_PLUGINS_FAIL_FAST;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+
+/**
+ * Small utility class which only re-throws YarnException if
+ * NM_RESOURCE_PLUGINS_FAIL_FAST property is true.
+ *
+ */
+public final class ResourcesExceptionUtil {
+  private ResourcesExceptionUtil() {}
+
+  public static void throwIfNecessary(YarnException e, Configuration conf)
+      throws YarnException {
+    if (conf.getBoolean(NM_RESOURCE_PLUGINS_FAIL_FAST,
+        DEFAULT_NM_RESOURCE_PLUGINS_FAIL_FAST)) {
+      throw e;
+    }
+  }
+}

+ 4 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/GpuResourceHandlerImpl.java

@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.gpu;
 
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -75,7 +77,8 @@ public class GpuResourceHandlerImpl implements ResourceHandler {
         String message = "GPU is enabled on the NodeManager, but couldn't find "
             + "any usable GPU devices, please double check configuration!";
         LOG.error(message);
-        throw new ResourceHandlerException(message);
+        throwIfNecessary(new ResourceHandlerException(message),
+            configuration);
       }
     } catch (YarnException e) {
       LOG.error("Exception when trying to get usable GPU device", e);

+ 3 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/ResourcePluginManager.java

@@ -72,7 +72,7 @@ public class ResourcePluginManager {
 
     Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
     if (plugins != null) {
-      pluginMap = initializePlugins(context, plugins);
+      pluginMap = initializePlugins(conf, context, plugins);
     }
 
     // Try to load pluggable device plugins
@@ -101,7 +101,7 @@ public class ResourcePluginManager {
     return plugins;
   }
 
-  private Map<String, ResourcePlugin> initializePlugins(
+  private Map<String, ResourcePlugin> initializePlugins(Configuration conf,
       Context context, String[] plugins) throws YarnException {
     Map<String, ResourcePlugin> pluginMap = Maps.newHashMap();
 
@@ -114,7 +114,7 @@ public class ResourcePluginManager {
         if (resourceName.equals(GPU_URI)) {
           final GpuDiscoverer gpuDiscoverer = new GpuDiscoverer();
           final GpuNodeResourceUpdateHandler updateHandler =
-              new GpuNodeResourceUpdateHandler(gpuDiscoverer);
+              new GpuNodeResourceUpdateHandler(gpuDiscoverer, conf);
           plugin = new GpuResourcePlugin(updateHandler, gpuDiscoverer);
         } else if (resourceName.equals(FPGA_URI)) {
           plugin = new FpgaResourcePlugin();

+ 46 - 37
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuDiscoverer.java

@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.Lists;
@@ -26,7 +28,6 @@ import org.apache.hadoop.classification.InterfaceAudience;
 import org.apache.hadoop.classification.InterfaceStability;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
-import org.apache.hadoop.util.Shell;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
@@ -58,10 +59,9 @@ public class GpuDiscoverer extends Configured {
   private static final Set<String> DEFAULT_BINARY_SEARCH_DIRS = ImmutableSet.of(
       "/usr/bin", "/bin", "/usr/local/nvidia/bin");
 
-  // command should not run more than 10 sec.
-  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
   private static final int MAX_REPEATED_ERROR_ALLOWED = 10;
 
+  private NvidiaBinaryHelper nvidiaBinaryHelper;
   private String pathOfGpuBinary = null;
   private Map<String, String> environment = new HashMap<>();
 
@@ -110,24 +110,17 @@ public class GpuDiscoverer extends Configured {
    * @return GpuDeviceInformation
    * @throws YarnException when any error happens
    */
-  synchronized GpuDeviceInformation getGpuDeviceInformation()
+  public synchronized GpuDeviceInformation getGpuDeviceInformation()
       throws YarnException {
-    validateConfOrThrowException();
-
     if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
       String msg = getErrorMessageOfScriptExecutionThresholdReached();
       LOG.error(msg);
       throw new YarnException(msg);
     }
 
-    String output;
     try {
-      output = Shell.execCommand(environment,
-          new String[] { pathOfGpuBinary, "-x", "-q" }, MAX_EXEC_TIMEOUT_MS);
-      GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
-      lastDiscoveredGpuInformation = parser.parseXml(output);
-      numOfErrorExecutionSinceLastSucceed = 0;
-      return lastDiscoveredGpuInformation;
+      lastDiscoveredGpuInformation =
+          nvidiaBinaryHelper.getGpuDeviceInformation(pathOfGpuBinary);
     } catch (IOException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getErrorMessageOfScriptExecution(e.getMessage());
@@ -136,14 +129,14 @@ public class GpuDiscoverer extends Configured {
     } catch (YarnException e) {
       numOfErrorExecutionSinceLastSucceed++;
       String msg = getFailedToParseErrorMessage(e.getMessage());
-      if (LOG.isDebugEnabled()) {
-        LOG.warn(msg, e);
-      }
+      LOG.debug(msg, e);
       throw e;
     }
+
+    return lastDiscoveredGpuInformation;
   }
 
-  private boolean IsAutoDiscoveryEnabled() {
+  private boolean isAutoDiscoveryEnabled() {
     String allowedDevicesStr = getConf().get(
         YarnConfiguration.NM_GPU_ALLOWED_DEVICES,
         YarnConfiguration.AUTOMATICALLY_DISCOVER_GPU_DEVICES);
@@ -155,13 +148,12 @@ public class GpuDiscoverer extends Configured {
    * Get list of GPU devices usable by YARN.
    *
    * @return List of GPU devices
-   * @throws YarnException when any issue happens
    */
   public synchronized List<GpuDevice> getGpusUsableByYarn()
       throws YarnException {
     validateConfOrThrowException();
 
-    if (IsAutoDiscoveryEnabled()) {
+    if (isAutoDiscoveryEnabled()) {
       return parseGpuDevicesFromAutoDiscoveredGpuInfo();
     } else {
       if (gpuDevicesFromUser == null) {
@@ -217,16 +209,27 @@ public class GpuDiscoverer extends Configured {
       if (device.trim().length() > 0) {
         String[] splitByColon = device.trim().split(":");
         if (splitByColon.length != 2) {
-          throw GpuDeviceSpecificationException.
-              createWithWrongValueSpecified(device, devices);
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithWrongValueSpecified(device, devices), getConf());
+          LOG.warn("Wrong GPU specification string {}, ignored", device);
+        }
+
+        GpuDevice gpuDevice;
+        try {
+          gpuDevice = parseGpuDevice(splitByColon);
+        } catch (NumberFormatException e) {
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithWrongValueSpecified(device, devices, e), getConf());
+          LOG.warn("Cannot parse GPU device numbers: {}", device);
+          continue;
         }
 
-        GpuDevice gpuDevice = parseGpuDevice(device, splitByColon, devices);
         if (!gpuDevices.contains(gpuDevice)) {
           gpuDevices.add(gpuDevice);
         } else {
-          throw GpuDeviceSpecificationException
-              .createWithDuplicateValueSpecified(device, devices);
+          throwIfNecessary(GpuDeviceSpecificationException
+              .createWithDuplicateValueSpecified(device, devices), getConf());
+          LOG.warn("CPU device is duplicated: {}", device);
         }
       }
     }
@@ -235,22 +238,17 @@ public class GpuDiscoverer extends Configured {
     return gpuDevices;
   }
 
-  private GpuDevice parseGpuDevice(String device, String[] splitByColon,
-      String allowedDevicesStr) throws YarnException {
-    try {
-      int index = Integer.parseInt(splitByColon[0]);
-      int minorNumber = Integer.parseInt(splitByColon[1]);
-      return new GpuDevice(index, minorNumber);
-    } catch (NumberFormatException e) {
-      throw GpuDeviceSpecificationException.
-          createWithWrongValueSpecified(device, allowedDevicesStr, e);
-    }
+  private GpuDevice parseGpuDevice(String[] splitByColon) {
+    int index = Integer.parseInt(splitByColon[0]);
+    int minorNumber = Integer.parseInt(splitByColon[1]);
+    return new GpuDevice(index, minorNumber);
   }
 
-  public synchronized void initialize(Configuration config)
-      throws YarnException {
+  public synchronized void initialize(Configuration config,
+      NvidiaBinaryHelper nvidiaHelper) throws YarnException {
     setConf(config);
-    if (IsAutoDiscoveryEnabled()) {
+    this.nvidiaBinaryHelper = nvidiaHelper;
+    if (isAutoDiscoveryEnabled()) {
       numOfErrorExecutionSinceLastSucceed = 0;
       lookUpAutoDiscoveryBinary(config);
 
@@ -284,7 +282,18 @@ public class GpuDiscoverer extends Configured {
       binaryPath = handleConfiguredBinaryPathIsDirectory(configuredBinaryFile);
     } else {
       binaryPath = configuredBinaryFile;
+      // If path exists but file name is incorrect don't execute the file
+      String fileName = binaryPath.getName();
+      if (DEFAULT_BINARY_NAME.equals(fileName)) {
+        String msg = String.format("Please check the configuration value of"
+             +" %s. It should point to an %s binary.",
+             YarnConfiguration.NM_GPU_PATH_TO_EXEC,
+             DEFAULT_BINARY_NAME);
+        throwIfNecessary(new YarnException(msg), config);
+        LOG.warn(msg);
+      }
     }
+
     pathOfGpuBinary = binaryPath.getAbsolutePath();
   }
 

+ 10 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuNodeResourceUpdateHandler.java

@@ -18,6 +18,9 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import static org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourcesExceptionUtil.throwIfNecessary;
+
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.yarn.api.records.Resource;
 import org.apache.hadoop.yarn.api.records.ResourceInformation;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
@@ -36,9 +39,12 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
   private static final Logger LOG =
       LoggerFactory.getLogger(GpuNodeResourceUpdateHandler.class);
   private final GpuDiscoverer gpuDiscoverer;
+  private Configuration conf;
 
-  public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer) {
+  public GpuNodeResourceUpdateHandler(GpuDiscoverer gpuDiscoverer,
+      Configuration conf) {
     this.gpuDiscoverer = gpuDiscoverer;
+    this.conf = conf;
   }
 
   @Override
@@ -51,7 +57,8 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
           "but could not find any usable GPUs on the NodeManager!";
       LOG.error(message);
       // No gpu can be used by YARN.
-      throw new YarnException(message);
+      throwIfNecessary(new YarnException(message), conf);
+      return;
     }
 
     long nUsableGpus = usableGpus.size();
@@ -59,7 +66,7 @@ public class GpuNodeResourceUpdateHandler extends NodeResourceUpdaterPlugin {
     Map<String, ResourceInformation> configuredResourceTypes =
         ResourceUtils.getResourceTypes();
     if (!configuredResourceTypes.containsKey(GPU_URI)) {
-      throw new YarnException("Found " + nUsableGpus + " usable GPUs, however "
+      LOG.warn("Found " + nUsableGpus + " usable GPUs, however "
           + GPU_URI
           + " resource-type is not configured inside"
           + " resource-types.xml, please configure it to enable GPU feature or"

+ 30 - 5
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/GpuResourcePlugin.java

@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
 
+import java.util.List;
+
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.server.nodemanager.Context;
@@ -32,8 +34,6 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.NMResourceInfo;
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
 import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.NMGpuResourceInfo;
-
-import java.util.List;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -44,6 +44,10 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   private final GpuNodeResourceUpdateHandler resourceDiscoverHandler;
   private final GpuDiscoverer gpuDiscoverer;
+  public static final int MAX_REPEATED_ERROR_ALLOWED = 10;
+
+  private int numOfErrorExecutionSinceLastSucceed = 0;
+
   private GpuResourceHandlerImpl gpuResourceHandler = null;
   private DockerCommandPlugin dockerCommandPlugin = null;
 
@@ -55,7 +59,8 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   @Override
   public void initialize(Context context) throws YarnException {
-    this.gpuDiscoverer.initialize(context.getConf());
+    this.gpuDiscoverer.initialize(context.getConf(),
+        new NvidiaBinaryHelper());
     this.dockerCommandPlugin =
         GpuDockerCommandPluginFactory.createGpuDockerCommandPlugin(
             context.getConf());
@@ -89,12 +94,21 @@ public class GpuResourcePlugin implements ResourcePlugin {
 
   @Override
   public synchronized NMResourceInfo getNMResourceInfo() throws YarnException {
-    GpuDeviceInformation gpuDeviceInformation =
-        gpuDiscoverer.getGpuDeviceInformation();
+    GpuDeviceInformation gpuDeviceInformation;
 
     //At this point the gpu plugin is already enabled
     checkGpuResourceHandler();
 
+    checkErrorCount();
+    try{
+      gpuDeviceInformation = gpuDiscoverer.getGpuDeviceInformation();
+      numOfErrorExecutionSinceLastSucceed = 0;
+    } catch (YarnException e) {
+      LOG.error(e.getMessage(), e);
+      numOfErrorExecutionSinceLastSucceed++;
+      throw e;
+    }
+
     GpuResourceAllocator gpuResourceAllocator =
         gpuResourceHandler.getGpuAllocator();
     List<GpuDevice> totalGpus = gpuResourceAllocator.getAllowedGpus();
@@ -116,6 +130,17 @@ public class GpuResourcePlugin implements ResourcePlugin {
     }
   }
 
+  private void checkErrorCount() throws YarnException {
+    if (numOfErrorExecutionSinceLastSucceed == MAX_REPEATED_ERROR_ALLOWED) {
+      String msg =
+          "Failed to execute GPU device information detection script for "
+              + MAX_REPEATED_ERROR_ALLOWED
+              + " times, skip following executions.";
+      LOG.error(msg);
+      throw new YarnException(msg);
+    }
+  }
+
   @Override
   public String toString() {
     return GpuResourcePlugin.class.getName();

+ 63 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/NvidiaBinaryHelper.java

@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu;
+
+import java.io.IOException;
+import java.util.HashMap;
+
+import org.apache.hadoop.util.Shell;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformation;
+import org.apache.hadoop.yarn.server.nodemanager.webapp.dao.gpu.GpuDeviceInformationParser;
+
+/**
+ * Executes the "nvidia-smi" command and returns an object
+ * based on its output.
+ *
+ */
+public class NvidiaBinaryHelper {
+  /**
+   * command should not run more than 10 sec.
+   */
+  private static final int MAX_EXEC_TIMEOUT_MS = 10 * 1000;
+
+  /**
+   * @param pathOfGpuBinary The path of the binary
+   * @return the GpuDeviceInformation parsed from the nvidia-smi output
+   * @throws IOException if the binary output is not readable
+   * @throws YarnException if the pathOfGpuBinary is null,
+   * or the output parse failed
+   */
+  synchronized GpuDeviceInformation getGpuDeviceInformation(
+      String pathOfGpuBinary) throws IOException, YarnException {
+    GpuDeviceInformationParser parser = new GpuDeviceInformationParser();
+
+    if (pathOfGpuBinary == null) {
+      throw new YarnException(
+          "Failed to find GPU discovery executable, please double check "
+              + YarnConfiguration.NM_GPU_PATH_TO_EXEC + " setting.");
+    }
+
+    String output = Shell.execCommand(new HashMap<>(),
+        new String[]{pathOfGpuBinary, "-x", "-q"}, MAX_EXEC_TIMEOUT_MS);
+    return parser.parseXml(output);
+  }
+}

+ 10 - 6
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/gpu/TestGpuResourceHandlerImpl.java

@@ -41,6 +41,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resource
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.resources.ResourceHandlerException;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDevice;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.GpuDiscoverer;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.resourceplugin.gpu.NvidiaBinaryHelper;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerRuntimeConstants;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMStateStoreService;
@@ -116,10 +117,12 @@ public class TestGpuResourceHandlerImpl {
   @Rule
   public ExpectedException expected = ExpectedException.none();
 
+  private NvidiaBinaryHelper nvidiaBinaryHelper;
+
   @Before
   public void setup() throws IOException {
     createTestDataDirectory();
-
+    nvidiaBinaryHelper = new NvidiaBinaryHelper();
     CustomResourceTypesConfigurationProvider.
         initResourceTypes(ResourceInformation.GPU_URI);
 
@@ -147,13 +150,14 @@ public class TestGpuResourceHandlerImpl {
   @After
   public void cleanupTestFiles() throws IOException {
     FileUtils.deleteDirectory(testDataDirectory);
+    nvidiaBinaryHelper = new NvidiaBinaryHelper();
   }
 
   @Test
   public void testBootstrapWithRealGpuDiscoverer() throws Exception {
     Configuration conf = createDefaultConfig();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0");
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     gpuResourceHandler.bootstrap(conf);
 
@@ -171,7 +175,7 @@ public class TestGpuResourceHandlerImpl {
   public void testBootstrapWithMockGpuDiscoverer() throws Exception {
     GpuDiscoverer mockDiscoverer = mock(GpuDiscoverer.class);
     Configuration conf = new YarnConfiguration();
-    mockDiscoverer.initialize(conf);
+    mockDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     expected.expect(ResourceHandlerException.class);
     gpuResourceHandler.bootstrap(conf);
@@ -271,7 +275,7 @@ public class TestGpuResourceHandlerImpl {
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:0,1:1,2:3,3:4");
 
     gpuDiscoverer = new GpuDiscoverer();
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
     Context nmContext = createMockNmContext(conf);
     gpuResourceHandler = new GpuResourceHandlerImpl(nmContext,
         mockCGroupsHandler, mockPrivilegedExecutor, gpuDiscoverer);
@@ -380,7 +384,7 @@ public class TestGpuResourceHandlerImpl {
   public void testAllocationWithoutAllowedGpus() throws Exception {
     Configuration conf = createDefaultConfig();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, " ");
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     try {
       gpuResourceHandler.bootstrap(conf);
@@ -461,7 +465,7 @@ public class TestGpuResourceHandlerImpl {
         new GpuResourceHandlerImpl(nmnctx, mockCGroupsHandler,
         mockPrivilegedExecutor, gpuDiscoverer);
 
-    gpuDiscoverer.initialize(conf);
+    gpuDiscoverer.initialize(conf, nvidiaBinaryHelper);
 
     gpuNULLStateResourceHandler.bootstrap(conf);
     verifyNumberOfAvailableGpus(4, gpuNULLStateResourceHandler);

+ 23 - 22
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/resourceplugin/gpu/TestGpuDiscoverer.java

@@ -64,6 +64,7 @@ public class TestGpuDiscoverer {
   private static final String BASH_SHEBANG = "#!/bin/bash\n\n";
   private static final String TEST_PARENT_DIR = new File("target/temp/" +
       TestGpuDiscoverer.class.getName()).getAbsolutePath();
+  private NvidiaBinaryHelper binaryHelper = new NvidiaBinaryHelper();
 
   @Rule
   public ExpectedException exception = ExpectedException.none();
@@ -150,7 +151,7 @@ public class TestGpuDiscoverer {
       Configuration conf) throws YarnException {
     conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, TEST_PARENT_DIR);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     return discoverer;
   }
 
@@ -163,14 +164,14 @@ public class TestGpuDiscoverer {
     // test case 1, check default setting.
     Configuration conf = new Configuration(false);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(DEFAULT_BINARY_NAME, discoverer.getPathOfGpuBinary());
     assertNvidiaIsOnPath(discoverer);
 
     // test case 2, check mandatory set path.
     File fakeBinary = setupFakeBinary(conf);
     discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(fakeBinary.getAbsolutePath(),
         discoverer.getPathOfGpuBinary());
     assertNull(discoverer.getEnvironmentToRunCommand().get(PATH));
@@ -179,7 +180,7 @@ public class TestGpuDiscoverer {
     // but binary doesn't exist so default path will be used.
     fakeBinary.delete();
     discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     assertEquals(DEFAULT_BINARY_NAME,
         discoverer.getPathOfGpuBinary());
     assertNvidiaIsOnPath(discoverer);
@@ -317,7 +318,7 @@ public class TestGpuDiscoverer {
         Boolean.valueOf(System.getProperty("runGpuDiscoverUnitTest")));
     Configuration conf = new Configuration(false);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     GpuDeviceInformation info = discoverer.getGpuDeviceInformation();
 
     assertTrue(info.getGpus().size() > 0);
@@ -331,7 +332,7 @@ public class TestGpuDiscoverer {
     Configuration conf = createConfigWithAllowedDevices("1:2");
 
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
     assertEquals(1, usableGpuDevices.size());
 
@@ -346,7 +347,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -354,7 +355,7 @@ public class TestGpuDiscoverer {
   public void testGetNumberOfUsableGpusFromConfig() throws YarnException {
     Configuration conf = createConfigWithAllowedDevices("0:0,1:1,2:2,3:4");
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
 
     List<GpuDevice> usableGpuDevices = discoverer.getGpusUsableByYarn();
     assertEquals(4, usableGpuDevices.size());
@@ -379,7 +380,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -390,7 +391,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -401,7 +402,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -412,7 +413,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -423,7 +424,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -434,7 +435,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -445,7 +446,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -456,7 +457,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -467,7 +468,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -478,7 +479,7 @@ public class TestGpuDiscoverer {
 
     exception.expect(GpuDeviceSpecificationException.class);
     GpuDiscoverer discoverer = new GpuDiscoverer();
-    discoverer.initialize(conf);
+    discoverer.initialize(conf, binaryHelper);
     discoverer.getGpusUsableByYarn();
   }
 
@@ -488,7 +489,7 @@ public class TestGpuDiscoverer {
     conf.set(YarnConfiguration.NM_GPU_PATH_TO_EXEC, "/blabla");
     GpuDiscoverer plugin = new GpuDiscoverer();
     try {
-      plugin.initialize(conf);
+      plugin.initialize(conf, binaryHelper);
       plugin.getGpusUsableByYarn();
       fail("Illegal format, should fail.");
     } catch (YarnException e) {
@@ -501,15 +502,15 @@ public class TestGpuDiscoverer {
   }
 
   @Test
-  public void testScriptNotCalled() throws YarnException {
+  public void testScriptNotCalled() throws YarnException, IOException {
     Configuration conf = new Configuration();
     conf.set(YarnConfiguration.NM_GPU_ALLOWED_DEVICES, "0:1,2:3");
 
     GpuDiscoverer gpuSpy = spy(GpuDiscoverer.class);
 
-    gpuSpy.initialize(conf);
+    gpuSpy.initialize(conf, binaryHelper);
     gpuSpy.getGpusUsableByYarn();
 
     verify(gpuSpy, never()).getGpuDeviceInformation();
   }
-}
+}