Browse Source

YARN-11813. Fix the fallback ordering between cgroup v2 and v1. (#7631)

Benjamin Teke 1 week ago
parent
commit
189c8b65e2

+ 15 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2HandlerImpl.java

@@ -95,10 +95,22 @@ class CGroupsV2HandlerImpl extends AbstractCGroupsHandler {
   }
   }
 
 
   @Override
   @Override
-  protected Map<String, Set<String>> parsePreConfiguredMountPath() throws IOException {
+  protected Map<String, Set<String>> parsePreConfiguredMountPath() {
     Map<String, Set<String>> controllerMappings = new HashMap<>();
     Map<String, Set<String>> controllerMappings = new HashMap<>();
-    controllerMappings.put(this.cGroupsMountConfig.getV2MountPath(),
-        readControllersFile(this.cGroupsMountConfig.getV2MountPath()));
+    try {
+      controllerMappings.put(this.cGroupsMountConfig.getV2MountPath(),
+          readControllersFile(this.cGroupsMountConfig.getV2MountPath()));
+    } catch (IOException e) {
+      // Failing to read the cgroup.controllers file in the preconfigured might mean
+      // that the node is not using cgroup v2, or no cgroup v2 hierarchy is mounted
+      // under the specified path. If the node is using v1 we will fall back to cgroup v1
+      // in ResourceHandlerModule.initializeCGroupHandlers. If the cgroup v2 hierarchy is
+      // not mounted and no cgroup v1 hierarchy is mounted, we will fail to start the NM.
+      LOG.info("Failed to read the cgroup controllers file in the preconfigured directory: {}. " +
+          "The cgroup v2 hierarchy may not be mounted under the specified path, or the node" +
+          " might be using cgroup v1.", this.cGroupsMountConfig.getV2MountPath());
+      LOG.debug("Exception while reading the cgroup.controllers file: ", e);
+    }
     return controllerMappings;
     return controllerMappings;
   }
   }
 
 

+ 10 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/ResourceHandlerModule.java

@@ -77,10 +77,17 @@ public class ResourceHandlerModule {
       cGroupsCpuResourceHandler;
       cGroupsCpuResourceHandler;
 
 
   private static void initializeCGroupHandlers(Configuration conf,
   private static void initializeCGroupHandlers(Configuration conf,
-      CGroupsHandler.CGroupController controller) throws ResourceHandlerException {
-    initializeCGroupV1Handler(conf);
-    if (cgroupsV2Enabled && !isMountedInCGroupsV1(controller)) {
+                                               CGroupsHandler.CGroupController controller)
+      throws ResourceHandlerException {
+    if (cgroupsV2Enabled) {
       initializeCGroupV2Handler(conf);
       initializeCGroupV2Handler(conf);
+      if (!isMountedInCGroupsV2(controller)) {
+        LOG.info("Cgroup v2 is enabled but {} is not mounted in cgroups v2, falling back to v1",
+            controller);
+        initializeCGroupV1Handler(conf);
+      }
+    } else {
+      initializeCGroupV1Handler(conf);
     }
     }
   }
   }