Browse Source

YARN-11689. Update the cgroup v2 init error handling (#6810)

Benjamin Teke 11 months ago
parent
commit
ce7d01fac8

+ 4 - 12
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/AbstractCGroupsHandler.java

@@ -358,14 +358,14 @@ public abstract class AbstractCGroupsHandler implements CGroupsHandler {
         } else {
           // Unexpected: we just checked that it was missing
           throw new ResourceHandlerException(getErrorWithDetails(
-              "Unexpected: Cannot create yarn cgroup",
+              "Unexpected: Cannot create yarn cgroup hierarchy",
               subsystemName,
               yarnHierarchy.getAbsolutePath()
           ));
         }
       } catch (SecurityException e) {
         throw new ResourceHandlerException(getErrorWithDetails(
-            "No permissions to create yarn cgroup",
+            "No permissions to create yarn cgroup hierarchy",
             subsystemName,
             yarnHierarchy.getAbsolutePath()
         ), e);
@@ -378,15 +378,7 @@ public abstract class AbstractCGroupsHandler implements CGroupsHandler {
       ));
     }
 
-    try {
-      updateEnabledControllersInHierarchy(yarnHierarchy, controller);
-    } catch (ResourceHandlerException e) {
-      throw new ResourceHandlerException(getErrorWithDetails(
-          "Failed to update cgroup.subtree_control in yarn hierarchy",
-          subsystemName,
-          yarnHierarchy.getAbsolutePath()
-      ));
-    }
+    updateEnabledControllersInHierarchy(yarnHierarchy, controller);
   }
 
   protected abstract void updateEnabledControllersInHierarchy(
@@ -401,7 +393,7 @@ public abstract class AbstractCGroupsHandler implements CGroupsHandler {
    * @param yarnCgroupPath cgroup path that failed
    * @return a string builder that can be appended by the caller
    */
-  private String getErrorWithDetails(
+  protected String getErrorWithDetails(
       String errorMessage,
       String subsystemName,
       String yarnCgroupPath) {

+ 29 - 14
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/CGroupsV2HandlerImpl.java

@@ -97,10 +97,8 @@ class CGroupsV2HandlerImpl extends AbstractCGroupsHandler {
   @Override
   protected Map<String, Set<String>> parsePreConfiguredMountPath() throws IOException {
     Map<String, Set<String>> controllerMappings = new HashMap<>();
-    String controllerPath = this.cGroupsMountConfig.getMountPath() +
-        Path.SEPARATOR + this.cGroupPrefix;
     controllerMappings.put(this.cGroupsMountConfig.getMountPath(),
-        readControllersFile(controllerPath));
+        readControllersFile(this.cGroupsMountConfig.getMountPath()));
     return controllerMappings;
   }
 
@@ -171,19 +169,32 @@ class CGroupsV2HandlerImpl extends AbstractCGroupsHandler {
     try {
       Set<String> enabledControllers = readControllersFile(yarnHierarchy.getAbsolutePath());
       if (!enabledControllers.contains(controller.getName())) {
-        throw new ResourceHandlerException(String.format(
+        String errorMsg = String.format(
             "The controller %s is not enabled in the cgroup hierarchy: %s. Please enable it in " +
                 "in the %s/cgroup.subtree_control file.",
             controller.getName(), yarnHierarchy.getAbsolutePath(),
-            yarnHierarchy.getParentFile().getAbsolutePath()));
+            yarnHierarchy.getParentFile().getAbsolutePath());
+
+        throw new ResourceHandlerException(getErrorWithDetails(
+            errorMsg, controller.getName(),
+            yarnHierarchy.getAbsolutePath()));
       }
 
       File subtreeControlFile = new File(yarnHierarchy.getAbsolutePath()
           + Path.SEPARATOR + CGROUP_SUBTREE_CONTROL_FILE);
       if (!subtreeControlFile.exists()) {
-        throw new ResourceHandlerException(
-            "No subtree control file found in the cgroup hierarchy: " +
-                yarnHierarchy.getAbsolutePath());
+        String errorMsg = "No subtree control file found in the cgroup hierarchy: " +
+            yarnHierarchy.getAbsolutePath();
+        throw new ResourceHandlerException(getErrorWithDetails(
+            errorMsg, controller.getName(),
+            yarnHierarchy.getAbsolutePath()));
+      }
+      if (!subtreeControlFile.canWrite()) {
+        String errorMsg = "Cannot write the cgroup.subtree_control file in the " +
+            "cgroup hierarchy: " + yarnHierarchy.getAbsolutePath();
+        throw new ResourceHandlerException(getErrorWithDetails(
+            errorMsg, controller.getName(),
+            yarnHierarchy.getAbsolutePath()));
       }
 
       Writer w = new OutputStreamWriter(Files.newOutputStream(subtreeControlFile.toPath(),
@@ -194,16 +205,20 @@ class CGroupsV2HandlerImpl extends AbstractCGroupsHandler {
             yarnHierarchy.getAbsolutePath());
         pw.write("+" + controller.getName());
         if (pw.checkError()) {
-          throw new ResourceHandlerException("Failed to add the controller to the " +
+          String errorMsg = "Failed to add the controller to the " +
               "cgroup.subtree_control file in the cgroup hierarchy: " +
-              yarnHierarchy.getAbsolutePath());
+              yarnHierarchy.getAbsolutePath();
+          throw new ResourceHandlerException(getErrorWithDetails(
+              errorMsg, controller.getName(),
+              yarnHierarchy.getAbsolutePath()));
         }
       }
     } catch (IOException e) {
-      throw new ResourceHandlerException(
-          "Failed to update the cgroup.subtree_control file in the cgroup hierarchy: " +
-              yarnHierarchy.getAbsolutePath(), e);
+      String errorMsg = "Failed to update the cgroup.subtree_control file in the " +
+          "cgroup hierarchy: " + yarnHierarchy.getAbsolutePath();
+      throw new ResourceHandlerException(getErrorWithDetails(
+          errorMsg, controller.getName(),
+          yarnHierarchy.getAbsolutePath()));
     }
   }
-
 }

+ 2 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/linux/resources/TestCGroupsV2HandlerImpl.java

@@ -217,11 +217,13 @@ public class TestCGroupsV2HandlerImpl extends TestCGroupsHandlerBase {
     conf.set(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_HIERARCHY,
         "/hadoop-yarn");
 
+    File baseCgroup = new File(tmpPath);
     File subCgroup = new File(tmpPath, "/hadoop-yarn");
     Assert.assertTrue("temp dir should be created", subCgroup.mkdirs());
     subCgroup.deleteOnExit();
 
     String enabledControllers = "cpuset cpu io memory hugetlb pids rdma misc\n";
+    createFileWithContent(baseCgroup, CGroupsHandler.CGROUP_CONTROLLERS_FILE, enabledControllers);
     createFileWithContent(subCgroup, CGroupsHandler.CGROUP_CONTROLLERS_FILE, enabledControllers);
 
     File subtreeControlFile = new File(subCgroup.getAbsolutePath(),