Browse Source

YARN-6872. Ensure apps could run given NodeLabels are disabled post RM switchover/restart. Contributed by Sunil G

(cherry picked from commit e84a3f43a1b114d85a3c60151c821b1ca753f81a)
Jian He 7 years ago
parent
commit
d1996152bd

+ 8 - 58
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/RMAppManager.java

@@ -17,11 +17,7 @@
  */
 package org.apache.hadoop.yarn.server.resourcemanager;
 
-import java.io.IOException;
-import java.nio.ByteBuffer;
-import java.util.LinkedList;
-import java.util.Map;
-
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
@@ -37,7 +33,6 @@ import org.apache.hadoop.yarn.api.records.ResourceRequest;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException;
-import org.apache.hadoop.yarn.exceptions.InvalidLabelResourceRequestException;
 import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.ipc.RPCUtil;
 import org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger.AuditConstants;
@@ -56,11 +51,13 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
-import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.security.ApplicationACLsManager;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 
-import com.google.common.annotations.VisibleForTesting;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.util.LinkedList;
+import java.util.Map;
 
 /**
  * This class manages the list of applications for the resource manager. 
@@ -324,34 +321,6 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
         createAndPopulateNewRMApp(appContext, appState.getSubmitTime(),
             appState.getUser(), true);
 
-    // If null amReq has been returned, check if it is the case that
-    // application has specified node label expression while node label
-    // has been disabled. Reject the recovery of this application if it
-    // is true and give clear message so that user can react properly.
-    if (!appContext.getUnmanagedAM() &&
-        application.getAMResourceRequest() == null &&
-        !YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
-      // check application submission context and see if am resource request
-      // or application itself contains any node label expression.
-      ResourceRequest amReqFromAppContext =
-          appContext.getAMContainerResourceRequest();
-      String labelExp = (amReqFromAppContext != null) ?
-          amReqFromAppContext.getNodeLabelExpression() : null;
-      if (labelExp == null) {
-        labelExp = appContext.getNodeLabelExpression();
-      }
-      if (labelExp != null &&
-          !labelExp.equals(RMNodeLabelsManager.NO_LABEL)) {
-        String message = "Failed to recover application " + appId
-            + ". NodeLabel is not enabled in cluster, but AM resource request "
-            + "contains a label expression.";
-        LOG.warn(message);
-        application.handle(
-            new RMAppEvent(appId, RMAppEventType.APP_REJECTED, message));
-        return;
-      }
-    }
-
     application.handle(new RMAppRecoverEvent(appId, rmState));
   }
 
@@ -368,28 +337,9 @@ public class RMAppManager implements EventHandler<RMAppManagerEvent>,
     }
     
     ApplicationId applicationId = submissionContext.getApplicationId();
-    ResourceRequest amReq = null;
-    try {
-      amReq = validateAndCreateResourceRequest(submissionContext, isRecovery);
-    } catch (InvalidLabelResourceRequestException e) {
-      // This can happen if the application had been submitted and run
-      // with Node Label enabled but recover with Node Label disabled.
-      // Thus there might be node label expression in the application's
-      // resource requests. If this is the case, create RmAppImpl with
-      // null amReq and reject the application later with clear error
-      // message. So that the application can still be tracked by RM
-      // after recovery and user can see what's going on and react accordingly.
-      if (isRecovery &&
-          !YarnConfiguration.areNodeLabelsEnabled(this.conf)) {
-        if (LOG.isDebugEnabled()) {
-          LOG.debug("AMResourceRequest is not created for " + applicationId
-              + ". NodeLabel is not enabled in cluster, but AM resource "
-              + "request contains a label expression.");
-        }
-      } else {
-        throw e;
-      }
-    }
+
+    ResourceRequest amReq = validateAndCreateResourceRequest(
+        submissionContext, isRecovery);
 
     // Verify and get the update application priority and set back to
     // submissionContext

+ 4 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/AppSchedulingInfo.java

@@ -815,7 +815,8 @@ public class AppSchedulingInfo {
     this.placesBlacklistedByApp = appInfo.getBlackList();
   }
 
-  public synchronized void recoverContainer(RMContainer rmContainer) {
+  public synchronized void recoverContainer(RMContainer rmContainer,
+      String partition) {
     QueueMetrics metrics = queue.getMetrics();
     if (pending) {
       // If there was any container to recover, the application was
@@ -828,9 +829,8 @@ public class AppSchedulingInfo {
     if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
       return;
     }
-    metrics.allocateResources(rmContainer.getNodeLabelExpression(),
-        user, 1, rmContainer.getAllocatedResource(),
-      false);
+    metrics.allocateResources(partition, user, 1,
+        rmContainer.getAllocatedResource(), false);
   }
   
   public ResourceRequest cloneResourceRequest(ResourceRequest request) {

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerApplicationAttempt.java

@@ -779,7 +779,7 @@ public class SchedulerApplicationAttempt implements SchedulableEntity {
   public synchronized void recoverContainer(SchedulerNode node,
       RMContainer rmContainer) {
     // recover app scheduling info
-    appSchedulingInfo.recoverContainer(rmContainer);
+    appSchedulingInfo.recoverContainer(rmContainer,  node.getPartition());
 
     if (rmContainer.getState().equals(RMContainerState.COMPLETED)) {
       return;

+ 13 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/SchedulerUtils.java

@@ -22,6 +22,8 @@ import java.util.List;
 import java.util.Set;
 
 import org.apache.commons.lang.StringUtils;
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.classification.InterfaceAudience.Private;
 import org.apache.hadoop.classification.InterfaceStability.Unstable;
 import org.apache.hadoop.conf.Configuration;
@@ -51,7 +53,9 @@ import org.apache.hadoop.yarn.util.resource.Resources;
 @Private
 @Unstable
 public class SchedulerUtils {
-  
+
+  private static final Log LOG = LogFactory.getLog(SchedulerUtils.class);
+
   private static final RecordFactory recordFactory = 
       RecordFactoryProvider.getRecordFactory(null);
 
@@ -230,9 +234,14 @@ public class SchedulerUtils {
       String labelExp = resReq.getNodeLabelExpression();
       if (!(RMNodeLabelsManager.NO_LABEL.equals(labelExp)
           || null == labelExp)) {
-        throw new InvalidLabelResourceRequestException(
-            "Invalid resource request, node label not enabled "
-                + "but request contains label expression");
+        String message = "NodeLabel is not enabled in cluster, but resource"
+            + " request contains a label expression.";
+        LOG.warn(message);
+        if (!isRecovery) {
+          throw new InvalidLabelResourceRequestException(
+              "Invalid resource request, node label not enabled "
+                  + "but request contains label expression");
+        }
       }
     }
     if (null == queueInfo) {

+ 4 - 4
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/TestRMRestart.java

@@ -2397,14 +2397,14 @@ public class TestRMRestart extends ParameterizedSchedulerTestBase {
       }
     };
 
-    // rm should successfully start with app1 loaded back in FAILED state
-    // due to node label not enabled but am resource request contains
-    // node label expression.
+    // rm should successfully start with app1 loaded back in SUCCESS state
+    // by pushing app to run default label for am container and let other
+    // containers to run normally.
+
     try {
       rm2.start();
       Assert.assertTrue("RM start successfully", true);
       Assert.assertEquals(1, rm2.getRMContext().getRMApps().size());
-      rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
     } catch (Exception e) {
       LOG.debug("Exception on start", e);
       Assert.fail("RM should start without any issue");