Browse Source

YARN-5016. Add support for a minimum retry interval for container retries. Contributed by Jun Gong.

Varun Vasudev 9 năm trước cách đây
mục cha
commit
0287c49107

+ 5 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -893,6 +893,11 @@ public class YarnConfiguration extends Configuration {
       NM_PREFIX + "container-diagnostics-maximum-size";
       NM_PREFIX + "container-diagnostics-maximum-size";
   public static final int DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE = 10000;
   public static final int DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE = 10000;
 
 
+  /** Minimum container restart interval. */
+  public static final String NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS =
+      NM_PREFIX + "container-retry-minimum-interval-ms";
+  public static final int DEFAULT_NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS = 1000;
+
   /** Interval at which the delayed token removal thread runs */
   /** Interval at which the delayed token removal thread runs */
   public static final String RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS =
   public static final String RM_DELAYED_DELEGATION_TOKEN_REMOVAL_INTERVAL_MS =
       RM_PREFIX + "delayed.delegation-token.removal-interval-ms";
       RM_PREFIX + "delayed.delegation-token.removal-interval-ms";

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -1588,6 +1588,12 @@
     <value>10000</value>
     <value>10000</value>
   </property>
   </property>
 
 
+  <property>
+    <description>Minimum container restart interval in milliseconds.</description>
+    <name>yarn.nodemanager.container-retry-minimum-interval-ms</name>
+    <value>1000</value>
+  </property>
+
   <property>
   <property>
     <description>Max number of threads in NMClientAsync to process container
     <description>Max number of threads in NMClientAsync to process container
     management events</description>
     management events</description>

+ 16 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

@@ -33,6 +33,7 @@ import java.util.concurrent.locks.Lock;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
 
+import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configuration;
@@ -155,6 +156,16 @@ public class ContainerImpl implements Container {
       this.containerRetryContext = ContainerRetryContext.NEVER_RETRY_CONTEXT;
       this.containerRetryContext = ContainerRetryContext.NEVER_RETRY_CONTEXT;
     }
     }
     this.remainingRetryAttempts = containerRetryContext.getMaxRetries();
     this.remainingRetryAttempts = containerRetryContext.getMaxRetries();
+    int minimumRestartInterval = conf.getInt(
+        YarnConfiguration.NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS,
+        YarnConfiguration.DEFAULT_NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS);
+    if (containerRetryContext.getRetryPolicy()
+        != ContainerRetryPolicy.NEVER_RETRY
+        && containerRetryContext.getRetryInterval() < minimumRestartInterval) {
+      LOG.info("Set restart interval to minimum value " + minimumRestartInterval
+          + "ms for container " + containerTokenIdentifier.getContainerID());
+      this.containerRetryContext.setRetryInterval(minimumRestartInterval);
+    }
     this.diagnosticsMaxSize = conf.getInt(
     this.diagnosticsMaxSize = conf.getInt(
         YarnConfiguration.NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE,
         YarnConfiguration.NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE,
         YarnConfiguration.DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE);
         YarnConfiguration.DEFAULT_NM_CONTAINER_DIAGNOSTICS_MAXIMUM_SIZE);
@@ -1368,4 +1379,9 @@ public class ContainerImpl implements Container {
       LocalResourceRequest resource) {
       LocalResourceRequest resource) {
     return container.resourcesUploadPolicies.get(resource);
     return container.resourcesUploadPolicies.get(resource);
   }
   }
+
+  @VisibleForTesting
+  ContainerRetryContext getContainerRetryContext() {
+    return containerRetryContext;
+  }
 }
 }

+ 34 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java

@@ -725,6 +725,40 @@ public class TestContainer {
     }
     }
   }
   }
 
 
+  @Test
+  public void testContainerRestartInterval() throws IOException {
+    conf.setInt(YarnConfiguration.NM_CONTAINER_RETRY_MINIMUM_INTERVAL_MS, 2000);
+
+    ContainerRetryContext containerRetryContext1 = ContainerRetryContext
+        .newInstance(ContainerRetryPolicy.NEVER_RETRY, null, 3, 0);
+    testContainerRestartInterval(containerRetryContext1, 0);
+
+    ContainerRetryContext containerRetryContext2 = ContainerRetryContext
+        .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 3, 0);
+    testContainerRestartInterval(containerRetryContext2, 2000);
+
+    ContainerRetryContext containerRetryContext3 = ContainerRetryContext
+        .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 3, 4000);
+    testContainerRestartInterval(containerRetryContext3, 4000);
+  }
+
+  private void testContainerRestartInterval(
+      ContainerRetryContext containerRetryContext,
+      int expectedRestartInterval) throws IOException {
+    WrappedContainer wc = null;
+    try {
+      wc = new WrappedContainer(25, 314159265358980L, 4345,
+          "yak", containerRetryContext);
+      Assert.assertEquals(
+          ((ContainerImpl)wc.c).getContainerRetryContext().getRetryInterval(),
+          expectedRestartInterval);
+    } finally {
+      if (wc != null) {
+        wc.finished();
+      }
+    }
+  }
+
   private void verifyCleanupCall(WrappedContainer wc) throws Exception {
   private void verifyCleanupCall(WrappedContainer wc) throws Exception {
     ResourcesReleasedMatcher matchesReq =
     ResourcesReleasedMatcher matchesReq =
         new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(
         new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(