Просмотр исходного кода

YARN-5015. Support sliding window retry capability for container restart. (Chandni Singh via wangda)

Change-Id: I07addd3e4ba8d98456ee2ff1d5c540a38fe61dea
Wangda Tan 7 лет назад
Родитель
Сommit
a5b27b3c67
14 измененных файлов с 451 добавлено и 43 удалено
  1. 20 1
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryContext.java
  2. 1 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto
  3. 8 2
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java
  4. 7 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java
  5. 15 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerRetryContextPBImpl.java
  6. 46 40
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java
  7. 165 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java
  8. 24 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java
  9. 5 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java
  10. 21 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java
  11. 37 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java
  12. 77 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java
  13. 9 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java
  14. 16 0
      hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java

+ 20 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/api/records/ContainerRetryContext.java

@@ -49,6 +49,13 @@ import java.util.Set;
  *   </li>
  *   <li><em>retryInterval</em> specifies delaying some time before relaunch
  *   container, the unit is millisecond.</li>
+ *   <li>
+ *     <em>failuresValidityInterval</em>: default value is -1.
+ *     When failuresValidityInterval in milliseconds is set to {@literal >} 0,
+ *     the failure number will not take failures which happen out of the
+ *     failuresValidityInterval into failure count. If failure count
+ *     reaches to <em>maxRetries</em>, the container will be failed.
+ *   </li>
  * </ul>
  */
 @Public
@@ -63,16 +70,25 @@ public abstract class ContainerRetryContext {
   @Unstable
   public static ContainerRetryContext newInstance(
       ContainerRetryPolicy retryPolicy, Set<Integer> errorCodes,
-      int maxRetries, int retryInterval) {
+      int maxRetries, int retryInterval, long failuresValidityInterval) {
     ContainerRetryContext containerRetryContext =
         Records.newRecord(ContainerRetryContext.class);
     containerRetryContext.setRetryPolicy(retryPolicy);
     containerRetryContext.setErrorCodes(errorCodes);
     containerRetryContext.setMaxRetries(maxRetries);
     containerRetryContext.setRetryInterval(retryInterval);
+    containerRetryContext.setFailuresValidityInterval(failuresValidityInterval);
     return containerRetryContext;
   }
 
+  @Private
+  @Unstable
+  public static ContainerRetryContext newInstance(
+      ContainerRetryPolicy retryPolicy, Set<Integer> errorCodes,
+      int maxRetries, int retryInterval) {
+    return newInstance(retryPolicy, errorCodes, maxRetries, retryInterval, -1);
+  }
+
   public abstract ContainerRetryPolicy getRetryPolicy();
   public abstract void setRetryPolicy(ContainerRetryPolicy retryPolicy);
   public abstract Set<Integer> getErrorCodes();
@@ -81,4 +97,7 @@ public abstract class ContainerRetryContext {
   public abstract void setMaxRetries(int maxRetries);
   public abstract int getRetryInterval();
   public abstract void setRetryInterval(int retryInterval);
+  public abstract long getFailuresValidityInterval();
+  public abstract void setFailuresValidityInterval(
+      long failuresValidityInterval);
 }

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/proto/yarn_protos.proto

@@ -754,6 +754,7 @@ message ContainerRetryContextProto {
   repeated int32 error_codes = 2;
   optional int32 max_retries = 3 [default = 0];
   optional int32 retry_interval = 4 [default = 0];
+  optional int64 failures_validity_interval = 5 [default = -1];
 }
 
 enum ContainerRetryPolicyProto {

+ 8 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/ApplicationMaster.java

@@ -308,6 +308,7 @@ public class ApplicationMaster {
   private Set<Integer> containerRetryErrorCodes = null;
   private int containerMaxRetries = 0;
   private int containrRetryInterval = 0;
+  private long containerFailuresValidityInterval = -1;
 
   // Timeline domain ID
   private String domainId = null;
@@ -471,6 +472,9 @@ public class ApplicationMaster {
         "If container could retry, it specifies max retires");
     opts.addOption("container_retry_interval", true,
         "Interval between each retry, unit is milliseconds");
+    opts.addOption("container_failures_validity_interval", true,
+        "Failures which are out of the time window will not be added to"
+            + " the number of container retry attempts");
     opts.addOption("placement_spec", true, "Placement specification");
     opts.addOption("debug", false, "Dump out debug information");
 
@@ -661,7 +665,8 @@ public class ApplicationMaster {
         cliParser.getOptionValue("container_max_retries", "0"));
     containrRetryInterval = Integer.parseInt(cliParser.getOptionValue(
         "container_retry_interval", "0"));
-
+    containerFailuresValidityInterval = Long.parseLong(
+        cliParser.getOptionValue("container_failures_validity_interval", "-1"));
     if (!YarnConfiguration.timelineServiceEnabled(conf)) {
       timelineClient = null;
       timelineV2Client = null;
@@ -1385,7 +1390,8 @@ public class ApplicationMaster {
       ContainerRetryContext containerRetryContext =
           ContainerRetryContext.newInstance(
               containerRetryPolicy, containerRetryErrorCodes,
-              containerMaxRetries, containrRetryInterval);
+              containerMaxRetries, containrRetryInterval,
+              containerFailuresValidityInterval);
       ContainerLaunchContext ctx = ContainerLaunchContext.newInstance(
         localResources, myShellEnv, commands, null, allTokens.duplicate(),
           null, containerRetryContext);

+ 7 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-applications-distributedshell/src/main/java/org/apache/hadoop/yarn/applications/distributedshell/Client.java

@@ -373,6 +373,9 @@ public class Client {
         "If container could retry, it specifies max retires");
     opts.addOption("container_retry_interval", true,
         "Interval between each retry, unit is milliseconds");
+    opts.addOption("container_failures_validity_interval", true,
+        "Failures which are out of the time window will not be added to"
+            + " the number of container retry attempts");
     opts.addOption("docker_client_config", true,
         "The docker client configuration path. The scheme should be supplied"
             + " (i.e. file:// or hdfs://)."
@@ -579,6 +582,10 @@ public class Client {
       containerRetryOptions.add("--container_retry_interval "
           + cliParser.getOptionValue("container_retry_interval"));
     }
+    if (cliParser.hasOption("container_failures_validity_interval")) {
+      containerRetryOptions.add("--container_failures_validity_interval "
+          + cliParser.getOptionValue("container_failures_validity_interval"));
+    }
 
     if (cliParser.hasOption("flow_name")) {
       flowName = cliParser.getOptionValue("flow_name");

+ 15 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/api/records/impl/pb/ContainerRetryContextPBImpl.java

@@ -165,6 +165,21 @@ public class ContainerRetryContextPBImpl extends ContainerRetryContext {
     builder.setRetryInterval(retryInterval);
   }
 
+  @Override
+  public long getFailuresValidityInterval() {
+    ContainerRetryContextProtoOrBuilder p = viaProto ? proto : builder;
+    if (!p.hasFailuresValidityInterval()) {
+      return -1;
+    }
+    return p.getFailuresValidityInterval();
+  }
+
+  @Override
+  public void setFailuresValidityInterval(long failuresValidityInterval) {
+    maybeInitBuilder();
+    builder.setFailuresValidityInterval(failuresValidityInterval);
+  }
+
   private ContainerRetryPolicyProto convertToProtoFormat(
       ContainerRetryPolicy containerRetryPolicy) {
     return ProtoUtils.convertToProtoFormat(containerRetryPolicy);

+ 46 - 40
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/ContainerImpl.java

@@ -167,9 +167,11 @@ public class ContainerImpl implements Container {
   private long containerLaunchStartTime;
   private ContainerMetrics containerMetrics;
   private static Clock clock = SystemClock.getInstance();
+
   private ContainerRetryContext containerRetryContext;
-  // remaining retries to relaunch container if needed
-  private int remainingRetryAttempts;
+  private SlidingWindowRetryPolicy.RetryContext windowRetryContext;
+  private SlidingWindowRetryPolicy retryPolicy;
+
   private String workDir;
   private String logDir;
   private String host;
@@ -246,7 +248,10 @@ public class ContainerImpl implements Container {
     // Configure the Retry Context
     this.containerRetryContext = configureRetryContext(
         conf, launchContext, this.containerId);
-    this.remainingRetryAttempts = this.containerRetryContext.getMaxRetries();
+    this.windowRetryContext = new SlidingWindowRetryPolicy
+        .RetryContext(containerRetryContext);
+    this.retryPolicy = new SlidingWindowRetryPolicy(clock);
+
     stateMachine = stateMachineFactory.make(this, ContainerState.NEW,
         context.getContainerStateTransitionListener());
     this.context = context;
@@ -289,7 +294,9 @@ public class ContainerImpl implements Container {
     this.recoveredAsKilled = rcs.getKilled();
     this.diagnostics.append(rcs.getDiagnostics());
     this.version = rcs.getVersion();
-    this.remainingRetryAttempts = rcs.getRemainingRetryAttempts();
+    this.windowRetryContext.setRemainingRetries(
+        rcs.getRemainingRetryAttempts());
+    this.windowRetryContext.setRestartTimes(rcs.getRestartTimes());
     this.workDir = rcs.getWorkDir();
     this.logDir = rcs.getLogDir();
     this.resourceMappings = rcs.getResourceMappings();
@@ -1591,27 +1598,15 @@ public class ContainerImpl implements Container {
       if (exitEvent.getDiagnosticInfo() != null) {
         if (container.containerRetryContext.getRetryPolicy()
             != ContainerRetryPolicy.NEVER_RETRY) {
-          int n = container.containerRetryContext.getMaxRetries()
-              - container.remainingRetryAttempts;
-          container.addDiagnostics("Diagnostic message from attempt "
-              + n + " : ", "\n");
+          container.addDiagnostics("Diagnostic message from attempt : \n");
         }
         container.addDiagnostics(exitEvent.getDiagnosticInfo() + "\n");
       }
 
       if (container.shouldRetry(container.exitCode)) {
-        if (container.remainingRetryAttempts > 0) {
-          container.remainingRetryAttempts--;
-          try {
-            container.stateStore.storeContainerRemainingRetryAttempts(
-                container.getContainerId(), container.remainingRetryAttempts);
-          } catch (IOException e) {
-            LOG.warn(
-                "Unable to update remainingRetryAttempts in state store for "
-                    + container.getContainerId(), e);
-          }
-        }
-        doRelaunch(container, container.remainingRetryAttempts,
+        container.storeRetryContext();
+        doRelaunch(container,
+            container.windowRetryContext.getRemainingRetries(),
             container.containerRetryContext.getRetryInterval());
         return ContainerState.RELAUNCHING;
       } else if (container.canRollback()) {
@@ -1671,29 +1666,14 @@ public class ContainerImpl implements Container {
 
   @Override
   public boolean shouldRetry(int errorCode) {
-    return shouldRetry(errorCode, containerRetryContext,
-        remainingRetryAttempts);
-  }
-
-  public static boolean shouldRetry(int errorCode,
-      ContainerRetryContext retryContext, int remainingRetryAttempts) {
     if (errorCode == ExitCode.SUCCESS.getExitCode()
         || errorCode == ExitCode.FORCE_KILLED.getExitCode()
         || errorCode == ExitCode.TERMINATED.getExitCode()) {
       return false;
     }
-
-    ContainerRetryPolicy retryPolicy = retryContext.getRetryPolicy();
-    if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS
-        || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES
-        && retryContext.getErrorCodes() != null
-        && retryContext.getErrorCodes().contains(errorCode))) {
-      return remainingRetryAttempts > 0
-          || remainingRetryAttempts == ContainerRetryContext.RETRY_FOREVER;
-    }
-
-    return false;
+    return retryPolicy.shouldRetry(windowRetryContext, errorCode);
   }
+
   /**
    * Transition to EXITED_WITH_FAILURE
    */
@@ -1729,9 +1709,9 @@ public class ContainerImpl implements Container {
       container.containerRetryContext =
           configureRetryContext(container.context.getConf(),
               container.launchContext, container.containerId);
-      // Reset the retry attempts since its a fresh start
-      container.remainingRetryAttempts =
-          container.containerRetryContext.getMaxRetries();
+      container.windowRetryContext = new SlidingWindowRetryPolicy
+          .RetryContext(container.containerRetryContext);
+      container.retryPolicy = new SlidingWindowRetryPolicy(clock);
 
       container.resourceSet =
           container.reInitContext.mergedResourceSet(container.resourceSet);
@@ -2209,4 +2189,30 @@ public class ContainerImpl implements Container {
             container.getContainerId().toString());
     deletionService.delete(deletionTask);
   }
+
+  private void storeRetryContext() {
+    if (windowRetryContext.getRestartTimes() != null) {
+      try {
+        stateStore.storeContainerRestartTimes(containerId,
+            windowRetryContext.getRestartTimes());
+      } catch (IOException e) {
+        LOG.warn(
+            "Unable to update finishTimeForRetryAttempts in state store for "
+                + containerId, e);
+      }
+    }
+    try {
+      stateStore.storeContainerRemainingRetryAttempts(containerId,
+          windowRetryContext.getRemainingRetries());
+    } catch (IOException e) {
+      LOG.warn(
+          "Unable to update remainingRetryAttempts in state store for "
+              + containerId, e);
+    }
+  }
+
+  @VisibleForTesting
+  SlidingWindowRetryPolicy getRetryPolicy() {
+    return retryPolicy;
+  }
 }

+ 165 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/SlidingWindowRetryPolicy.java

@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.container;
+
+import com.google.common.base.Preconditions;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.yarn.api.records.ContainerRetryContext;
+import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy;
+import org.apache.hadoop.yarn.util.Clock;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+
+/**
+ * <p>Sliding window retry policy for relaunching a
+ * <code>Container</code> in Yarn.</p>
+ */
+@InterfaceStability.Unstable
+public class SlidingWindowRetryPolicy {
+
+  private Clock clock;
+
+  public SlidingWindowRetryPolicy(Clock clock)  {
+    this.clock = Preconditions.checkNotNull(clock);
+  }
+
+  public boolean shouldRetry(RetryContext retryContext,
+      int errorCode) {
+    ContainerRetryContext containerRC = retryContext
+        .containerRetryContext;
+    Preconditions.checkNotNull(containerRC, "container retry context null");
+    ContainerRetryPolicy retryPolicy = containerRC.getRetryPolicy();
+    if (retryPolicy == ContainerRetryPolicy.RETRY_ON_ALL_ERRORS
+        || (retryPolicy == ContainerRetryPolicy.RETRY_ON_SPECIFIC_ERROR_CODES
+        && containerRC.getErrorCodes() != null
+        && containerRC.getErrorCodes().contains(errorCode))) {
+      if (containerRC.getMaxRetries() == ContainerRetryContext.RETRY_FOREVER) {
+        return true;
+      }
+      int pendingRetries = calculatePendingRetries(retryContext);
+      updateRetryContext(retryContext, pendingRetries);
+      return pendingRetries > 0;
+    }
+    return false;
+  }
+
+  /**
+   * Calculates the pending number of retries.
+   * <p>
+   * When failuresValidityInterval is > 0, it also removes time entries from
+   * <code>restartTimes</code> which are outside the validity interval.
+   *
+   * @return the pending retries.
+   */
+  private int calculatePendingRetries(RetryContext retryContext) {
+    ContainerRetryContext containerRC =
+        retryContext.containerRetryContext;
+    if (containerRC.getFailuresValidityInterval() > 0) {
+      Iterator<Long> iterator = retryContext.getRestartTimes().iterator();
+      long currentTime = clock.getTime();
+      while (iterator.hasNext()) {
+        long restartTime = iterator.next();
+        if (currentTime - restartTime
+            > containerRC.getFailuresValidityInterval()) {
+          iterator.remove();
+        } else {
+          break;
+        }
+      }
+      return containerRC.getMaxRetries() -
+          retryContext.getRestartTimes().size();
+    } else {
+      return retryContext.getRemainingRetries();
+    }
+  }
+
+  /**
+   * Updates remaining retries and the restart time when
+   * required in the retryContext.
+   */
+  private void updateRetryContext(RetryContext retryContext,
+      int pendingRetries) {
+    retryContext.setRemainingRetries(pendingRetries - 1);
+    if (retryContext.containerRetryContext.getFailuresValidityInterval()
+        > 0) {
+      retryContext.getRestartTimes().add(clock.getTime());
+    }
+  }
+
+  /**
+   * Sets the clock.
+   * @param clock clock
+   */
+  public void setClock(Clock clock) {
+    this.clock = Preconditions.checkNotNull(clock);
+  }
+
+  /**
+   * Sliding window container retry context.
+   * <p>
+   * Besides {@link ContainerRetryContext}, it also provide details such as:
+   * <ul>
+   * <li>
+   * <em>remainingRetries</em>: specifies the number of pending retries. It is
+   * initially set to <code>containerRetryContext.maxRetries</code>.
+   * </li>
+   * <li>
+   * <em>restartTimes</em>: when
+   * <code>containerRetryContext.failuresValidityInterval</code> is set,
+   * then this records the times when the container is set to restart.
+   * </li>
+   * </ul>
+   */
+  static class RetryContext {
+
+    private final ContainerRetryContext containerRetryContext;
+    private List<Long> restartTimes = new ArrayList<>();
+    private int remainingRetries;
+
+    RetryContext(ContainerRetryContext containerRetryContext) {
+      this.containerRetryContext = Preconditions
+          .checkNotNull(containerRetryContext);
+      this.remainingRetries = containerRetryContext.getMaxRetries();
+    }
+
+    ContainerRetryContext getContainerRetryContext() {
+      return containerRetryContext;
+    }
+
+    int getRemainingRetries() {
+      return remainingRetries;
+    }
+
+    void setRemainingRetries(int remainingRetries) {
+      this.remainingRetries = remainingRetries;
+    }
+
+    List<Long> getRestartTimes() {
+      return restartTimes;
+    }
+
+    void setRestartTimes(List<Long> restartTimes) {
+      if (restartTimes != null) {
+        this.restartTimes.clear();
+        this.restartTimes.addAll(restartTimes);
+      }
+    }
+  }
+}

+ 24 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMLeveldbStateStoreService.java

@@ -127,6 +127,8 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
   private static final String CONTAINER_EXIT_CODE_KEY_SUFFIX = "/exitcode";
   private static final String CONTAINER_REMAIN_RETRIES_KEY_SUFFIX =
       "/remainingRetryAttempts";
+  private static final String CONTAINER_RESTART_TIMES_SUFFIX =
+      "/restartTimes";
   private static final String CONTAINER_WORK_DIR_KEY_SUFFIX = "/workdir";
   private static final String CONTAINER_LOG_DIR_KEY_SUFFIX = "/logdir";
 
@@ -338,6 +340,16 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
       } else if (suffix.equals(CONTAINER_REMAIN_RETRIES_KEY_SUFFIX)) {
         rcs.setRemainingRetryAttempts(
             Integer.parseInt(asString(entry.getValue())));
+      } else if (suffix.equals(CONTAINER_RESTART_TIMES_SUFFIX)) {
+        String value = asString(entry.getValue());
+        // parse the string format of List<Long>, e.g. [34, 21, 22]
+        String[] unparsedRestartTimes =
+            value.substring(1, value.length() - 1).split(", ");
+        List<Long> restartTimes = new ArrayList<>();
+        for (String restartTime : unparsedRestartTimes) {
+          restartTimes.add(Long.parseLong(restartTime));
+        }
+        rcs.setRestartTimes(restartTimes);
       } else if (suffix.equals(CONTAINER_WORK_DIR_KEY_SUFFIX)) {
         rcs.setWorkDir(asString(entry.getValue()));
       } else if (suffix.equals(CONTAINER_LOG_DIR_KEY_SUFFIX)) {
@@ -581,6 +593,18 @@ public class NMLeveldbStateStoreService extends NMStateStoreService {
     }
   }
 
+  @Override
+  public void storeContainerRestartTimes(ContainerId containerId,
+      List<Long> restartTimes) throws IOException {
+    String key = CONTAINERS_KEY_PREFIX + containerId.toString()
+        + CONTAINER_RESTART_TIMES_SUFFIX;
+    try {
+      db.put(bytes(key), bytes(restartTimes.toString()));
+    } catch (DBException e) {
+      throw new IOException(e);
+    }
+  }
+
   @Override
   public void storeContainerWorkDir(ContainerId containerId,
       String workDir) throws IOException {

+ 5 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMNullStateStoreService.java

@@ -119,6 +119,11 @@ public class NMNullStateStoreService extends NMStateStoreService {
       int remainingRetryAttempts) throws IOException {
   }
 
+  @Override
+  public void storeContainerRestartTimes(ContainerId containerId,
+      List<Long> restartTimes) throws IOException {
+  }
+
   @Override
   public void storeContainerWorkDir(ContainerId containerId,
       String workDir) throws IOException {

+ 21 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMStateStoreService.java

@@ -98,6 +98,7 @@ public abstract class NMStateStoreService extends AbstractService {
     StartContainerRequest startRequest;
     Resource capability;
     private int remainingRetryAttempts = ContainerRetryContext.RETRY_INVALID;
+    private List<Long> restartTimes;
     private String workDir;
     private String logDir;
     int version;
@@ -150,6 +151,15 @@ public abstract class NMStateStoreService extends AbstractService {
       this.remainingRetryAttempts = retryAttempts;
     }
 
+    public List<Long> getRestartTimes() {
+      return restartTimes;
+    }
+
+    public void setRestartTimes(
+        List<Long> restartTimes) {
+      this.restartTimes = restartTimes;
+    }
+
     public String getWorkDir() {
       return workDir;
     }
@@ -177,6 +187,7 @@ public abstract class NMStateStoreService extends AbstractService {
           .append(", Capability: ").append(getCapability())
           .append(", StartRequest: ").append(getStartRequest())
           .append(", RemainingRetryAttempts: ").append(remainingRetryAttempts)
+          .append(", RestartTimes: ").append(restartTimes)
           .append(", WorkDir: ").append(workDir)
           .append(", LogDir: ").append(logDir)
           .toString();
@@ -486,6 +497,16 @@ public abstract class NMStateStoreService extends AbstractService {
   public abstract void storeContainerRemainingRetryAttempts(
       ContainerId containerId, int remainingRetryAttempts) throws IOException;
 
+  /**
+   * Record restart times for a container.
+   * @param containerId
+   * @param restartTimes
+   * @throws IOException
+   */
+  public abstract void storeContainerRestartTimes(
+      ContainerId containerId, List<Long> restartTimes)
+      throws IOException;
+
   /**
    * Record working directory for a container.
    * @param containerId the container ID

+ 37 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestContainer.java

@@ -105,6 +105,7 @@ import org.apache.hadoop.yarn.server.nodemanager.metrics.NodeManagerMetrics;
 import org.apache.hadoop.yarn.server.nodemanager.NodeStatusUpdater;
 import org.apache.hadoop.yarn.server.nodemanager.recovery.NMNullStateStoreService;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
+import org.apache.hadoop.yarn.util.ControlledClock;
 import org.junit.Assert;
 import org.junit.Test;
 import org.mockito.ArgumentMatcher;
@@ -1109,6 +1110,38 @@ public class TestContainer {
     }
   }
 
+  @Test
+  public void testContainerRetryFailureValidityInterval() throws Exception {
+    ContainerRetryContext containerRetryContext = ContainerRetryContext
+        .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 1, 0, 10);
+    WrappedContainer wc = null;
+    try {
+      wc = new WrappedContainer(25, 314159265358980L, 4200, "test",
+          containerRetryContext);
+      ControlledClock clock = new ControlledClock();
+      wc.getRetryPolicy().setClock(clock);
+      wc.initContainer();
+      wc.localizeResources();
+      wc.launchContainer();
+      wc.containerFailed(12);
+      assertEquals(ContainerState.RUNNING, wc.c.getContainerState());
+      clock.setTime(20);
+      wc.containerFailed(12);
+      assertEquals(ContainerState.RUNNING, wc.c.getContainerState());
+      clock.setTime(40);
+      wc.containerFailed(12);
+      assertEquals(ContainerState.RUNNING, wc.c.getContainerState());
+      clock.setTime(45);
+      wc.containerFailed(12);
+      assertEquals(ContainerState.EXITED_WITH_FAILURE,
+          wc.c.getContainerState());
+    } finally {
+      if (wc != null) {
+        wc.finished();
+      }
+    }
+  }
+
   private void verifyCleanupCall(WrappedContainer wc) throws Exception {
     ResourcesReleasedMatcher matchesReq =
         new ResourcesReleasedMatcher(wc.localResources, EnumSet.of(
@@ -1574,5 +1607,9 @@ public class TestContainer {
     public String getDiagnostics() {
       return c.cloneAndGetContainerStatus().getDiagnostics();
     }
+
+    public SlidingWindowRetryPolicy getRetryPolicy() {
+      return ((ContainerImpl)c).getRetryPolicy();
+    }
   }
 }

+ 77 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/container/TestSlidingWindowRetryPolicy.java

@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.container;
+
+import org.apache.hadoop.yarn.api.records.ContainerRetryContext;
+import org.apache.hadoop.yarn.api.records.ContainerRetryPolicy;
+import org.apache.hadoop.yarn.util.ControlledClock;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Tests for {@link SlidingWindowRetryPolicy}.
+ */
+public class TestSlidingWindowRetryPolicy {
+
+  private ControlledClock clock;
+  private SlidingWindowRetryPolicy retryPolicy;
+
+  @Before
+  public void setup() {
+    clock = new ControlledClock();
+    retryPolicy = new SlidingWindowRetryPolicy(clock);
+  }
+
+  @Test
+  public void testNeverRetry() {
+    ContainerRetryContext retryContext =
+        ContainerRetryContext.NEVER_RETRY_CONTEXT;
+    Assert.assertFalse("never retry", retryPolicy.shouldRetry(
+        new SlidingWindowRetryPolicy.RetryContext(retryContext), 12));
+  }
+
+  @Test
+  public void testAlwaysRetry() {
+    ContainerRetryContext retryContext =  ContainerRetryContext.newInstance(
+        ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, -1,
+        0, 10);
+    Assert.assertTrue("always retry", retryPolicy.shouldRetry(
+        new SlidingWindowRetryPolicy.RetryContext(retryContext), 12));
+  }
+
+  @Test
+  public void testFailuresValidityInterval() {
+    ContainerRetryContext retryContext = ContainerRetryContext
+        .newInstance(ContainerRetryPolicy.RETRY_ON_ALL_ERRORS, null, 1, 0, 10);
+    SlidingWindowRetryPolicy.RetryContext windowRetryContext =
+        new SlidingWindowRetryPolicy.RetryContext(retryContext);
+    Assert.assertTrue("retry 1",
+        retryPolicy.shouldRetry(windowRetryContext, 12));
+    clock.setTime(20);
+    Assert.assertTrue("retry 2",
+        retryPolicy.shouldRetry(windowRetryContext, 12));
+    clock.setTime(40);
+    Assert.assertTrue("retry 3",
+        retryPolicy.shouldRetry(windowRetryContext, 12));
+    clock.setTime(45);
+    Assert.assertFalse("retry failed",
+        retryPolicy.shouldRetry(windowRetryContext, 12));
+  }
+}

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/NMMemoryStateStoreService.java

@@ -121,6 +121,7 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
       rcsCopy.startRequest = rcs.startRequest;
       rcsCopy.capability = rcs.capability;
       rcsCopy.setRemainingRetryAttempts(rcs.getRemainingRetryAttempts());
+      rcsCopy.setRestartTimes(rcs.getRestartTimes());
       rcsCopy.setWorkDir(rcs.getWorkDir());
       rcsCopy.setLogDir(rcs.getLogDir());
       rcsCopy.setResourceMappings(rcs.getResourceMappings());
@@ -212,6 +213,14 @@ public class NMMemoryStateStoreService extends NMStateStoreService {
     rcs.setRemainingRetryAttempts(remainingRetryAttempts);
   }
 
+  @Override
+  public void storeContainerRestartTimes(
+      ContainerId containerId, List<Long> restartTimes)
+      throws IOException {
+    RecoveredContainerState rcs = getRecoveredContainerState(containerId);
+    rcs.setRestartTimes(restartTimes);
+  }
+
   @Override
   public void storeContainerWorkDir(ContainerId containerId,
       String workDir) throws IOException {

+ 16 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/recovery/TestNMLeveldbStateStoreService.java

@@ -371,6 +371,7 @@ public class TestNMLeveldbStateStoreService {
     assertEquals("/test/workdir", rcs.getWorkDir());
     assertEquals("/test/logdir", rcs.getLogDir());
 
+    validateRetryAttempts(containerId);
     // remove the container and verify not recovered
     stateStore.removeContainer(containerId);
     restartStateStore();
@@ -378,6 +379,21 @@ public class TestNMLeveldbStateStoreService {
     assertTrue(recoveredContainers.isEmpty());
   }
 
+  private void validateRetryAttempts(ContainerId containerId)
+      throws IOException {
+    // store finishTimeForRetryAttempts
+    List<Long> finishTimeForRetryAttempts = Arrays.asList(1462700529039L,
+        1462700529050L, 1462700529120L);
+    stateStore.storeContainerRestartTimes(containerId,
+        finishTimeForRetryAttempts);
+    restartStateStore();
+    RecoveredContainerState rcs = stateStore.loadContainersState().get(0);
+    List<Long> recoveredRestartTimes = rcs.getRestartTimes();
+    assertEquals(1462700529039L, (long)recoveredRestartTimes.get(0));
+    assertEquals(1462700529050L, (long)recoveredRestartTimes.get(1));
+    assertEquals(1462700529120L, (long)recoveredRestartTimes.get(2));
+  }
+
   private StartContainerRequest createContainerRequest(
       ContainerId containerId) {
     LocalResource lrsrc = LocalResource.newInstance(