Browse Source

YARN-2313. Livelock can occur in FairScheduler when there are lots of running apps (Tsuyoshi Ozawa via Sandy Ryza)

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1612769 13f79535-47bb-0310-9956-ffa450edef68
Sanford Ryza 10 years ago
parent
commit
c88402f36d

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -91,6 +91,9 @@ Release 2.6.0 - UNRELEASED
     YARN-2273. NPE in ContinuousScheduling thread when we lose a node. 
     YARN-2273. NPE in ContinuousScheduling thread when we lose a node. 
     (Wei Yan via kasha)
     (Wei Yan via kasha)
 
 
+    YARN-2313. Livelock can occur in FairScheduler when there are lots of
+    running apps (Tsuyoshi Ozawa via Sandy Ryza)
+
 Release 2.5.0 - UNRELEASED
 Release 2.5.0 - UNRELEASED
 
 
   INCOMPATIBLE CHANGES
   INCOMPATIBLE CHANGES

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/dev-support/findbugs-exclude.xml

@@ -194,6 +194,12 @@
     <Field name="scheduleAsynchronously" />
     <Field name="scheduleAsynchronously" />
     <Bug pattern="IS2_INCONSISTENT_SYNC" />
     <Bug pattern="IS2_INCONSISTENT_SYNC" />
   </Match>
   </Match>
+  <!-- Inconsistent sync warning - updateInterval is only initialized once and never changed -->
+  <Match>
+    <Class name="org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler" />
+    <Field name="updateInterval" />
+    <Bug pattern="IS2_INCONSISTENT_SYNC" />
+  </Match>
   <!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
   <!-- Inconsistent sync warning - numRetries is only initialized once and never changed -->
   <Match>
   <Match>
     <Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />
     <Class name="org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore" />

+ 12 - 3
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairScheduler.java

@@ -135,7 +135,7 @@ public class FairScheduler extends
   public static final Resource CONTAINER_RESERVED = Resources.createResource(-1);
   public static final Resource CONTAINER_RESERVED = Resources.createResource(-1);
 
 
   // How often fair shares are re-calculated (ms)
   // How often fair shares are re-calculated (ms)
-  protected long UPDATE_INTERVAL = 500;
+  protected long updateInterval;
   private final int UPDATE_DEBUG_FREQUENCY = 5;
   private final int UPDATE_DEBUG_FREQUENCY = 5;
   private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;
   private int updatesToSkipForDebug = UPDATE_DEBUG_FREQUENCY;
 
 
@@ -244,13 +244,13 @@ public class FairScheduler extends
 
 
   /**
   /**
    * A runnable which calls {@link FairScheduler#update()} every
    * A runnable which calls {@link FairScheduler#update()} every
-   * <code>UPDATE_INTERVAL</code> milliseconds.
+   * <code>updateInterval</code> milliseconds.
    */
    */
   private class UpdateThread implements Runnable {
   private class UpdateThread implements Runnable {
     public void run() {
     public void run() {
       while (true) {
       while (true) {
         try {
         try {
-          Thread.sleep(UPDATE_INTERVAL);
+          Thread.sleep(updateInterval);
           update();
           update();
           preemptTasksIfNecessary();
           preemptTasksIfNecessary();
         } catch (Exception e) {
         } catch (Exception e) {
@@ -1206,6 +1206,15 @@ public class FairScheduler extends
     waitTimeBeforeKill = this.conf.getWaitTimeBeforeKill();
     waitTimeBeforeKill = this.conf.getWaitTimeBeforeKill();
     usePortForNodeName = this.conf.getUsePortForNodeName();
     usePortForNodeName = this.conf.getUsePortForNodeName();
 
 
+    updateInterval = this.conf.getUpdateInterval();
+    if (updateInterval < 0) {
+      updateInterval = FairSchedulerConfiguration.DEFAULT_UPDATE_INTERVAL_MS;
+      LOG.warn(FairSchedulerConfiguration.UPDATE_INTERVAL_MS
+              + " is invalid, so using default value " +
+              + FairSchedulerConfiguration.DEFAULT_UPDATE_INTERVAL_MS
+              + " ms instead");
+    }
+
     rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
     rootMetrics = FSQueueMetrics.forQueue("root", null, true, conf);
     // This stores per-application scheduling information
     // This stores per-application scheduling information
     this.applications =
     this.applications =

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/FairSchedulerConfiguration.java

@@ -123,6 +123,11 @@ public class FairSchedulerConfiguration extends Configuration {
   protected static final String MAX_ASSIGN = CONF_PREFIX + "max.assign";
   protected static final String MAX_ASSIGN = CONF_PREFIX + "max.assign";
   protected static final int DEFAULT_MAX_ASSIGN = -1;
   protected static final int DEFAULT_MAX_ASSIGN = -1;
 
 
+  /** The update interval for calculating resources in FairScheduler .*/
+  public static final String UPDATE_INTERVAL_MS =
+      CONF_PREFIX + "update-interval-ms";
+  public static final int DEFAULT_UPDATE_INTERVAL_MS = 500;
+
   public FairSchedulerConfiguration() {
   public FairSchedulerConfiguration() {
     super();
     super();
   }
   }
@@ -246,6 +251,10 @@ public class FairSchedulerConfiguration extends Configuration {
           "Error reading resource config", ex);
           "Error reading resource config", ex);
     }
     }
   }
   }
+
+  public long getUpdateInterval() {
+    return getLong(UPDATE_INTERVAL_MS, DEFAULT_UPDATE_INTERVAL_MS);
+  }
   
   
   private static int findResource(String val, String units)
   private static int findResource(String val, String units)
     throws AllocationConfigurationException {
     throws AllocationConfigurationException {

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/fair/TestFairSchedulerPreemption.java

@@ -94,7 +94,7 @@ public class TestFairSchedulerPreemption extends FairSchedulerTestBase {
     scheduler = (FairScheduler)resourceManager.getResourceScheduler();
     scheduler = (FairScheduler)resourceManager.getResourceScheduler();
 
 
     scheduler.setClock(clock);
     scheduler.setClock(clock);
-    scheduler.UPDATE_INTERVAL = 60 * 1000;
+    scheduler.updateInterval = 60 * 1000;
   }
   }
 
 
   private void registerNodeAndSubmitApp(
   private void registerNodeAndSubmitApp(

+ 6 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/apt/FairScheduler.apt.vm

@@ -205,6 +205,12 @@ Properties that can be placed in yarn-site.xml
       instead. Defaults to true. If a queue placement policy is given in the
       instead. Defaults to true. If a queue placement policy is given in the
       allocations file, this property is ignored.
       allocations file, this property is ignored.
 
 
+ * <<<yarn.scheduler.fair.update-interval-ms>>>
+ 
+    * The interval at which to lock the scheduler and recalculate fair shares,
+      recalculate demand, and check whether anything is due for preemption.
+      Defaults to 500 ms. 
+
 Allocation file format
 Allocation file format
 
 
   The allocation file must be in XML format. The format contains five types of
   The allocation file must be in XML format. The format contains five types of