Browse Source

YARN-364. AggregatedLogDeletionService can take too long to delete logs. Contributed by Jason Lowe

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1441239 13f79535-47bb-0310-9956-ffa450edef68
Jason Darrell Lowe 12 years ago
parent
commit
f811198164

+ 7 - 0
hadoop-common-project/hadoop-common/src/site/apt/ClusterSetup.apt.vm

@@ -325,6 +325,13 @@ Hadoop MapReduce Next Generation - Cluster Setup
 | | | How long to keep aggregation logs before deleting them. -1 disables. |
 | | | Be careful, set this too small and you will spam the name node. |
 *-------------------------+-------------------------+------------------------+
+| <<<yarn.log-aggregation.retain-check-interval-seconds>>> | | |
+| | <-1> | |
+| | | Time between checks for aggregated log retention. If set to 0 or a |
+| | | negative value then the value is computed as one-tenth of the |
+| | | aggregated log retention time. |
+| | | Be careful, set this too small and you will spam the name node. |
+*-------------------------+-------------------------+------------------------+
 
 
 

+ 3 - 0
hadoop-yarn-project/CHANGES.txt

@@ -284,6 +284,9 @@ Release 0.23.7 - UNRELEASED
     YARN-343. Capacity Scheduler maximum-capacity value -1 is invalid (Xuan 
     Gong via tgraves)
 
+    YARN-364. AggregatedLogDeletionService can take too long to delete logs
+    (jlowe)
+
 Release 0.23.6 - UNRELEASED
 
   INCOMPATIBLE CHANGES

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java

@@ -379,6 +379,15 @@ public class YarnConfiguration extends Configuration {
       + "log-aggregation.retain-seconds";
   public static final long DEFAULT_LOG_AGGREGATION_RETAIN_SECONDS = -1;
   
+  /**
+   * How long to wait between aggregated log retention checks. If set to
+   * a value <= 0 then the value is computed as one-tenth of the log retention
+   * setting. Be careful set this too small and you will spam the name node.
+   */
+  public static final String LOG_AGGREGATION_RETAIN_CHECK_INTERVAL_SECONDS =
+      YARN_PREFIX + "log-aggregation.retain-check-interval-seconds";
+  public static final long DEFAULT_LOG_AGGREGATION_RETAIN_CHECK_INTERVAL_SECONDS = -1;
+
   /**
    * Number of seconds to retain logs on the NodeManager. Only applicable if Log
    * aggregation is disabled

+ 8 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/AggregatedLogDeletionService.java

@@ -140,9 +140,16 @@ public class AggregatedLogDeletionService extends AbstractService {
       		" too small (" + retentionSecs + ")");
       return;
     }
+    long checkIntervalMsecs = 1000 * conf.getLong(
+        YarnConfiguration.LOG_AGGREGATION_RETAIN_CHECK_INTERVAL_SECONDS,
+        YarnConfiguration.DEFAULT_LOG_AGGREGATION_RETAIN_CHECK_INTERVAL_SECONDS);
+    if (checkIntervalMsecs <= 0) {
+      // when unspecified compute check interval as 1/10th of retention
+      checkIntervalMsecs = (retentionSecs * 1000) / 10;
+    }
     TimerTask task = new LogDeletionTask(conf, retentionSecs);
     timer = new Timer();
-    timer.scheduleAtFixedRate(task, 0, retentionSecs * 1000);
+    timer.scheduleAtFixedRate(task, 0, checkIntervalMsecs);
     super.start();
   }
 

+ 9 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml

@@ -410,6 +410,15 @@
     <value>-1</value>
   </property> 
   
+  <property>
+    <description>How long to wait between aggregated log retention checks.
+    If set to 0 or a negative value then the value is computed as one-tenth
+    of the aggregated log retention time. Be careful set this too small and
+    you will spam the name node.</description>
+    <name>yarn.log-aggregation.retain-check-interval-seconds</name>
+    <value>-1</value>
+  </property>
+
   <property>
     <description>Time in seconds to retain user logs. Only applicable if
     log aggregation is disabled

+ 71 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/logaggregation/TestAggregatedLogDeletionService.java

@@ -28,12 +28,19 @@ import org.apache.hadoop.fs.FilterFileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.security.AccessControlException;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.junit.Before;
 import org.junit.Test;
 
 import static org.mockito.Mockito.*;
 
 public class TestAggregatedLogDeletionService {
   
+  @Before
+  public void closeFilesystems() throws IOException {
+    // prevent the same mockfs instance from being reused due to FS cache
+    FileSystem.closeAll();
+  }
+
   @Test
   public void testDeletion() throws Exception {
     long now = System.currentTimeMillis();
@@ -121,6 +128,70 @@ public class TestAggregatedLogDeletionService {
     verify(mockFs).delete(app4Dir, true);
   }
 
+  @Test
+  public void testCheckInterval() throws Exception {
+    long RETENTION_SECS = 10 * 24 * 3600;
+    long now = System.currentTimeMillis();
+    long toDeleteTime = now - RETENTION_SECS*1000;
+
+    String root = "mockfs://foo/";
+    String remoteRootLogDir = root+"tmp/logs";
+    String suffix = "logs";
+    Configuration conf = new Configuration();
+    conf.setClass("fs.mockfs.impl", MockFileSystem.class, FileSystem.class);
+    conf.set(YarnConfiguration.LOG_AGGREGATION_ENABLED, "true");
+    conf.set(YarnConfiguration.LOG_AGGREGATION_RETAIN_SECONDS, "864000");
+    conf.set(YarnConfiguration.LOG_AGGREGATION_RETAIN_CHECK_INTERVAL_SECONDS, "1");
+    conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR, remoteRootLogDir);
+    conf.set(YarnConfiguration.NM_REMOTE_APP_LOG_DIR_SUFFIX, suffix);
+
+    // prevent us from picking up the same mockfs instance from another test
+    FileSystem.closeAll();
+    Path rootPath = new Path(root);
+    FileSystem rootFs = rootPath.getFileSystem(conf);
+    FileSystem mockFs = ((FilterFileSystem)rootFs).getRawFileSystem();
+
+    Path remoteRootLogPath = new Path(remoteRootLogDir);
+
+    Path userDir = new Path(remoteRootLogPath, "me");
+    FileStatus userDirStatus = new FileStatus(0, true, 0, 0, now, userDir);
+
+    when(mockFs.listStatus(remoteRootLogPath)).thenReturn(
+        new FileStatus[]{userDirStatus});
+
+    Path userLogDir = new Path(userDir, suffix);
+    Path app1Dir = new Path(userLogDir, "application_1_1");
+    FileStatus app1DirStatus = new FileStatus(0, true, 0, 0, now, app1Dir);
+
+    when(mockFs.listStatus(userLogDir)).thenReturn(
+        new FileStatus[]{app1DirStatus});
+
+    Path app1Log1 = new Path(app1Dir, "host1");
+    FileStatus app1Log1Status = new FileStatus(10, false, 1, 1, now, app1Log1);
+
+    when(mockFs.listStatus(app1Dir)).thenReturn(
+        new FileStatus[]{app1Log1Status});
+
+    AggregatedLogDeletionService deletionSvc =
+        new AggregatedLogDeletionService();
+    deletionSvc.init(conf);
+    deletionSvc.start();
+
+    verify(mockFs, timeout(10000).atLeast(4)).listStatus(any(Path.class));
+    verify(mockFs, never()).delete(app1Dir, true);
+
+    // modify the timestamp of the logs and verify it's picked up quickly
+    app1DirStatus = new FileStatus(0, true, 0, 0, toDeleteTime, app1Dir);
+    app1Log1Status = new FileStatus(10, false, 1, 1, toDeleteTime, app1Log1);
+    when(mockFs.listStatus(userLogDir)).thenReturn(
+        new FileStatus[]{app1DirStatus});
+    when(mockFs.listStatus(app1Dir)).thenReturn(
+        new FileStatus[]{app1Log1Status});
+
+    verify(mockFs, timeout(10000)).delete(app1Dir, true);
+
+    deletionSvc.stop();
+  }
   
   static class MockFileSystem extends FilterFileSystem {
     MockFileSystem() {