浏览代码

YARN-7697. NM goes down with OOM due to leak in log-aggregation. (Xuan Gong via wangda)

Change-Id: Ie4fc7979d834e25f37a033c314f3efceeb8f4a9e
Wangda Tan 7 年之前
父节点
当前提交
d4c98579e3

+ 5 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/LogAggregationFileController.java

@@ -226,10 +226,12 @@ public abstract class LogAggregationFileController {
    * Returns the owner of the application.
    *
    * @param aggregatedLogPath the aggregatedLog path
+   * @param appId the ApplicationId
    * @return the application owner
    * @throws IOException if we can not get the application owner
    */
-  public abstract String getApplicationOwner(Path aggregatedLogPath)
+  public abstract String getApplicationOwner(Path aggregatedLogPath,
+      ApplicationId appId)
       throws IOException;
 
   /**
@@ -237,11 +239,12 @@ public abstract class LogAggregationFileController {
    * found.
    *
    * @param aggregatedLogPath the aggregatedLog path.
+   * @param appId the ApplicationId
    * @return a map of the Application ACLs.
    * @throws IOException if we can not get the application acls
    */
   public abstract Map<ApplicationAccessType, String> getApplicationAcls(
-      Path aggregatedLogPath) throws IOException;
+      Path aggregatedLogPath, ApplicationId appId) throws IOException;
 
   /**
    * Verify and create the remote log directory.

+ 1 - 1
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/IndexedFileAggregatedLogsBlock.java

@@ -135,7 +135,7 @@ public class IndexedFileAggregatedLogsBlock extends LogAggregationHtmlBlock {
         IndexedLogsMeta indexedLogsMeta = null;
         try {
           indexedLogsMeta = fileController.loadIndexedLogsMeta(
-              thisNodeFile.getPath(), endIndex);
+              thisNodeFile.getPath(), endIndex, appId);
         } catch (Exception ex) {
           // DO NOTHING
           LOG.warn("Can not load log meta from the log file:"

+ 46 - 23
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/LogAggregationIndexedFileController.java

@@ -284,16 +284,8 @@ public class LogAggregationIndexedFileController
                 currentRemoteLogFile.getName())) {
               overwriteCheckSum = false;
               long endIndex = checksumFileInputStream.readLong();
-              IndexedLogsMeta recoveredLogsMeta = null;
-              try {
-                truncateFileWithRetries(fc, currentRemoteLogFile,
-                    endIndex);
-                recoveredLogsMeta = loadIndexedLogsMeta(
-                    currentRemoteLogFile);
-              } catch (Exception ex) {
-                recoveredLogsMeta = loadIndexedLogsMeta(
-                    currentRemoteLogFile, endIndex);
-              }
+              IndexedLogsMeta recoveredLogsMeta = loadIndexedLogsMeta(
+                  currentRemoteLogFile, endIndex, appId);
               if (recoveredLogsMeta != null) {
                 indexedLogsMeta = recoveredLogsMeta;
               }
@@ -524,11 +516,11 @@ public class LogAggregationIndexedFileController
       IndexedLogsMeta indexedLogsMeta = null;
       try {
         indexedLogsMeta = loadIndexedLogsMeta(thisNodeFile.getPath(),
-            endIndex);
+            endIndex, appId);
       } catch (Exception ex) {
         // DO NOTHING
         LOG.warn("Can not load log meta from the log file:"
-            + thisNodeFile.getPath());
+            + thisNodeFile.getPath() + "\n" + ex.getMessage());
         continue;
       }
       if (indexedLogsMeta == null) {
@@ -636,14 +628,14 @@ public class LogAggregationIndexedFileController
           endIndex = checkSumIndex.longValue();
         }
         IndexedLogsMeta current = loadIndexedLogsMeta(
-            thisNodeFile.getPath(), endIndex);
+            thisNodeFile.getPath(), endIndex, appId);
         if (current != null) {
           listOfLogsMeta.add(current);
         }
       } catch (IOException ex) {
         // DO NOTHING
         LOG.warn("Can not get log meta from the log file:"
-            + thisNodeFile.getPath());
+            + thisNodeFile.getPath() + "\n" + ex.getMessage());
       }
     }
     for (IndexedLogsMeta indexedLogMeta : listOfLogsMeta) {
@@ -721,6 +713,7 @@ public class LogAggregationIndexedFileController
           checkSumFiles.put(nodeName, Long.valueOf(index));
         }
       } catch (IOException ex) {
+        LOG.warn(ex.getMessage());
         continue;
       } finally {
         IOUtils.cleanupWithLogger(LOG, checksumFileInputStream);
@@ -773,25 +766,26 @@ public class LogAggregationIndexedFileController
   }
 
   @Override
-  public String getApplicationOwner(Path aggregatedLogPath)
+  public String getApplicationOwner(Path aggregatedLogPath,
+      ApplicationId appId)
       throws IOException {
     if (this.cachedIndexedLogsMeta == null
         || !this.cachedIndexedLogsMeta.getRemoteLogPath()
             .equals(aggregatedLogPath)) {
       this.cachedIndexedLogsMeta = new CachedIndexedLogsMeta(
-          loadIndexedLogsMeta(aggregatedLogPath), aggregatedLogPath);
+          loadIndexedLogsMeta(aggregatedLogPath, appId), aggregatedLogPath);
     }
     return this.cachedIndexedLogsMeta.getCachedIndexedLogsMeta().getUser();
   }
 
   @Override
   public Map<ApplicationAccessType, String> getApplicationAcls(
-      Path aggregatedLogPath) throws IOException {
+      Path aggregatedLogPath, ApplicationId appId) throws IOException {
     if (this.cachedIndexedLogsMeta == null
         || !this.cachedIndexedLogsMeta.getRemoteLogPath()
             .equals(aggregatedLogPath)) {
       this.cachedIndexedLogsMeta = new CachedIndexedLogsMeta(
-          loadIndexedLogsMeta(aggregatedLogPath), aggregatedLogPath);
+          loadIndexedLogsMeta(aggregatedLogPath, appId), aggregatedLogPath);
     }
     return this.cachedIndexedLogsMeta.getCachedIndexedLogsMeta().getAcls();
   }
@@ -804,8 +798,8 @@ public class LogAggregationIndexedFileController
   }
 
   @Private
-  public IndexedLogsMeta loadIndexedLogsMeta(Path remoteLogPath, long end)
-      throws IOException {
+  public IndexedLogsMeta loadIndexedLogsMeta(Path remoteLogPath, long end,
+      ApplicationId appId) throws IOException {
     FileContext fileContext =
         FileContext.getFileContext(remoteLogPath.toUri(), conf);
     FSDataInputStream fsDataIStream = null;
@@ -816,8 +810,36 @@ public class LogAggregationIndexedFileController
       }
       long fileLength = end < 0 ? fileContext.getFileStatus(
           remoteLogPath).getLen() : end;
+
       fsDataIStream.seek(fileLength - Integer.SIZE/ Byte.SIZE - UUID_LENGTH);
       int offset = fsDataIStream.readInt();
+      // If the offset/log meta size is larger than 64M,
+      // output a warn message for better debug.
+      if (offset > 64 * 1024 * 1024) {
+        LOG.warn("The log meta size read from " + remoteLogPath
+            + " is " + offset);
+      }
+
+      // Load UUID and make sure the UUID is correct.
+      byte[] uuidRead = new byte[UUID_LENGTH];
+      int uuidReadLen = fsDataIStream.read(uuidRead);
+      if (this.uuid == null) {
+        this.uuid = createUUID(appId);
+      }
+      if (uuidReadLen != UUID_LENGTH || !Arrays.equals(this.uuid, uuidRead)) {
+        if (LOG.isDebugEnabled()) {
+          LOG.debug("the length of loaded UUID:" + uuidReadLen);
+          LOG.debug("the loaded UUID:" + new String(uuidRead,
+              Charset.forName("UTF-8")));
+          LOG.debug("the expected UUID:" + new String(this.uuid,
+              Charset.forName("UTF-8")));
+        }
+        throw new IOException("The UUID from "
+            + remoteLogPath + " is not correct. The offset of loaded UUID is "
+            + (fileLength - UUID_LENGTH));
+      }
+
+      // Load Log Meta
       byte[] array = new byte[offset];
       fsDataIStream.seek(
           fileLength - offset - Integer.SIZE/ Byte.SIZE - UUID_LENGTH);
@@ -833,9 +855,9 @@ public class LogAggregationIndexedFileController
     }
   }
 
-  private IndexedLogsMeta loadIndexedLogsMeta(Path remoteLogPath)
-      throws IOException {
-    return loadIndexedLogsMeta(remoteLogPath, -1);
+  private IndexedLogsMeta loadIndexedLogsMeta(Path remoteLogPath,
+      ApplicationId appId) throws IOException {
+    return loadIndexedLogsMeta(remoteLogPath, -1, appId);
   }
 
   /**
@@ -1040,6 +1062,7 @@ public class LogAggregationIndexedFileController
         this.out = compressAlgo.createCompressionStream(
             fsBufferedOutput, compressor, 0);
       } catch (IOException e) {
+        LOG.warn(e.getMessage());
         compressAlgo.returnCompressor(compressor);
         throw e;
       }

+ 3 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/tfile/LogAggregationTFileController.java

@@ -335,14 +335,15 @@ public class LogAggregationTFileController
   }
 
   @Override
-  public String getApplicationOwner(Path aggregatedLog) throws IOException {
+  public String getApplicationOwner(Path aggregatedLog, ApplicationId appId)
+      throws IOException {
     createTFileLogReader(aggregatedLog);
     return this.tfReader.getLogReader().getApplicationOwner();
   }
 
   @Override
   public Map<ApplicationAccessType, String> getApplicationAcls(
-      Path aggregatedLog) throws IOException {
+      Path aggregatedLog, ApplicationId appId) throws IOException {
     createTFileLogReader(aggregatedLog);
     return this.tfReader.getLogReader().getApplicationAcls();
   }

+ 3 - 2
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/logaggregation/filecontroller/TestLogAggregationFileControllerFactory.java

@@ -194,14 +194,15 @@ public class TestLogAggregationFileControllerFactory {
     }
 
     @Override
-    public String getApplicationOwner(Path aggregatedLogPath)
+    public String getApplicationOwner(Path aggregatedLogPath,
+        ApplicationId appId)
         throws IOException {
       return null;
     }
 
     @Override
     public Map<ApplicationAccessType, String> getApplicationAcls(
-        Path aggregatedLogPath) throws IOException {
+        Path aggregatedLogPath, ApplicationId appId) throws IOException {
       return null;
     }
   }

+ 21 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/test/java/org/apache/hadoop/yarn/logaggregation/filecontroller/ifile/TestLogAggregationIndexFileController.java

@@ -55,7 +55,9 @@ import org.apache.hadoop.yarn.logaggregation.LogAggregationUtils;
 import org.apache.hadoop.yarn.logaggregation.AggregatedLogFormat.LogKey;
 import org.apache.hadoop.yarn.logaggregation.AggregatedLogFormat.LogValue;
 import org.apache.hadoop.yarn.logaggregation.ContainerLogFileInfo;
+import org.apache.hadoop.yarn.logaggregation.filecontroller.LogAggregationFileController;
 import org.apache.hadoop.yarn.logaggregation.filecontroller.LogAggregationFileControllerContext;
+import org.apache.hadoop.yarn.logaggregation.filecontroller.LogAggregationFileControllerFactory;
 import org.apache.hadoop.yarn.util.Clock;
 import org.apache.hadoop.yarn.util.ControlledClock;
 import org.junit.After;
@@ -219,6 +221,25 @@ public class TestLogAggregationIndexFileController {
     }
     sysOutStream.reset();
 
+    Configuration factoryConf = new Configuration(conf);
+    factoryConf.set("yarn.log-aggregation.file-formats", "Indexed");
+    factoryConf.set("yarn.log-aggregation.file-controller.Indexed.class",
+        "org.apache.hadoop.yarn.logaggregation.filecontroller.ifile"
+        + ".LogAggregationIndexedFileController");
+    LogAggregationFileControllerFactory factory =
+        new LogAggregationFileControllerFactory(factoryConf);
+    LogAggregationFileController fileController = factory
+        .getFileControllerForRead(appId, USER_UGI.getShortUserName());
+    Assert.assertTrue(fileController instanceof
+        LogAggregationIndexedFileController);
+    foundLogs = fileController.readAggregatedLogs(logRequest, System.out);
+    Assert.assertTrue(foundLogs);
+    for (String logType : logTypes) {
+      Assert.assertTrue(sysOutStream.toString().contains(logMessage(
+          containerId, logType)));
+    }
+    sysOutStream.reset();
+
     // create a checksum file
     Path checksumFile = new Path(fileFormat.getRemoteAppLogDir(
         appId, USER_UGI.getShortUserName()),