瀏覽代碼

HADOOP-1610. Add metrics for failed tasks. Contributed by Devaraj Das.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@563270 13f79535-47bb-0310-9956-ffa450edef68
Thomas White 18 年之前
父節點
當前提交
9a978712fc
共有 3 個文件被更改,包括 28 次插入2 次删除
  1. 3 0
      CHANGES.txt
  2. 3 0
      src/java/org/apache/hadoop/mapred/TaskRunner.java
  3. 22 2
      src/java/org/apache/hadoop/mapred/TaskTracker.java

+ 3 - 0
CHANGES.txt

@@ -18,6 +18,9 @@ Trunk (unreleased changes)
     easier to read.  Also remove numbering, to make merging easier.
     (cutting)
 
+    HADOOP-1610.  Add metrics for failed tasks.
+    (Devaraj Das via tomwhite)
+
   OPTIMIZATIONS
 
     HADOOP-1565.  Reduce memory usage of NameNode by replacing 

+ 3 - 0
src/java/org/apache/hadoop/mapred/TaskRunner.java

@@ -421,6 +421,9 @@ abstract class TaskRunner extends Thread {
       int exit_code = process.waitFor();
      
       if (!killed && exit_code != 0) {
+        if (exit_code == 65) {
+          tracker.getTaskTrackerMetrics().taskFailedPing();
+        }
         throw new IOException("Task process exit with nonzero status of " +
                               exit_code + ".");
       }

+ 22 - 2
src/java/org/apache/hadoop/mapred/TaskTracker.java

@@ -213,9 +213,11 @@ public class TaskTracker
       shuffleMetricsRecord.update();
     }
   }
-  private class TaskTrackerMetrics implements Updater {
+  public class TaskTrackerMetrics implements Updater {
     private MetricsRecord metricsRecord = null;
     private int numCompletedTasks = 0;
+    private int timedoutTasks = 0;
+    private int tasksFailedPing = 0;
       
     TaskTrackerMetrics() {
       JobConf conf = getJobConf();
@@ -232,6 +234,15 @@ public class TaskTracker
     synchronized void completeTask() {
       ++numCompletedTasks;
     }
+    
+    synchronized void timedoutTask() {
+      ++timedoutTasks;
+    }
+    
+    synchronized void taskFailedPing() {
+      ++tasksFailedPing;
+    }
+    
     /**
      * Since this object is a registered updater, this method will be called
      * periodically, e.g. every 5 seconds.
@@ -243,15 +254,23 @@ public class TaskTracker
           metricsRecord.setMetric("reduces_running", reduceTotal);
           metricsRecord.setMetric("taskSlots", (short)maxCurrentTasks);
           metricsRecord.incrMetric("tasks_completed", numCompletedTasks);
-          metricsRecord.update();
+          metricsRecord.incrMetric("tasks_failed_timeout", timedoutTasks);
+          metricsRecord.incrMetric("tasks_failed_ping", tasksFailedPing);
         }
         numCompletedTasks = 0;
+        timedoutTasks = 0;
+        tasksFailedPing = 0;
       }
+      metricsRecord.update();
     }
   }
     
   private TaskTrackerMetrics myMetrics = null;
 
+  public TaskTrackerMetrics getTaskTrackerMetrics() {
+    return myMetrics;
+  }
+  
   /**
    * A list of tips that should be cleaned up.
    */
@@ -991,6 +1010,7 @@ public class TaskTracker
           LOG.info(tip.getTask().getTaskId() + ": " + msg);
           ReflectionUtils.logThreadInfo(LOG, "lost task", 30);
           tip.reportDiagnosticInfo(msg);
+          myMetrics.timedoutTask();
           purgeTask(tip, true);
         }
       }