Browse Source

HADOOP-654. Stop assigning tasks to a tasktracker if it has failed more than a specified number in the job. Contributed by Arun.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@510630 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 years ago
parent
commit
01b8b057bf

+ 4 - 0
CHANGES.txt

@@ -109,6 +109,10 @@ Trunk (unreleased changes)
     Also increase the value used from one to two seconds, in hopes of
     Also increase the value used from one to two seconds, in hopes of
     making tests complete more reliably.  (cutting)
     making tests complete more reliably.  (cutting)
 
 
+33. HADOOP-654.  Stop assigning tasks to a tasktracker if it has
+    failed more than a specified number in the job.
+    (Arun C Murthy via cutting)
+
 
 
 Release 0.11.2 - 2007-02-16
 Release 0.11.2 - 2007-02-16
 
 

+ 8 - 0
conf/hadoop-default.xml

@@ -747,6 +747,14 @@ creations/deletions), or "all".</description>
   hosts are excluded.</description>
   hosts are excluded.</description>
 </property> 
 </property> 
 
 
+<property>
+  <name>mapred.max.tracker.failures</name>
+  <value>4</value>
+  <description>The number of task-failures on a tasktracker of a given job 
+               after which new tasks of that job aren't assigned to it.
+  </description>
+</property>
+
 <property>
 <property>
   <name>jobclient.output.filter</name>
   <name>jobclient.output.filter</name>
   <value>FAILED</value>
   <value>FAILED</value>

+ 18 - 0
src/java/org/apache/hadoop/mapred/JobConf.java

@@ -532,6 +532,24 @@ public class JobConf extends Configuration {
     set("mapred.job.name", name);
     set("mapred.job.name", name);
   }
   }
   
   
+  /**
+   * Set the maximum no. of failures of a given job per tasktracker.
+   * 
+   * @param noFailures maximum no. of failures of a given job per tasktracker.
+   */
+  public void setMaxTaskFailuresPerTracker(int noFailures) {
+    setInt("mapred.max.tracker.failures", noFailures);
+  }
+  
+  /**
+   * Get the maximum no. of failures of a given job per tasktracker.
+   * 
+   * @return the maximum no. of failures of a given job per tasktracker.
+   */
+  public int getMaxTaskFailuresPerTracker() {
+    return getInt("mapred.max.tracker.failures", 4); 
+  }
+  
   /** Find a jar that contains a class of the same name, if any.
   /** Find a jar that contains a class of the same name, if any.
    * It will return a jar file, even if that is not the first thing
    * It will return a jar file, even if that is not the first thing
    * on the class path that has a class with the same name.
    * on the class path that has a class with the same name.

+ 90 - 3
src/java/org/apache/hadoop/mapred/JobInProgress.java

@@ -38,7 +38,7 @@ import java.util.*;
 ///////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////
 class JobInProgress {
 class JobInProgress {
     private static final Log LOG = LogFactory.getLog("org.apache.hadoop.mapred.JobInProgress");
     private static final Log LOG = LogFactory.getLog("org.apache.hadoop.mapred.JobInProgress");
-
+    
     JobProfile profile;
     JobProfile profile;
     JobStatus status;
     JobStatus status;
     Path localJobFile = null;
     Path localJobFile = null;
@@ -57,8 +57,15 @@ class JobInProgress {
     JobTracker jobtracker = null;
     JobTracker jobtracker = null;
     Map<String,List<TaskInProgress>> hostToMaps = new HashMap();
     Map<String,List<TaskInProgress>> hostToMaps = new HashMap();
     private int taskCompletionEventTracker = 0 ; 
     private int taskCompletionEventTracker = 0 ; 
-    List<TaskCompletionEvent> taskCompletionEvents ; 
-
+    List<TaskCompletionEvent> taskCompletionEvents ;
+    
+    // The no. of tasktrackers where >= conf.getMaxTaskFailuresPerTracker()
+    // tasks have failed
+    private volatile int flakyTaskTrackers = 0;
+    // Map of trackerHostName -> no. of task failures
+    private Map<String, Integer> trackerToFailuresMap = 
+      new TreeMap<String, Integer>();
+    
     long startTime;
     long startTime;
     long finishTime;
     long finishTime;
 
 
@@ -102,6 +109,7 @@ class JobInProgress {
         this.numReduceTasks = conf.getNumReduceTasks();
         this.numReduceTasks = conf.getNumReduceTasks();
         this.taskCompletionEvents = new ArrayList(
         this.taskCompletionEvents = new ArrayList(
             numMapTasks + numReduceTasks + 10);
             numMapTasks + numReduceTasks + 10);
+        
         JobHistory.JobInfo.logSubmitted(jobid, conf.getJobName(), conf.getUser(), 
         JobHistory.JobInfo.logSubmitted(jobid, conf.getJobName(), conf.getUser(), 
             System.currentTimeMillis(), jobFile); 
             System.currentTimeMillis(), jobFile); 
         
         
@@ -373,6 +381,61 @@ class JobInProgress {
         return result;
         return result;
     }
     }
     
     
+    private String convertTrackerNameToHostName(String trackerName) {
+      // Ugly!
+      // Convert the trackerName to it's host name
+      int indexOfColon = trackerName.indexOf(":");
+      String trackerHostName = (indexOfColon == -1) ? 
+                                trackerName : 
+                                trackerName.substring(0, indexOfColon);
+      return trackerHostName;
+    }
+    
+    private void addTrackerTaskFailure(String trackerName) {
+      String trackerHostName = convertTrackerNameToHostName(trackerName);
+      
+      Integer trackerFailures = trackerToFailuresMap.get(trackerHostName);
+      if (trackerFailures == null) {
+        trackerFailures = new Integer(0);
+      }
+      trackerToFailuresMap.put(trackerHostName, ++trackerFailures);
+      
+      // Check if this tasktracker has turned 'flaky'
+      if (trackerFailures.intValue() == conf.getMaxTaskFailuresPerTracker()) {
+        ++flakyTaskTrackers;
+        LOG.info("TaskTracker at '" + trackerHostName + "' turned 'flaky'");
+      }
+    }
+    
+    private int getTrackerTaskFailures(String trackerName) {
+      String trackerHostName = convertTrackerNameToHostName(trackerName);
+      Integer failedTasks = trackerToFailuresMap.get(trackerHostName);
+      return (failedTasks != null) ? failedTasks.intValue() : 0; 
+    }
+    
+    /**
+     * Get the no. of 'flaky' tasktrackers for a given job.
+     * 
+     * @return the no. of 'flaky' tasktrackers for a given job.
+     */
+    int getNoOfBlackListedTrackers() {
+      return flakyTaskTrackers;
+    }
+    
+    /**
+     * Get the information on tasktrackers and no. of errors which occurred
+     * on them for a given job. 
+     * 
+     * @return the map of tasktrackers and no. of errors which occurred
+     *         on them for a given job. 
+     */
+    synchronized Map<String, Integer> getTaskTrackerErrors() {
+      // Clone the 'trackerToFailuresMap' and return the copy
+      Map<String, Integer> trackerErrors = 
+        new TreeMap<String, Integer>(trackerToFailuresMap);
+      return trackerErrors;
+    }
+    
     /**
     /**
      * Find a new task to run.
      * Find a new task to run.
      * @param tts The task tracker that is asking for a task
      * @param tts The task tracker that is asking for a task
@@ -389,6 +452,25 @@ class JobInProgress {
                             TaskInProgress[] tasks,
                             TaskInProgress[] tasks,
                             List cachedTasks) {
                             List cachedTasks) {
         String taskTracker = tts.getTrackerName();
         String taskTracker = tts.getTrackerName();
+
+        //
+        // Check if too many tasks of this job have failed on this
+        // tasktracker prior to assigning it a new one.
+        //
+        int taskTrackerFailedTasks = getTrackerTaskFailures(taskTracker);
+        if (taskTrackerFailedTasks >= conf.getMaxTaskFailuresPerTracker()) {
+          String flakyTracker = convertTrackerNameToHostName(taskTracker); 
+          if (flakyTaskTrackers < clusterSize) {
+            LOG.debug("Ignoring the black-listed tasktracker: '" + flakyTracker 
+                    + "' for assigning a new task");
+            return -1;
+          } else {
+            LOG.warn("Trying to assign a new task for black-listed tracker " + 
+                    flakyTracker + " since all task-trackers in the cluster are " +
+                    "'flaky' !");
+          }
+        }
+        
         //
         //
         // See if there is a split over a block that is stored on
         // See if there is a split over a block that is stored on
         // the TaskTracker checking in.  That means the block
         // the TaskTracker checking in.  That means the block
@@ -647,6 +729,11 @@ class JobInProgress {
           failedReduceTasks++; 
           failedReduceTasks++; 
         }
         }
             
             
+        //
+        // Note down that a task has failed on this tasktracker
+        //
+        addTrackerTaskFailure(trackerName);
+        
         //
         //
         // Let the JobTracker know that this task has failed
         // Let the JobTracker know that this task has failed
         //
         //

+ 60 - 0
src/webapps/job/jobblacklistedtrackers.jsp

@@ -0,0 +1,60 @@
+<%@ page
+  contentType="text/html; charset=UTF-8"
+  import="javax.servlet.*"
+  import="javax.servlet.http.*"
+  import="java.io.*"
+  import="java.util.*"
+  import="org.apache.hadoop.mapred.*"
+  import="org.apache.hadoop.util.*"
+%>
+
+<%!
+  JobTracker tracker = JobTracker.getTracker();
+  String trackerName = 
+           StringUtils.simpleHostname(tracker.getJobTrackerMachine());
+          
+  private void printBlackListedTrackers(JspWriter out, 
+                             JobInProgress job) throws IOException {
+    Map<String, Integer> trackerErrors = job.getTaskTrackerErrors();
+    out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
+    out.print("<tr><th>TaskTracker</th><th>No. of Failures</th></tr>\n");
+    int maxErrorsPerTracker = job.getJobConf().getMaxTaskFailuresPerTracker();
+    for (Map.Entry<String,Integer> e : trackerErrors.entrySet()) {
+      if (e.getValue().intValue() >= maxErrorsPerTracker) {
+        out.print("<tr><td>" + e.getKey() + "</td><td>" + e.getValue() + 
+            "</td></tr>\n");
+      }
+    }
+    out.print("</table>\n");
+  }
+%>
+
+<%
+    String jobId = request.getParameter("jobid");
+    if (jobId == null) {
+  	  out.println("<h2>Missing 'jobid' for fetching black-listed tasktrackers!</h2>");
+  	  return;
+    }
+    
+    JobInProgress job = (JobInProgress) tracker.getJob(jobId);
+    if (job == null) {
+      out.print("<b>Job " + jobId + " not found.</b><br>\n");
+      return;
+    }
+%>
+
+<html>
+<title>Hadoop <%=jobId%>'s black-listed tasktrackers</title>
+<body>
+<h1>Hadoop <a href="/jobdetails.jsp?jobid=<%=jobId%>"><%=jobId%></a> - 
+Black-listed task-trackers</h1>
+
+<% 
+    printBlackListedTrackers(out, job); 
+%>
+
+<hr>
+<a href="/jobdetails.jsp?jobid=><%=jobId%>">Go back to <%=jobId%></a><br>
+<a href="http://lucene.apache.org/hadoop">Hadoop</a>, 2006.<br>
+</body>
+</html>

+ 6 - 0
src/webapps/job/jobdetails.jsp

@@ -60,6 +60,7 @@
     JobProfile profile = job.getProfile();
     JobProfile profile = job.getProfile();
     JobStatus status = job.getStatus();
     JobStatus status = job.getStatus();
     int runState = status.getRunState();
     int runState = status.getRunState();
+    int flakyTaskTrackers = job.getNoOfBlackListedTrackers();
     out.print("<b>User:</b> " + profile.getUser() + "<br>\n");
     out.print("<b>User:</b> " + profile.getUser() + "<br>\n");
     out.print("<b>Job Name:</b> " + profile.getJobName() + "<br>\n");
     out.print("<b>Job Name:</b> " + profile.getJobName() + "<br>\n");
     if (runState == JobStatus.RUNNING) {
     if (runState == JobStatus.RUNNING) {
@@ -80,6 +81,11 @@
       out.print("<b>Finished at:</b> " + new Date(job.getFinishTime()) +
       out.print("<b>Finished at:</b> " + new Date(job.getFinishTime()) +
                 "<br>\n");
                 "<br>\n");
     }
     }
+    if (flakyTaskTrackers > 0) {
+      out.print("<b>Black-listed TaskTrackers:</b> " + 
+          "<a href=\"/jobblacklistedtrackers.jsp?jobid=" + jobId + "\">" +
+          flakyTaskTrackers + "</a><br>\n");
+    }
     out.print("<hr>\n");
     out.print("<hr>\n");
     out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
     out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
     out.print("<tr><th>Kind</th><th>% Complete</th><th>Num Tasks</th>" +
     out.print("<tr><th>Kind</th><th>% Complete</th><th>Num Tasks</th>" +