1
0
Selaa lähdekoodia

HADOOP-654. Stop assigning tasks to a tasktracker if it has failed more than a specified number in the job. Contributed by Arun.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@510630 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 vuotta sitten
vanhempi
commit
01b8b057bf

+ 4 - 0
CHANGES.txt

@@ -109,6 +109,10 @@ Trunk (unreleased changes)
     Also increase the value used from one to two seconds, in hopes of
     making tests complete more reliably.  (cutting)
 
+33. HADOOP-654.  Stop assigning tasks to a tasktracker if it has
+    failed more than a specified number in the job.
+    (Arun C Murthy via cutting)
+
 
 Release 0.11.2 - 2007-02-16
 

+ 8 - 0
conf/hadoop-default.xml

@@ -747,6 +747,14 @@ creations/deletions), or "all".</description>
   hosts are excluded.</description>
 </property> 
 
+<property>
+  <name>mapred.max.tracker.failures</name>
+  <value>4</value>
+  <description>The number of task-failures on a tasktracker of a given job 
+               after which new tasks of that job aren't assigned to it.
+  </description>
+</property>
+
 <property>
   <name>jobclient.output.filter</name>
   <value>FAILED</value>

+ 18 - 0
src/java/org/apache/hadoop/mapred/JobConf.java

@@ -532,6 +532,24 @@ public class JobConf extends Configuration {
     set("mapred.job.name", name);
   }
   
+  /**
+   * Set the maximum no. of failures of a given job per tasktracker.
+   * 
+   * @param noFailures maximum no. of failures of a given job per tasktracker.
+   */
+  public void setMaxTaskFailuresPerTracker(int noFailures) {
+    setInt("mapred.max.tracker.failures", noFailures);
+  }
+  
+  /**
+   * Get the maximum no. of failures of a given job per tasktracker.
+   * 
+   * @return the maximum no. of failures of a given job per tasktracker.
+   */
+  public int getMaxTaskFailuresPerTracker() {
+    return getInt("mapred.max.tracker.failures", 4); 
+  }
+  
   /** Find a jar that contains a class of the same name, if any.
    * It will return a jar file, even if that is not the first thing
    * on the class path that has a class with the same name.

+ 90 - 3
src/java/org/apache/hadoop/mapred/JobInProgress.java

@@ -38,7 +38,7 @@ import java.util.*;
 ///////////////////////////////////////////////////////
 class JobInProgress {
     private static final Log LOG = LogFactory.getLog("org.apache.hadoop.mapred.JobInProgress");
-
+    
     JobProfile profile;
     JobStatus status;
     Path localJobFile = null;
@@ -57,8 +57,15 @@ class JobInProgress {
     JobTracker jobtracker = null;
     Map<String,List<TaskInProgress>> hostToMaps = new HashMap();
     private int taskCompletionEventTracker = 0 ; 
-    List<TaskCompletionEvent> taskCompletionEvents ; 
-
+    List<TaskCompletionEvent> taskCompletionEvents ;
+    
+    // The no. of tasktrackers where >= conf.getMaxTaskFailuresPerTracker()
+    // tasks have failed
+    private volatile int flakyTaskTrackers = 0;
+    // Map of trackerHostName -> no. of task failures
+    private Map<String, Integer> trackerToFailuresMap = 
+      new TreeMap<String, Integer>();
+    
     long startTime;
     long finishTime;
 
@@ -102,6 +109,7 @@ class JobInProgress {
         this.numReduceTasks = conf.getNumReduceTasks();
         this.taskCompletionEvents = new ArrayList(
             numMapTasks + numReduceTasks + 10);
+        
         JobHistory.JobInfo.logSubmitted(jobid, conf.getJobName(), conf.getUser(), 
             System.currentTimeMillis(), jobFile); 
         
@@ -373,6 +381,61 @@ class JobInProgress {
         return result;
     }
     
+    private String convertTrackerNameToHostName(String trackerName) {
+      // Ugly!
+      // Convert the trackerName to it's host name
+      int indexOfColon = trackerName.indexOf(":");
+      String trackerHostName = (indexOfColon == -1) ? 
+                                trackerName : 
+                                trackerName.substring(0, indexOfColon);
+      return trackerHostName;
+    }
+    
+    private void addTrackerTaskFailure(String trackerName) {
+      String trackerHostName = convertTrackerNameToHostName(trackerName);
+      
+      Integer trackerFailures = trackerToFailuresMap.get(trackerHostName);
+      if (trackerFailures == null) {
+        trackerFailures = new Integer(0);
+      }
+      trackerToFailuresMap.put(trackerHostName, ++trackerFailures);
+      
+      // Check if this tasktracker has turned 'flaky'
+      if (trackerFailures.intValue() == conf.getMaxTaskFailuresPerTracker()) {
+        ++flakyTaskTrackers;
+        LOG.info("TaskTracker at '" + trackerHostName + "' turned 'flaky'");
+      }
+    }
+    
+    private int getTrackerTaskFailures(String trackerName) {
+      String trackerHostName = convertTrackerNameToHostName(trackerName);
+      Integer failedTasks = trackerToFailuresMap.get(trackerHostName);
+      return (failedTasks != null) ? failedTasks.intValue() : 0; 
+    }
+    
+    /**
+     * Get the no. of 'flaky' tasktrackers for a given job.
+     * 
+     * @return the no. of 'flaky' tasktrackers for a given job.
+     */
+    int getNoOfBlackListedTrackers() {
+      return flakyTaskTrackers;
+    }
+    
+    /**
+     * Get the information on tasktrackers and no. of errors which occurred
+     * on them for a given job. 
+     * 
+     * @return the map of tasktrackers and no. of errors which occurred
+     *         on them for a given job. 
+     */
+    synchronized Map<String, Integer> getTaskTrackerErrors() {
+      // Clone the 'trackerToFailuresMap' and return the copy
+      Map<String, Integer> trackerErrors = 
+        new TreeMap<String, Integer>(trackerToFailuresMap);
+      return trackerErrors;
+    }
+    
     /**
      * Find a new task to run.
      * @param tts The task tracker that is asking for a task
@@ -389,6 +452,25 @@ class JobInProgress {
                             TaskInProgress[] tasks,
                             List cachedTasks) {
         String taskTracker = tts.getTrackerName();
+
+        //
+        // Check if too many tasks of this job have failed on this
+        // tasktracker prior to assigning it a new one.
+        //
+        int taskTrackerFailedTasks = getTrackerTaskFailures(taskTracker);
+        if (taskTrackerFailedTasks >= conf.getMaxTaskFailuresPerTracker()) {
+          String flakyTracker = convertTrackerNameToHostName(taskTracker); 
+          if (flakyTaskTrackers < clusterSize) {
+            LOG.debug("Ignoring the black-listed tasktracker: '" + flakyTracker 
+                    + "' for assigning a new task");
+            return -1;
+          } else {
+            LOG.warn("Trying to assign a new task for black-listed tracker " + 
+                    flakyTracker + " since all task-trackers in the cluster are " +
+                    "'flaky' !");
+          }
+        }
+        
         //
         // See if there is a split over a block that is stored on
         // the TaskTracker checking in.  That means the block
@@ -647,6 +729,11 @@ class JobInProgress {
           failedReduceTasks++; 
         }
             
+        //
+        // Note down that a task has failed on this tasktracker
+        //
+        addTrackerTaskFailure(trackerName);
+        
         //
         // Let the JobTracker know that this task has failed
         //

+ 60 - 0
src/webapps/job/jobblacklistedtrackers.jsp

@@ -0,0 +1,60 @@
+<%@ page
+  contentType="text/html; charset=UTF-8"
+  import="javax.servlet.*"
+  import="javax.servlet.http.*"
+  import="java.io.*"
+  import="java.util.*"
+  import="org.apache.hadoop.mapred.*"
+  import="org.apache.hadoop.util.*"
+%>
+
+<%!
+  JobTracker tracker = JobTracker.getTracker();
+  String trackerName = 
+           StringUtils.simpleHostname(tracker.getJobTrackerMachine());
+          
+  private void printBlackListedTrackers(JspWriter out, 
+                             JobInProgress job) throws IOException {
+    Map<String, Integer> trackerErrors = job.getTaskTrackerErrors();
+    out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
+    out.print("<tr><th>TaskTracker</th><th>No. of Failures</th></tr>\n");
+    int maxErrorsPerTracker = job.getJobConf().getMaxTaskFailuresPerTracker();
+    for (Map.Entry<String,Integer> e : trackerErrors.entrySet()) {
+      if (e.getValue().intValue() >= maxErrorsPerTracker) {
+        out.print("<tr><td>" + e.getKey() + "</td><td>" + e.getValue() + 
+            "</td></tr>\n");
+      }
+    }
+    out.print("</table>\n");
+  }
+%>
+
+<%
+    String jobId = request.getParameter("jobid");
+    if (jobId == null) {
+  	  out.println("<h2>Missing 'jobid' for fetching black-listed tasktrackers!</h2>");
+  	  return;
+    }
+    
+    JobInProgress job = (JobInProgress) tracker.getJob(jobId);
+    if (job == null) {
+      out.print("<b>Job " + jobId + " not found.</b><br>\n");
+      return;
+    }
+%>
+
+<html>
+<title>Hadoop <%=jobId%>'s black-listed tasktrackers</title>
+<body>
+<h1>Hadoop <a href="/jobdetails.jsp?jobid=<%=jobId%>"><%=jobId%></a> - 
+Black-listed task-trackers</h1>
+
+<% 
+    printBlackListedTrackers(out, job); 
+%>
+
+<hr>
+<a href="/jobdetails.jsp?jobid=><%=jobId%>">Go back to <%=jobId%></a><br>
+<a href="http://lucene.apache.org/hadoop">Hadoop</a>, 2006.<br>
+</body>
+</html>

+ 6 - 0
src/webapps/job/jobdetails.jsp

@@ -60,6 +60,7 @@
     JobProfile profile = job.getProfile();
     JobStatus status = job.getStatus();
     int runState = status.getRunState();
+    int flakyTaskTrackers = job.getNoOfBlackListedTrackers();
     out.print("<b>User:</b> " + profile.getUser() + "<br>\n");
     out.print("<b>Job Name:</b> " + profile.getJobName() + "<br>\n");
     if (runState == JobStatus.RUNNING) {
@@ -80,6 +81,11 @@
       out.print("<b>Finished at:</b> " + new Date(job.getFinishTime()) +
                 "<br>\n");
     }
+    if (flakyTaskTrackers > 0) {
+      out.print("<b>Black-listed TaskTrackers:</b> " + 
+          "<a href=\"/jobblacklistedtrackers.jsp?jobid=" + jobId + "\">" +
+          flakyTaskTrackers + "</a><br>\n");
+    }
     out.print("<hr>\n");
     out.print("<table border=2 cellpadding=\"5\" cellspacing=\"2\">");
     out.print("<tr><th>Kind</th><th>% Complete</th><th>Num Tasks</th>" +