Browse Source

Fix for HADOOP-133. Retry pings from child to parent, in case of (local) communcation problems. Also log exit status, so that one can distinguish patricide from other deaths. Contributed by Owen.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@395067 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 19 năm trước cách đây
mục cha
commit
4e31745424

+ 4 - 0
CHANGES.txt

@@ -54,6 +54,10 @@ Trunk (unreleased)
 
 15. Fix HADOOP-115.  Correct an error message.  (Stack via cutting)
 
+16. "Fix HADOOP-133.  Retry pings from child to parent, in case of
+    (local) communcation problems.  Also log exit status, so that one
+    can distinguish patricide from other deaths.  (omalley via cutting)
+
 
 Release 0.1.1 - 2006-04-08
 

+ 3 - 1
src/java/org/apache/hadoop/mapred/LocalJobRunner.java

@@ -154,7 +154,9 @@ class LocalJobRunner implements JobSubmissionProtocol {
       // Ignore for now
     }
 
-    public void ping(String taskid) throws IOException {}
+    public boolean ping(String taskid) throws IOException {
+      return true;
+    }
 
     public void done(String taskId) throws IOException {
       int taskIndex = mapIds.indexOf(taskId);

+ 4 - 3
src/java/org/apache/hadoop/mapred/TaskRunner.java

@@ -260,7 +260,6 @@ abstract class TaskRunner extends Thread {
   private void runChild(String[] args, File dir) throws IOException {
     this.process = Runtime.getRuntime().exec(args, null, dir);
     try {
-      StringBuffer errorBuf = new StringBuffer();
       new Thread() {
         public void run() {
           logStream(process.getErrorStream());    // copy log output
@@ -269,8 +268,10 @@ abstract class TaskRunner extends Thread {
         
       logStream(process.getInputStream());        // normally empty
       
-      if (this.process.waitFor() != 0) {
-        throw new IOException("Task process exit with nonzero status.");
+      int exit_code = process.waitFor();
+      if (exit_code != 0) {
+        throw new IOException("Task process exit with nonzero status of " +
+                              exit_code + ".");
       }
       
     } catch (InterruptedException e) {

+ 16 - 7
src/java/org/apache/hadoop/mapred/TaskTracker.java

@@ -653,10 +653,8 @@ public class TaskTracker implements MRConstants, TaskUmbilicalProtocol, MapOutpu
     }
 
     /** Child checking to see if we're alive.  Normally does nothing.*/
-    public synchronized void ping(String taskid) throws IOException {
-      if (tasks.get(taskid) == null) {
-        throw new IOException("No such task id."); // force child exit
-      }
+    public synchronized boolean ping(String taskid) throws IOException {
+      return tasks.get(taskid) != null;
     }
 
     /**
@@ -748,12 +746,23 @@ public class TaskTracker implements MRConstants, TaskUmbilicalProtocol, MapOutpu
                                          final String taskid) {
           Thread thread = new Thread(new Runnable() {
               public void run() {
+                final int MAX_RETRIES = 3;
+                int remainingRetries = MAX_RETRIES;
                 while (true) {
                   try {
-                    umbilical.ping(taskid);
+                    if (!umbilical.ping(taskid)) {
+                      LOG.log(Level.WARNING, "Parent died.  Exiting "+taskid);
+                      System.exit(66);
+                    }
+                    remainingRetries = MAX_RETRIES;
                   } catch (Throwable t) {
-                    LOG.log(Level.WARNING, "Parent died.  Exiting "+taskid, t);
-                    System.exit(1);
+                    String msg = StringUtils.stringifyException(t);
+                    LOG.info("Ping exception: " + msg);
+                    remainingRetries -=1;
+                    if (remainingRetries == 0) {
+                      LOG.log(Level.WARNING, "Last retry, killing "+taskid);
+                      System.exit(65);
+                    }
                   }
                   try {
                     Thread.sleep(1000);

+ 4 - 2
src/java/org/apache/hadoop/mapred/TaskUmbilicalProtocol.java

@@ -42,8 +42,10 @@ interface TaskUmbilicalProtocol {
    */
   void reportDiagnosticInfo(String taskid, String trace) throws IOException;
 
-  /** Periodically called by child to check if parent is still alive. */
-  void ping(String taskid) throws IOException;
+  /** Periodically called by child to check if parent is still alive. 
+   * @return True if the task is known
+   */
+  boolean ping(String taskid) throws IOException;
 
   /** Report that the task is successfully completed.  Failure is assumed if
    * the task process exits without calling this. */