فهرست منبع

Fix HADOOP-174. Make job client try up to five times to contact job tracker before aborting a job. Contributed by Owen.

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@398014 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 19 سال پیش
والد
کامیت
9eb0886585
3فایلهای تغییر یافته به همراه19 افزوده شده و 3 حذف شده
  1. 4 0
      CHANGES.txt
  2. 1 1
      src/java/org/apache/hadoop/ipc/Client.java
  3. 14 2
      src/java/org/apache/hadoop/mapred/JobClient.java

+ 4 - 0
CHANGES.txt

@@ -124,6 +124,10 @@ Trunk (unreleased)
 33. NUTCH-256.  Change FileSystem#createNewFile() to create a .crc
     file.  The lack of a .crc file was causing warnings.  (cutting)
 
+34. HADOOP-174.  Change JobClient to not abort job until it has failed
+    to contact the job tracker for five attempts, not just one as
+    before.  (omalley via cutting)
+
 
 Release 0.1.1 - 2006-04-08
 

+ 1 - 1
src/java/org/apache/hadoop/ipc/Client.java

@@ -302,7 +302,7 @@ public class Client {
       if (call.error != null) {
         throw call.error;
       } else if (!call.done) {
-        throw new IOException("timed out waiting for response");
+        throw new SocketTimeoutException("timed out waiting for rpc response");
       } else {
         return call.value;
       }

+ 14 - 2
src/java/org/apache/hadoop/mapred/JobClient.java

@@ -18,7 +18,7 @@ package org.apache.hadoop.mapred;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.ipc.*;
 import org.apache.hadoop.conf.*;
-import org.apache.hadoop.util.LogFormatter;
+import org.apache.hadoop.util.*;
 
 import java.io.*;
 import java.net.*;
@@ -302,6 +302,8 @@ public class JobClient implements MRConstants {
       boolean error = true;
       RunningJob running = null;
       String lastReport = null;
+      final int MAX_RETRIES = 5;
+      int retries = MAX_RETRIES;
       try {
         running = jc.submitJob(job);
         String jobId = running.getJobID();
@@ -310,7 +312,17 @@ public class JobClient implements MRConstants {
           try {
             Thread.sleep(1000);
           } catch (InterruptedException e) {}
-          running = jc.getJob(jobId);
+          try {
+            running = jc.getJob(jobId);
+            retries = MAX_RETRIES;
+          } catch (IOException ie) {
+            if (--retries == 0) {
+              LOG.info("Final attempt failed, killing job.");
+              throw ie;
+            }
+            LOG.info("Communication problem with server: " +
+                     StringUtils.stringifyException(ie));
+          }
           String report = null;
           report = " map "+Math.round(running.mapProgress()*100)+"%  reduce " + Math.round(running.reduceProgress()*100)+"%";
           if (!report.equals(lastReport)) {