Bladeren bron

merge -r 453780:462941, from trunk to branch-0.7, preparing for 0.7.1 release

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/branches/branch-0.7@462942 13f79535-47bb-0310-9956-ffa450edef68
Doug Cutting 18 jaren geleden
bovenliggende
commit
0945dd2ff1

+ 19 - 0
CHANGES.txt

@@ -1,6 +1,25 @@
 Hadoop Change Log
 
 
+Release 0.7.1 - 2006-10-11
+
+ 1. HADOOP-593.  Fix a NullPointerException in the JobTracker.
+    (omalley via cutting)
+
+ 2. HADOOP-592.  Fix a NullPointerException in the IPC Server.  Also
+    consistently log when stale calls are discarded.  (omalley via cutting)
+
+ 3. HADOOP-594.  Increase the DFS safe-mode threshold from .95 to
+    .999, so that nearly all blocks must be reported before filesystem
+    modifications are permitted.  (Konstantin Shvachko via cutting)
+
+ 4. HADOOP-598.  Fix tasks to retry when reporting completion, so that
+    a single RPC timeout won't fail a task.  (omalley via cutting)
+
+ 5. HADOOP-597.  Fix TaskTracker to not discard map outputs for errors
+    in transmitting them to reduce nodes.  (omalley via cutting)
+
+
 Release 0.7.0 - 2006-10-06
 
  1. HADOOP-243.  Fix rounding in the display of task and job progress

+ 1 - 1
build.xml

@@ -9,7 +9,7 @@
  
   <property name="Name" value="Hadoop"/>
   <property name="name" value="hadoop"/>
-  <property name="version" value="0.7.1-dev"/>
+  <property name="version" value="0.7.2-dev"/>
   <property name="final.name" value="${name}-${version}"/>
   <property name="year" value="2006"/>
   <property name="libhdfs.version" value="1"/>

+ 1 - 1
conf/hadoop-default.xml

@@ -249,7 +249,7 @@ creations/deletions), or "all".</description>
 
 <property>
   <name>dfs.safemode.threshold.pct</name>
-  <value>0.95f</value>
+  <value>0.999f</value>
   <description>
   	Specifies the percentage of blocks that should satisfy 
   	the minimal replication requirement defined by dfs.replication.min.

+ 25 - 16
site/index.html

@@ -122,6 +122,9 @@ document.write("<text>Last Published:</text> " + document.lastModified);
 <a href="#News">News</a>
 <ul class="minitoc">
 <li>
+<a href="#11+October%2C+2006%3A+release+0.7.1+available">11 October, 2006: release 0.7.1 available</a>
+</li>
+<li>
 <a href="#6+October%2C+2006%3A+release+0.7.0+available">6 October, 2006: release 0.7.0 available</a>
 </li>
 <li>
@@ -178,73 +181,79 @@ document.write("<text>Last Published:</text> " + document.lastModified);
 <a name="N1000C"></a><a name="News"></a>
 <h2 class="h3">News</h2>
 <div class="section">
-<a name="N10012"></a><a name="6+October%2C+2006%3A+release+0.7.0+available"></a>
+<a name="N10012"></a><a name="11+October%2C+2006%3A+release+0.7.1+available"></a>
+<h3 class="h4">11 October, 2006: release 0.7.1 available</h3>
+<p>This fixes critical bugs in 0.7.0.  For details see the <a href="http://tinyurl.com/p7qod">release notes</a>. The release can
+      be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
+      nearby mirror</a>.
+      </p>
+<a name="N10024"></a><a name="6+October%2C+2006%3A+release+0.7.0+available"></a>
 <h3 class="h4">6 October, 2006: release 0.7.0 available</h3>
 <p>For details see the <a href="http://tinyurl.com/kvd9m">release notes</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N10024"></a><a name="18+September%2C+2006%3A+release+0.6.2+available"></a>
+<a name="N10036"></a><a name="18+September%2C+2006%3A+release+0.6.2+available"></a>
 <h3 class="h4">18 September, 2006: release 0.6.2 available</h3>
 <p>This fixes critical bugs in 0.6.1.  For details see the <a href="http://tinyurl.com/gyb56">release notes</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N10036"></a><a name="13+September%2C+2006%3A+release+0.6.1+available"></a>
+<a name="N10048"></a><a name="13+September%2C+2006%3A+release+0.6.1+available"></a>
 <h3 class="h4">13 September, 2006: release 0.6.1 available</h3>
 <p>This fixes critical bugs in 0.6.0.  For details see the <a href="http://tinyurl.com/lykp4">release notes</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N10048"></a><a name="8+September%2C+2006%3A+release+0.6.0+available"></a>
+<a name="N1005A"></a><a name="8+September%2C+2006%3A+release+0.6.0+available"></a>
 <h3 class="h4">8 September, 2006: release 0.6.0 available</h3>
 <p>For details see the <a href="http://tinyurl.com/r3zoj">release notes</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N1005A"></a><a name="4+August%2C+2006%3A+release+0.5.0+available"></a>
+<a name="N1006C"></a><a name="4+August%2C+2006%3A+release+0.5.0+available"></a>
 <h3 class="h4">4 August, 2006: release 0.5.0 available</h3>
 <p>For details see the <a href="http://tinyurl.com/pnml2">release notes</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N1006C"></a><a name="28+June%2C+2006%3A+release+0.4.0+available"></a>
+<a name="N1007E"></a><a name="28+June%2C+2006%3A+release+0.4.0+available"></a>
 <h3 class="h4">28 June, 2006: release 0.4.0 available</h3>
 <p>For details see the <a href="http://tinyurl.com/o35b6">change log</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N1007E"></a><a name="9+June%2C+2006%3A+release+0.3.2+available"></a>
+<a name="N10090"></a><a name="9+June%2C+2006%3A+release+0.3.2+available"></a>
 <h3 class="h4">9 June, 2006: release 0.3.2 available</h3>
 <p>This is a bugfix release.  For details see the <a href="http://tinyurl.com/k9g5c">change log</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N10090"></a><a name="8+June%2C+2006%3A+FAQ+added+to+Wiki"></a>
+<a name="N100A2"></a><a name="8+June%2C+2006%3A+FAQ+added+to+Wiki"></a>
 <h3 class="h4">8 June, 2006: FAQ added to Wiki</h3>
 <p>Hadoop now has a <a href="http://wiki.apache.org/lucene-hadoop/FAQ">FAQ</a>.  Please
       help make this more complete!
       </p>
-<a name="N1009E"></a><a name="5+June%2C+2006%3A+release+0.3.1+available"></a>
+<a name="N100B0"></a><a name="5+June%2C+2006%3A+release+0.3.1+available"></a>
 <h3 class="h4">5 June, 2006: release 0.3.1 available</h3>
 <p>This is a bugfix release.  For details see the <a href="http://tinyurl.com/l6on4">change log</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N100B0"></a><a name="2+June%2C+2006%3A+release+0.3.0+available"></a>
+<a name="N100C2"></a><a name="2+June%2C+2006%3A+release+0.3.0+available"></a>
 <h3 class="h4">2 June, 2006: release 0.3.0 available</h3>
 <p>This includes many fixes, improving performance, scalability
       and reliability and adding new features.  For details see the <a href="http://tinyurl.com/rq3f7">change log</a>. The release can
       be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N100C2"></a><a name="12+May%2C+2006%3A+release+0.2.1+available"></a>
+<a name="N100D4"></a><a name="12+May%2C+2006%3A+release+0.2.1+available"></a>
 <h3 class="h4">12 May, 2006: release 0.2.1 available</h3>
 <p>This fixes a few bugs in release 0.2.0, listed in the <a href="http://tinyurl.com/rnnvz">change log</a>. The
       release can be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N100D4"></a><a name="5+May%2C+2006%3A+release+0.2.0+available"></a>
+<a name="N100E6"></a><a name="5+May%2C+2006%3A+release+0.2.0+available"></a>
 <h3 class="h4">5 May, 2006: release 0.2.0 available</h3>
 <p>We are now aiming for monthly releases.  There have been many
       bug fixes and improvements in the past month.  MapReduce and DFS
@@ -253,24 +262,24 @@ document.write("<text>Last Published:</text> " + document.lastModified);
       details. The release can be obtained from <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
       nearby mirror</a>.
       </p>
-<a name="N100E6"></a><a name="2+April%2C+2006%3A+release+0.1.0+available"></a>
+<a name="N100F8"></a><a name="2+April%2C+2006%3A+release+0.1.0+available"></a>
 <h3 class="h4">2 April, 2006: release 0.1.0 available</h3>
 <p>This is the first Hadoop release.  The release is available
       <a href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/">
       here</a>.</p>
-<a name="N100F4"></a><a name="6+February%2C+2006%3A+nightly+builds"></a>
+<a name="N10106"></a><a name="6+February%2C+2006%3A+nightly+builds"></a>
 <h3 class="h4">6 February, 2006: nightly builds</h3>
 <p>Hadoop now has nightly builds.  This automatically creates a
       <a href="http://cvs.apache.org/dist/lucene/hadoop/nightly/">downloadable version of Hadoop every
       night</a>.  All unit tests must pass, or a message is sent to
       the developers mailing list and no new version is created.  This
       also updates the <a href="docs/api/">javadoc</a>.</p>
-<a name="N10106"></a><a name="3+February%2C+2006%3A+Hadoop+code+moved+out+of+Nutch"></a>
+<a name="N10118"></a><a name="3+February%2C+2006%3A+Hadoop+code+moved+out+of+Nutch"></a>
 <h3 class="h4">3 February, 2006: Hadoop code moved out of Nutch</h3>
 <p>The Hadoop code has now been moved into its own Subversion
       tree, renamed into packages under <span class="codefrag">org.apache.hadoop</span>.
       All unit tests pass, but little else has yet been tested.</p>
-<a name="N10113"></a><a name="30+March%2C+2006%3A+Hadoop+project+approved"></a>
+<a name="N10125"></a><a name="30+March%2C+2006%3A+Hadoop+project+approved"></a>
 <h3 class="h4">30 March, 2006: Hadoop project approved</h3>
 <p>The Lucene PMC has elected to split the Nutch MapReduce and
       distributed filesytem code into a new project named Hadoop.</p>

File diff suppressed because it is too large
+ 28 - 17
site/index.pdf


+ 18 - 5
src/java/org/apache/hadoop/ipc/Server.java

@@ -30,6 +30,7 @@ import java.nio.channels.Selector;
 import java.nio.channels.ServerSocketChannel;
 import java.nio.channels.SocketChannel;
 
+import java.net.InetAddress;
 import java.net.InetSocketAddress;
 import java.net.Socket;
 
@@ -351,6 +352,10 @@ public abstract class Server {
     private long lastContact;
     private int dataLength;
     private Socket socket;
+    // Cache the remote host & port info so that even if the socket is 
+    // disconnected, we can say where it used to connect to.
+    private String hostAddress;
+    private int remotePort;
 
     public Connection(SelectionKey key, SocketChannel channel, 
     long lastContact) {
@@ -363,14 +368,21 @@ public abstract class Server {
       this.out = new DataOutputStream
         (new BufferedOutputStream(
          this.channelOut = new SocketChannelOutputStream(channel, 4096)));
+      InetAddress addr = socket.getInetAddress();
+      if (addr == null) {
+        this.hostAddress = "*Unknown*";
+      } else {
+        this.hostAddress = addr.getHostAddress();
+      }
+      this.remotePort = socket.getPort();
     }   
 
     public String toString() {
-      return getHostAddress() + ":" + socket.getPort(); 
+      return getHostAddress() + ":" + remotePort; 
     }
     
     public String getHostAddress() {
-      return socket.getInetAddress().getHostAddress();
+      return hostAddress;
     }
 
     public void setLastContact(long lastContact) {
@@ -431,7 +443,8 @@ public abstract class Server {
       Call call = new Call(id, param, this);
       synchronized (callQueue) {
         if (callQueue.size() >= maxQueueSize) {
-          callQueue.removeFirst();
+          Call oldCall = (Call) callQueue.removeFirst();
+          LOG.warn("Call queue overflow discarding oldest call " + oldCall);
         }
         callQueue.addLast(call);              // queue the call
         callQueue.notify();                   // wake up a waiting handler
@@ -484,7 +497,7 @@ public abstract class Server {
           // throw the message away if it is too old
           if (System.currentTimeMillis() - call.receivedTime > 
               maxCallStartAge) {
-            LOG.info("Call " + call.toString() + 
+            LOG.warn("Call " + call.toString() + 
                      " discarded for being too old (" +
                      (System.currentTimeMillis() - call.receivedTime) + ")");
             continue;
@@ -492,7 +505,7 @@ public abstract class Server {
           
           if (LOG.isDebugEnabled())
             LOG.debug(getName() + ": has #" + call.id + " from " +
-                     call.connection.socket.getInetAddress().getHostAddress());
+                     call.connection);
           
           String errorClass = null;
           String error = null;

+ 11 - 14
src/java/org/apache/hadoop/mapred/JobInProgress.java

@@ -20,7 +20,6 @@ import org.apache.commons.logging.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.conf.*;
 import org.apache.hadoop.mapred.JobTracker.JobTrackerMetrics;
-import org.apache.hadoop.mapred.JobHistory.Keys ; 
 import org.apache.hadoop.mapred.JobHistory.Values ; 
 import java.io.*;
 import java.net.*;
@@ -33,7 +32,7 @@ import java.util.*;
 // doing bookkeeping of its Tasks.
 ///////////////////////////////////////////////////////
 class JobInProgress {
-    public static final Log LOG = LogFactory.getLog("org.apache.hadoop.mapred.JobInProgress");
+    private static final Log LOG = LogFactory.getLog("org.apache.hadoop.mapred.JobInProgress");
 
     JobProfile profile;
     JobStatus status;
@@ -473,25 +472,24 @@ class JobInProgress {
                    " successfully.");          
 
           String taskTrackerName = status.getTaskTracker();
-          TaskTrackerStatus taskTracker = this.jobtracker.getTaskTracker(taskTrackerName);
           
           if(status.getIsMap()){
             JobHistory.MapAttempt.logStarted(profile.getJobId(), 
                 tip.getTIPId(), status.getTaskId(), status.getStartTime(), 
-                taskTracker.getHost()); 
+                taskTrackerName); 
             JobHistory.MapAttempt.logFinished(profile.getJobId(), 
                 tip.getTIPId(), status.getTaskId(), status.getFinishTime(), 
-                taskTracker.getHost()); 
+                taskTrackerName); 
             JobHistory.Task.logFinished(profile.getJobId(), tip.getTIPId(), 
                 Values.MAP.name(), status.getFinishTime()); 
           }else{
               JobHistory.ReduceAttempt.logStarted(profile.getJobId(), 
                   tip.getTIPId(), status.getTaskId(), status.getStartTime(), 
-                  taskTracker.getHost()); 
+                  taskTrackerName); 
               JobHistory.ReduceAttempt.logFinished(profile.getJobId(), 
                   tip.getTIPId(), status.getTaskId(), status.getShuffleFinishTime(),
                   status.getSortFinishTime(), status.getFinishTime(), 
-                  taskTracker.getHost()); 
+                  taskTrackerName); 
               JobHistory.Task.logFinished(profile.getJobId(), tip.getTIPId(), 
                   Values.REDUCE.name(), status.getFinishTime()); 
           }
@@ -609,21 +607,20 @@ class JobInProgress {
         
         // update job history
         String taskTrackerName = status.getTaskTracker();
-        TaskTrackerStatus taskTracker = this.jobtracker.getTaskTracker(taskTrackerName);
-        if(status.getIsMap()){
+        if (status.getIsMap()) {
           JobHistory.MapAttempt.logStarted(profile.getJobId(), 
               tip.getTIPId(), status.getTaskId(), status.getStartTime(), 
-              taskTracker.getHost()); 
+              taskTrackerName); 
           JobHistory.MapAttempt.logFailed(profile.getJobId(), 
               tip.getTIPId(), status.getTaskId(), System.currentTimeMillis(),
-              taskTracker.getHost(), status.getDiagnosticInfo()); 
-        }else{
+              taskTrackerName, status.getDiagnosticInfo()); 
+        } else {
           JobHistory.ReduceAttempt.logStarted(profile.getJobId(), 
               tip.getTIPId(), status.getTaskId(), status.getStartTime(), 
-              taskTracker.getHost()); 
+              taskTrackerName); 
           JobHistory.ReduceAttempt.logFailed(profile.getJobId(), 
               tip.getTIPId(), status.getTaskId(), System.currentTimeMillis(),
-              taskTracker.getHost(), status.getDiagnosticInfo()); 
+              taskTrackerName, status.getDiagnosticInfo()); 
         }
         
         // After this, try to assign tasks with the one after this, so that

+ 15 - 8
src/java/org/apache/hadoop/mapred/JobTracker.java

@@ -21,6 +21,7 @@ import org.apache.commons.logging.*;
 import org.apache.hadoop.fs.*;
 import org.apache.hadoop.ipc.*;
 import org.apache.hadoop.conf.*;
+import org.apache.hadoop.util.StringUtils;
 
 import java.io.*;
 import java.net.*;
@@ -114,8 +115,8 @@ public class JobTracker implements MRConstants, InterTrackerProtocol, JobSubmiss
       private Map launchingTasks = new LinkedHashMap();
       
       public void run() {
-        try {
-          while (shouldRun) {
+        while (shouldRun) {
+          try {
             // Every 3 minutes check for any tasks that are overdue
             Thread.sleep(TASKTRACKER_EXPIRY_INTERVAL/3);
             long now = System.currentTimeMillis();
@@ -151,9 +152,13 @@ public class JobTracker implements MRConstants, InterTrackerProtocol, JobSubmiss
                 }
               }
             }
+          } catch (InterruptedException ie) {
+            // all done
+            return;
+          } catch (Exception e) {
+            LOG.error("Expire Launching Task Thread got exception: " +
+                      StringUtils.stringifyException(e));
           }
-        } catch (InterruptedException ie) {
-          // all done
         }
       }
       
@@ -188,15 +193,13 @@ public class JobTracker implements MRConstants, InterTrackerProtocol, JobSubmiss
          */
         public void run() {
             while (shouldRun) {
+              try {
                 //
                 // Thread runs periodically to check whether trackers should be expired.
                 // The sleep interval must be no more than half the maximum expiry time
                 // for a task tracker.
                 //
-                try {
-                    Thread.sleep(TASKTRACKER_EXPIRY_INTERVAL / 3);
-                } catch (InterruptedException ie) {
-                }
+                Thread.sleep(TASKTRACKER_EXPIRY_INTERVAL / 3);
 
                 //
                 // Loop through all expired items in the queue
@@ -232,6 +235,10 @@ public class JobTracker implements MRConstants, InterTrackerProtocol, JobSubmiss
                         }
                     }
                 }
+              } catch (Exception t) {
+                LOG.error("Tracker Expiry Thread got exception: " +
+                          StringUtils.stringifyException(t));
+              }
             }
         }
         

+ 21 - 5
src/java/org/apache/hadoop/mapred/Task.java

@@ -176,10 +176,26 @@ abstract class Task implements Writable, Configurable {
     }
   }
 
-  public void done(TaskUmbilicalProtocol umbilical)
-    throws IOException {
-    umbilical.progress(getTaskId(),               // send a final status report
-                       taskProgress.get(), taskProgress.toString(), phase);
-    umbilical.done(getTaskId());
+  public void done(TaskUmbilicalProtocol umbilical) throws IOException {
+    int retries = 10;
+    boolean needProgress = true;
+    while (true) {
+      try {
+        if (needProgress) {
+          // send a final status report
+          umbilical.progress(getTaskId(), taskProgress.get(), 
+                             taskProgress.toString(), phase);
+          needProgress = false;
+        }
+        umbilical.done(getTaskId());
+        return;
+      } catch (IOException ie) {
+        LOG.warn("Failure signalling completion: " + 
+                 StringUtils.stringifyException(ie));
+        if (--retries == 0) {
+          throw ie;
+        }
+      }
+    }
   }
 }

+ 15 - 6
src/java/org/apache/hadoop/mapred/TaskTracker.java

@@ -1365,30 +1365,39 @@ public class TaskTracker
         Path filename = conf.getLocalPath(mapId+"/part-"+reduce+".out");
         response.setContentLength((int) fileSys.getLength(filename));
         InputStream inStream = null;
+        // true iff IOException was caused by attempt to access input
+        boolean isInputException = true;
         try {
           inStream = fileSys.open(filename);
           try {
             int len = inStream.read(buffer);
             while (len > 0) {
-              outStream.write(buffer, 0, len);
+              try {
+                outStream.write(buffer, 0, len);
+              } catch (IOException ie) {
+                isInputException = false;
+                throw ie;
+              }
               len = inStream.read(buffer);
             }
           } finally {
             inStream.close();
-            outStream.close();
           }
         } catch (IOException ie) {
           TaskTracker tracker = 
             (TaskTracker) context.getAttribute("task.tracker");
           Log log = (Log) context.getAttribute("log");
-          String errorMsg = "getMapOutput(" + mapId + "," + reduceId + 
-          ") failed :\n"+
-          StringUtils.stringifyException(ie);
+          String errorMsg = ("getMapOutput(" + mapId + "," + reduceId + 
+                             ") failed :\n"+
+                             StringUtils.stringifyException(ie));
           log.warn(errorMsg);
-          tracker.mapOutputLost(mapId, errorMsg);
+          if (isInputException) {
+            tracker.mapOutputLost(mapId, errorMsg);
+          }
           response.sendError(HttpServletResponse.SC_GONE, errorMsg);
           throw ie;
         } 
+        outStream.close();
       }
     }
 }

+ 1 - 1
src/java/overview.html

@@ -16,7 +16,7 @@ href="org/apache/hadoop/mapred/package-summary.html">org.apache.hadoop.mapred
 
 <ol>
   
-<li>Java 1.4.x, preferably from <a
+<li>Java 1.5.x, preferably from <a
  href="http://java.sun.com/j2se/downloads.html">Sun</a> Set
  <tt>JAVA_HOME</tt> to the root of your Java installation.</li>
   

+ 9 - 0
src/site/src/documentation/content/xdocs/index.xml

@@ -14,6 +14,15 @@
     <section>
       <title>News</title>
 
+      <section>
+      <title>11 October, 2006: release 0.7.1 available</title>
+      <p>This fixes critical bugs in 0.7.0.  For details see the <a
+      href="http://tinyurl.com/p7qod">release notes</a>. The release can
+      be obtained from <a
+      href="http://www.apache.org/dyn/closer.cgi/lucene/hadoop/"> a
+      nearby mirror</a>.
+      </p> </section>
+
       <section>
       <title>6 October, 2006: release 0.7.0 available</title>
       <p>For details see the <a

Some files were not shown because too many files changed in this diff