Browse Source

HDFS-119. Fix a bug in logSync(), which causes NameNode block forever. Konstantin Shvachko, via suresh. Merge to 1.0.3.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.0@1330029 13f79535-47bb-0310-9956-ffa450edef68
Matthew Foley 13 years ago
parent
commit
d18016069d
2 changed files with 74 additions and 55 deletions
  1. 2 0
      CHANGES.txt
  2. 72 55
      src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java

+ 2 - 0
CHANGES.txt

@@ -39,6 +39,8 @@ Release 1.0.3 - unreleased
     MAPREDUCE-4154. streaming MR job succeeds even if the streaming command 
     fails. (Devaraj Das via tgraves)
 
+    HDFS-119. Fix a bug in logSync(), which causes NameNode block forever. (shv)
+
 Release 1.0.2 - 2012.03.24
 
   NEW FEATURES

+ 72 - 55
src/hdfs/org/apache/hadoop/hdfs/server/namenode/FSEditLog.java

@@ -957,69 +957,86 @@ public class FSEditLog {
     // Fetch the transactionId of this thread. 
     long mytxid = myTransactionId.get().txid;
 
-    final int numEditStreams;
-    synchronized (this) {
-      numEditStreams = editStreams.size();
-      assert numEditStreams > 0 : "no editlog streams";
-      printStatistics(false);
-
-      // if somebody is already syncing, then wait
-      while (mytxid > synctxid && isSyncRunning) {
-        try {
-          wait(1000);
-        } catch (InterruptedException ie) { 
+    ArrayList<EditLogOutputStream> streams = new ArrayList<EditLogOutputStream>();
+    boolean sync = false;
+    try {
+      synchronized (this) {
+        printStatistics(false);
+
+        // if somebody is already syncing, then wait
+        while (mytxid > synctxid && isSyncRunning) {
+          try {
+            wait(1000);
+          } catch (InterruptedException ie) { 
+          }
+        }
+
+        //
+        // If this transaction was already flushed, then nothing to do
+        //
+        if (mytxid <= synctxid) {
+          numTransactionsBatchedInSync++;
+          if (metrics != null) // Metrics is non-null only when used inside name node
+            metrics.incrTransactionsBatchedInSync();
+          return;
         }
-      }
 
-      //
-      // If this transaction was already flushed, then nothing to do
-      //
-      if (mytxid <= synctxid) {
-        numTransactionsBatchedInSync++;
-        if (metrics != null) // Metrics is non-null only when used inside name node
-          metrics.incrTransactionsBatchedInSync();
-        return;
+        // now, this thread will do the sync
+        syncStart = txid;
+        isSyncRunning = true;
+        sync = true;
+
+        // swap buffers
+        assert editStreams.size() > 0 : "no editlog streams";
+        for(EditLogOutputStream eStream : editStreams) {
+          try {
+            eStream.setReadyToFlush();
+            streams.add(eStream);
+          } catch (IOException ie) {
+            FSNamesystem.LOG.error("Unable to get ready to flush.", ie);
+            //
+            // remember the streams that encountered an error.
+            //
+            if (errorStreams == null) {
+              errorStreams = new ArrayList<EditLogOutputStream>(1);
+            }
+            errorStreams.add(eStream);
+          }
+        }
       }
-   
-      // now, this thread will do the sync
-      syncStart = txid;
-      isSyncRunning = true;   
-
-      // swap buffers
-      for (int idx = 0; idx < numEditStreams; idx++) {
-        editStreams.get(idx).setReadyToFlush();
+
+      // do the sync
+      long start = FSNamesystem.now();
+      for (EditLogOutputStream eStream : streams) {
+        try {
+          eStream.flush();
+        } catch (IOException ie) {
+          FSNamesystem.LOG.error("Unable to sync edit log.", ie);
+          //
+          // remember the streams that encountered an error.
+          //
+          if (errorStreams == null) {
+            errorStreams = new ArrayList<EditLogOutputStream>(1);
+          }
+          errorStreams.add(eStream);
+        }
       }
-    }
+      long elapsed = FSNamesystem.now() - start;
+      removeEditsStreamsAndStorageDirs(errorStreams);
+      exitIfNoStreams();
 
-    // do the sync
-    long start = FSNamesystem.now();
-    for (int idx = 0; idx < numEditStreams; idx++) {
-      EditLogOutputStream eStream = editStreams.get(idx);
-      try {
-        eStream.flush();
-      } catch (IOException ioe) {
-        //
-        // remember the streams that encountered an error.
-        //
-        if (errorStreams == null) {
-          errorStreams = new ArrayList<EditLogOutputStream>(1);
+      if (metrics != null) // Metrics is non-null only when used inside name node
+        metrics.addSync(elapsed);
+
+    } finally {
+      synchronized (this) {
+        if(sync) {
+          synctxid = syncStart;
+          isSyncRunning = false;
         }
-        errorStreams.add(eStream);
-        FSNamesystem.LOG.error("Unable to sync "+eStream.getName());
+        this.notifyAll();
       }
     }
-    long elapsed = FSNamesystem.now() - start;
-
-    synchronized (this) {
-       removeEditsStreamsAndStorageDirs(errorStreams);
-       exitIfNoStreams();
-       synctxid = syncStart;
-       isSyncRunning = false;
-       this.notifyAll();
-    }
-
-    if (metrics != null) // Metrics is non-null only when used inside name node
-      metrics.addSync(elapsed);
   }
 
   //