Browse Source

HADOOP-1960 If a region server cannot talk to the master before its lease times out, it should shut itself down

git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk@581396 13f79535-47bb-0310-9956-ffa450edef68
Jim Kellerman 18 years ago
parent
commit
c5448b1eb7

+ 2 - 0
src/contrib/hbase/CHANGES.txt

@@ -85,6 +85,8 @@ Trunk (unreleased changes)
                 (Edward Yoon via Stack)
     HADOOP-1928 Have master pass the regionserver the filesystem to use
     HADOOP-1789 Output formatting
+    HADOOP-1960 If a region server cannot talk to the master before its lease
+                times out, it should shut itself down
 
 
 Below are the list of changes before 2007-08-18

+ 19 - 5
src/contrib/hbase/src/java/org/apache/hadoop/hbase/HRegionServer.java

@@ -99,6 +99,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
   int numRetries;
   protected final int threadWakeFrequency;
   private final int msgInterval;
+  private final int serverLeaseTimeout;
 
   // Remote HMaster
   private final HMasterRegionInterface hbaseMaster;
@@ -384,6 +385,8 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
     this.numRetries =  conf.getInt("hbase.client.retries.number", 2);
     this.threadWakeFrequency = conf.getInt(THREAD_WAKE_FREQUENCY, 10 * 1000);
     this.msgInterval = conf.getInt("hbase.regionserver.msginterval", 3 * 1000);
+    this.serverLeaseTimeout =
+      conf.getInt("hbase.master.lease.period", 30 * 1000);
 
     // Cache flushing chore thread.
     this.cacheFlusherThread =
@@ -427,11 +430,20 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
     
     try {
       init(reportForDuty());
+      long lastMsg = 0;
       while(!stopRequested.get()) {
-        long lastMsg = 0;
         // Now ask master what it wants us to do and tell it what we have done
         for (int tries = 0; !stopRequested.get();) {
-          if ((System.currentTimeMillis() - lastMsg) >= msgInterval) {
+          long now = System.currentTimeMillis();
+          if (lastMsg != 0 && (now - lastMsg) >= serverLeaseTimeout) {
+            // It has been way too long since we last reported to the master.
+            // Commit suicide.
+            LOG.fatal("unable to report to master for " + (now - lastMsg) +
+                " milliseconds - aborting server");
+            abort();
+            break;
+          }
+          if ((now - lastMsg) >= msgInterval) {
             HMsg outboundArray[] = null;
             synchronized(outboundMsgs) {
               outboundArray =
@@ -514,9 +526,10 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
                 stop();
               }
             }
-          } // while (!stopRequested.get())
+          }
+          
           this.sleeper.sleep(lastMsg);
-        }
+        } // while (!stopRequested.get())
       }
     } catch (Throwable t) {
       LOG.fatal("Unhandled exception. Aborting...", t);
@@ -743,12 +756,13 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
       LOG.debug("Telling master we are up");
     }
     MapWritable result = null;
+    long lastMsg = 0;
     while(!stopRequested.get()) {
-      long lastMsg = 0;
       try {
         this.requestCount.set(0);
         this.serverInfo.setLoad(new HServerLoad(0, onlineRegions.size()));
         result = this.hbaseMaster.regionServerStartup(serverInfo);
+        lastMsg = System.currentTimeMillis();
         if (LOG.isDebugEnabled()) {
           LOG.debug("Done telling master we are up");
         }

+ 4 - 7
src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestRegionServerAbort.java

@@ -25,8 +25,6 @@ import java.util.TreeMap;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.io.Text;
-import org.apache.log4j.Level;
-import org.apache.log4j.Logger;
 
 /**
  * Tests region server failover when a region server exits.
@@ -41,9 +39,6 @@ public class TestRegionServerAbort extends HBaseClusterTestCase {
     conf.setInt("ipc.client.timeout", 5000);            // reduce client timeout
     conf.setInt("ipc.client.connect.max.retries", 5);   // and number of retries
     conf.setInt("hbase.client.retries.number", 5);      // reduce HBase retries
-    Logger.getRootLogger().setLevel(Level.WARN);
-    Logger.getLogger(this.getClass().getPackage().getName()).
-      setLevel(Level.DEBUG);
   }
   
   /**
@@ -92,8 +87,10 @@ public class TestRegionServerAbort extends HBaseClusterTestCase {
       }
       LOG.info("Success!");
     } finally {
-      LOG.info("Closing scanner " + scanner);
-      scanner.close();
+      if (scanner != null) {
+        LOG.info("Closing scanner " + scanner);
+        scanner.close();
+      }
     }
   }
 }

+ 18 - 13
src/contrib/hbase/src/test/org/apache/hadoop/hbase/TestSplit.java

@@ -153,21 +153,26 @@ public class TestSplit extends MultiRegionTable {
   /**
    * Test that a region is cleaned up after its daughter splits release all
    * references.
-   * @throws IOException
+   * @throws Exception
    */
-  public void testSplitRegionIsDeleted() throws IOException {
-    // Start up a hbase cluster
-    MiniHBaseCluster cluster = new MiniHBaseCluster(conf, 1, true);
+  public void testSplitRegionIsDeleted() throws Exception {
     try {
-      // Create a table.
-      HBaseAdmin admin = new HBaseAdmin(this.conf);
-      admin.createTable(createTableDescriptor(getName()));
-      // This builds a multi-region table by splitting.  It will assert
-      // the parent region gets cleaned-up.
-      MultiRegionTable.makeMultiRegionTable(conf, cluster,
-        this.localFs, getName(), COLFAMILY_NAME3);
-    } finally {
-      cluster.shutdown();
+      // Start up a hbase cluster
+      MiniHBaseCluster cluster = new MiniHBaseCluster(conf, 1, true);
+      try {
+        // Create a table.
+        HBaseAdmin admin = new HBaseAdmin(this.conf);
+        admin.createTable(createTableDescriptor(getName()));
+        // This builds a multi-region table by splitting.  It will assert
+        // the parent region gets cleaned-up.
+        MultiRegionTable.makeMultiRegionTable(conf, cluster,
+            this.localFs, getName(), COLFAMILY_NAME3);
+      } finally {
+        cluster.shutdown();
+      }
+    } catch (Exception e) {
+      LOG.error("test failed", e);
+      throw e;
     }
   }