Browse Source

HADOOP-9220. Unnecessary transition to standby in ActiveStandbyElector. Contributed by Tom White and Todd Lipcon.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1482401 13f79535-47bb-0310-9956-ffa450edef68
Todd Lipcon 12 years ago
parent
commit
d5a6e764dc

+ 3 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -720,6 +720,9 @@ Release 2.0.5-beta - UNRELEASED
     HADOOP-9307. BufferedFSInputStream.read returns wrong results
     after certain seeks. (todd)
 
+    HADOOP-9220. Unnecessary transition to standby in ActiveStandbyElector.
+    (tom and todd via todd)
+
 Release 2.0.4-alpha - 2013-04-25 
 
   INCOMPATIBLE CHANGES

+ 11 - 9
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/ha/ActiveStandbyElector.java

@@ -159,6 +159,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
   private int createRetryCount = 0;
   private int statRetryCount = 0;
   private ZooKeeper zkClient;
+  private WatcherWithClientRef watcher;
   private ConnectionState zkConnectionState = ConnectionState.TERMINATED;
 
   private final ActiveStandbyElectorCallback appClient;
@@ -246,6 +247,11 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     if (data == null) {
       throw new HadoopIllegalArgumentException("data cannot be null");
     }
+    
+    if (wantToBeInElection) {
+      LOG.info("Already in election. Not re-connecting.");
+      return;
+    }
 
     appData = new byte[data.length];
     System.arraycopy(data, 0, appData, 0, data.length);
@@ -615,7 +621,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     // watcher after constructing ZooKeeper, we may miss that event. Instead,
     // we construct the watcher first, and have it block any events it receives
     // before we can set its ZooKeeper reference.
-    WatcherWithClientRef watcher = new WatcherWithClientRef();
+    watcher = new WatcherWithClientRef();
     ZooKeeper zk = new ZooKeeper(zkHostPort, zkSessionTimeout, watcher);
     watcher.setZooKeeperRef(zk);
 
@@ -753,6 +759,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
             e);
       }
       zkClient = null;
+      watcher = null;
     }
     zkClient = getNewZooKeeper();
     LOG.debug("Created new connection for " + this);
@@ -765,12 +772,14 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
     LOG.debug("Terminating ZK connection for " + this);
     ZooKeeper tempZk = zkClient;
     zkClient = null;
+    watcher = null;
     try {
       tempZk.close();
     } catch(InterruptedException e) {
       LOG.warn(e);
     }
     zkConnectionState = ConnectionState.TERMINATED;
+    wantToBeInElection = false;
   }
 
   private void reset() {
@@ -914,7 +923,7 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
 
   private void monitorLockNodeAsync() {
     zkClient.exists(zkLockFilePath, 
-        new WatcherWithClientRef(zkClient), this,
+        watcher, this,
         zkClient);
   }
 
@@ -1015,13 +1024,6 @@ public class ActiveStandbyElector implements StatCallback, StringCallback {
      * Latch used to wait until the reference to ZooKeeper is set.
      */
     private CountDownLatch hasSetZooKeeper = new CountDownLatch(1);
-    
-    private WatcherWithClientRef() {
-    }
-
-    private WatcherWithClientRef(ZooKeeper zk) {
-      setZooKeeperRef(zk);
-    }
 
     /**
      * Waits for the next event from ZooKeeper to arrive.

+ 2 - 0
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/DummyHAService.java

@@ -49,6 +49,7 @@ class DummyHAService extends HAServiceTarget {
   
   DummySharedResource sharedResource;
   public int fenceCount = 0;
+  public int activeTransitionCount = 0;
   
   static ArrayList<DummyHAService> instances = Lists.newArrayList();
   int index;
@@ -139,6 +140,7 @@ class DummyHAService extends HAServiceTarget {
     @Override
     public void transitionToActive(StateChangeRequestInfo req) throws ServiceFailedException,
         AccessControlException, IOException {
+      activeTransitionCount++;
       checkUnreachable();
       if (failToBecomeActive) {
         throw new ServiceFailedException("injected failure");

+ 7 - 2
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/ha/TestZKFailoverController.java

@@ -422,7 +422,7 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
     }
   }
   
-  @Test(timeout=15000)
+  @Test(timeout=25000)
   public void testGracefulFailover() throws Exception {
     try {
       cluster.start();
@@ -430,11 +430,16 @@ public class TestZKFailoverController extends ClientBaseWithFixes {
       cluster.waitForActiveLockHolder(0);
       cluster.getService(1).getZKFCProxy(conf, 5000).gracefulFailover();
       cluster.waitForActiveLockHolder(1);
+
       cluster.getService(0).getZKFCProxy(conf, 5000).gracefulFailover();
       cluster.waitForActiveLockHolder(0);
-      
+
+      Thread.sleep(10000); // allow to quiesce
+
       assertEquals(0, cluster.getService(0).fenceCount);
       assertEquals(0, cluster.getService(1).fenceCount);
+      assertEquals(2, cluster.getService(0).activeTransitionCount);
+      assertEquals(1, cluster.getService(1).activeTransitionCount);
     } finally {
       cluster.stop();
     }