Browse Source

ZOOKEEPER-710. permanent ZSESSIONMOVED error after client app reconnects to zookeeper cluster

git-svn-id: https://svn.apache.org/repos/asf/hadoop/zookeeper/trunk@925104 13f79535-47bb-0310-9956-ffa450edef68
Benjamin Reed 15 years ago
parent
commit
b1c614b2ca

+ 2 - 0
CHANGES.txt

@@ -306,6 +306,8 @@ BUGFIXES:
 
 
   ZOOKEEPER-436. Bookies should auto register to ZooKeeper (erwin tam & fpj via breed)
   ZOOKEEPER-436. Bookies should auto register to ZooKeeper (erwin tam & fpj via breed)
 
 
+  ZOOKEEPER-710. permanent ZSESSIONMOVED error after client app reconnects to zookeeper cluster (phunt via breed)
+
 IMPROVEMENTS:
 IMPROVEMENTS:
   ZOOKEEPER-473. cleanup junit tests to eliminate false positives due to
   ZOOKEEPER-473. cleanup junit tests to eliminate false positives due to
   "socket reuse" and failure to close client (phunt via mahadev)
   "socket reuse" and failure to close client (phunt via mahadev)

+ 5 - 4
src/docs/src/documentation/content/xdocs/zookeeperProgrammers.xml

@@ -533,12 +533,13 @@
       which has be reestablished on a different server. The normal cause of this error is
       which has be reestablished on a different server. The normal cause of this error is
       a client that sends a request to a server, but the network packet gets delayed, so
       a client that sends a request to a server, but the network packet gets delayed, so
       the client times out and connects to a new server. When the delayed packet arrives at
       the client times out and connects to a new server. When the delayed packet arrives at
-      the first server, the old server detects that the session has moved and sends back
-      the SESSION_MOVED error. Clients normally do not see this error since they do not read
+      the first server, the old server detects that the session has moved, and closes the
+      client connection. Clients normally do not see this error since they do not read
       from those old connections. (Old connections are usually closed.) One situation in which this
       from those old connections. (Old connections are usually closed.) One situation in which this
-      error can be seen is when two clients try to reestablish the same connection using
+      condition can be seen is when two clients try to reestablish the same connection using
       a saved session id and password. One of the clients will reestablish the connection
       a saved session id and password. One of the clients will reestablish the connection
-      and the second client will get the SessionMovedException.</para>
+      and the second client will be disconnected (causing the pair to attempt to re-establish
+      it's connection/session indefinitely).</para>
 
 
   </section>
   </section>
 
 

+ 12 - 0
src/java/main/org/apache/zookeeper/server/FinalRequestProcessor.java

@@ -27,6 +27,7 @@ import org.apache.log4j.Logger;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.KeeperException.Code;
 import org.apache.zookeeper.KeeperException.Code;
+import org.apache.zookeeper.KeeperException.SessionMovedException;
 import org.apache.zookeeper.ZooDefs.OpCode;
 import org.apache.zookeeper.ZooDefs.OpCode;
 import org.apache.zookeeper.data.ACL;
 import org.apache.zookeeper.data.ACL;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.data.Stat;
@@ -327,6 +328,17 @@ public class FinalRequestProcessor implements RequestProcessor {
                 break;
                 break;
             }
             }
             }
             }
+        } catch (SessionMovedException e) {
+            // session moved is a connection level error, we need to tear
+            // down the connection otw ZOOKEEPER-710 might happen
+            // ie client on slow follower starts to renew session, fails
+            // before this completes, then tries the fast follower (leader)
+            // and is successful, however the initial renew is then 
+            // successfully fwd/processed by the leader and as a result
+            // the client and leader disagree on where the client is most
+            // recently attached (and therefore invalid SESSION MOVED generated)
+            cnxn.sendCloseSession();
+            return;
         } catch (KeeperException e) {
         } catch (KeeperException e) {
             err = e.code();
             err = e.code();
         } catch (Exception e) {
         } catch (Exception e) {

+ 35 - 16
src/java/test/org/apache/zookeeper/test/QuorumTest.java

@@ -28,6 +28,7 @@ import org.apache.zookeeper.WatchedEvent;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.Watcher.Event.KeeperState;
 import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.ZooDefs.Ids;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.server.quorum.Leader;
 import org.apache.zookeeper.server.quorum.Leader;
@@ -180,14 +181,14 @@ public class QuorumTest extends QuorumBase {
     @Test
     @Test
     public void testSessionMoved() throws IOException, InterruptedException, KeeperException {
     public void testSessionMoved() throws IOException, InterruptedException, KeeperException {
         String hostPorts[] = qb.hostPort.split(",");
         String hostPorts[] = qb.hostPort.split(",");
-        ZooKeeper zk = new DisconnectableZooKeeper(hostPorts[0], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
+        DisconnectableZooKeeper zk = new DisconnectableZooKeeper(hostPorts[0], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
             public void process(WatchedEvent event) {
             public void process(WatchedEvent event) {
             }});
             }});
         zk.create("/sessionMoveTest", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         zk.create("/sessionMoveTest", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         // we want to loop through the list twice
         // we want to loop through the list twice
         for(int i = 0; i < hostPorts.length*2; i++) {
         for(int i = 0; i < hostPorts.length*2; i++) {
             // This should stomp the zk handle
             // This should stomp the zk handle
-            ZooKeeper zknew = new DisconnectableZooKeeper(hostPorts[(i+1)%hostPorts.length], ClientBase.CONNECTION_TIMEOUT, 
+            DisconnectableZooKeeper zknew = new DisconnectableZooKeeper(hostPorts[(i+1)%hostPorts.length], ClientBase.CONNECTION_TIMEOUT, 
                     new Watcher() {public void process(WatchedEvent event) {
                     new Watcher() {public void process(WatchedEvent event) {
                     }},
                     }},
                     zk.getSessionId(),
                     zk.getSessionId(),
@@ -196,14 +197,23 @@ public class QuorumTest extends QuorumBase {
             try {
             try {
                 zk.setData("/", new byte[1], -1);
                 zk.setData("/", new byte[1], -1);
                 fail("Should have lost the connection");
                 fail("Should have lost the connection");
-            } catch(KeeperException.SessionMovedException e) {
+            } catch(KeeperException.ConnectionLossException e) {
             }
             }
-            //zk.close();
+            zk.disconnect(); // close w/o closing session
             zk = zknew;
             zk = zknew;
         }
         }
         zk.close();
         zk.close();
     }
     }
-    
+
+    private static class DiscoWatcher implements Watcher {
+        volatile boolean zkDisco = false;
+        public void process(WatchedEvent event) {
+            if (event.getState() == KeeperState.Disconnected) {
+                zkDisco = true;
+            }
+        }
+    }
+
     @Test
     @Test
     /**
     /**
      * Connect to two different servers with two different handles using the same session and
      * Connect to two different servers with two different handles using the same session and
@@ -211,32 +221,41 @@ public class QuorumTest extends QuorumBase {
      */
      */
     public void testSessionMove() throws IOException, InterruptedException, KeeperException {
     public void testSessionMove() throws IOException, InterruptedException, KeeperException {
         String hps[] = qb.hostPort.split(",");
         String hps[] = qb.hostPort.split(",");
-        ZooKeeper zk = new DisconnectableZooKeeper(hps[0], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
-            public void process(WatchedEvent event) {
-        }});
+        DiscoWatcher oldWatcher = new DiscoWatcher();
+        ZooKeeper zk = new DisconnectableZooKeeper(hps[0],
+                ClientBase.CONNECTION_TIMEOUT, oldWatcher);
         zk.create("/t1", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         zk.create("/t1", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         // This should stomp the zk handle
         // This should stomp the zk handle
-        ZooKeeper zknew = new DisconnectableZooKeeper(hps[1], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
-            public void process(WatchedEvent event) {
-            }}, zk.getSessionId(), zk.getSessionPasswd());
+        DiscoWatcher watcher = new DiscoWatcher();
+        ZooKeeper zknew = new DisconnectableZooKeeper(hps[1],
+                ClientBase.CONNECTION_TIMEOUT, watcher, zk.getSessionId(),
+                zk.getSessionPasswd());
         zknew.create("/t2", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         zknew.create("/t2", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         try {
         try {
             zk.create("/t3", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
             zk.create("/t3", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
             fail("Should have lost the connection");
             fail("Should have lost the connection");
-        } catch(KeeperException.SessionMovedException e) {
+        } catch(KeeperException.ConnectionLossException e) {
+            // wait up to 30 seconds for the disco to be delivered
+            for (int i = 0; i < 30; i++) {
+                if (oldWatcher.zkDisco) {
+                    break;
+                }
+                Thread.sleep(1000);
+            }
+            assertTrue(oldWatcher.zkDisco);
         }
         }
 
 
         ArrayList<ZooKeeper> toClose = new ArrayList<ZooKeeper>();
         ArrayList<ZooKeeper> toClose = new ArrayList<ZooKeeper>();
         toClose.add(zknew);
         toClose.add(zknew);
         // Let's just make sure it can still move
         // Let's just make sure it can still move
         for(int i = 0; i < 10; i++) {
         for(int i = 0; i < 10; i++) {
-            zknew = new DisconnectableZooKeeper(hps[1], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
-                public void process(WatchedEvent event) {
-                }}, zk.getSessionId(), zk.getSessionPasswd());
+            zknew = new DisconnectableZooKeeper(hps[1],
+                    ClientBase.CONNECTION_TIMEOUT, new DiscoWatcher(),
+                    zk.getSessionId(), zk.getSessionPasswd());
             toClose.add(zknew);
             toClose.add(zknew);
             zknew.create("/t-"+i, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
             zknew.create("/t-"+i, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         }
         }
-    for(ZooKeeper z: toClose) {
+        for (ZooKeeper z: toClose) {
             z.close();
             z.close();
         }
         }
         zk.close();
         zk.close();