ソースを参照

ZOOKEEPER-1732. ZooKeeper server unable to join established ensemble (German Blanco via fpj)


git-svn-id: https://svn.apache.org/repos/asf/zookeeper/trunk@1534390 13f79535-47bb-0310-9956-ffa450edef68
Flavio Paiva Junqueira 11 年 前
コミット
c6ee0139d5

+ 3 - 0
CHANGES.txt

@@ -450,6 +450,9 @@ BUGFIXES:
 
   ZOOKEEPER-1646. mt c client tests fail on Ubuntu Raring (phunt)
 
+  ZOOKEEPER-1732. ZooKeeper server unable to join established
+  ensemble (German Blanco via fpj)
+
 IMPROVEMENTS:
 
   ZOOKEEPER-1170. Fix compiler (eclipse) warnings: unused imports,

+ 24 - 6
src/java/main/org/apache/zookeeper/server/quorum/FastLeaderElection.java

@@ -67,6 +67,14 @@ public class FastLeaderElection implements Election {
      */
 
     final static int maxNotificationInterval = 60000;
+    
+    /**
+     * This value is passed to the methods that check the quorum
+     * majority of an established ensemble for those values that
+     * should not be taken into account in the comparison 
+     * (electionEpoch and zxid). 
+     */
+    final static int IGNOREVALUE = -1;
 
     /**
      * Connection manager. Fast leader election uses TCP for
@@ -382,7 +390,7 @@ public class FastLeaderElection implements Election {
                                             ToSend.mType.notification,
                                             current.getId(),
                                             current.getZxid(),
-                                            logicalclock,
+                                            current.getElectionEpoch(),
                                             self.getPeerState(),
                                             response.sid,
                                             current.getPeerEpoch(), 
@@ -919,15 +927,25 @@ public class FastLeaderElection implements Election {
                             }
                         }
 
-                        /**
+                        /*
                          * Before joining an established ensemble, verify that
                          * a majority are following the same leader.
+                         * Only peer epoch is used to check that the votes come
+                         * from the same ensemble. This is because there is at
+                         * least one corner case in which the ensemble can be
+                         * created with inconsistent zxid and election epoch
+                         * info. However, given that only one ensemble can be
+                         * running at a single point in time and that each 
+                         * epoch is used only once, using only the epoch to 
+                         * compare the votes is sufficient.
+                         * 
+                         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
                          */
-                        outofelection.put(n.sid, new Vote(n.leader, n.zxid,
-                                n.electionEpoch, n.peerEpoch, n.state));
+                        outofelection.put(n.sid, new Vote(n.leader, 
+                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
                         if (termPredicate(outofelection, new Vote(n.leader,
-                                n.zxid, n.electionEpoch, n.peerEpoch, n.state))
-                                && checkLeader(outofelection, n.leader, n.electionEpoch)) {
+                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state))
+                                && checkLeader(outofelection, n.leader, IGNOREVALUE)) {
                             synchronized(this){
                                 logicalclock = n.electionEpoch;
                                 self.setPeerState((n.leader == self.getId()) ?

+ 9 - 0
src/java/main/org/apache/zookeeper/server/quorum/Leader.java

@@ -1242,6 +1242,15 @@ public class Leader {
         }
         
         zk.startup();
+        /*
+         * Update the election vote here to ensure that all members of the
+         * ensemble report the same vote to new servers that start up and
+         * send leader election notifications to the ensemble.
+         * 
+         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+         */
+        self.updateElectionVote(getEpoch());
+
         zk.getZKDatabase().setlastProcessedZxid(zk.getZxid());
     }
 

+ 9 - 0
src/java/main/org/apache/zookeeper/server/quorum/Learner.java

@@ -495,6 +495,15 @@ public class Learner {
         writePacket(ack, true);
         sock.setSoTimeout(self.tickTime * self.syncLimit);
         zk.startup();
+        /*
+         * Update the election vote here to ensure that all members of the
+         * ensemble report the same vote to new servers that start up and
+         * send leader election notifications to the ensemble.
+         * 
+         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+         */
+        self.updateElectionVote(newEpoch);
+
         // We need to log the stuff that came in between the snapshot and the uptodate
         if (zk instanceof FollowerZooKeeperServer) {
             FollowerZooKeeperServer fzk = (FollowerZooKeeperServer)zk;

+ 18 - 0
src/java/main/org/apache/zookeeper/server/quorum/QuorumPeer.java

@@ -1605,4 +1605,22 @@ public class QuorumPeer extends Thread implements QuorumStats.Provider {
        }
        return false;
    }
+ 
+    /**
+     * Updates leader election info to avoid inconsistencies when
+     * a new server tries to join the ensemble.
+     * 
+     * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+     */
+    protected void updateElectionVote(long newEpoch) {
+        Vote currentVote = getCurrentVote();
+        if (currentVote != null) {
+            setCurrentVote(new Vote(currentVote.getId(),
+                currentVote.getZxid(),
+                currentVote.getElectionEpoch(),
+                newEpoch,
+                currentVote.getState()));
+        }
+    }
+
 }

+ 63 - 0
src/java/test/org/apache/zookeeper/test/FLETest.java

@@ -454,4 +454,67 @@ public class FLETest extends ZKTestCase {
             }
         }
     }
+
+    /*
+     * For ZOOKEEPER-1732 verify that it is possible to join an ensemble with
+     * inconsistent election round information.
+     */
+    @Test
+    public void testJoinInconsistentEnsemble() throws Exception {
+        int sid;
+        QuorumPeer peer;
+        int waitTime = 10 * 1000;
+        ArrayList<QuorumPeer> peerList = new ArrayList<QuorumPeer>();
+        for(sid = 0; sid < 3; sid++) {
+            peers.put(Long.valueOf(sid),
+                    new QuorumServer(sid,
+                            new InetSocketAddress(PortAssignment.unique()),
+                    new InetSocketAddress(PortAssignment.unique())));
+            tmpdir[sid] = ClientBase.createTmpDir();
+            port[sid] = PortAssignment.unique();
+        }
+        // start 2 peers and verify if they form the cluster
+        for (sid = 0; sid < 2; sid++) {
+            peer = new QuorumPeer(peers, tmpdir[sid], tmpdir[sid],
+                                             port[sid], 3, sid, 2000, 2, 2);
+            LOG.info("Starting peer " + peer.getId());
+            peer.start();
+            peerList.add(sid, peer);
+        }
+        peer = peerList.get(0);
+        VerifyState v1 = new VerifyState(peerList.get(0));
+        v1.start();
+        v1.join(waitTime);
+        Assert.assertFalse("Unable to form cluster in " +
+            waitTime + " ms",
+            !v1.isSuccess());
+        // Change the election round for one of the members of the ensemble
+        long leaderSid = peer.getCurrentVote().getId();
+        long zxid = peer.getCurrentVote().getZxid();
+        long electionEpoch = peer.getCurrentVote().getElectionEpoch();
+        ServerState state = peer.getCurrentVote().getState();
+        long peerEpoch = peer.getCurrentVote().getPeerEpoch();
+        Vote newVote = new Vote(leaderSid, zxid+100, electionEpoch+100, peerEpoch, state);
+        peer.setCurrentVote(newVote);
+        // Start 3rd peer and check if it joins the quorum
+        peer = new QuorumPeer(peers, tmpdir[2], tmpdir[2],
+                 port[2], 3, 2, 2000, 2, 2);
+        LOG.info("Starting peer " + peer.getId());
+        peer.start();
+        peerList.add(sid, peer);
+        v1 = new VerifyState(peer);
+        v1.start();
+        v1.join(waitTime);
+        if (v1.isAlive()) {
+               Assert.fail("Peer " + peer.getId() + " failed to join the cluster " +
+                "within " + waitTime + " ms");
+        }
+        // cleanup
+        for (int id = 0; id < 3; id++) {
+            peer = peerList.get(id);
+            if (peer != null) {
+                peer.shutdown();
+            }
+        }
+    }
 }