Explorar o código

ZOOKEEPER-1818: Correctly handle potential inconsistent zxid/electionEpoch and peerEpoch during leader election

This is similar to the 3.4 implementation in Jira ZOOKEEPER-1817, main differences with that:

1. removed bcVote, in Vote.equals it will skip compare peerEpoch when one of the version is 0x0.
2. added detailed scenarios which we tried to solve with peerEpoch update and the change in Vote.equals.
3. removed ooePredicate with inlined one, since master code is tracking voteSet..
4. improved the tests to make it cleaner.

Author: Fangmin Lyu <fangmin@apache.org>

Reviewers: hanm@apache.org, andor@apache.org

Closes #703 from lvfangmin/ZOOKEEPER-1818 and squashes the following commits:

022fcbb78 [Fangmin Lyu] update the comment in the test
b51e124c4 [Fangmin Lyu] update comment about the JIRA references
ccccac2aa [Fangmin Lyu] Correctly handle potential inconsitent zxid/electionEpoch and peerEpoch during leader election
Fangmin Lyu %!s(int64=6) %!d(string=hai) anos
pai
achega
b7403b790f

+ 20 - 37
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/FastLeaderElection.java

@@ -70,14 +70,6 @@ public class FastLeaderElection implements Election {
 
     final static int maxNotificationInterval = 60000;
 
-    /**
-     * This value is passed to the methods that check the quorum
-     * majority of an established ensemble for those values that
-     * should not be taken into account in the comparison
-     * (electionEpoch and zxid).
-     */
-    final static int IGNOREVALUE = -1;
-
     /**
      * Connection manager. Fast leader election uses TCP for
      * communication between peers, and QuorumCnxManager manages
@@ -742,7 +734,7 @@ public class FastLeaderElection implements Election {
      *            Identifier of the vote received last
      * @return the SyncedLearnerTracker with vote details
      */
-    private SyncedLearnerTracker getVoteTracker(Map<Long, Vote> votes, Vote vote) {
+    protected SyncedLearnerTracker getVoteTracker(Map<Long, Vote> votes, Vote vote) {
         SyncedLearnerTracker voteSet = new SyncedLearnerTracker();
         voteSet.addQuorumVerifier(self.getQuorumVerifier());
         if (self.getLastSeenQuorumVerifier() != null
@@ -775,7 +767,7 @@ public class FastLeaderElection implements Election {
      * @param   leader  leader id
      * @param   electionEpoch   epoch id
      */
-    private boolean checkLeader(
+    protected boolean checkLeader(
             Map<Long, Vote> votes,
             long leader,
             long electionEpoch){
@@ -999,6 +991,7 @@ public class FastLeaderElection implements Election {
                                     ", proposed election epoch=0x" + Long.toHexString(n.electionEpoch));
                         }
 
+                        // don't care about the version if it's in LOOKING state
                         recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
 
                         voteSet = getVoteTracker(
@@ -1023,9 +1016,9 @@ public class FastLeaderElection implements Election {
                              */
                             if (n == null) {
                                 setPeerState(proposedLeader, voteSet);
-
                                 Vote endVote = new Vote(proposedLeader,
-                                        proposedZxid, proposedEpoch);
+                                        proposedZxid, logicalclock.get(), 
+                                        proposedEpoch);
                                 leaveInstance(endVote);
                                 return endVote;
                             }
@@ -1042,14 +1035,13 @@ public class FastLeaderElection implements Election {
                          */
                         if(n.electionEpoch == logicalclock.get()){
                             recvset.put(n.sid, new Vote(n.leader, n.zxid, n.electionEpoch, n.peerEpoch));
-                            voteSet = getVoteTracker(
-                                    recvset, new Vote(n.leader, n.zxid,
-                                    n.electionEpoch, n.peerEpoch, n.state));
-                            if(voteSet.hasAllQuorums()
-                                            && checkLeader(outofelection, n.leader, n.electionEpoch)) {
+                            voteSet = getVoteTracker(recvset, new Vote(n.version, 
+                                      n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
+                            if (voteSet.hasAllQuorums() && 
+                                    checkLeader(outofelection, n.leader, n.electionEpoch)) {
                                 setPeerState(n.leader, voteSet);
-
-                                Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
+                                Vote endVote = new Vote(n.leader, 
+                                        n.zxid, n.electionEpoch, n.peerEpoch);
                                 leaveInstance(endVote);
                                 return endVote;
                             }
@@ -1058,29 +1050,20 @@ public class FastLeaderElection implements Election {
                         /*
                          * Before joining an established ensemble, verify that
                          * a majority are following the same leader.
-                         * Only peer epoch is used to check that the votes come
-                         * from the same ensemble. This is because there is at
-                         * least one corner case in which the ensemble can be
-                         * created with inconsistent zxid and election epoch
-                         * info. However, given that only one ensemble can be
-                         * running at a single point in time and that each
-                         * epoch is used only once, using only the epoch to
-                         * compare the votes is sufficient.
-                         *
-                         * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
                          */
-                        outofelection.put(n.sid, new Vote(n.leader,
-                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
-                        voteSet = getVoteTracker(
-                                outofelection, new Vote(n.leader,
-                                IGNOREVALUE, IGNOREVALUE, n.peerEpoch, n.state));
-                        if (voteSet.hasAllQuorums()
-                                && checkLeader(outofelection, n.leader, IGNOREVALUE)) {
+                        outofelection.put(n.sid, new Vote(n.version, n.leader,
+                                n.zxid, n.electionEpoch, n.peerEpoch, n.state));
+                        voteSet = getVoteTracker(outofelection, new Vote(n.version, 
+                                n.leader, n.zxid, n.electionEpoch, n.peerEpoch, n.state));
+
+                        if (voteSet.hasAllQuorums() &&
+                                checkLeader(outofelection, n.leader, n.electionEpoch)) {
                             synchronized(this){
                                 logicalclock.set(n.electionEpoch);
                                 setPeerState(n.leader, voteSet);
                             }
-                            Vote endVote = new Vote(n.leader, n.zxid, n.peerEpoch);
+                            Vote endVote = new Vote(n.leader, n.zxid, 
+                                    n.electionEpoch, n.peerEpoch);
                             leaveInstance(endVote);
                             return endVote;
                         }

+ 33 - 1
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/QuorumPeer.java

@@ -1997,8 +1997,40 @@ public class QuorumPeer extends ZooKeeperThread implements QuorumStats.Provider
     /**
      * Updates leader election info to avoid inconsistencies when
      * a new server tries to join the ensemble.
+     *
+     * Here is the inconsistency scenario we try to solve by updating the peer 
+     * epoch after following leader:
+     *
+     * Let's say we have an ensemble with 3 servers z1, z2 and z3.
+     *
+     * 1. z1, z2 were following z3 with peerEpoch to be 0xb8, the new epoch is 
+     *    0xb9, aka current accepted epoch on disk.
+     * 2. z2 get restarted, which will use 0xb9 as it's peer epoch when loading
+     *    the current accept epoch from disk.
+     * 3. z2 received notification from z1 and z3, which is following z3 with 
+     *    epoch 0xb8, so it started following z3 again with peer epoch 0xb8.
+     * 4. before z2 successfully connected to z3, z3 get restarted with new 
+     *    epoch 0xb9.
+     * 5. z2 will retry around a few round (default 5s) before giving up, 
+     *    meanwhile it will report z3 as leader.
+     * 6. z1 restarted, and looking with peer epoch 0xb9.
+     * 7. z1 voted z3, and z3 was elected as leader again with peer epoch 0xb9.
+     * 8. z2 successfully connected to z3 before giving up, but with peer 
+     *    epoch 0xb8.
+     * 9. z1 get restarted, looking for leader with peer epoch 0xba, but cannot 
+     *    join, because z2 is reporting peer epoch 0xb8, while z3 is reporting 
+     *    0xb9.
+     *
+     * By updating the election vote after actually following leader, we can 
+     * avoid this kind of stuck happened.
+     *
+     * Btw, the zxid and electionEpoch could be inconsistent because of the same 
+     * reason, it's better to update these as well after syncing with leader, but 
+     * that required protocol change which is non trivial. This problem is worked 
+     * around by skipping comparing the zxid and electionEpoch when counting for 
+     * votes for out of election servers during looking for leader.
      * 
-     * @see https://issues.apache.org/jira/browse/ZOOKEEPER-1732
+     * {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732}
      */
     protected void updateElectionVote(long newEpoch) {
         Vote currentVote = getCurrentVote();

+ 39 - 2
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Vote.java

@@ -125,11 +125,48 @@ public class Vote {
             return false;
         }
         Vote other = (Vote) o;
-        return (id == other.id
+
+        if ((state == ServerState.LOOKING) ||
+                (other.state == ServerState.LOOKING)) {
+            return (id == other.id
                     && zxid == other.zxid
                     && electionEpoch == other.electionEpoch
                     && peerEpoch == other.peerEpoch);
-
+        } else {
+            /*
+             * There are two things going on in the logic below:
+             * 
+             * 1. skip comparing the zxid and electionEpoch for votes for servers 
+             *    out of election. 
+             *    
+             *    Need to skip those because they can be inconsistent due to  
+             *    scenarios described in QuorumPeer.updateElectionVote. 
+             *
+             *    And given that only one ensemble can be running at a single point 
+             *    in time and that each epoch is used only once, using only id and 
+             *    epoch to compare the votes is sufficient.
+             *
+             *    {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1805}
+             *
+             * 2. skip comparing peerEpoch if if we're running with mixed ensemble 
+             *    with (version > 0x0) and without the change (version = 0x0) 
+             *    introduced in ZOOKEEPER-1732.
+             *
+             *    {@see https://issues.apache.org/jira/browse/ZOOKEEPER-1732}
+             *
+             *    The server running with and without ZOOKEEPER-1732 will return 
+             *    different peerEpoch. During rolling upgrades, it's possible
+             *    that 2/5 servers are returning epoch 1, while the other 2/5
+             *    are returning epoch 2, the other server need to ignore the 
+             *    peerEpoch to be able to join it.
+             */
+            if ((version > 0x0) ^ (other.version > 0x0)) {
+                return id == other.id;
+            } else {
+                return (id == other.id
+                        && peerEpoch == other.peerEpoch);
+            }
+        }
     }
 
     @Override

+ 136 - 0
zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/FLEOutOfElectionTest.java

@@ -0,0 +1,136 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.zookeeper.server.quorum;
+
+import java.io.File;
+import java.net.InetSocketAddress;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.zookeeper.PortAssignment;
+import org.apache.zookeeper.server.quorum.QuorumPeer;
+import org.apache.zookeeper.server.quorum.FastLeaderElection.Notification;
+import org.apache.zookeeper.server.quorum.Vote;
+import org.apache.zookeeper.server.quorum.QuorumPeer.QuorumServer;
+import org.apache.zookeeper.server.quorum.QuorumPeer.ServerState;
+import org.apache.zookeeper.server.util.ZxidUtils;
+import org.apache.zookeeper.test.ClientBase;
+import org.apache.zookeeper.test.FLETest;
+import org.junit.After;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * Test FastLeaderElection with out of election servers.
+ */
+public class FLEOutOfElectionTest {
+
+    private FastLeaderElection fle;
+
+    @Before
+    public void setUp() throws Exception {
+        File tmpdir = ClientBase.createTmpDir();
+        Map<Long, QuorumServer> peers = new HashMap<Long,QuorumServer>();
+        for(int i = 0; i < 5; i++) {
+            peers.put(Long.valueOf(i), new QuorumServer(Long.valueOf(i), 
+                    new InetSocketAddress("127.0.0.1", PortAssignment.unique())));
+        }
+        QuorumPeer peer = new QuorumPeer(peers, tmpdir, tmpdir, 
+                PortAssignment.unique(), 3, 3, 1000, 2, 2);
+        fle = new FastLeaderElection(peer, peer.createCnxnManager());
+    }
+
+    @Test
+    public void testIgnoringZxidElectionEpoch() {
+        Map<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 2, ServerState.FOLLOWING));
+        votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 2), 1, 2, ServerState.FOLLOWING));
+        votes.put(3L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING));
+        votes.put(4L, new Vote(0x1, 4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING));
+
+        Assert.assertTrue(fle.getVoteTracker(votes, 
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)).hasAllQuorums());
+    }
+
+    @Test
+    public void testElectionWIthDifferentVersion() {
+        Map<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING));
+        votes.put(1L, new Vote(0x1, 4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.FOLLOWING));
+        votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING));
+        votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LEADING));
+
+        Assert.assertTrue(fle.getVoteTracker(votes, 
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.FOLLOWING)).hasAllQuorums());
+    }
+
+    @Test
+    public void testLookingNormal() {
+        Map<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+        votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+        votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING));
+        votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LEADING));
+
+        Assert.assertTrue(fle.getVoteTracker(votes, 
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 1, 1, ServerState.LOOKING)).hasAllQuorums());
+    }
+
+    @Test
+    public void testLookingDiffRounds() {
+        HashMap<Long, Vote> votes = new HashMap<Long, Vote>();
+        votes.put(0L, new Vote(4L, ZxidUtils.makeZxid(1, 1), 1, 1, ServerState.LOOKING));
+        votes.put(1L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING));
+        votes.put(3L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LOOKING));
+        votes.put(4L, new Vote(4L, ZxidUtils.makeZxid(2, 1), 3, 2, ServerState.LEADING));
+
+        Assert.assertFalse(fle.getVoteTracker(votes,
+                new Vote(4L, ZxidUtils.makeZxid(2, 1), 2, 2, ServerState.LOOKING)).hasAllQuorums());
+    }
+
+    @Test
+    public void testOutofElection() {
+        HashMap<Long,Vote> outofelection = new HashMap<Long,Vote>();
+
+        outofelection.put(1L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING));
+        outofelection.put(2L, new Vote(0x0, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x17, ServerState.FOLLOWING));
+        outofelection.put(4L, new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.FOLLOWING));
+        Vote vote = new Vote(0x1, 5, ZxidUtils.makeZxid(15, 0), 0xa, 0x18, ServerState.LEADING);
+        outofelection.put(5L, vote);
+
+        Notification n = new Notification();
+        n.version = vote.getVersion();
+        n.leader = vote.getId();
+        n.zxid = vote.getZxid();
+        n.electionEpoch = vote.getElectionEpoch();
+        n.state = vote.getState();
+        n.peerEpoch = vote.getPeerEpoch();
+        n.sid = 5L;
+        
+        // Set the logical clock to 1 on fle instance of server 3.
+        fle.logicalclock.set(0x1);
+
+        Assert.assertTrue("Quorum check failed", 
+                fle.getVoteTracker(outofelection, new Vote(n.version, n.leader, 
+                        n.zxid, n.electionEpoch, n.peerEpoch, n.state)).hasAllQuorums());
+
+        Assert.assertTrue("Leader check failed", fle.checkLeader(outofelection, 
+                n.leader, n.electionEpoch)); 
+    }
+}