Browse Source

ZOOKEEPER-3470: fix flaky LearnerMetricsTest

This PR is about fixing the flaky unit test: `LearnerMetricsTest`.
The code changes are:
- Replacing the `Thread.sleep(200)` with polling the metrics until the assertion succeed or timeout (this seems help in some flaky scenario)
- Fixing the `ZKTestCase.waitFor` method
- We are also logging the 0 millisec latency values now on the server side. We calculate these latency values between different hosts (as the difference between the server wall time and the wall time in the request header), so 0 values are possible. And in case of unittests (where all the hosts are using the same clock) it can happen relatively frequently. This situation also caused some of the flakiness. In the previous implementation the 0 latency values were not counted, so the assertion on these counters values were failing in the end of the testcase.

Author: Mate Szalay-Beko <szalay.beko.mate@gmail.com>

Reviewers: eolivelli@apache.org, andor@apache.org

Closes #1040 from symat/ZOOKEEPER-3470 and squashes the following commits:

65a8b5795 [Mate Szalay-Beko] ZOOKEEPER-3470: implement PR comments
7fdf78041 [Mate Szalay-Beko] ZOOKEEPER-3470: fix flaky LearnerMetricsTest
Mate Szalay-Beko 5 years ago
parent
commit
9fa18ab0e1

+ 1 - 1
zookeeper-server/src/main/java/org/apache/zookeeper/server/FinalRequestProcessor.java

@@ -170,7 +170,7 @@ public class FinalRequestProcessor implements RequestProcessor {
              * CLOCK_MONOTONIC.
              */
             long propagationLatency = Time.currentWallTime() - request.getHdr().getTime();
-            if (propagationLatency > 0) {
+            if (propagationLatency >= 0) {
                 ServerMetrics.getMetrics().PROPAGATION_LATENCY.add(propagationLatency);
             }
         }

+ 2 - 2
zookeeper-server/src/main/java/org/apache/zookeeper/server/Request.java

@@ -402,7 +402,7 @@ public class Request {
              * CLOCK_MONOTONIC.
              */
             long latency = currentTime - hdr.getTime();
-            if (latency > 0) {
+            if (latency >= 0) {
                 metric.add(latency);
             }
         }
@@ -415,7 +415,7 @@ public class Request {
              * CLOCK_MONOTONIC.
              */
             long latency = currentTime - hdr.getTime();
-            if (latency > 0) {
+            if (latency >= 0) {
                 metric.add(key, latency);
             }
         }

+ 1 - 1
zookeeper-server/src/main/java/org/apache/zookeeper/server/quorum/Follower.java

@@ -169,7 +169,7 @@ public class Follower extends Learner{
                  */
                 long now = Time.currentWallTime();
                 long latency = now - hdr.getTime();
-                if (latency > 0) {
+                if (latency >= 0) {
                     ServerMetrics.getMetrics().PROPOSAL_LATENCY.add(latency);
                 }
             }

+ 5 - 4
zookeeper-server/src/test/java/org/apache/zookeeper/ZKTestCase.java

@@ -25,7 +25,8 @@ import org.junit.Rule;
 import org.junit.rules.TestWatcher;
 import org.junit.runner.Description;
 import org.junit.runner.RunWith;
-import org.junit.runners.model.FrameworkMethod;
+
+import java.time.LocalDateTime;
 
 /**
  * Base class for a non-parameterized ZK test.
@@ -92,9 +93,9 @@ public class ZKTestCase {
      * @param timeout   timeout in seconds
      * @throws InterruptedException
      */
-    public void waitFor(String msg, WaitForCondition condition, int timeout)
-            throws InterruptedException {
-        for (int i = 0; i < timeout; ++i) {
+    public void waitFor(String msg, WaitForCondition condition, int timeout) throws InterruptedException {
+        final LocalDateTime deadline = LocalDateTime.now().plusSeconds(timeout);
+        while (LocalDateTime.now().isBefore(deadline)) {
             if (condition.evaluate()) {
                 return;
             }

+ 45 - 23
zookeeper-server/src/test/java/org/apache/zookeeper/server/quorum/LearnerMetricsTest.java

@@ -8,9 +8,9 @@
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
- *uuuuu
+ *
  * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "/RequuuAS IS" BASIS,
+ * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
@@ -25,25 +25,31 @@ import org.apache.zookeeper.ZooKeeper;
 import org.apache.zookeeper.metrics.MetricsUtils;
 import org.apache.zookeeper.server.ServerMetrics;
 import org.apache.zookeeper.test.ClientBase;
-import org.junit.Assert;
+import org.hamcrest.Matcher;
+import org.junit.After;
 import org.junit.Test;
 
 import java.util.HashMap;
 import java.util.Map;
 
-import static org.hamcrest.number.OrderingComparison.greaterThan;
+import static org.hamcrest.core.Is.is;
+import static org.hamcrest.number.OrderingComparison.greaterThanOrEqualTo;
 
 public class LearnerMetricsTest extends QuorumPeerTestBase {
 
+    private static final int TIMEOUT_SECONDS = 30;
+    private static final int SERVER_COUNT = 6; // 5 participants, 1 observer
+    private final QuorumPeerTestBase.MainThread[] mt = new QuorumPeerTestBase.MainThread[SERVER_COUNT];
+    private ZooKeeper zk_client;
+
     @Test
     public void testLearnerMetricsTest() throws Exception {
         ServerMetrics.getMetrics().resetAll();
         ClientBase.setupTestEnv();
 
-        final int SERVER_COUNT = 6; // 5 participants, 1 observer
         final String path = "/zk-testLeanerMetrics";
         final byte[] data = new byte[512];
-        final int clientPorts[] = new int[SERVER_COUNT];
+        final int[] clientPorts = new int[SERVER_COUNT];
         StringBuilder sb = new StringBuilder();
         int observer = 0 ;
         clientPorts[observer] = PortAssignment.unique();
@@ -55,7 +61,6 @@ public class LearnerMetricsTest extends QuorumPeerTestBase {
 
         // start the participants
         String quorumCfgSection = sb.toString();
-        QuorumPeerTestBase.MainThread mt[] = new QuorumPeerTestBase.MainThread[SERVER_COUNT];
         for(int i = 1; i < SERVER_COUNT; i++) {
             mt[i] = new QuorumPeerTestBase.MainThread(i, clientPorts[i], quorumCfgSection);
             mt[i].start();
@@ -67,26 +72,43 @@ public class LearnerMetricsTest extends QuorumPeerTestBase {
         mt[observer] = new QuorumPeerTestBase.MainThread(observer, clientPorts[observer], quorumCfgSection, observerConfig);
         mt[observer].start();
 
-        ZooKeeper zk = new ZooKeeper("127.0.0.1:" + clientPorts[1], ClientBase.CONNECTION_TIMEOUT, this);
-
-        waitForOne(zk, ZooKeeper.States.CONNECTED);
-
         // send one create request
-        zk.create(path, data, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
-
-        Thread.sleep(200);
+        zk_client = new ZooKeeper("127.0.0.1:" + clientPorts[1], ClientBase.CONNECTION_TIMEOUT, this);
+        waitForOne(zk_client, ZooKeeper.States.CONNECTED);
+        zk_client.create(path, data, ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
 
-        Map<String, Object> values = MetricsUtils.currentServerMetrics();
         // there are 4 followers, each received two proposals, one for leader election, one for the create request
-        Assert.assertEquals(8L, values.get("learner_proposal_received_count"));
-        Assert.assertEquals(8L, values.get("cnt_proposal_latency"));
-        Assert.assertThat((long)values.get("min_proposal_latency"), greaterThan(0L));
-        Assert.assertEquals(10L, values.get("cnt_proposal_ack_creation_latency"));
-        Assert.assertThat((long)values.get("min_proposal_ack_creation_latency"), greaterThan(0L));
+        waitForMetric("learner_proposal_received_count", is(8L));
+        waitForMetric("cnt_proposal_latency", is(8L));
+        waitForMetric("min_proposal_latency", greaterThanOrEqualTo(0L));
+        waitForMetric("cnt_proposal_ack_creation_latency", is(10L));
+        waitForMetric("min_proposal_ack_creation_latency", greaterThanOrEqualTo(0L));
 
         // there are five learners, each received two commits, one for leader election, one for the create request
-        Assert.assertEquals(10L, values.get("learner_commit_received_count"));
-        Assert.assertEquals(10L, values.get("cnt_commit_propagation_latency"));
-        Assert.assertThat((long)values.get("min_commit_propagation_latency"), greaterThan(0L));
+        waitForMetric("learner_commit_received_count", is(10L));
+        waitForMetric("cnt_commit_propagation_latency", is(10L));
+        waitForMetric("min_commit_propagation_latency", greaterThanOrEqualTo(0L));
+    }
+
+    private void waitForMetric(final String metricKey, final Matcher<Long> matcher) throws InterruptedException {
+        final String errorMessage = String.format("unable to match on metric: %s", metricKey);
+        waitFor(errorMessage,
+                () -> {
+                    long actual = (long) MetricsUtils.currentServerMetrics().get(metricKey);
+                    if(!matcher.matches(actual)) {
+                        LOG.info(String.format("match failed on %s, actual value: %d", metricKey, actual));
+                        return false;
+                    }
+                    return true;
+                },
+                TIMEOUT_SECONDS);
+    }
+
+    @After
+    public void tearDown() throws Exception {
+        zk_client.close();
+        for(int i = 0; i < SERVER_COUNT; i++) {
+            mt[i].shutdown();
+        }
     }
 }