Parcourir la source

HDDS-1682. TestEventWatcher.testMetrics is flaky

Closes #962.
Márton Elek il y a 5 ans
Parent
commit
b039f7591f

+ 6 - 5
hadoop-hdds/framework/src/main/java/org/apache/hadoop/hdds/server/events/EventWatcher.java

@@ -143,14 +143,15 @@ public abstract class EventWatcher<TIMEOUT_PAYLOAD extends
   protected synchronized void handleCompletion(COMPLETION_PAYLOAD
       completionPayload, EventPublisher publisher) throws
       LeaseNotFoundException {
-    metrics.incrementCompletedEvents();
     long id = completionPayload.getId();
     leaseManager.release(id);
     TIMEOUT_PAYLOAD payload = trackedEventsByID.remove(id);
-    trackedEvents.remove(payload);
-    long originalTime = startTrackingTimes.remove(id);
-    metrics.updateFinishingTime(System.currentTimeMillis() - originalTime);
-    onFinished(publisher, payload);
+    if (trackedEvents.remove(payload)) {
+      metrics.incrementCompletedEvents();
+      long originalTime = startTrackingTimes.remove(id);
+      metrics.updateFinishingTime(System.currentTimeMillis() - originalTime);
+      onFinished(publisher, payload);
+    }
   }
 
   private synchronized void handleTimeout(EventPublisher publisher,

+ 17 - 7
hadoop-hdds/framework/src/test/java/org/apache/hadoop/hdds/server/events/TestEventWatcher.java

@@ -179,22 +179,32 @@ public class TestEventWatcher {
 
     queue.fireEvent(REPLICATION_COMPLETED, event1Completed);
 
-    Thread.sleep(2200L);
+    //lease manager timeout = 2000L
+    Thread.sleep(3 * 2000L);
+
+    queue.processAll(2000L);
 
     //until now: 3 in-progress activities are tracked with three
     // UnderreplicatedEvents. The first one is completed, the remaining two
-    // are timed out (as the timeout -- defined in the leasmanager -- is 2000ms.
+    // are timed out (as the timeout -- defined in the lease manager -- is
+    // 2000ms).
 
     EventWatcherMetrics metrics = replicationWatcher.getMetrics();
 
     //3 events are received
     Assert.assertEquals(3, metrics.getTrackedEvents().value());
 
-    //one is finished. doesn't need to be resent
-    Assert.assertEquals(1, metrics.getCompletedEvents().value());
-
-    //Other two are timed out and resent
-    Assert.assertEquals(2, metrics.getTimedOutEvents().value());
+    //completed + timed out = all messages
+    Assert.assertEquals(
+        "number of timed out and completed messages should be the same as the"
+            + " all messages",
+        metrics.getTrackedEvents().value(),
+        metrics.getCompletedEvents().value() + metrics.getTimedOutEvents()
+            .value());
+
+    //_at least_ two are timed out.
+    Assert.assertTrue("At least two events should be timed out.",
+        metrics.getTimedOutEvents().value() >= 2);
 
     DefaultMetricsSystem.shutdown();
   }