Przeglądaj źródła

HADOOP-9090. Support on-demand publish of metrics. Contributed by Mostafa Elhemali.

git-svn-id: https://svn.apache.org/repos/asf/hadoop/common/trunk@1416538 13f79535-47bb-0310-9956-ffa450edef68
Suresh Srinivas 12 lat temu
rodzic
commit
e5d54ac89a

+ 4 - 0
hadoop-common-project/hadoop-common/CHANGES.txt

@@ -306,6 +306,10 @@ Release 2.0.3-alpha - Unreleased
 
     HADOOP-9020. Add a SASL PLAIN server (daryn via bobby)
 
+    HADOOP-9090. Support on-demand publish of metrics. (Mostafa Elhemali via
+    suresh)
+
+
   IMPROVEMENTS
 
     HADOOP-8789. Tests setLevel(Level.OFF) should be Level.ERROR.

+ 11 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/MetricsSystem.java

@@ -90,6 +90,17 @@ public abstract class MetricsSystem implements MetricsSystemMXBean {
    */
   public abstract void register(Callback callback);
 
+  /**
+   * Requests an immediate publish of all metrics from sources to sinks.
+   * 
+   * This is a "soft" request: the expectation is that a best effort will be
+   * done to synchronously snapshot the metrics from all the sources and put
+   * them in all the sinks (including flushing the sinks) before returning to
+   * the caller. If this can't be accomplished in reasonable time it's OK to
+   * return to the caller before everything is done. 
+   */
+  public abstract void publishMetricsNow();
+
   /**
    * Shutdown the metrics system completely (usually during server shutdown.)
    * The MetricsSystemMXBean will be unregistered.

+ 46 - 0
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsSinkAdapter.java

@@ -19,6 +19,7 @@
 package org.apache.hadoop.metrics2.impl;
 
 import java.util.Random;
+import java.util.concurrent.*;
 
 import static com.google.common.base.Preconditions.*;
 
@@ -48,6 +49,7 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
   private volatile boolean stopping = false;
   private volatile boolean inError = false;
   private final int period, firstRetryDelay, retryCount;
+  private final long oobPutTimeout;
   private final float retryBackoff;
   private final MetricsRegistry registry = new MetricsRegistry("sinkadapter");
   private final MutableStat latency;
@@ -69,6 +71,8 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
     this.period = checkArg(period, period > 0, "period");
     firstRetryDelay = checkArg(retryDelay, retryDelay > 0, "retry delay");
     this.retryBackoff = checkArg(retryBackoff, retryBackoff>1, "retry backoff");
+    oobPutTimeout = (long)
+        (firstRetryDelay * Math.pow(retryBackoff, retryCount) * 1000);
     this.retryCount = retryCount;
     this.queue = new SinkQueue<MetricsBuffer>(checkArg(queueCapacity,
         queueCapacity > 0, "queue capacity"));
@@ -95,6 +99,23 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
     }
     return true; // OK
   }
+  
+  public boolean putMetricsImmediate(MetricsBuffer buffer) {
+    WaitableMetricsBuffer waitableBuffer =
+        new WaitableMetricsBuffer(buffer);
+    if (!queue.enqueue(waitableBuffer)) {
+      LOG.warn(name + " has a full queue and can't consume the given metrics.");
+      dropped.incr();
+      return false;
+    }
+    if (!waitableBuffer.waitTillNotified(oobPutTimeout)) {
+      LOG.warn(name +
+          " couldn't fulfill an immediate putMetrics request in time." +
+          " Abandoning.");
+      return false;
+    }
+    return true;
+  }
 
   void publishMetricsFromQueue() {
     int retryDelay = firstRetryDelay;
@@ -158,6 +179,9 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
       sink.flush();
       latency.add(Time.now() - ts);
     }
+    if (buffer instanceof WaitableMetricsBuffer) {
+      ((WaitableMetricsBuffer)buffer).notifyAnyWaiters();
+    }
     LOG.debug("Done");
   }
 
@@ -191,4 +215,26 @@ class MetricsSinkAdapter implements SinkQueue.Consumer<MetricsBuffer> {
   MetricsSink sink() {
     return sink;
   }
+
+  static class WaitableMetricsBuffer extends MetricsBuffer {
+    private final Semaphore notificationSemaphore =
+        new Semaphore(0);
+
+    public WaitableMetricsBuffer(MetricsBuffer metricsBuffer) {
+      super(metricsBuffer);
+    }
+
+    public boolean waitTillNotified(long millisecondsToWait) {
+      try {
+        return notificationSemaphore.tryAcquire(millisecondsToWait,
+            TimeUnit.MILLISECONDS);
+      } catch (InterruptedException e) {
+        return false;
+      }
+    }
+
+    public void notifyAnyWaiters() {
+      notificationSemaphore.release();
+    }
+  }
 }

+ 21 - 3
hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/metrics2/impl/MetricsSystemImpl.java

@@ -344,9 +344,19 @@ public class MetricsSystemImpl extends MetricsSystem implements MetricsSource {
   synchronized void onTimerEvent() {
     logicalTime += period;
     if (sinks.size() > 0) {
-      publishMetrics(sampleMetrics());
+      publishMetrics(sampleMetrics(), false);
     }
   }
+  
+  /**
+   * Requests an immediate publish of all metrics from sources to sinks.
+   */
+  @Override
+  public void publishMetricsNow() {
+    if (sinks.size() > 0) {
+      publishMetrics(sampleMetrics(), true);
+    }    
+  }
 
   /**
    * Sample all the sources for a snapshot of metrics/tags
@@ -380,12 +390,20 @@ public class MetricsSystemImpl extends MetricsSystem implements MetricsSource {
   /**
    * Publish a metrics snapshot to all the sinks
    * @param buffer  the metrics snapshot to publish
+   * @param immediate  indicates that we should publish metrics immediately
+   *                   instead of using a separate thread.
    */
-  synchronized void publishMetrics(MetricsBuffer buffer) {
+  synchronized void publishMetrics(MetricsBuffer buffer, boolean immediate) {
     int dropped = 0;
     for (MetricsSinkAdapter sa : sinks.values()) {
       long startTime = Time.now();
-      dropped += sa.putMetrics(buffer, logicalTime) ? 0 : 1;
+      boolean result;
+      if (immediate) {
+        result = sa.putMetricsImmediate(buffer); 
+      } else {
+        result = sa.putMetrics(buffer, logicalTime);
+      }
+      dropped += result ? 0 : 1;
       publishStat.add(Time.now() - startTime);
     }
     droppedPubAll.incr(dropped);

+ 3 - 26
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/impl/TestGangliaMetrics.java

@@ -29,8 +29,6 @@ import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
-import java.util.concurrent.CountDownLatch;
-import java.util.concurrent.TimeUnit;
 
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -115,31 +113,23 @@ public class TestGangliaMetrics {
     final int expectedCountFromGanglia30 = expectedMetrics.length;
     final int expectedCountFromGanglia31 = 2 * expectedMetrics.length;
 
-    // use latch to make sure we received required records before shutting
-    // down the MetricSystem
-    CountDownLatch latch = new CountDownLatch(
-        expectedCountFromGanglia30 + expectedCountFromGanglia31);
-
     // Setup test for GangliaSink30
     AbstractGangliaSink gsink30 = new GangliaSink30();
     gsink30.init(cb.subset("test"));
-    MockDatagramSocket mockds30 = new MockDatagramSocket(latch);
+    MockDatagramSocket mockds30 = new MockDatagramSocket();
     GangliaMetricsTestHelper.setDatagramSocket(gsink30, mockds30);
 
     // Setup test for GangliaSink31
     AbstractGangliaSink gsink31 = new GangliaSink31();
     gsink31.init(cb.subset("test"));
-    MockDatagramSocket mockds31 = new MockDatagramSocket(latch);
+    MockDatagramSocket mockds31 = new MockDatagramSocket();
     GangliaMetricsTestHelper.setDatagramSocket(gsink31, mockds31);
 
     // register the sinks
     ms.register("gsink30", "gsink30 desc", gsink30);
     ms.register("gsink31", "gsink31 desc", gsink31);
-    ms.onTimerEvent();  // trigger something interesting
+    ms.publishMetricsNow(); // publish the metrics
 
-    // wait for all records and the stop MetricSystem.  Without this
-    // sometime the ms gets shutdown before all the sinks have consumed
-    latch.await(200, TimeUnit.MILLISECONDS);
     ms.stop();
 
     // check GanfliaSink30 data
@@ -198,7 +188,6 @@ public class TestGangliaMetrics {
    */
   private class MockDatagramSocket extends DatagramSocket {
     private ArrayList<byte[]> capture;
-    private CountDownLatch latch;
 
     /**
      * @throws SocketException
@@ -207,15 +196,6 @@ public class TestGangliaMetrics {
       capture = new  ArrayList<byte[]>();
     }
 
-    /**
-     * @param latch
-     * @throws SocketException
-     */
-    public MockDatagramSocket(CountDownLatch latch) throws SocketException {
-      this();
-      this.latch = latch;
-    }
-
     /* (non-Javadoc)
      * @see java.net.DatagramSocket#send(java.net.DatagramPacket)
      */
@@ -225,9 +205,6 @@ public class TestGangliaMetrics {
       byte[] bytes = new byte[p.getLength()];
       System.arraycopy(p.getData(), p.getOffset(), bytes, 0, p.getLength());
       capture.add(bytes);
-
-      // decrement the latch
-      latch.countDown();
     }
 
     /**

+ 185 - 3
hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/metrics2/impl/TestMetricsSystemImpl.java

@@ -18,7 +18,11 @@
 
 package org.apache.hadoop.metrics2.impl;
 
-import java.util.List;
+import java.util.*;
+import java.util.concurrent.*;
+import java.util.concurrent.atomic.*;
+
+import javax.annotation.Nullable;
 
 import org.junit.Test;
 import org.junit.runner.RunWith;
@@ -26,9 +30,11 @@ import org.junit.runner.RunWith;
 import org.mockito.ArgumentCaptor;
 import org.mockito.Captor;
 import org.mockito.runners.MockitoJUnitRunner;
+
 import static org.junit.Assert.*;
 import static org.mockito.Mockito.*;
 
+import com.google.common.base.Predicate;
 import com.google.common.collect.Iterables;
 
 import org.apache.commons.configuration.SubsetConfiguration;
@@ -36,6 +42,8 @@ import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.metrics2.MetricsException;
 import static org.apache.hadoop.test.MoreAsserts.*;
+
+import org.apache.hadoop.metrics2.AbstractMetric;
 import org.apache.hadoop.metrics2.MetricsRecord;
 import org.apache.hadoop.metrics2.MetricsSink;
 import org.apache.hadoop.metrics2.MetricsSource;
@@ -47,6 +55,7 @@ import org.apache.hadoop.metrics2.lib.MetricsRegistry;
 import org.apache.hadoop.metrics2.lib.MutableCounterLong;
 import org.apache.hadoop.metrics2.lib.MutableRate;
 import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
+import org.apache.hadoop.util.StringUtils;
 
 /**
  * Test the MetricsSystemImpl class
@@ -72,7 +81,7 @@ public class TestMetricsSystemImpl {
   }
 
   @Test public void testInitFirst() throws Exception {
-    ConfigBuilder cb = new ConfigBuilder().add("*.period", 8)
+    new ConfigBuilder().add("*.period", 8)
         //.add("test.sink.plugin.urls", getPluginUrlsAsString())
         .add("test.sink.test.class", TestSink.class.getName())
         .add("test.*.source.filter.exclude", "s0")
@@ -93,8 +102,9 @@ public class TestMetricsSystemImpl {
     MetricsSink sink2 = mock(MetricsSink.class);
     ms.registerSink("sink1", "sink1 desc", sink1);
     ms.registerSink("sink2", "sink2 desc", sink2);
-    ms.onTimerEvent();  // trigger something interesting
+    ms.publishMetricsNow(); // publish the metrics
     ms.stop();
+    ms.shutdown();
 
     verify(sink1, times(2)).putMetrics(r1.capture());
     List<MetricsRecord> mr1 = r1.getAllValues();
@@ -104,6 +114,177 @@ public class TestMetricsSystemImpl {
     assertEquals("output", mr1, mr2);
   }
 
+  @Test public void testMultiThreadedPublish() throws Exception {
+    new ConfigBuilder().add("*.period", 80)
+      .add("test.sink.Collector.queue.capacity", "20")
+      .save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
+    final MetricsSystemImpl ms = new MetricsSystemImpl("Test");
+    ms.start();
+    final int numThreads = 10;
+    final CollectingSink sink = new CollectingSink(numThreads);
+    ms.registerSink("Collector",
+        "Collector of values from all threads.", sink);
+    final TestSource[] sources = new TestSource[numThreads];
+    final Thread[] threads = new Thread[numThreads];
+    final String[] results = new String[numThreads];
+    final CyclicBarrier barrier1 = new CyclicBarrier(numThreads),
+        barrier2 = new CyclicBarrier(numThreads);
+    for (int i = 0; i < numThreads; i++) {
+      sources[i] = ms.register("threadSource" + i,
+          "A source of my threaded goodness.",
+          new TestSource("threadSourceRec" + i));
+      threads[i] = new Thread(new Runnable() {
+        private boolean safeAwait(int mySource, CyclicBarrier barrier) {
+          try {
+            barrier1.await(2, TimeUnit.SECONDS);
+          } catch (InterruptedException e) {
+            results[mySource] = "Interrupted";
+            return false;
+          } catch (BrokenBarrierException e) {
+            results[mySource] = "Broken Barrier";
+            return false;
+          } catch (TimeoutException e) {
+            results[mySource] = "Timed out on barrier";
+            return false;
+          }
+          return true;
+        }
+        
+        @Override
+        public void run() {
+          int mySource = Integer.parseInt(Thread.currentThread().getName());
+          if (sink.collected[mySource].get() != 0L) {
+            results[mySource] = "Someone else collected my metric!";
+            return;
+          }
+          // Wait for all the threads to come here so we can hammer
+          // the system at the same time
+          if (!safeAwait(mySource, barrier1)) return;
+          sources[mySource].g1.set(230);
+          ms.publishMetricsNow();
+          // Since some other thread may have snatched my metric,
+          // I need to wait for the threads to finish before checking.
+          if (!safeAwait(mySource, barrier2)) return;
+          if (sink.collected[mySource].get() != 230L) {
+            results[mySource] = "Metric not collected!";
+            return;
+          }
+          results[mySource] = "Passed";
+        }
+      }, "" + i);
+    }
+    for (Thread t : threads)
+      t.start();
+    for (Thread t : threads)
+      t.join();
+    assertEquals(0L, ms.droppedPubAll.value());
+    assertTrue(StringUtils.join("\n", Arrays.asList(results)),
+      Iterables.all(Arrays.asList(results), new Predicate<String>() {
+        @Override
+        public boolean apply(@Nullable String input) {
+          return input.equalsIgnoreCase("Passed");
+        }
+      }));
+    ms.stop();
+    ms.shutdown();
+  }
+
+  private static class CollectingSink implements MetricsSink {
+    private final AtomicLong[] collected;
+    
+    public CollectingSink(int capacity) {
+      collected = new AtomicLong[capacity];
+      for (int i = 0; i < capacity; i++) {
+        collected[i] = new AtomicLong();
+      }
+    }
+    
+    @Override
+    public void init(SubsetConfiguration conf) {
+    }
+
+    @Override
+    public void putMetrics(MetricsRecord record) {
+      final String prefix = "threadSourceRec";
+      if (record.name().startsWith(prefix)) {
+        final int recordNumber = Integer.parseInt(
+            record.name().substring(prefix.length()));
+        ArrayList<String> names = new ArrayList<String>();
+        for (AbstractMetric m : record.metrics()) {
+          if (m.name().equalsIgnoreCase("g1")) {
+            collected[recordNumber].set(m.value().longValue());
+            return;
+          }
+          names.add(m.name());
+        }
+      }
+    }
+
+    @Override
+    public void flush() {
+    }
+  }
+
+  @Test public void testHangingSink() {
+    new ConfigBuilder().add("*.period", 8)
+      .add("test.sink.test.class", TestSink.class.getName())
+      .add("test.sink.hanging.retry.delay", "1")
+      .add("test.sink.hanging.retry.backoff", "1.01")
+      .add("test.sink.hanging.retry.count", "0")
+      .save(TestMetricsConfig.getTestFilename("hadoop-metrics2-test"));
+    MetricsSystemImpl ms = new MetricsSystemImpl("Test");
+    ms.start();
+    TestSource s = ms.register("s3", "s3 desc", new TestSource("s3rec"));
+    s.c1.incr();
+    HangingSink hanging = new HangingSink();
+    ms.registerSink("hanging", "Hang the sink!", hanging);
+    ms.publishMetricsNow();
+    assertEquals(1L, ms.droppedPubAll.value());
+    assertFalse(hanging.getInterrupted());
+    ms.stop();
+    ms.shutdown();
+    assertTrue(hanging.getInterrupted());
+    assertTrue("The sink didn't get called after its first hang " +
+               "for subsequent records.", hanging.getGotCalledSecondTime());
+  }
+
+  private static class HangingSink implements MetricsSink {
+    private volatile boolean interrupted;
+    private boolean gotCalledSecondTime;
+    private boolean firstTime = true;
+
+    public boolean getGotCalledSecondTime() {
+      return gotCalledSecondTime;
+    }
+
+    public boolean getInterrupted() {
+      return interrupted;
+    }
+
+    @Override
+    public void init(SubsetConfiguration conf) {
+    }
+
+    @Override
+    public void putMetrics(MetricsRecord record) {
+      // No need to hang every time, just the first record.
+      if (!firstTime) {
+        gotCalledSecondTime = true;
+        return;
+      }
+      firstTime = false;
+      try {
+        Thread.sleep(10 * 1000);
+      } catch (InterruptedException ex) {
+        interrupted = true;
+      }
+    }
+
+    @Override
+    public void flush() {
+    }
+  }
+
   @Test public void testRegisterDups() {
     MetricsSystem ms = new MetricsSystemImpl();
     TestSource ts1 = new TestSource("ts1");
@@ -116,6 +297,7 @@ public class TestMetricsSystemImpl {
     MetricsSource s2 = ms.getSource("ts1");
     assertNotNull(s2);
     assertNotSame(s1, s2);
+    ms.shutdown();
   }
 
   @Test(expected=MetricsException.class) public void testRegisterDupError() {