Browse Source

YARN-6451. Add RM monitor validating metrics invariants. Contributed by Carlo Curino

Chris Douglas 8 năm trước cách đây
mục cha
commit
af8e9842d2

+ 1 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/pom.xml

@@ -347,6 +347,7 @@
             <exclude>src/test/resources/submit-reservation.json</exclude>
             <exclude>src/test/resources/delete-reservation.json</exclude>
             <exclude>src/test/resources/update-reservation.json</exclude>
+            <exclude>src/test/resources/invariants.txt</exclude>
           </excludes>
         </configuration>
       </plugin>

+ 35 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantViolationException.java

@@ -0,0 +1,35 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+
+import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
+
+/**
+ * This exception represents the violation of an internal invariant.
+ */
+public class InvariantViolationException extends YarnRuntimeException {
+
+  public InvariantViolationException(String s) {
+    super(s);
+  }
+
+  public InvariantViolationException(String s, Exception e) {
+    super(s, e);
+  }
+}

+ 96 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/InvariantsChecker.java

@@ -0,0 +1,96 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
+import org.apache.hadoop.yarn.server.resourcemanager.monitor.SchedulingEditPolicy;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Abstract invariant checker, that setup common context for invariants
+ * checkers.
+ */
+public abstract class InvariantsChecker implements SchedulingEditPolicy {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(InvariantsChecker.class);
+  public static final String THROW_ON_VIOLATION =
+      "yarn.resourcemanager.invariant-checker.throw-on-violation";
+  public static final String INVARIANT_MONITOR_INTERVAL =
+      "yarn.resourcemanager.invariant-checker.monitor-interval";
+
+  private Configuration conf;
+  private RMContext context;
+  private PreemptableResourceScheduler scheduler;
+  private boolean throwOnInvariantViolation;
+  private long monitoringInterval;
+
+  @Override
+  public void init(Configuration config, RMContext rmContext,
+      PreemptableResourceScheduler preemptableResourceScheduler) {
+    this.conf = config;
+    this.context = rmContext;
+    this.scheduler = preemptableResourceScheduler;
+    this.throwOnInvariantViolation =
+        conf.getBoolean(InvariantsChecker.THROW_ON_VIOLATION, false);
+    this.monitoringInterval =
+        conf.getLong(InvariantsChecker.INVARIANT_MONITOR_INTERVAL, 1000L);
+
+    LOG.info("Invariant checker " + this.getPolicyName()
+        + " enabled. Monitoring every " + monitoringInterval
+        + "ms, throwOnViolation=" + throwOnInvariantViolation);
+  }
+
+  @Override
+  public long getMonitoringInterval() {
+    return monitoringInterval;
+  }
+
+  @Override
+  public String getPolicyName() {
+    return this.getClass().getSimpleName();
+  }
+
+  public void logOrThrow(String message) throws InvariantViolationException {
+    if (getThrowOnInvariantViolation()) {
+      throw new InvariantViolationException(message);
+    } else {
+      LOG.warn(message);
+    }
+  }
+
+  public boolean getThrowOnInvariantViolation() {
+    return throwOnInvariantViolation;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public RMContext getContext() {
+    return context;
+  }
+
+  public PreemptableResourceScheduler getScheduler() {
+    return scheduler;
+  }
+
+}

+ 195 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/MetricsInvariantChecker.java

@@ -0,0 +1,195 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+import com.google.common.base.Charsets;
+import com.google.common.io.Files;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.AbstractMetric;
+import org.apache.hadoop.metrics2.MetricsRecord;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.impl.MetricsCollectorImpl;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.source.JvmMetrics;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.PreemptableResourceScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import javax.script.Compilable;
+import javax.script.CompiledScript;
+import javax.script.ScriptEngineManager;
+import javax.script.ScriptException;
+import javax.script.SimpleBindings;
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+/**
+ * This policy checks at every invocation that a given set of invariants
+ * (specified in a file) are respected over QueueMetrics and JvmMetrics. The
+ * file may contain arbitrary (Javascrip) boolean expression over the metrics
+ * variables.
+ *
+ * The right set of invariants depends on the deployment environment, a large
+ * number of complex invariant can make this check expensive.
+ *
+ * The MetricsInvariantChecker can be configured to throw a RuntimeException or
+ * simlpy warn in the logs if an invariant is not respected.
+ */
+public class MetricsInvariantChecker extends InvariantsChecker {
+
+  private static final Logger LOG =
+      LoggerFactory.getLogger(MetricsInvariantChecker.class);
+  public static final String INVARIANTS_FILE =
+      "yarn.resourcemanager.invariant-checker.file";
+
+  private MetricsSystem metricsSystem;
+  private MetricsCollectorImpl collector;
+  private SimpleBindings bindings;
+  private ScriptEngineManager manager;
+  private Compilable scriptEngine;
+  private String invariantFile;
+  private Map<String, CompiledScript> invariants;
+  private CompiledScript combinedInvariants;
+
+  // set of metrics we monitor
+  private QueueMetrics queueMetrics;
+  private JvmMetrics jvmMetrics;
+
+  @Override
+  public void init(Configuration config, RMContext rmContext,
+      PreemptableResourceScheduler preemptableResourceScheduler) {
+
+    super.init(config, rmContext, preemptableResourceScheduler);
+
+    this.metricsSystem = DefaultMetricsSystem.instance();
+    this.queueMetrics =
+        QueueMetrics.forQueue(metricsSystem, "root", null, false, getConf());
+    this.jvmMetrics = (JvmMetrics) metricsSystem.getSource("JvmMetrics");
+
+    // at first collect all metrics
+    collector = new MetricsCollectorImpl();
+    queueMetrics.getMetrics(collector, true);
+    jvmMetrics.getMetrics(collector, true);
+
+    // prepare bindings and evaluation engine
+    this.bindings = new SimpleBindings();
+    this.manager = new ScriptEngineManager();
+    this.scriptEngine = (Compilable) manager.getEngineByName("JavaScript");
+
+    // load metrics invariant from file
+    this.invariantFile = getConf().get(MetricsInvariantChecker.INVARIANTS_FILE);
+
+    this.invariants = new HashMap<>();
+
+    // preload all bindings
+    queueMetrics.getMetrics(collector, true);
+    jvmMetrics.getMetrics(collector, true);
+    for (MetricsRecord record : collector.getRecords()) {
+      for (AbstractMetric am : record.metrics()) {
+        bindings.put(am.name().replace(' ', '_'), am.value());
+      }
+    }
+
+    StringBuilder sb = new StringBuilder();
+    try {
+      List<String> tempInv =
+          Files.readLines(new File(invariantFile), Charsets.UTF_8);
+
+
+      boolean first = true;
+      // precompile individual invariants
+      for (String inv : tempInv) {
+
+        if(first) {
+          first = false;
+        } else {
+          sb.append("&&");
+        }
+
+        invariants.put(inv, scriptEngine.compile(inv));
+        sb.append(" (");
+        sb.append(inv);
+        sb.append(") ");
+      }
+
+      // create a single large combined invariant for speed of checking
+      combinedInvariants = scriptEngine.compile(sb.toString());
+
+    } catch (IOException e) {
+      throw new RuntimeException(
+          "Error loading invariant file: " + e.getMessage());
+    } catch (ScriptException e) {
+      throw new RuntimeException("Error compiling invariant " + e.getMessage());
+    }
+
+  }
+
+  @Override
+  public void editSchedule() {
+    // grab all changed metrics and update bindings
+    collector.clear();
+    queueMetrics.getMetrics(collector, false);
+    jvmMetrics.getMetrics(collector, false);
+
+    for (MetricsRecord record : collector.getRecords()) {
+      for (AbstractMetric am : record.metrics()) {
+        bindings.put(am.name().replace(' ', '_'), am.value());
+      }
+    }
+
+    // evaluate all invariants with new bindings
+    try {
+
+      // fastpath check all invariants at once (much faster)
+      boolean allInvHold = (boolean) combinedInvariants.eval(bindings);
+
+      // if any fails, check individually to produce more insightful log
+      if (!allInvHold) {
+        for (Map.Entry<String, CompiledScript> e : invariants.entrySet()) {
+          boolean invariantsHold = (boolean) e.getValue().eval(bindings);
+          if (!invariantsHold) {
+            // filter bindings to produce minimal set
+            Map<String, Object> matchingBindings =
+                extractMatchingBindings(e.getKey(), bindings);
+            logOrThrow("Invariant \"" + e.getKey()
+                + "\" is NOT holding, with bindings: " + matchingBindings);
+          }
+        }
+      }
+    } catch (ScriptException e) {
+      logOrThrow(e.getMessage());
+    }
+  }
+
+  private static Map<String, Object> extractMatchingBindings(String inv,
+      SimpleBindings allBindings) {
+    Map<String, Object> matchingBindings = new HashMap<>();
+    for (Map.Entry<String, Object> s : allBindings.entrySet()) {
+      if (inv.contains(s.getKey())) {
+        matchingBindings.put(s.getKey(), s.getValue());
+      }
+    }
+    return matchingBindings;
+  }
+}

+ 22 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/package-info.java

@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Monitoring policies, used to check invariants.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;

+ 99 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/monitor/invariants/TestMetricsInvariantChecker.java

@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.resourcemanager.monitor.invariants;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.metrics2.MetricsSystem;
+import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
+import org.apache.hadoop.metrics2.source.JvmMetrics;
+import org.apache.hadoop.yarn.api.records.Resource;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics;
+import org.apache.log4j.Logger;
+import org.junit.Before;
+import org.junit.Test;
+
+import static junit.framework.TestCase.fail;
+
+/**
+ * This class tests the {@code MetricsInvariantChecker} by running it multiple
+ * time and reporting the time it takes to execute, as well as verifying that
+ * the invariant throws in case the invariants are not respected.
+ */
+public class TestMetricsInvariantChecker {
+  public final static Logger LOG =
+      Logger.getLogger(TestMetricsInvariantChecker.class);
+
+  private MetricsSystem metricsSystem;
+  private MetricsInvariantChecker ic;
+  private Configuration conf;
+
+  @Before
+  public void setup() {
+    this.metricsSystem = DefaultMetricsSystem.instance();
+    JvmMetrics.initSingleton("ResourceManager", null);
+    this.ic = new MetricsInvariantChecker();
+    this.conf = new Configuration();
+    conf.set(MetricsInvariantChecker.INVARIANTS_FILE,
+        "src/test/resources/invariants.txt");
+    conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, true);
+    ic.init(conf, null, null);
+  }
+
+  @Test(timeout = 5000)
+  public void testManyRuns() {
+
+    QueueMetrics qm =
+        QueueMetrics.forQueue(metricsSystem, "root", null, false, conf);
+    qm.setAvailableResourcesToQueue(Resource.newInstance(1, 1));
+
+    int numIterations = 1000;
+    long start = System.currentTimeMillis();
+    for (int i = 0; i < numIterations; i++) {
+      ic.editSchedule();
+    }
+    long end = System.currentTimeMillis();
+
+    System.out.println("Runtime per iteration (avg of " + numIterations
+        + " iterations): " + (end - start) + " tot time");
+
+  }
+
+  @Test
+  public void testViolation() {
+
+    // create a "wrong" condition in which the invariants are not respected
+    QueueMetrics qm =
+        QueueMetrics.forQueue(metricsSystem, "root", null, false, conf);
+    qm.setAvailableResourcesToQueue(Resource.newInstance(-1, -1));
+
+    // test with throwing exception turned on
+    try {
+      ic.editSchedule();
+      fail();
+    } catch (InvariantViolationException i) {
+      // expected
+    }
+
+    // test log-only mode
+    conf.setBoolean(MetricsInvariantChecker.THROW_ON_VIOLATION, false);
+    ic.init(conf, null, null);
+    ic.editSchedule();
+
+  }
+
+}

+ 54 - 0
hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/resources/invariants.txt

@@ -0,0 +1,54 @@
+running_0 >= 0
+running_60 >= 0
+running_300 >= 0
+running_1440 >= 0
+AppsSubmitted >= 0
+AppsRunning >= 0
+AppsPending >= 0
+AppsCompleted >= 0
+AppsKilled >= 0
+AppsFailed >= 0
+AllocatedMB >= 0
+AllocatedVCores >= 0
+AllocatedContainers >= 0
+AggregateContainersAllocated >= 0
+AggregateNodeLocalContainersAllocated >= 0
+AggregateRackLocalContainersAllocated >= 0
+AggregateOffSwitchContainersAllocated >= 0
+AggregateContainersReleased >= 0
+AggregateContainersPreempted >= 0
+AvailableMB >= 0
+AvailableVCores >= 0
+PendingMB >= 0
+PendingVCores >= 0
+PendingContainers >= 0
+ReservedMB >= 0
+ReservedVCores >= 0
+ReservedContainers >= 0
+ActiveUsers >= 0
+ActiveApplications >= 0
+AppAttemptFirstContainerAllocationDelayNumOps >= 0
+AppAttemptFirstContainerAllocationDelayAvgTime >= 0
+MemNonHeapUsedM >= 0
+MemNonHeapCommittedM >= 0
+MemNonHeapMaxM >= 0 || MemNonHeapMaxM == -1
+MemHeapUsedM >= 0
+MemHeapCommittedM >= 0
+MemHeapMaxM >= 0
+MemMaxM >= 0
+GcCountPS_Scavenge >= 0
+GcTimeMillisPS_Scavenge >= 0
+GcCountPS_MarkSweep >= 0
+GcTimeMillisPS_MarkSweep >= 0
+GcCount >= 0
+GcTimeMillis >= 0
+ThreadsNew >= 0
+ThreadsRunnable >= 0
+ThreadsBlocked >= 0
+ThreadsWaiting >= 0
+ThreadsTimedWaiting >= 0
+ThreadsTerminated >= 0
+LogFatal >= 0
+LogError >= 0
+LogWarn >= 0
+LogInfo >= 0