|
@@ -0,0 +1,458 @@
|
|
|
|
+/**
|
|
|
|
+ * Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
+ * or more contributor license agreements. See the NOTICE file
|
|
|
|
+ * distributed with this work for additional information
|
|
|
|
+ * regarding copyright ownership. The ASF licenses this file
|
|
|
|
+ * to you under the Apache License, Version 2.0 (the
|
|
|
|
+ * "License"); you may not use this file except in compliance
|
|
|
|
+ * with the License. You may obtain a copy of the License at
|
|
|
|
+ *
|
|
|
|
+ * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
+ *
|
|
|
|
+ * Unless required by applicable law or agreed to in writing, software
|
|
|
|
+ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
+ * See the License for the specific language governing permissions and
|
|
|
|
+ * limitations under the License.
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+package org.apache.hadoop.contrib.failmon;
|
|
|
|
+
|
|
|
|
+import java.io.File;
|
|
|
|
+import java.io.FileInputStream;
|
|
|
|
+import java.io.FileNotFoundException;
|
|
|
|
+import java.io.IOException;
|
|
|
|
+import java.io.InputStream;
|
|
|
|
+import java.util.ArrayList;
|
|
|
|
+import java.util.Properties;
|
|
|
|
+import java.util.regex.Matcher;
|
|
|
|
+import java.util.regex.Pattern;
|
|
|
|
+
|
|
|
|
+import org.apache.commons.logging.*;
|
|
|
|
+import org.apache.log4j.PropertyConfigurator;
|
|
|
|
+
|
|
|
|
+/**********************************************************
|
|
|
|
+ * This class provides various methods for interaction with
|
|
|
|
+ * the configuration and the operating system environment. Also
|
|
|
|
+ * provides some helper methods for use by other classes in
|
|
|
|
+ * the package.
|
|
|
|
+ **********************************************************/
|
|
|
|
+
|
|
|
|
+public class Environment {
|
|
|
|
+
|
|
|
|
+ public static final int DEFAULT_LOG_INTERVAL = 3600;
|
|
|
|
+
|
|
|
|
+ public static final int DEFAULT_POLL_INTERVAL = 360;
|
|
|
|
+
|
|
|
|
+ public static int MIN_INTERVAL = 5;
|
|
|
|
+
|
|
|
|
+ public static final int MAX_OUTPUT_LENGTH = 51200;
|
|
|
|
+
|
|
|
|
+ public static Log LOG;
|
|
|
|
+
|
|
|
|
+ static Properties fmProperties = new Properties();
|
|
|
|
+
|
|
|
|
+ static boolean superuser = false;
|
|
|
|
+
|
|
|
|
+ static boolean ready = false;
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Initializes structures needed by other methods. Also determines
|
|
|
|
+ * whether the executing user has superuser privileges.
|
|
|
|
+ *
|
|
|
|
+ */
|
|
|
|
+ public static void prepare(String fname) {
|
|
|
|
+
|
|
|
|
+ if (!"Linux".equalsIgnoreCase(System.getProperty("os.name"))) {
|
|
|
|
+ System.err.println("Linux system required for FailMon. Exiting...");
|
|
|
|
+ System.exit(0);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ System.setProperty("log4j.configuration", "conf/log4j.properties");
|
|
|
|
+ PropertyConfigurator.configure("conf/log4j.properties");
|
|
|
|
+ LOG = LogFactory.getLog("org.apache.hadoop.contrib.failmon");
|
|
|
|
+ logInfo("********** FailMon started ***********");
|
|
|
|
+
|
|
|
|
+ // read parseState file
|
|
|
|
+ PersistentState.readState("conf/parsing.state");
|
|
|
|
+
|
|
|
|
+ try {
|
|
|
|
+ FileInputStream propFile = new FileInputStream(fname);
|
|
|
|
+ fmProperties.load(propFile);
|
|
|
|
+ propFile.close();
|
|
|
|
+ } catch (FileNotFoundException e1) {
|
|
|
|
+ e1.printStackTrace();
|
|
|
|
+ } catch (IOException e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ }
|
|
|
|
+ ready = true;
|
|
|
|
+
|
|
|
|
+ try {
|
|
|
|
+ String sudo_prompt = "passwd_needed:";
|
|
|
|
+ String echo_txt = "access_ok";
|
|
|
|
+
|
|
|
|
+ Process p = Runtime.getRuntime().exec("sudo -S -p " + sudo_prompt + " echo " + echo_txt );
|
|
|
|
+ InputStream inps = p.getInputStream();
|
|
|
|
+ InputStream errs = p.getErrorStream();
|
|
|
|
+
|
|
|
|
+ while (inps.available() < echo_txt.length() && errs.available() < sudo_prompt.length())
|
|
|
|
+ Thread.sleep(100);
|
|
|
|
+
|
|
|
|
+ byte [] buf;
|
|
|
|
+ String s;
|
|
|
|
+
|
|
|
|
+ if (inps.available() >= echo_txt.length()) {
|
|
|
|
+ buf = new byte[inps.available()];
|
|
|
|
+ inps.read(buf);
|
|
|
|
+ s = new String(buf);
|
|
|
|
+ if (s.startsWith(echo_txt)) {
|
|
|
|
+ superuser = true;
|
|
|
|
+ logInfo("Superuser privileges found!");
|
|
|
|
+ } else {
|
|
|
|
+ // no need to read errs
|
|
|
|
+ superuser = false;
|
|
|
|
+ logInfo("Superuser privileges not found.");
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ } catch (IOException e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ } catch (InterruptedException e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Fetches the value of a property from the configuration file.
|
|
|
|
+ *
|
|
|
|
+ * @param key the name of the property
|
|
|
|
+ *
|
|
|
|
+ * @return the value of the property, if it exists and
|
|
|
|
+ * null otherwise
|
|
|
|
+ */
|
|
|
|
+ public static String getProperty(String key) {
|
|
|
|
+ if (!ready)
|
|
|
|
+ prepare("conf/failmon.properties");
|
|
|
|
+ return fmProperties.getProperty(key);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Sets the value of a property inthe configuration file.
|
|
|
|
+ *
|
|
|
|
+ * @param key the name of the property
|
|
|
|
+ * @param value the new value for the property
|
|
|
|
+ *
|
|
|
|
+ */
|
|
|
|
+
|
|
|
|
+ public static void setProperty(String key, String value) {
|
|
|
|
+ fmProperties.setProperty(key, value);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Scans the configuration file to determine which monitoring
|
|
|
|
+ * utilities are available in the system. For each one of them, a
|
|
|
|
+ * job is created. All such jobs are scheduled and executed by
|
|
|
|
+ * Executor.
|
|
|
|
+ *
|
|
|
|
+ * @return an ArrayList that contains jobs to be executed by theExecutor.
|
|
|
|
+ */
|
|
|
|
+ public static ArrayList<MonitorJob> getJobs() {
|
|
|
|
+
|
|
|
|
+ ArrayList<MonitorJob> monitors = new ArrayList<MonitorJob>();
|
|
|
|
+ int timeInt = 0;
|
|
|
|
+
|
|
|
|
+ // for Hadoop Log parsing
|
|
|
|
+ String [] fnames_r = getProperty("log.hadoop.filenames").split(",\\s*");
|
|
|
|
+ String tmp = getProperty("log.hadoop.enabled");
|
|
|
|
+
|
|
|
|
+ String [] fnames = expandDirs(fnames_r, ".*(.log).*");
|
|
|
|
+
|
|
|
|
+ timeInt = setValue("log.hadoop.interval", DEFAULT_LOG_INTERVAL);
|
|
|
|
+
|
|
|
|
+ if ("true".equalsIgnoreCase(tmp) && fnames[0] != null)
|
|
|
|
+ for (String fname : fnames) {
|
|
|
|
+ File f = new File(fname);
|
|
|
|
+ if (f.exists() && f.canRead()) {
|
|
|
|
+ monitors.add(new MonitorJob(new HadoopLogParser(fname), "hadoopLog", timeInt));
|
|
|
|
+ logInfo("Created Monitor for Hadoop log file: " + f.getAbsolutePath());
|
|
|
|
+ } else if (!f.exists())
|
|
|
|
+ logInfo("Skipping Hadoop log file " + fname + " (file not found)");
|
|
|
|
+ else
|
|
|
|
+ logInfo("Skipping Hadoop log file " + fname + " (permission denied)");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ // for System Log parsing
|
|
|
|
+ fnames_r = getProperty("log.system.filenames").split(",\\s*");
|
|
|
|
+ tmp = getProperty("log.system.enabled");
|
|
|
|
+
|
|
|
|
+ fnames = expandDirs(fnames_r, ".*(messages).*");
|
|
|
|
+
|
|
|
|
+ timeInt = setValue("log.system.interval", DEFAULT_LOG_INTERVAL);
|
|
|
|
+
|
|
|
|
+ if ("true".equalsIgnoreCase(tmp))
|
|
|
|
+ for (String fname : fnames) {
|
|
|
|
+ File f = new File(fname);
|
|
|
|
+ if (f.exists() && f.canRead()) {
|
|
|
|
+ monitors.add(new MonitorJob(new SystemLogParser(fname), "systemLog", timeInt));
|
|
|
|
+ logInfo("Created Monitor for System log file: " + f.getAbsolutePath());
|
|
|
|
+ } else if (!f.exists())
|
|
|
|
+ logInfo("Skipping system log file " + fname + " (file not found)");
|
|
|
|
+ else
|
|
|
|
+ logInfo("Skipping system log file " + fname + " (permission denied)");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ // for network interfaces
|
|
|
|
+ tmp = getProperty("nic.enabled");
|
|
|
|
+
|
|
|
|
+ timeInt = setValue("nics.interval", DEFAULT_POLL_INTERVAL);
|
|
|
|
+
|
|
|
|
+ if ("true".equalsIgnoreCase(tmp)) {
|
|
|
|
+ monitors.add(new MonitorJob(new NICParser(), "nics", timeInt));
|
|
|
|
+ logInfo("Created Monitor for NICs");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // for cpu
|
|
|
|
+ tmp = getProperty("cpu.enabled");
|
|
|
|
+
|
|
|
|
+ timeInt = setValue("cpu.interval", DEFAULT_POLL_INTERVAL);
|
|
|
|
+
|
|
|
|
+ if ("true".equalsIgnoreCase(tmp)) {
|
|
|
|
+ monitors.add(new MonitorJob(new CPUParser(), "cpu", timeInt));
|
|
|
|
+ logInfo("Created Monitor for CPUs");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // for disks
|
|
|
|
+ tmp = getProperty("disks.enabled");
|
|
|
|
+
|
|
|
|
+ timeInt = setValue("disks.interval", DEFAULT_POLL_INTERVAL);
|
|
|
|
+
|
|
|
|
+ if ("true".equalsIgnoreCase(tmp)) {
|
|
|
|
+ // check privileges if a disk with no disks./dev/xxx/.source is found
|
|
|
|
+ boolean smart_present = checkExistence("smartctl");
|
|
|
|
+ int disks_ok = 0;
|
|
|
|
+ String devicesStr = getProperty("disks.list");
|
|
|
|
+ String[] devices = null;
|
|
|
|
+
|
|
|
|
+ if (devicesStr != null)
|
|
|
|
+ devices = devicesStr.split(",\\s*");
|
|
|
|
+
|
|
|
|
+ for (int i = 0; i< devices.length; i++) {
|
|
|
|
+ boolean file_present = false;
|
|
|
|
+ boolean disk_present = false;
|
|
|
|
+
|
|
|
|
+ String fileloc = getProperty("disks." + devices[i] + ".source");
|
|
|
|
+ if (fileloc != null && fileloc.equalsIgnoreCase("true"))
|
|
|
|
+ file_present = true;
|
|
|
|
+
|
|
|
|
+ if (!file_present)
|
|
|
|
+ if (superuser) {
|
|
|
|
+ StringBuffer sb = runCommand("sudo smartctl -i " + devices[i]);
|
|
|
|
+ String patternStr = "[(failed)(device not supported)]";
|
|
|
|
+ Pattern pattern = Pattern.compile(patternStr);
|
|
|
|
+ Matcher matcher = pattern.matcher(sb.toString());
|
|
|
|
+ if (matcher.find(0))
|
|
|
|
+ disk_present = false;
|
|
|
|
+ else
|
|
|
|
+ disk_present = true;
|
|
|
|
+ }
|
|
|
|
+ if (file_present || (disk_present && smart_present)) {
|
|
|
|
+ disks_ok++;
|
|
|
|
+ } else
|
|
|
|
+ devices[i] = null;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // now remove disks that dont exist
|
|
|
|
+ StringBuffer resetSB = new StringBuffer();
|
|
|
|
+ for (int j = 0; j < devices.length; j++) {
|
|
|
|
+ resetSB.append(devices[j] == null ? "" : devices[j] + ", ");
|
|
|
|
+ if (devices[j] != null)
|
|
|
|
+ logInfo("Found S.M.A.R.T. attributes for disk " + devices[j]);
|
|
|
|
+ }
|
|
|
|
+ // fix the property
|
|
|
|
+ if (resetSB.length() >= 2)
|
|
|
|
+ setProperty("disks.list", resetSB.substring(0, resetSB.length() - 2));
|
|
|
|
+
|
|
|
|
+ if (disks_ok > 0) {
|
|
|
|
+ monitors.add(new MonitorJob(new SMARTParser(), "disks", timeInt));
|
|
|
|
+ logInfo("Created Monitor for S.M.A.R.T disk attributes");
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // for lm-sensors
|
|
|
|
+ tmp = getProperty("sensors.enabled");
|
|
|
|
+
|
|
|
|
+ timeInt = setValue("sensors.interval", DEFAULT_POLL_INTERVAL);
|
|
|
|
+
|
|
|
|
+ if ("true".equalsIgnoreCase(tmp) && checkExistence("sensors")) {
|
|
|
|
+ monitors.add(new MonitorJob(new SensorsParser(), "sensors", timeInt));
|
|
|
|
+ logInfo("Created Monitor for lm-sensors output");
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return monitors;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Determines the minimum interval at which the executor thread
|
|
|
|
+ * needs to wake upto execute jobs. Essentially, this is interval
|
|
|
|
+ * equals the GCD of intervals of all scheduled jobs.
|
|
|
|
+ *
|
|
|
|
+ * @param monitors the list of scheduled jobs
|
|
|
|
+ *
|
|
|
|
+ * @return the minimum interval between two scheduled jobs
|
|
|
|
+ */
|
|
|
|
+ public static int getInterval(ArrayList<MonitorJob> monitors) {
|
|
|
|
+ String tmp = getProperty("executor.interval.min");
|
|
|
|
+ if (tmp != null)
|
|
|
|
+ MIN_INTERVAL = Integer.parseInt(tmp);
|
|
|
|
+
|
|
|
|
+ int[] monIntervals = new int[monitors.size()];
|
|
|
|
+
|
|
|
|
+ for (int i = 0; i < monitors.size(); i++)
|
|
|
|
+ monIntervals[i] = monitors.get(i).interval;
|
|
|
|
+
|
|
|
|
+ return Math.max(MIN_INTERVAL, gcd(monIntervals));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Checks whether a specific shell command is available
|
|
|
|
+ * in the system.
|
|
|
|
+ *
|
|
|
|
+ * @param cmd the command to check against
|
|
|
|
+ *
|
|
|
|
+ * @return true, if the command is availble, false otherwise
|
|
|
|
+ */
|
|
|
|
+ public static boolean checkExistence(String cmd) {
|
|
|
|
+ StringBuffer sb = runCommand("which " + cmd);
|
|
|
|
+ if (sb.length() > 1)
|
|
|
|
+ return true;
|
|
|
|
+
|
|
|
|
+ return false;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Runs a shell command in the system and provides a StringBuffer
|
|
|
|
+ * with the output of the command.
|
|
|
|
+ *
|
|
|
|
+ * @param cmd an array of string that form the command to run
|
|
|
|
+ *
|
|
|
|
+ * @return a StringBuffer that contains the output of the command
|
|
|
|
+ */
|
|
|
|
+ public static StringBuffer runCommand(String[] cmd) {
|
|
|
|
+ StringBuffer retval = new StringBuffer(MAX_OUTPUT_LENGTH);
|
|
|
|
+ Process p;
|
|
|
|
+ try {
|
|
|
|
+ p = Runtime.getRuntime().exec(cmd);
|
|
|
|
+ InputStream tmp = p.getInputStream();
|
|
|
|
+ p.waitFor();
|
|
|
|
+ int c;
|
|
|
|
+ while ((c = tmp.read()) != -1)
|
|
|
|
+ retval.append((char) c);
|
|
|
|
+ } catch (IOException e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ } catch (InterruptedException e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ return retval;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Runs a shell command in the system and provides a StringBuffer
|
|
|
|
+ * with the output of the command.
|
|
|
|
+ *
|
|
|
|
+ * @param cmd the command to run
|
|
|
|
+ *
|
|
|
|
+ * @return a StringBuffer that contains the output of the command
|
|
|
|
+ */
|
|
|
|
+ public static StringBuffer runCommand(String cmd) {
|
|
|
|
+ return runCommand(cmd.split("\\s+"));
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Determines the greatest common divisor (GCD) of two integers.
|
|
|
|
+ *
|
|
|
|
+ * @param m the first integer
|
|
|
|
+ * @param n the second integer
|
|
|
|
+ *
|
|
|
|
+ * @return the greatest common divisor of m and n
|
|
|
|
+ */
|
|
|
|
+ public static int gcd(int m, int n) {
|
|
|
|
+ if (m == 0 && n == 0)
|
|
|
|
+ return 0;
|
|
|
|
+ if (m < n) {
|
|
|
|
+ int t = m;
|
|
|
|
+ m = n;
|
|
|
|
+ n = t;
|
|
|
|
+ }
|
|
|
|
+ int r = m % n;
|
|
|
|
+ if (r == 0) {
|
|
|
|
+ return n;
|
|
|
|
+ } else {
|
|
|
|
+ return gcd(n, r);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * Determines the greatest common divisor (GCD) of a list
|
|
|
|
+ * of integers.
|
|
|
|
+ *
|
|
|
|
+ * @param numbers the list of integers to process
|
|
|
|
+ *
|
|
|
|
+ * @return the greatest common divisor of all numbers
|
|
|
|
+ */
|
|
|
|
+ public static int gcd(int[] numbers) {
|
|
|
|
+
|
|
|
|
+ if (numbers.length == 1)
|
|
|
|
+ return numbers[0];
|
|
|
|
+
|
|
|
|
+ int g = gcd(numbers[0], numbers[1]);
|
|
|
|
+
|
|
|
|
+ for (int i = 2; i < numbers.length; i++)
|
|
|
|
+ g = gcd(g, numbers[i]);
|
|
|
|
+
|
|
|
|
+ return g;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private static String [] expandDirs(String [] input, String patternStr) {
|
|
|
|
+
|
|
|
|
+ ArrayList<String> fnames = new ArrayList<String>();
|
|
|
|
+ Pattern pattern = Pattern.compile(patternStr);
|
|
|
|
+ Matcher matcher;
|
|
|
|
+ File f;
|
|
|
|
+
|
|
|
|
+ for (String fname : input) {
|
|
|
|
+ f = new File(fname);
|
|
|
|
+ if (f.exists()) {
|
|
|
|
+ if (f.isDirectory()) {
|
|
|
|
+ // add all matching files
|
|
|
|
+ File [] fcs = f.listFiles();
|
|
|
|
+ for (File fc : fcs) {
|
|
|
|
+ matcher = pattern.matcher(fc.getName());
|
|
|
|
+ if (matcher.find() && fc.isFile())
|
|
|
|
+ fnames.add(fc.getAbsolutePath());
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ // normal file, just add to output
|
|
|
|
+ fnames.add(f.getAbsolutePath());
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ return fnames.toArray(input);
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private static int setValue(String propname, int defaultValue) {
|
|
|
|
+
|
|
|
|
+ String v = getProperty(propname);
|
|
|
|
+
|
|
|
|
+ if (v != null)
|
|
|
|
+ return Integer.parseInt(v);
|
|
|
|
+ else
|
|
|
|
+ return defaultValue;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+ public static void logInfo(String str) {
|
|
|
|
+ LOG.info(str);
|
|
|
|
+ }
|
|
|
|
+}
|