|
@@ -0,0 +1,545 @@
|
|
|
+#!/bin/sh
|
|
|
+
|
|
|
+#/*
|
|
|
+# * Licensed to the Apache Software Foundation (ASF) under one
|
|
|
+# * or more contributor license agreements. See the NOTICE file
|
|
|
+# * distributed with this work for additional information
|
|
|
+# * regarding copyright ownership. The ASF licenses this file
|
|
|
+# * to you under the Apache License, Version 2.0 (the
|
|
|
+# * "License"); you may not use this file except in compliance
|
|
|
+# * with the License. You may obtain a copy of the License at
|
|
|
+# *
|
|
|
+# * http://www.apache.org/licenses/LICENSE-2.0
|
|
|
+# *
|
|
|
+# * Unless required by applicable law or agreed to in writing, software
|
|
|
+# * distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
+# * See the License for the specific language governing permissions and
|
|
|
+# * limitations under the License.
|
|
|
+# */
|
|
|
+
|
|
|
+cd `dirname ${0}`;
|
|
|
+
|
|
|
+# Slurp in all our user-customizable settings.
|
|
|
+source ./gangliaEnv.sh;
|
|
|
+
|
|
|
+# Get access to Ganglia-wide constants etc.
|
|
|
+source ./gangliaLib.sh;
|
|
|
+
|
|
|
+GMOND_BIN=/usr/sbin/gmond;
|
|
|
+GMOND_CORE_CONF_FILE=gmond.core.conf;
|
|
|
+GMOND_MASTER_CONF_FILE=gmond.master.conf;
|
|
|
+GMOND_SLAVE_CONF_FILE=gmond.slave.conf;
|
|
|
+GMOND_PID_FILE=gmond.pid;
|
|
|
+
|
|
|
+# Functions.
|
|
|
+function getGmondCoreConfFileName()
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ # ${clusterName} is not empty.
|
|
|
+ echo "${GANGLIA_CONF_DIR}/${clusterName}/${GMOND_CORE_CONF_FILE}";
|
|
|
+ else
|
|
|
+ echo "${GANGLIA_CONF_DIR}/${GMOND_CORE_CONF_FILE}";
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function getGmondMasterConfFileName()
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ # ${clusterName} is not empty.
|
|
|
+ echo "${GANGLIA_CONF_DIR}/${clusterName}/conf.d/${GMOND_MASTER_CONF_FILE}";
|
|
|
+ else
|
|
|
+ echo "${GANGLIA_CONF_DIR}/conf.d/${GMOND_MASTER_CONF_FILE}";
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function getGmondSlaveConfFileName()
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ # ${clusterName} is not empty.
|
|
|
+ echo "${GANGLIA_CONF_DIR}/${clusterName}/conf.d/${GMOND_SLAVE_CONF_FILE}";
|
|
|
+ else
|
|
|
+ echo "${GANGLIA_CONF_DIR}/conf.d/${GMOND_SLAVE_CONF_FILE}";
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function getGmondPidFileName()
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ # ${clusterName} is not empty.
|
|
|
+ echo "${GANGLIA_RUNTIME_DIR}/${clusterName}/${GMOND_PID_FILE}";
|
|
|
+ else
|
|
|
+ echo "${GANGLIA_RUNTIME_DIR}/${GMOND_PID_FILE}";
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function getGmondLoggedPid()
|
|
|
+{
|
|
|
+ gmondPidFile=`getGmondPidFileName ${1}`;
|
|
|
+
|
|
|
+ if [ -e "${gmondPidFile}" ]
|
|
|
+ then
|
|
|
+ echo `cat ${gmondPidFile}`;
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function getGmondRunningPid()
|
|
|
+{
|
|
|
+ gmondLoggedPid=`getGmondLoggedPid ${1}`;
|
|
|
+
|
|
|
+ if [ -n "${gmondLoggedPid}" ]
|
|
|
+ then
|
|
|
+ echo `ps -o pid=MYPID -p ${gmondLoggedPid} | tail -1 | awk '{print $1}' | grep -v MYPID`;
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function generateGmondCoreConf()
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ read gmondClusterName gmondMasterIP gmondPort <<<`getGangliaClusterInfo ${clusterName}`;
|
|
|
+
|
|
|
+ # Check that all of ${gmondClusterName} and ${gmondMasterIP} and ${gmondPort} are populated.
|
|
|
+ if [ "x" != "x${gmondClusterName}" -a "x" != "x${gmondMasterIP}" -a "x" != "x${gmondPort}" ]
|
|
|
+ then
|
|
|
+ now=`date`;
|
|
|
+
|
|
|
+ cat << END_OF_GMOND_CORE_CONF
|
|
|
+#################### Generated by ${0} on ${now} ####################
|
|
|
+#
|
|
|
+/* This configuration is as close to 2.5.x default behavior as possible
|
|
|
+ The values closely match ./gmond/metric.h definitions in 2.5.x */
|
|
|
+globals {
|
|
|
+ daemonize = yes
|
|
|
+ setuid = yes
|
|
|
+ user = ${GMOND_USER}
|
|
|
+ debug_level = 0
|
|
|
+ max_udp_msg_len = 1472
|
|
|
+ mute = no
|
|
|
+ deaf = no
|
|
|
+ allow_extra_data = yes
|
|
|
+ host_dmax = 0 /*secs */
|
|
|
+ host_tmax = 20 /*secs */
|
|
|
+ cleanup_threshold = 300 /*secs */
|
|
|
+ gexec = no
|
|
|
+ send_metadata_interval = 30 /*secs */
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * The cluster attributes specified will be used as part of the <CLUSTER>
|
|
|
+ * tag that will wrap all hosts collected by this instance.
|
|
|
+ */
|
|
|
+cluster {
|
|
|
+ name = "${gmondClusterName}"
|
|
|
+ owner = "unspecified"
|
|
|
+ latlong = "unspecified"
|
|
|
+ url = "unspecified"
|
|
|
+}
|
|
|
+
|
|
|
+/* The host section describes attributes of the host, like the location */
|
|
|
+host {
|
|
|
+ location = "unspecified"
|
|
|
+}
|
|
|
+
|
|
|
+/* You can specify as many tcp_accept_channels as you like to share
|
|
|
+ * an XML description of the state of the cluster.
|
|
|
+ *
|
|
|
+ * At the very least, every gmond must expose its XML state to
|
|
|
+ * queriers from localhost.
|
|
|
+ */
|
|
|
+tcp_accept_channel {
|
|
|
+ bind = localhost
|
|
|
+ port = ${gmondPort}
|
|
|
+}
|
|
|
+
|
|
|
+/* Each metrics module that is referenced by gmond must be specified and
|
|
|
+ loaded. If the module has been statically linked with gmond, it does
|
|
|
+ not require a load path. However all dynamically loadable modules must
|
|
|
+ include a load path. */
|
|
|
+modules {
|
|
|
+ module {
|
|
|
+ name = "core_metrics"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "cpu_module"
|
|
|
+ path = "modcpu.so"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "disk_module"
|
|
|
+ path = "moddisk.so"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "load_module"
|
|
|
+ path = "modload.so"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "mem_module"
|
|
|
+ path = "modmem.so"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "net_module"
|
|
|
+ path = "modnet.so"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "proc_module"
|
|
|
+ path = "modproc.so"
|
|
|
+ }
|
|
|
+ module {
|
|
|
+ name = "sys_module"
|
|
|
+ path = "modsys.so"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* The old internal 2.5.x metric array has been replaced by the following
|
|
|
+ collection_group directives. What follows is the default behavior for
|
|
|
+ collecting and sending metrics that is as close to 2.5.x behavior as
|
|
|
+ possible. */
|
|
|
+
|
|
|
+/* This collection group will cause a heartbeat (or beacon) to be sent every
|
|
|
+ 20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
|
|
|
+ the age of the running gmond. */
|
|
|
+collection_group {
|
|
|
+ collect_once = yes
|
|
|
+ time_threshold = 20
|
|
|
+ metric {
|
|
|
+ name = "heartbeat"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* This collection group will send general info about this host total memory every
|
|
|
+ 180 secs.
|
|
|
+ This information doesn't change between reboots and is only collected
|
|
|
+ once. This information needed for heatmap showing */
|
|
|
+ collection_group {
|
|
|
+ collect_once = yes
|
|
|
+ time_threshold = 180
|
|
|
+ metric {
|
|
|
+ name = "mem_total"
|
|
|
+ title = "Memory Total"
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+/* This collection group will send general info about this host every
|
|
|
+ 1200 secs.
|
|
|
+ This information doesn't change between reboots and is only collected
|
|
|
+ once. */
|
|
|
+collection_group {
|
|
|
+ collect_once = yes
|
|
|
+ time_threshold = 1200
|
|
|
+ metric {
|
|
|
+ name = "cpu_num"
|
|
|
+ title = "CPU Count"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_speed"
|
|
|
+ title = "CPU Speed"
|
|
|
+ }
|
|
|
+ /* Should this be here? Swap can be added/removed between reboots. */
|
|
|
+ metric {
|
|
|
+ name = "swap_total"
|
|
|
+ title = "Swap Space Total"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "boottime"
|
|
|
+ title = "Last Boot Time"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "machine_type"
|
|
|
+ title = "Machine Type"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "os_name"
|
|
|
+ title = "Operating System"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "os_release"
|
|
|
+ title = "Operating System Release"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "location"
|
|
|
+ title = "Location"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* This collection group will send the status of gexecd for this host
|
|
|
+ every 300 secs.*/
|
|
|
+/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
|
|
|
+collection_group {
|
|
|
+ collect_once = yes
|
|
|
+ time_threshold = 300
|
|
|
+ metric {
|
|
|
+ name = "gexec"
|
|
|
+ title = "Gexec Status"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* This collection group will collect the CPU status info every 20 secs.
|
|
|
+ The time threshold is set to 90 seconds. In honesty, this
|
|
|
+ time_threshold could be set significantly higher to reduce
|
|
|
+ unneccessary network chatter. */
|
|
|
+collection_group {
|
|
|
+ collect_every = 20
|
|
|
+ time_threshold = 90
|
|
|
+ /* CPU status */
|
|
|
+ metric {
|
|
|
+ name = "cpu_user"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "CPU User"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_system"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "CPU System"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_idle"
|
|
|
+ value_threshold = "5.0"
|
|
|
+ title = "CPU Idle"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_nice"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "CPU Nice"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_aidle"
|
|
|
+ value_threshold = "5.0"
|
|
|
+ title = "CPU aidle"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_wio"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "CPU wio"
|
|
|
+ }
|
|
|
+ /* The next two metrics are optional if you want more detail...
|
|
|
+ ... since they are accounted for in cpu_system.
|
|
|
+ metric {
|
|
|
+ name = "cpu_intr"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "CPU intr"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "cpu_sintr"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "CPU sintr"
|
|
|
+ }
|
|
|
+ */
|
|
|
+}
|
|
|
+
|
|
|
+collection_group {
|
|
|
+ collect_every = 20
|
|
|
+ time_threshold = 90
|
|
|
+ /* Load Averages */
|
|
|
+ metric {
|
|
|
+ name = "load_one"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "One Minute Load Average"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "load_five"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "Five Minute Load Average"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "load_fifteen"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "Fifteen Minute Load Average"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* This group collects the number of running and total processes */
|
|
|
+collection_group {
|
|
|
+ collect_every = 80
|
|
|
+ time_threshold = 950
|
|
|
+ metric {
|
|
|
+ name = "proc_run"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "Total Running Processes"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "proc_total"
|
|
|
+ value_threshold = "1.0"
|
|
|
+ title = "Total Processes"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/* This collection group grabs the volatile memory metrics every 40 secs and
|
|
|
+ sends them at least every 180 secs. This time_threshold can be increased
|
|
|
+ significantly to reduce unneeded network traffic. */
|
|
|
+collection_group {
|
|
|
+ collect_every = 40
|
|
|
+ time_threshold = 180
|
|
|
+ metric {
|
|
|
+ name = "mem_free"
|
|
|
+ value_threshold = "1024.0"
|
|
|
+ title = "Free Memory"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "mem_shared"
|
|
|
+ value_threshold = "1024.0"
|
|
|
+ title = "Shared Memory"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "mem_buffers"
|
|
|
+ value_threshold = "1024.0"
|
|
|
+ title = "Memory Buffers"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "mem_cached"
|
|
|
+ value_threshold = "1024.0"
|
|
|
+ title = "Cached Memory"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "swap_free"
|
|
|
+ value_threshold = "1024.0"
|
|
|
+ title = "Free Swap Space"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+collection_group {
|
|
|
+ collect_every = 40
|
|
|
+ time_threshold = 300
|
|
|
+ metric {
|
|
|
+ name = "bytes_out"
|
|
|
+ value_threshold = 4096
|
|
|
+ title = "Bytes Sent"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "bytes_in"
|
|
|
+ value_threshold = 4096
|
|
|
+ title = "Bytes Received"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "pkts_in"
|
|
|
+ value_threshold = 256
|
|
|
+ title = "Packets Received"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "pkts_out"
|
|
|
+ value_threshold = 256
|
|
|
+ title = "Packets Sent"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+collection_group {
|
|
|
+ collect_every = 40
|
|
|
+ time_threshold = 180
|
|
|
+ metric {
|
|
|
+ name = "disk_free"
|
|
|
+ value_threshold = 1.0
|
|
|
+ title = "Disk Space Available"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "part_max_used"
|
|
|
+ value_threshold = 1.0
|
|
|
+ title = "Maximum Disk Space Used"
|
|
|
+ }
|
|
|
+ metric {
|
|
|
+ name = "disk_total"
|
|
|
+ value_threshold = 1.0
|
|
|
+ title = "Total Disk Space"
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+udp_recv_channel {
|
|
|
+ port = 0
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+include ("${GANGLIA_CONF_DIR}/${gmondClusterName}/conf.d/*.conf")
|
|
|
+END_OF_GMOND_CORE_CONF
|
|
|
+ else
|
|
|
+ return 2;
|
|
|
+ fi
|
|
|
+ else
|
|
|
+ return 1;
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function generateGmondMasterConf
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ read gmondClusterName gmondMasterIP gmondPort <<<`getGangliaClusterInfo ${clusterName}`;
|
|
|
+
|
|
|
+ # Check that all of ${gmondClusterName} and ${gmondMasterIP} and ${gmondPort} are populated.
|
|
|
+ if [ "x" != "x${gmondClusterName}" -a "x" != "x${gmondMasterIP}" -a "x" != "x${gmondPort}" ]
|
|
|
+ then
|
|
|
+ now=`date`;
|
|
|
+
|
|
|
+ cat << END_OF_GMOND_MASTER_CONF
|
|
|
+#################### Generated by ${0} on ${now} ####################
|
|
|
+/* Masters only receive; they never send. */
|
|
|
+udp_recv_channel {
|
|
|
+ bind = ${gmondMasterIP}
|
|
|
+ port = ${gmondPort}
|
|
|
+}
|
|
|
+
|
|
|
+/* The gmond cluster master must additionally provide an XML
|
|
|
+ * description of the cluster to the gmetad that will query it.
|
|
|
+ */
|
|
|
+tcp_accept_channel {
|
|
|
+ bind = ${gmondMasterIP}
|
|
|
+ port = ${gmondPort}
|
|
|
+}
|
|
|
+END_OF_GMOND_MASTER_CONF
|
|
|
+ else
|
|
|
+ return 2;
|
|
|
+ fi
|
|
|
+ else
|
|
|
+ return 1;
|
|
|
+ fi
|
|
|
+}
|
|
|
+
|
|
|
+function generateGmondSlaveConf
|
|
|
+{
|
|
|
+ clusterName=${1};
|
|
|
+
|
|
|
+ if [ "x" != "x${clusterName}" ]
|
|
|
+ then
|
|
|
+ read gmondClusterName gmondMasterIP gmondPort <<<`getGangliaClusterInfo ${clusterName}`;
|
|
|
+
|
|
|
+ # Check that all of ${gmondClusterName} and ${gmondMasterIP} and ${gmondPort} are populated.
|
|
|
+ if [ "x" != "x${gmondClusterName}" -a "x" != "x${gmondMasterIP}" -a "x" != "x${gmondPort}" ]
|
|
|
+ then
|
|
|
+ now=`date`;
|
|
|
+
|
|
|
+ cat << END_OF_GMOND_SLAVE_CONF
|
|
|
+#################### Generated by ${0} on ${now} ####################
|
|
|
+/* Slaves only send; they never receive. */
|
|
|
+udp_send_channel {
|
|
|
+ #bind_hostname = yes # Highly recommended, soon to be default.
|
|
|
+ # This option tells gmond to use a source address
|
|
|
+ # that resolves to the machine's hostname. Without
|
|
|
+ # this, the metrics may appear to come from any
|
|
|
+ # interface and the DNS names associated with
|
|
|
+ # those IPs will be used to create the RRDs.
|
|
|
+ host = ${gmondMasterIP}
|
|
|
+ port = ${gmondPort}
|
|
|
+ ttl = 1
|
|
|
+}
|
|
|
+END_OF_GMOND_SLAVE_CONF
|
|
|
+ else
|
|
|
+ return 2;
|
|
|
+ fi
|
|
|
+ else
|
|
|
+ return 1;
|
|
|
+ fi
|
|
|
+}
|