2015-05-28 11:32:57 +02:00
|
|
|
/* This configuration is as close to 2.5.x default behavior as possible
|
|
|
|
The values closely match ./gmond/metric.h definitions in 2.5.x */
|
|
|
|
globals {
|
|
|
|
daemonize = yes
|
|
|
|
setuid = yes
|
|
|
|
user = ganglia
|
|
|
|
debug_level = 0
|
|
|
|
# max_udp_msg_len = 1472
|
|
|
|
mute = no
|
|
|
|
deaf = no
|
|
|
|
host_dmax = 3600 /*secs */
|
|
|
|
cleanup_threshold = 300 /*secs */
|
|
|
|
gexec = no
|
|
|
|
allow_extra_data = yes
|
|
|
|
send_metadata_interval = {{ ganglia_gmond_send_metadata_interval }}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* If a cluster attribute is specified, then all gmond hosts are wrapped inside
|
|
|
|
* of a <CLUSTER> tag. If you do not specify a cluster tag, then all <HOSTS> will
|
|
|
|
* NOT be wrapped inside of a <CLUSTER> tag. */
|
|
|
|
cluster {
|
|
|
|
name = "{{ ganglia_gmond_cluster }}"
|
|
|
|
owner = "{{ ganglia_gmond_cluster_owner }}"
|
|
|
|
latlong = "unspecified"
|
|
|
|
url = "unspecified"
|
|
|
|
}
|
|
|
|
|
|
|
|
/* The host section describes attributes of the host, like the location */
|
|
|
|
host {
|
|
|
|
location = "{{ ganglia_gmond_location }}"
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Feel free to specify as many udp_send_channels as you like. Gmond
|
|
|
|
used to only support having a single channel */
|
2016-07-12 19:15:00 +02:00
|
|
|
{% if not ganglia_unicast_mode %}
|
2015-05-28 11:32:57 +02:00
|
|
|
udp_send_channel {
|
2016-07-12 19:15:00 +02:00
|
|
|
#bind_hostname = yes
|
2015-05-28 11:32:57 +02:00
|
|
|
mcast_join = {{ ganglia_gmond_mcast_addr }}
|
|
|
|
port = {{ ganglia_gmond_cluster_port }}
|
|
|
|
ttl = 1
|
|
|
|
}
|
|
|
|
|
|
|
|
/* You can specify as many udp_recv_channels as you like as well. */
|
|
|
|
udp_recv_channel {
|
|
|
|
mcast_join = {{ ganglia_gmond_mcast_addr }}
|
|
|
|
port = {{ ganglia_gmond_cluster_port }}
|
|
|
|
}
|
|
|
|
|
2016-07-12 19:15:00 +02:00
|
|
|
{% else %}
|
|
|
|
{% for host in ganglia_gmetad_sources %}
|
|
|
|
udp_send_channel {
|
|
|
|
host = {{ host }}
|
2015-05-28 11:32:57 +02:00
|
|
|
port = {{ ganglia_gmond_cluster_port }}
|
2016-07-12 19:15:00 +02:00
|
|
|
ttl = 1
|
2015-05-28 11:32:57 +02:00
|
|
|
}
|
2016-07-12 19:15:00 +02:00
|
|
|
{% endfor %}
|
|
|
|
|
|
|
|
{% endif %}
|
|
|
|
udp_recv_channel {
|
|
|
|
port = {{ ganglia_gmond_cluster_port }}
|
|
|
|
}
|
2015-05-28 11:32:57 +02:00
|
|
|
|
|
|
|
/* You can specify as many tcp_accept_channels as you like to share
|
|
|
|
an xml description of the state of the cluster */
|
|
|
|
tcp_accept_channel {
|
|
|
|
port = {{ ganglia_gmond_cluster_port }}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Each metrics module that is referenced by gmond must be specified and
|
|
|
|
loaded. If the module has been statically linked with gmond, it does not
|
|
|
|
require a load path. However all dynamically loadable modules must include
|
|
|
|
a load path. */
|
|
|
|
modules {
|
|
|
|
module {
|
|
|
|
name = "core_metrics"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "cpu_module"
|
|
|
|
path = "/usr/lib/ganglia/modcpu.so"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "disk_module"
|
|
|
|
path = "/usr/lib/ganglia/moddisk.so"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "load_module"
|
|
|
|
path = "/usr/lib/ganglia/modload.so"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "mem_module"
|
|
|
|
path = "/usr/lib/ganglia/modmem.so"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "net_module"
|
|
|
|
path = "/usr/lib/ganglia/modnet.so"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "proc_module"
|
|
|
|
path = "/usr/lib/ganglia/modproc.so"
|
|
|
|
}
|
|
|
|
module {
|
|
|
|
name = "sys_module"
|
|
|
|
path = "/usr/lib/ganglia/modsys.so"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
include ('/etc/ganglia/conf.d/*.conf')
|
|
|
|
|
|
|
|
|
|
|
|
/* The old internal 2.5.x metric array has been replaced by the following
|
|
|
|
collection_group directives. What follows is the default behavior for
|
|
|
|
collecting and sending metrics that is as close to 2.5.x behavior as
|
|
|
|
possible. */
|
|
|
|
|
|
|
|
/* This collection group will cause a heartbeat (or beacon) to be sent every
|
|
|
|
20 seconds. In the heartbeat is the GMOND_STARTED data which expresses
|
|
|
|
the age of the running gmond. */
|
|
|
|
collection_group {
|
|
|
|
collect_once = yes
|
|
|
|
time_threshold = 20
|
|
|
|
metric {
|
|
|
|
name = "heartbeat"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This collection group will send general info about this host every 1200 secs.
|
|
|
|
This information doesn't change between reboots and is only collected once. */
|
|
|
|
collection_group {
|
|
|
|
collect_once = yes
|
|
|
|
time_threshold = 1200
|
|
|
|
metric {
|
|
|
|
name = "cpu_num"
|
|
|
|
title = "CPU Count"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_speed"
|
|
|
|
title = "CPU Speed"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "mem_total"
|
|
|
|
title = "Memory Total"
|
|
|
|
}
|
|
|
|
/* Should this be here? Swap can be added/removed between reboots. */
|
|
|
|
metric {
|
|
|
|
name = "swap_total"
|
|
|
|
title = "Swap Space Total"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "boottime"
|
|
|
|
title = "Last Boot Time"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "machine_type"
|
|
|
|
title = "Machine Type"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "os_name"
|
|
|
|
title = "Operating System"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "os_release"
|
|
|
|
title = "Operating System Release"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "location"
|
|
|
|
title = "Location"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This collection group will send the status of gexecd for this host every 300 secs */
|
|
|
|
/* Unlike 2.5.x the default behavior is to report gexecd OFF. */
|
|
|
|
collection_group {
|
|
|
|
collect_once = yes
|
|
|
|
time_threshold = 300
|
|
|
|
metric {
|
|
|
|
name = "gexec"
|
|
|
|
title = "Gexec Status"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This collection group will collect the CPU status info every 20 secs.
|
|
|
|
The time threshold is set to 90 seconds. In honesty, this time_threshold could be
|
|
|
|
set significantly higher to reduce unneccessary network chatter. */
|
|
|
|
collection_group {
|
|
|
|
collect_every = 20
|
|
|
|
time_threshold = 180
|
|
|
|
/* CPU status */
|
|
|
|
metric {
|
|
|
|
name = "cpu_user"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "CPU User"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_system"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "CPU System"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_idle"
|
|
|
|
value_threshold = "5.0"
|
|
|
|
title = "CPU Idle"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_nice"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "CPU Nice"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_aidle"
|
|
|
|
value_threshold = "5.0"
|
|
|
|
title = "CPU aidle"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_wio"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "CPU wio"
|
|
|
|
}
|
|
|
|
/* The next two metrics are optional if you want more detail...
|
|
|
|
... since they are accounted for in cpu_system.
|
|
|
|
metric {
|
|
|
|
name = "cpu_intr"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "CPU intr"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "cpu_sintr"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "CPU sintr"
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
}
|
|
|
|
|
|
|
|
collection_group {
|
|
|
|
collect_every = 20
|
|
|
|
time_threshold = 90
|
|
|
|
/* Load Averages */
|
|
|
|
metric {
|
|
|
|
name = "load_one"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "One Minute Load Average"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "load_five"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "Five Minute Load Average"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "load_fifteen"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "Fifteen Minute Load Average"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This group collects the number of running and total processes */
|
|
|
|
collection_group {
|
|
|
|
collect_every = 80
|
|
|
|
time_threshold = 950
|
|
|
|
metric {
|
|
|
|
name = "proc_run"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "Total Running Processes"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "proc_total"
|
|
|
|
value_threshold = "1.0"
|
|
|
|
title = "Total Processes"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This collection group grabs the volatile memory metrics every 40 secs and
|
|
|
|
sends them at least every 180 secs. This time_threshold can be increased
|
|
|
|
significantly to reduce unneeded network traffic. */
|
|
|
|
collection_group {
|
|
|
|
collect_every = 40
|
|
|
|
time_threshold = 180
|
|
|
|
metric {
|
|
|
|
name = "mem_free"
|
|
|
|
value_threshold = "1024.0"
|
|
|
|
title = "Free Memory"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "mem_shared"
|
|
|
|
value_threshold = "1024.0"
|
|
|
|
title = "Shared Memory"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "mem_buffers"
|
|
|
|
value_threshold = "1024.0"
|
|
|
|
title = "Memory Buffers"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "mem_cached"
|
|
|
|
value_threshold = "1024.0"
|
|
|
|
title = "Cached Memory"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "swap_free"
|
|
|
|
value_threshold = "1024.0"
|
|
|
|
title = "Free Swap Space"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
collection_group {
|
|
|
|
collect_every = 40
|
|
|
|
time_threshold = 300
|
|
|
|
metric {
|
|
|
|
name = "bytes_out"
|
|
|
|
value_threshold = 4096
|
|
|
|
title = "Bytes Sent"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "bytes_in"
|
|
|
|
value_threshold = 4096
|
|
|
|
title = "Bytes Received"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "pkts_in"
|
|
|
|
value_threshold = 256
|
|
|
|
title = "Packets Received"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "pkts_out"
|
|
|
|
value_threshold = 256
|
|
|
|
title = "Packets Sent"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* Different than 2.5.x default since the old config made no sense */
|
|
|
|
collection_group {
|
|
|
|
collect_every = 1800
|
|
|
|
time_threshold = 3600
|
|
|
|
metric {
|
|
|
|
name = "disk_total"
|
|
|
|
value_threshold = 1.0
|
|
|
|
title = "Total Disk Space"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
collection_group {
|
|
|
|
collect_every = 40
|
|
|
|
time_threshold = 180
|
|
|
|
metric {
|
|
|
|
name = "disk_free"
|
|
|
|
value_threshold = 1.0
|
|
|
|
title = "Disk Space Available"
|
|
|
|
}
|
|
|
|
metric {
|
|
|
|
name = "part_max_used"
|
|
|
|
value_threshold = 1.0
|
|
|
|
title = "Maximum Disk Space Used"
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|