mirror of
https://github.com/netdata/netdata.git
synced 2025-04-17 03:02:41 +00:00
Provide new attributes in health conf files (#10961)
* read and store new attributes (class, component, type) from health conf files. Replace family variable in info strings * provide the attributes to jsons * remove extra semicolon * populate conf files with new attributes * added newline * remove extra defines from health.h * remove empty line * remove realloc * use helper variables for find_and_replace. Adjust position for next strstr * remove comments * Add type to mysql.conf and vcsa.conf * fix formatting * add parenthesis * remove extra assignment * changes to mysql_galera_cluster_state from master * add type Errors to unbound_request_list_overwritten * fix identation for info strings spawning more than one line * check for null, replace with empty string if true * add class, component, type to systemdunits.conf
This commit is contained in:
parent
0a6a14e323
commit
f5bd20e60a
93 changed files with 4230 additions and 3101 deletions
database
health
health.c
health.d
adaptec_raid.confam2320.confanomalies.confapache.confapcupsd.confbackend.confbcache.confbeanstalkd.confbind_rndc.confboinc.confbtrfs.confceph.confcgroups.confcockroachdb.confcouchdb.confcpu.confdbengine.confdisks.confdns_query.confdnsmasq_dhcp.confdockerd.confelasticsearch.confentropy.confexporting.conffping.conffronius.confgearman.confhaproxy.confhdfs.confhttpcheck.confioping.confipc.confipfs.confipmi.confkubelet.conflighttpd.conflinux_power_supply.confload.confmdstat.confmegacli.confmemcached.confmemory.confmongodb.confmysql.confnamed.confnet.confnetfilter.confnginx.confnginx_plus.confphpfpm.confpihole.confportcheck.confpostgres.confprocesses.confpulsar.confram.confredis.confretroshare.confriakkv.confscaleio.confsoftnet.confsquid.confstiebeleltron.confswap.confsystemdunits.conftcp_conn.conftcp_listen.conftcp_mem.conftcp_orphans.conftcp_resets.confudp_errors.confunbound.confvarnish.confvcsa.confvernemq.confvsphere.confweb_log.confwhoisquery.confwmi.confx509check.confzfs.confzookeeper.conf
health.hhealth_config.chealth_json.chealth_log.clibnetdata
|
@ -666,6 +666,10 @@ struct alarm_entry {
|
|||
|
||||
char *family;
|
||||
|
||||
char *class;
|
||||
char *component;
|
||||
char *type;
|
||||
|
||||
char *exec;
|
||||
char *recipient;
|
||||
time_t exec_run_timestamp;
|
||||
|
|
|
@ -91,6 +91,9 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
|
|||
rc->name,
|
||||
rc->rrdset->id,
|
||||
rc->rrdset->family,
|
||||
rc->class,
|
||||
rc->component,
|
||||
rc->type,
|
||||
rc->exec,
|
||||
rc->recipient,
|
||||
now - rc->last_status_change,
|
||||
|
@ -165,6 +168,9 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
|
|||
rc->name,
|
||||
rc->rrdset->id,
|
||||
rc->rrdset->family,
|
||||
rc->class,
|
||||
rc->component,
|
||||
rc->type,
|
||||
rc->exec,
|
||||
rc->recipient,
|
||||
now - rc->last_status_change,
|
||||
|
@ -428,6 +434,10 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt,
|
|||
if(rt->units) rc->units = strdupz(rt->units);
|
||||
if(rt->info) rc->info = strdupz(rt->info);
|
||||
|
||||
if (rt->class) rc->class = strdupz(rt->class);
|
||||
if (rt->component) rc->component = strdupz(rt->component);
|
||||
if (rt->type) rc->type = strdupz(rt->type);
|
||||
|
||||
if(rt->calculation) {
|
||||
rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
|
||||
if(!rc->calculation)
|
||||
|
@ -535,6 +545,10 @@ inline RRDCALC *rrdcalc_create_from_rrdcalc(RRDCALC *rc, RRDHOST *host, const ch
|
|||
if(rc->units) newrc->units = strdupz(rc->units);
|
||||
if(rc->info) newrc->info = strdupz(rc->info);
|
||||
|
||||
if (rc->class) newrc->class = strdupz(rc->class);
|
||||
if (rc->component) newrc->component = strdupz(rc->component);
|
||||
if (rc->type) newrc->type = strdupz(rc->type);
|
||||
|
||||
if(rc->calculation) {
|
||||
newrc->calculation = expression_parse(rc->calculation->source, NULL, NULL);
|
||||
if(!newrc->calculation)
|
||||
|
@ -573,6 +587,9 @@ void rrdcalc_free(RRDCALC *rc) {
|
|||
freez(rc->source);
|
||||
freez(rc->units);
|
||||
freez(rc->info);
|
||||
freez(rc->class);
|
||||
freez(rc->component);
|
||||
freez(rc->type);
|
||||
simple_pattern_free(rc->spdim);
|
||||
freez(rc->labels);
|
||||
simple_pattern_free(rc->splabels);
|
||||
|
|
|
@ -42,10 +42,13 @@ struct rrdcalc {
|
|||
char *exec; // the command to execute when this alarm switches state
|
||||
char *recipient; // the recipient of the alarm (the first parameter to exec)
|
||||
|
||||
char *class; // the class that this alarm belongs
|
||||
char *component; // the component that this alarm refers to
|
||||
char *type; // type of the alarm
|
||||
|
||||
char *chart; // the chart id this should be linked to
|
||||
uint32_t hash_chart;
|
||||
|
||||
|
||||
char *plugin_match; //the plugin name that should be linked to
|
||||
SIMPLE_PATTERN *plugin_pattern;
|
||||
|
||||
|
|
|
@ -15,6 +15,10 @@ struct rrdcalctemplate {
|
|||
char *exec;
|
||||
char *recipient;
|
||||
|
||||
char *class;
|
||||
char *component;
|
||||
char *type;
|
||||
|
||||
char *context;
|
||||
uint32_t hash_context;
|
||||
|
||||
|
|
|
@ -930,7 +930,7 @@ void *health_main(void *ptr) {
|
|||
if(likely(!rrdcalc_isrepeating(rc))) {
|
||||
ALARM_ENTRY *ae = health_create_alarm_entry(
|
||||
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
|
||||
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
|
||||
rc->delay_last,
|
||||
(
|
||||
|
@ -980,7 +980,7 @@ void *health_main(void *ptr) {
|
|||
rc->last_repeat = now;
|
||||
ALARM_ENTRY *ae = health_create_alarm_entry(
|
||||
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
|
||||
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
|
||||
rc->delay_last,
|
||||
(
|
||||
|
|
|
@ -1,24 +1,30 @@
|
|||
|
||||
# logical device status check
|
||||
|
||||
template: adaptec_raid_ld_status
|
||||
on: adaptec_raid.ld_status
|
||||
lookup: max -10s foreach *
|
||||
units: bool
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: logical device status is failed or degraded
|
||||
to: sysadmin
|
||||
template: adaptec_raid_ld_status
|
||||
on: adaptec_raid.ld_status
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
lookup: max -10s foreach *
|
||||
units: bool
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: logical device status is failed or degraded
|
||||
to: sysadmin
|
||||
|
||||
# physical device state check
|
||||
|
||||
template: adaptec_raid_pd_state
|
||||
on: adaptec_raid.pd_state
|
||||
lookup: max -10s foreach *
|
||||
units: bool
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: physical device state is not online
|
||||
to: sysadmin
|
||||
template: adaptec_raid_pd_state
|
||||
on: adaptec_raid.pd_state
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
lookup: max -10s foreach *
|
||||
units: bool
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: physical device state is not online
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
# make sure am2320 is sending stats
|
||||
|
||||
template: am2320_last_collected_secs
|
||||
on: am2320.temperature
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: am2320_last_collected_secs
|
||||
on: am2320.temperature
|
||||
class: Other
|
||||
component: Sensors
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
|
|
@ -1,17 +1,23 @@
|
|||
# raise a warning alarm if an anomaly probability is consistently above 50%
|
||||
|
||||
template: anomalies_anomaly_probabilities
|
||||
on: anomalies.probability
|
||||
lookup: average -2m foreach *
|
||||
every: 1m
|
||||
warn: $this > 50
|
||||
info: average anomaly probability over the last 2 minutes
|
||||
template: anomalies_anomaly_probabilities
|
||||
on: anomalies.probability
|
||||
class: Netdata
|
||||
component: ML
|
||||
type: Errors
|
||||
lookup: average -2m foreach *
|
||||
every: 1m
|
||||
warn: $this > 50
|
||||
info: average anomaly probability over the last 2 minutes
|
||||
|
||||
# raise a warning alarm if an anomaly flag is consistently firing
|
||||
|
||||
template: anomalies_anomaly_flags
|
||||
on: anomalies.anomaly
|
||||
lookup: sum -2m foreach *
|
||||
every: 1m
|
||||
warn: $this > 10
|
||||
info: number of anomalies in the last 2 minutes
|
||||
template: anomalies_anomaly_flags
|
||||
on: anomalies.anomaly
|
||||
class: Netdata
|
||||
component: ML
|
||||
type: Errors
|
||||
lookup: sum -2m foreach *
|
||||
every: 1m
|
||||
warn: $this > 10
|
||||
info: number of anomalies in the last 2 minutes
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure apache is running
|
||||
|
||||
template: apache_last_collected_secs
|
||||
on: apache.requests
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: apache_last_collected_secs
|
||||
on: apache.requests
|
||||
class: Web Server
|
||||
component: Apache
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
|
|
@ -1,40 +1,49 @@
|
|||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
template: apcupsd_10min_ups_load
|
||||
on: apcupsd.load
|
||||
os: *
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of percentage
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
info: average UPS load over the last 10 minutes
|
||||
to: sitemgr
|
||||
template: apcupsd_10min_ups_load
|
||||
on: apcupsd.load
|
||||
class: Power Supply
|
||||
component: UPS
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of percentage
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
info: average UPS load over the last 10 minutes
|
||||
to: sitemgr
|
||||
|
||||
# Discussion in https://github.com/netdata/netdata/pull/3928:
|
||||
# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
|
||||
template: apcupsd_ups_charge
|
||||
on: apcupsd.charge
|
||||
os: *
|
||||
hosts: *
|
||||
lookup: average -60s unaligned of charge
|
||||
units: %
|
||||
every: 60s
|
||||
warn: $this < 100
|
||||
crit: $this < (($status == $CRITICAL) ? (60) : (50))
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
info: average UPS charge over the last minute
|
||||
to: sitemgr
|
||||
template: apcupsd_ups_charge
|
||||
on: apcupsd.charge
|
||||
class: Power Supply
|
||||
component: UPS
|
||||
type: Errors
|
||||
os: *
|
||||
hosts: *
|
||||
lookup: average -60s unaligned of charge
|
||||
units: %
|
||||
every: 60s
|
||||
warn: $this < 100
|
||||
crit: $this < (($status == $CRITICAL) ? (60) : (50))
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
info: average UPS charge over the last minute
|
||||
to: sitemgr
|
||||
|
||||
template: apcupsd_last_collected_secs
|
||||
on: apcupsd.load
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
template: apcupsd_last_collected_secs
|
||||
on: apcupsd.load
|
||||
class: Power Supply
|
||||
component: UPS device
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
|
|
|
@ -1,33 +1,42 @@
|
|||
# Alert that backends subsystem will be disabled soon
|
||||
alarm: backend_metrics_eol
|
||||
on: netdata.backend_metrics
|
||||
units: boolean
|
||||
calc: $now - $last_collected_t
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
|
||||
to: sysadmin
|
||||
alarm: backend_metrics_eol
|
||||
on: netdata.backend_metrics
|
||||
class: Netdata
|
||||
component: Exporting engine
|
||||
type: Errors
|
||||
units: boolean
|
||||
calc: $now - $last_collected_t
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
|
||||
to: sysadmin
|
||||
|
||||
# make sure we are sending data to backend
|
||||
|
||||
alarm: backend_last_buffering
|
||||
on: netdata.backend_metrics
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful buffering of backend data
|
||||
to: dba
|
||||
alarm: backend_last_buffering
|
||||
on: netdata.backend_metrics
|
||||
class: Netdata
|
||||
component: Exporting engine
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful buffering of backend data
|
||||
to: dba
|
||||
|
||||
alarm: backend_metrics_sent
|
||||
on: netdata.backend_metrics
|
||||
units: %
|
||||
calc: abs($sent) * 100 / abs($buffered)
|
||||
every: 10s
|
||||
warn: $this != 100
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of metrics sent to the backend server
|
||||
to: dba
|
||||
alarm: backend_metrics_sent
|
||||
on: netdata.backend_metrics
|
||||
class: Netdata
|
||||
component: Exporting engine
|
||||
type: Workload
|
||||
units: %
|
||||
calc: abs($sent) * 100 / abs($buffered)
|
||||
every: 10s
|
||||
warn: $this != 100
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of metrics sent to the backend server
|
||||
to: dba
|
||||
|
|
|
@ -1,24 +1,30 @@
|
|||
|
||||
template: bcache_cache_errors
|
||||
on: disk.bcache_cache_read_races
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 2m down 1h multiplier 1.5 max 2h
|
||||
info: number of times data was read from the cache, \
|
||||
the bucket was reused and invalidated in the last 10 minutes \
|
||||
(when this occurs the data is reread from the backing device)
|
||||
to: sysadmin
|
||||
template: bcache_cache_errors
|
||||
on: disk.bcache_cache_read_races
|
||||
class: System
|
||||
component: Disk
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 2m down 1h multiplier 1.5 max 2h
|
||||
info: number of times data was read from the cache, \
|
||||
the bucket was reused and invalidated in the last 10 minutes \
|
||||
(when this occurs the data is reread from the backing device)
|
||||
to: sysadmin
|
||||
|
||||
template: bcache_cache_dirty
|
||||
on: disk.bcache_cache_alloc
|
||||
calc: $dirty + $metadata + $undefined
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
|
||||
crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: percentage of cache space used for dirty data and metadata \
|
||||
(this usually means your SSD cache is too small)
|
||||
to: sysadmin
|
||||
template: bcache_cache_dirty
|
||||
on: disk.bcache_cache_alloc
|
||||
class: System
|
||||
component: Disk
|
||||
type: Utilization
|
||||
calc: $dirty + $metadata + $undefined
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
|
||||
crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: percentage of cache space used for dirty data and metadata \
|
||||
(this usually means your SSD cache is too small)
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,17 +1,20 @@
|
|||
# get the number of buried jobs in all queues
|
||||
|
||||
template: beanstalk_server_buried_jobs
|
||||
on: beanstalk.current_jobs
|
||||
calc: $buried
|
||||
units: jobs
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
crit: $this > 10
|
||||
delay: up 0 down 5m multiplier 1.2 max 1h
|
||||
info: number of buried jobs across all tubes. \
|
||||
You need to manually kick them so they can be processed. \
|
||||
Presence of buried jobs in a tube does not affect new jobs.
|
||||
to: sysadmin
|
||||
template: beanstalk_server_buried_jobs
|
||||
on: beanstalk.current_jobs
|
||||
class: Messaging
|
||||
component: Beanstalk
|
||||
type: Workload
|
||||
calc: $buried
|
||||
units: jobs
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
crit: $this > 10
|
||||
delay: up 0 down 5m multiplier 1.2 max 1h
|
||||
info: number of buried jobs across all tubes. \
|
||||
You need to manually kick them so they can be processed. \
|
||||
Presence of buried jobs in a tube does not affect new jobs.
|
||||
to: sysadmin
|
||||
|
||||
# get the number of buried jobs per queue
|
||||
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
template: bind_rndc_stats_file_size
|
||||
on: bind_rndc.stats_size
|
||||
units: megabytes
|
||||
every: 60
|
||||
calc: $stats_size
|
||||
warn: $this > 512
|
||||
crit: $this > 1024
|
||||
info: BIND statistics-file size
|
||||
to: sysadmin
|
||||
template: bind_rndc_stats_file_size
|
||||
on: bind_rndc.stats_size
|
||||
class: DNS
|
||||
component: BIND
|
||||
type: Utilization
|
||||
units: megabytes
|
||||
every: 60
|
||||
calc: $stats_size
|
||||
warn: $this > 512
|
||||
crit: $this > 1024
|
||||
info: BIND statistics-file size
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,62 +1,74 @@
|
|||
# Alarms for various BOINC issues.
|
||||
|
||||
# Warn on any compute errors encountered.
|
||||
template: boinc_compute_errors
|
||||
on: boinc.states
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of comperror
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
crit: $this > 1
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: average number of compute errors over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: boinc_compute_errors
|
||||
on: boinc.states
|
||||
class: Computing
|
||||
component: BOINC
|
||||
type: Errors
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of comperror
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
crit: $this > 1
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: average number of compute errors over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Warn on lots of upload errors
|
||||
template: boinc_upload_errors
|
||||
on: boinc.states
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of upload_failed
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
crit: $this > 1
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: average number of failed uploads over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: boinc_upload_errors
|
||||
on: boinc.states
|
||||
class: Computing
|
||||
component: BOINC
|
||||
type: Errors
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of upload_failed
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
crit: $this > 1
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: average number of failed uploads over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Warn on the task queue being empty
|
||||
template: boinc_total_tasks
|
||||
on: boinc.tasks
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of total
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this < 1
|
||||
crit: $this < 0.1
|
||||
delay: up 5m down 10m multiplier 1.5 max 1h
|
||||
info: average number of total tasks over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: boinc_total_tasks
|
||||
on: boinc.tasks
|
||||
class: Computing
|
||||
component: BOINC
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of total
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this < 1
|
||||
crit: $this < 0.1
|
||||
delay: up 5m down 10m multiplier 1.5 max 1h
|
||||
info: average number of total tasks over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Warn on no active tasks with a non-empty queue
|
||||
template: boinc_active_tasks
|
||||
on: boinc.tasks
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of active
|
||||
calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this < 1
|
||||
crit: $this < 0.1
|
||||
delay: up 5m down 10m multiplier 1.5 max 1h
|
||||
info: average number of active tasks over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: boinc_active_tasks
|
||||
on: boinc.tasks
|
||||
class: Computing
|
||||
component: BOINC
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned of active
|
||||
calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
|
||||
units: tasks
|
||||
every: 1m
|
||||
warn: $this < 1
|
||||
crit: $this < 0.1
|
||||
delay: up 5m down 10m multiplier 1.5 max 1h
|
||||
info: average number of active tasks over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,56 +1,68 @@
|
|||
|
||||
template: btrfs_allocated
|
||||
on: btrfs.disk
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95))
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: percentage of allocated BTRFS physical disk space
|
||||
to: sysadmin
|
||||
template: btrfs_allocated
|
||||
on: btrfs.disk
|
||||
class: System
|
||||
component: File system
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95))
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: percentage of allocated BTRFS physical disk space
|
||||
to: sysadmin
|
||||
|
||||
template: btrfs_data
|
||||
on: btrfs.data
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: $used * 100 / ($used + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: utilization of BTRFS data space
|
||||
to: sysadmin
|
||||
template: btrfs_data
|
||||
on: btrfs.data
|
||||
class: System
|
||||
component: File system
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: $used * 100 / ($used + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: utilization of BTRFS data space
|
||||
to: sysadmin
|
||||
|
||||
template: btrfs_metadata
|
||||
on: btrfs.metadata
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: utilization of BTRFS metadata space
|
||||
to: sysadmin
|
||||
template: btrfs_metadata
|
||||
on: btrfs.metadata
|
||||
class: System
|
||||
component: File system
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: utilization of BTRFS metadata space
|
||||
to: sysadmin
|
||||
|
||||
template: btrfs_system
|
||||
on: btrfs.system
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: $used * 100 / ($used + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: utilization of BTRFS system space
|
||||
to: sysadmin
|
||||
template: btrfs_system
|
||||
on: btrfs.system
|
||||
class: System
|
||||
component: File system
|
||||
type: Utilization
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: $used * 100 / ($used + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: utilization of BTRFS system space
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
# low ceph disk available
|
||||
|
||||
template: ceph_cluster_space_usage
|
||||
on: ceph.general_usage
|
||||
calc: $used * 100 / ($used + $avail)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING ) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 5m multiplier 1.2 max 1h
|
||||
info: cluster disk space utilization
|
||||
to: sysadmin
|
||||
template: ceph_cluster_space_usage
|
||||
on: ceph.general_usage
|
||||
class: Storage
|
||||
component: Ceph
|
||||
type: Utilization
|
||||
calc: $used * 100 / ($used + $avail)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING ) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 5m multiplier 1.2 max 1h
|
||||
info: cluster disk space utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,28 +1,34 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
template: cgroup_10min_cpu_usage
|
||||
on: cgroup.cpu_limit
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average cgroup CPU utilization over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: cgroup_10min_cpu_usage
|
||||
on: cgroup.cpu_limit
|
||||
class: Cgroups
|
||||
component: CPU
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average cgroup CPU utilization over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: cgroup_ram_in_use
|
||||
on: cgroup.mem_usage
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($ram) * 100 / $memory_limit
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: cgroup memory utilization
|
||||
to: sysadmin
|
||||
template: cgroup_ram_in_use
|
||||
on: cgroup.mem_usage
|
||||
class: Cgroups
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($ram) * 100 / $memory_limit
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: cgroup memory utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,91 +1,115 @@
|
|||
|
||||
# Availability
|
||||
|
||||
template: cockroachdb_last_collected_secs
|
||||
on: cockroachdb.live_nodes
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: cockroachdb_last_collected_secs
|
||||
on: cockroachdb.live_nodes
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
||||
# Capacity
|
||||
|
||||
template: cockroachdb_used_storage_capacity
|
||||
on: cockroachdb.storage_used_capacity_percentage
|
||||
calc: $capacity_used_percent
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage capacity utilization
|
||||
to: dba
|
||||
template: cockroachdb_used_storage_capacity
|
||||
on: cockroachdb.storage_used_capacity_percentage
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Utilization
|
||||
calc: $capacity_used_percent
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage capacity utilization
|
||||
to: dba
|
||||
|
||||
template: cockroachdb_used_usable_storage_capacity
|
||||
on: cockroachdb.storage_used_capacity_percentage
|
||||
calc: $capacity_usable_used_percent
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage usable space utilization
|
||||
to: dba
|
||||
template: cockroachdb_used_usable_storage_capacity
|
||||
on: cockroachdb.storage_used_capacity_percentage
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Utilization
|
||||
calc: $capacity_usable_used_percent
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage usable space utilization
|
||||
to: dba
|
||||
|
||||
# Replication
|
||||
|
||||
template: cockroachdb_unavailable_ranges
|
||||
on: cockroachdb.ranges_replication_problem
|
||||
calc: $ranges_unavailable
|
||||
units: num
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of ranges with fewer live replicas than the replication target
|
||||
to: dba
|
||||
template: cockroachdb_unavailable_ranges
|
||||
on: cockroachdb.ranges_replication_problem
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Utilization
|
||||
calc: $ranges_unavailable
|
||||
units: num
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of ranges with fewer live replicas than the replication target
|
||||
to: dba
|
||||
|
||||
template: cockroachdb_replicas_leaders_not_leaseholders
|
||||
on: cockroachdb.replicas_leaders
|
||||
calc: $replicas_leaders_not_leaseholders
|
||||
units: num
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of replicas that are Raft leaders whose range lease is held by another store
|
||||
to: dba
|
||||
template: cockroachdb_replicas_leaders_not_leaseholders
|
||||
on: cockroachdb.replicas_leaders
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Utilization
|
||||
calc: $replicas_leaders_not_leaseholders
|
||||
units: num
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of replicas that are Raft leaders whose range lease is held by another store
|
||||
to: dba
|
||||
|
||||
# FD
|
||||
|
||||
template: cockroachdb_open_file_descriptors_limit
|
||||
on: cockroachdb.process_file_descriptors
|
||||
calc: $sys_fd_open/$sys_fd_softlimit * 100
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > 80
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: open file descriptors utilization (against softlimit)
|
||||
to: dba
|
||||
template: cockroachdb_open_file_descriptors_limit
|
||||
on: cockroachdb.process_file_descriptors
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Utilization
|
||||
calc: $sys_fd_open/$sys_fd_softlimit * 100
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > 80
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: open file descriptors utilization (against softlimit)
|
||||
to: dba
|
||||
|
||||
# SQL
|
||||
|
||||
template: cockroachdb_sql_active_connections
|
||||
on: cockroachdb.sql_connections
|
||||
calc: $sql_conns
|
||||
units: active connections
|
||||
every: 10s
|
||||
info: number of active SQL connections
|
||||
to: dba
|
||||
template: cockroachdb_sql_active_connections
|
||||
on: cockroachdb.sql_connections
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Utilization
|
||||
calc: $sql_conns
|
||||
units: active connections
|
||||
every: 10s
|
||||
info: number of active SQL connections
|
||||
to: dba
|
||||
|
||||
template: cockroachdb_sql_executed_statements_total_last_5m
|
||||
on: cockroachdb.sql_statements_total
|
||||
lookup: sum -5m absolute of sql_query_count
|
||||
units: statements
|
||||
every: 10s
|
||||
warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
|
||||
delay: down 15m up 30s multiplier 1.5 max 1h
|
||||
info: number of executed SQL statements in the last 5 minutes
|
||||
to: dba
|
||||
template: cockroachdb_sql_executed_statements_total_last_5m
|
||||
on: cockroachdb.sql_statements_total
|
||||
class: Database
|
||||
component: CockroachDB
|
||||
type: Workload
|
||||
lookup: sum -5m absolute of sql_query_count
|
||||
units: statements
|
||||
every: 10s
|
||||
warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
|
||||
delay: down 15m up 30s multiplier 1.5 max 1h
|
||||
info: number of executed SQL statements in the last 5 minutes
|
||||
to: dba
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
|
||||
# make sure couchdb is running
|
||||
|
||||
template: couchdb_last_collected_secs
|
||||
on: couchdb.request_methods
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: couchdb_last_collected_secs
|
||||
on: couchdb.request_methods
|
||||
class: Database
|
||||
component: CouchDB
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
|
|
@ -1,55 +1,67 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
template: 10min_cpu_usage
|
||||
on: system.cpu
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of user,system,softirq,irq,guest
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
|
||||
to: sysadmin
|
||||
template: 10min_cpu_usage
|
||||
on: system.cpu
|
||||
class: System
|
||||
component: CPU
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of user,system,softirq,irq,guest
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
|
||||
to: sysadmin
|
||||
|
||||
template: 10min_cpu_iowait
|
||||
on: system.cpu
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of iowait
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (20) : (40))
|
||||
crit: $this > (($status == $CRITICAL) ? (40) : (50))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU iowait time over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: 10min_cpu_iowait
|
||||
on: system.cpu
|
||||
class: System
|
||||
component: CPU
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of iowait
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (20) : (40))
|
||||
crit: $this > (($status == $CRITICAL) ? (40) : (50))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU iowait time over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: 20min_steal_cpu
|
||||
on: system.cpu
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -20m unaligned of steal
|
||||
units: %
|
||||
every: 5m
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (20) : (30))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average CPU steal time over the last 20 minutes
|
||||
to: sysadmin
|
||||
template: 20min_steal_cpu
|
||||
on: system.cpu
|
||||
class: System
|
||||
component: CPU
|
||||
type: Latency
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -20m unaligned of steal
|
||||
units: %
|
||||
every: 5m
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (20) : (30))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average CPU steal time over the last 20 minutes
|
||||
to: sysadmin
|
||||
|
||||
## FreeBSD
|
||||
template: 10min_cpu_usage
|
||||
on: system.cpu
|
||||
os: freebsd
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of user,system,interrupt
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes (excluding nice)
|
||||
to: sysadmin
|
||||
template: 10min_cpu_usage
|
||||
on: system.cpu
|
||||
class: System
|
||||
component: CPU
|
||||
type: Utilization
|
||||
os: freebsd
|
||||
hosts: *
|
||||
lookup: average -10m unaligned of user,system,interrupt
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes (excluding nice)
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,52 +1,64 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: 10min_dbengine_global_fs_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of fs_errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
|
||||
to: sysadmin
|
||||
alarm: 10min_dbengine_global_fs_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
class: Netdata
|
||||
component: DB engine
|
||||
type: Errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of fs_errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_io_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of io_errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
|
||||
to: sysadmin
|
||||
alarm: 10min_dbengine_global_io_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
class: Netdata
|
||||
component: DB engine
|
||||
type: Errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of io_errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_flushing_warnings
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
|
||||
Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
|
||||
to: sysadmin
|
||||
alarm: 10min_dbengine_global_flushing_warnings
|
||||
on: netdata.dbengine_global_errors
|
||||
class: Netdata
|
||||
component: DB engine
|
||||
type: Errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
|
||||
Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_flushing_errors
|
||||
on: netdata.dbengine_long_term_page_stats
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of flushing_pressure_deletions
|
||||
units: pages
|
||||
every: 10s
|
||||
crit: $this != 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
|
||||
Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
|
||||
to: sysadmin
|
||||
alarm: 10min_dbengine_global_flushing_errors
|
||||
on: netdata.dbengine_long_term_page_stats
|
||||
class: Netdata
|
||||
component: DB engine
|
||||
type: Errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of flushing_pressure_deletions
|
||||
units: pages
|
||||
every: 10s
|
||||
crit: $this != 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
|
||||
Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
|
||||
to: sysadmin
|
||||
|
|
|
@ -9,33 +9,39 @@
|
|||
# raise an alarm if the disk is low on
|
||||
# available disk space
|
||||
|
||||
template: disk_space_usage
|
||||
on: disk.space
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: !/dev !/dev/* !/run !/run/* *
|
||||
calc: $used * 100 / ($avail + $used)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING ) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: disk space utilization
|
||||
to: sysadmin
|
||||
template: disk_space_usage
|
||||
on: disk.space
|
||||
class: System
|
||||
component: Disk
|
||||
type: Utilization
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: !/dev !/dev/* !/run !/run/* *
|
||||
calc: $used * 100 / ($avail + $used)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING ) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: disk $family space utilization
|
||||
to: sysadmin
|
||||
|
||||
template: disk_inode_usage
|
||||
on: disk.inodes
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: !/dev !/dev/* !/run !/run/* *
|
||||
calc: $used * 100 / ($avail + $used)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: disk inode utilization
|
||||
to: sysadmin
|
||||
template: disk_inode_usage
|
||||
on: disk.inodes
|
||||
class: System
|
||||
component: Disk
|
||||
type: Utilization
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: !/dev !/dev/* !/run !/run/* *
|
||||
calc: $used * 100 / ($avail + $used)
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
info: disk $family inode utilization
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -128,21 +134,24 @@ families: !/dev !/dev/* !/run !/run/* *
|
|||
# by calculating the average disk utilization
|
||||
# for the last 10 minutes
|
||||
|
||||
template: 10min_disk_utilization
|
||||
on: disk.util
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned
|
||||
units: %
|
||||
every: 1m
|
||||
green: 90
|
||||
red: 98
|
||||
warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
|
||||
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
|
||||
delay: down 15m multiplier 1.2 max 1h
|
||||
info: average percentage of time the disk was busy over the last 10 minutes
|
||||
to: silent
|
||||
template: 10min_disk_utilization
|
||||
on: disk.util
|
||||
class: System
|
||||
component: Disk
|
||||
type: Utilization
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned
|
||||
units: %
|
||||
every: 1m
|
||||
green: 90
|
||||
red: 98
|
||||
warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
|
||||
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
|
||||
delay: down 15m multiplier 1.2 max 1h
|
||||
info: average percentage of time $family disk was busy over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
||||
# raise an alarm if the disk backlog
|
||||
|
@ -150,18 +159,21 @@ families: *
|
|||
# for 10 minutes
|
||||
# (i.e. the disk cannot catch up)
|
||||
|
||||
template: 10min_disk_backlog
|
||||
on: disk.backlog
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned
|
||||
units: ms
|
||||
every: 1m
|
||||
green: 2000
|
||||
red: 5000
|
||||
warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
|
||||
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
|
||||
delay: down 15m multiplier 1.2 max 1h
|
||||
info: average disk backlog size over the last 10 minutes
|
||||
to: silent
|
||||
template: 10min_disk_backlog
|
||||
on: disk.backlog
|
||||
class: System
|
||||
component: Disk
|
||||
type: Latency
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10m unaligned
|
||||
units: ms
|
||||
every: 1m
|
||||
green: 2000
|
||||
red: 5000
|
||||
warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
|
||||
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
|
||||
delay: down 15m multiplier 1.2 max 1h
|
||||
info: average backlog size of the $family disk over the last 10 minutes
|
||||
to: silent
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
|
||||
# detect dns query failure
|
||||
|
||||
template: dns_query_time_query_time
|
||||
on: dns_query_time.query_time
|
||||
lookup: average -10s unaligned foreach *
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: $this == nan
|
||||
delay: up 20s down 5m multiplier 1.5 max 1h
|
||||
info: average DNS query round trip time over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: dns_query_time_query_time
|
||||
on: dns_query_time.query_time
|
||||
class: DNS
|
||||
component: DNS
|
||||
type: Latency
|
||||
lookup: average -10s unaligned foreach *
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: $this == nan
|
||||
delay: up 20s down 5m multiplier 1.5 max 1h
|
||||
info: average DNS query round trip time over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
# dhcp-range utilization
|
||||
|
||||
template: dnsmasq_dhcp_dhcp_range_utilization
|
||||
on: dnsmasq_dhcp.dhcp_range_utilization
|
||||
every: 10s
|
||||
units: %
|
||||
calc: $used
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
|
||||
crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
|
||||
delay: down 5m
|
||||
info: DHCP range utilization
|
||||
to: sysadmin
|
||||
template: dnsmasq_dhcp_dhcp_range_utilization
|
||||
on: dnsmasq_dhcp.dhcp_range_utilization
|
||||
class: DHCP
|
||||
component: Dnsmasq
|
||||
type: Utilization
|
||||
every: 10s
|
||||
units: %
|
||||
calc: $used
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
|
||||
crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
|
||||
delay: down 5m
|
||||
info: DHCP range utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,8 +1,11 @@
|
|||
template: docker_unhealthy_containers
|
||||
on: docker.unhealthy_containers
|
||||
units: unhealthy containers
|
||||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of unhealthy docker containers over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: docker_unhealthy_containers
|
||||
on: docker.unhealthy_containers
|
||||
class: Containers
|
||||
component: Docker
|
||||
type: Errors
|
||||
units: unhealthy containers
|
||||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of unhealthy docker containers over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
|
||||
# make sure elasticsearch is running
|
||||
|
||||
template: elasticsearch_last_collected
|
||||
on: elasticsearch.cluster_health_status
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: elasticsearch_last_collected
|
||||
on: elasticsearch.cluster_health_status
|
||||
class: Search engine
|
||||
component: Elasticsearch
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
|
|
@ -3,14 +3,17 @@
|
|||
# the alarm is checked every 1 minute
|
||||
# and examines the last hour of data
|
||||
|
||||
alarm: lowest_entropy
|
||||
on: system.entropy
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: min -5m unaligned
|
||||
units: entries
|
||||
every: 5m
|
||||
warn: $this < (($status >= $WARNING) ? (200) : (100))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: minimum number of entries in the random numbers pool in the last 5 minutes
|
||||
to: silent
|
||||
alarm: lowest_entropy
|
||||
on: system.entropy
|
||||
class: System
|
||||
component: Cryptography
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: min -5m unaligned
|
||||
units: entries
|
||||
every: 5m
|
||||
warn: $this < (($status >= $WARNING) ? (200) : (100))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: minimum number of entries in the random numbers pool in the last 5 minutes
|
||||
to: silent
|
||||
|
|
|
@ -11,13 +11,16 @@ families: *
|
|||
info: number of seconds since the last successful buffering of exporting data
|
||||
to: dba
|
||||
|
||||
template: exporting_metrics_sent
|
||||
families: *
|
||||
on: exporting_data_size
|
||||
units: %
|
||||
calc: abs($sent) * 100 / abs($buffered)
|
||||
every: 10s
|
||||
warn: $this != 100
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of metrics sent to the external database server
|
||||
to: dba
|
||||
template: exporting_metrics_sent
|
||||
families: *
|
||||
on: exporting_data_size
|
||||
class: Netdata
|
||||
component: Exporting engine
|
||||
type: Workload
|
||||
units: %
|
||||
calc: abs($sent) * 100 / abs($buffered)
|
||||
every: 10s
|
||||
warn: $this != 100
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of metrics sent to the external database server
|
||||
to: dba
|
||||
|
|
|
@ -1,52 +1,64 @@
|
|||
|
||||
template: fping_last_collected_secs
|
||||
families: *
|
||||
on: fping.latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: fping_last_collected_secs
|
||||
families: *
|
||||
on: fping.latency
|
||||
class: Other
|
||||
component: Network
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
template: fping_host_reachable
|
||||
families: *
|
||||
on: fping.latency
|
||||
calc: $average != nan
|
||||
units: up/down
|
||||
every: 10s
|
||||
crit: $this == 0
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: reachability status of the network host (0: unreachable, 1: reachable)
|
||||
to: sysadmin
|
||||
template: fping_host_reachable
|
||||
families: *
|
||||
on: fping.latency
|
||||
class: Other
|
||||
component: Network
|
||||
type: Errors
|
||||
calc: $average != nan
|
||||
units: up/down
|
||||
every: 10s
|
||||
crit: $this == 0
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: reachability status of the network host (0: unreachable, 1: reachable)
|
||||
to: sysadmin
|
||||
|
||||
template: fping_host_latency
|
||||
families: *
|
||||
on: fping.latency
|
||||
lookup: average -10s unaligned of average
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: $this > $green OR $max > $red
|
||||
crit: $this > $red
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: average latency to the network host over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: fping_host_latency
|
||||
families: *
|
||||
on: fping.latency
|
||||
class: Other
|
||||
component: Network
|
||||
type: Latency
|
||||
lookup: average -10s unaligned of average
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: $this > $green OR $max > $red
|
||||
crit: $this > $red
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: average latency to the network host over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
||||
template: fping_packet_loss
|
||||
families: *
|
||||
on: fping.quality
|
||||
lookup: average -10m unaligned of returned
|
||||
calc: 100 - $this
|
||||
green: 1
|
||||
red: 10
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > $green
|
||||
crit: $this > $red
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: packet loss ratio to the network host over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: fping_packet_loss
|
||||
families: *
|
||||
on: fping.quality
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
lookup: average -10m unaligned of returned
|
||||
calc: 100 - $this
|
||||
green: 1
|
||||
red: 10
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > $green
|
||||
crit: $this > $red
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: packet loss ratio to the network host over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
template: fronius_last_collected_secs
|
||||
families: *
|
||||
on: fronius.power
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
template: fronius_last_collected_secs
|
||||
families: *
|
||||
on: fronius.power
|
||||
class: Power Supply
|
||||
component: Solar
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
|
|
|
@ -1,22 +1,28 @@
|
|||
# make sure Gearman is running
|
||||
template: gearman_last_collected_secs
|
||||
on: gearman.total_jobs
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: gearman_last_collected_secs
|
||||
on: gearman.total_jobs
|
||||
class: Computing
|
||||
component: Gearman
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
template: gearman_workers_queued
|
||||
on: gearman.single_job
|
||||
lookup: average -10m unaligned match-names of Queued
|
||||
units: workers
|
||||
every: 10s
|
||||
warn: $this > 30000
|
||||
crit: $this > 100000
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average number of queued jobs over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: gearman_workers_queued
|
||||
on: gearman.single_job
|
||||
class: Computing
|
||||
component: Gearman
|
||||
type: Latency
|
||||
lookup: average -10m unaligned match-names of Queued
|
||||
units: workers
|
||||
every: 10s
|
||||
warn: $this > 30000
|
||||
crit: $this > 100000
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average number of queued jobs over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,27 +1,36 @@
|
|||
template: haproxy_backend_server_status
|
||||
on: haproxy_hs.down
|
||||
units: failed servers
|
||||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of failed haproxy backend servers over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: haproxy_backend_server_status
|
||||
on: haproxy_hs.down
|
||||
class: Web Proxy
|
||||
component: HAProxy
|
||||
type: Errors
|
||||
units: failed servers
|
||||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of failed haproxy backend servers over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
||||
template: haproxy_backend_status
|
||||
on: haproxy_hb.down
|
||||
units: failed backend
|
||||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of failed haproxy backends over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: haproxy_backend_status
|
||||
on: haproxy_hb.down
|
||||
class: Web Proxy
|
||||
component: HAProxy
|
||||
type: Errors
|
||||
units: failed backend
|
||||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of failed haproxy backends over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
||||
template: haproxy_last_collected
|
||||
on: haproxy_hb.down
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: haproxy_last_collected
|
||||
on: haproxy_hb.down
|
||||
class: Web Proxy
|
||||
component: HAProxy
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,75 +1,93 @@
|
|||
|
||||
# make sure hdfs is running
|
||||
|
||||
template: hdfs_last_collected_secs
|
||||
on: hdfs.heap_memory
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: hdfs_last_collected_secs
|
||||
on: hdfs.heap_memory
|
||||
class: Storage
|
||||
component: HDFS
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
||||
# Common
|
||||
|
||||
template: hdfs_capacity_usage
|
||||
on: hdfs.capacity
|
||||
calc: ($used) * 100 / ($used + $remaining)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: summary datanodes space capacity utilization
|
||||
to: sysadmin
|
||||
template: hdfs_capacity_usage
|
||||
on: hdfs.capacity
|
||||
class: Storage
|
||||
component: HDFS
|
||||
type: Utilization
|
||||
calc: ($used) * 100 / ($used + $remaining)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: summary datanodes space capacity utilization
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# NameNode
|
||||
|
||||
template: hdfs_missing_blocks
|
||||
on: hdfs.blocks
|
||||
calc: $missing
|
||||
units: missing blocks
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of missing blocks
|
||||
to: sysadmin
|
||||
template: hdfs_missing_blocks
|
||||
on: hdfs.blocks
|
||||
class: Storage
|
||||
component: HDFS
|
||||
type: Errors
|
||||
calc: $missing
|
||||
units: missing blocks
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of missing blocks
|
||||
to: sysadmin
|
||||
|
||||
|
||||
template: hdfs_stale_nodes
|
||||
on: hdfs.data_nodes
|
||||
calc: $stale
|
||||
units: dead nodes
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of datanodes marked stale due to delayed heartbeat
|
||||
to: sysadmin
|
||||
template: hdfs_stale_nodes
|
||||
on: hdfs.data_nodes
|
||||
class: Storage
|
||||
component: HDFS
|
||||
type: Errors
|
||||
calc: $stale
|
||||
units: dead nodes
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of datanodes marked stale due to delayed heartbeat
|
||||
to: sysadmin
|
||||
|
||||
|
||||
template: hdfs_dead_nodes
|
||||
on: hdfs.data_nodes
|
||||
calc: $dead
|
||||
units: dead nodes
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of datanodes which are currently dead
|
||||
to: sysadmin
|
||||
template: hdfs_dead_nodes
|
||||
on: hdfs.data_nodes
|
||||
class: Storage
|
||||
component: HDFS
|
||||
type: Errors
|
||||
calc: $dead
|
||||
units: dead nodes
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of datanodes which are currently dead
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# DataNode
|
||||
|
||||
template: hdfs_num_failed_volumes
|
||||
on: hdfs.num_failed_volumes
|
||||
calc: $fsds_num_failed_volumes
|
||||
units: failed volumes
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of failed volumes
|
||||
to: sysadmin
|
||||
template: hdfs_num_failed_volumes
|
||||
on: hdfs.num_failed_volumes
|
||||
class: Storage
|
||||
component: HDFS
|
||||
type: Errors
|
||||
calc: $fsds_num_failed_volumes
|
||||
units: failed volumes
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of failed volumes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,99 +1,126 @@
|
|||
template: httpcheck_last_collected_secs
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: httpcheck_last_collected_secs
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Other
|
||||
component: HTTP endpoint
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
|
||||
template: httpcheck_web_service_up
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
lookup: average -1m unaligned percentage of success
|
||||
calc: ($this < 75) ? (0) : ($this)
|
||||
every: 5s
|
||||
units: up/down
|
||||
info: average ratio of successful HTTP requests over the last minute (at least 75%)
|
||||
to: silent
|
||||
template: httpcheck_web_service_up
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Web Server
|
||||
component: HTTP endpoint
|
||||
type: Utilization
|
||||
lookup: average -1m unaligned percentage of success
|
||||
calc: ($this < 75) ? (0) : ($this)
|
||||
every: 5s
|
||||
units: up/down
|
||||
info: average ratio of successful HTTP requests over the last minute (at least 75%)
|
||||
to: silent
|
||||
|
||||
template: httpcheck_web_service_bad_content
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
lookup: average -5m unaligned percentage of bad_content
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of HTTP responses with unexpected content over the last 5 minutes
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
template: httpcheck_web_service_bad_content
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Web Server
|
||||
component: HTTP endpoint
|
||||
type: Workload
|
||||
lookup: average -5m unaligned percentage of bad_content
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of HTTP responses with unexpected content over the last 5 minutes
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
||||
template: httpcheck_web_service_bad_status
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
lookup: average -5m unaligned percentage of bad_status
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of HTTP responses with unexpected status over the last 5 minutes
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
template: httpcheck_web_service_bad_status
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Web Server
|
||||
component: HTTP endpoint
|
||||
type: Workload
|
||||
lookup: average -5m unaligned percentage of bad_status
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of HTTP responses with unexpected status over the last 5 minutes
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
||||
template: httpcheck_web_service_timeouts
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
lookup: average -5m unaligned percentage of timeout
|
||||
every: 10s
|
||||
units: %
|
||||
info: average ratio of HTTP request timeouts over the last 5 minutes
|
||||
template: httpcheck_web_service_timeouts
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Web Server
|
||||
component: HTTP endpoint
|
||||
type: Latency
|
||||
lookup: average -5m unaligned percentage of timeout
|
||||
every: 10s
|
||||
units: %
|
||||
info: average ratio of HTTP request timeouts over the last 5 minutes
|
||||
|
||||
template: httpcheck_no_web_service_connections
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
lookup: average -5m unaligned percentage of no_connection
|
||||
every: 10s
|
||||
units: %
|
||||
info: average ratio of failed requests during the last 5 minutes
|
||||
template: httpcheck_no_web_service_connections
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Other
|
||||
component: HTTP endpoint
|
||||
type: Errors
|
||||
lookup: average -5m unaligned percentage of no_connection
|
||||
every: 10s
|
||||
units: %
|
||||
info: average ratio of failed requests during the last 5 minutes
|
||||
|
||||
# combined timeout & no connection alarm
|
||||
template: httpcheck_web_service_unreachable
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
|
||||
crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
template: httpcheck_web_service_unreachable
|
||||
families: *
|
||||
on: httpcheck.status
|
||||
class: Web Server
|
||||
component: HTTP endpoint
|
||||
type: Errors
|
||||
calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
|
||||
crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
||||
template: httpcheck_1h_web_service_response_time
|
||||
families: *
|
||||
on: httpcheck.responsetime
|
||||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average HTTP response time over the last hour
|
||||
template: httpcheck_1h_web_service_response_time
|
||||
families: *
|
||||
on: httpcheck.responsetime
|
||||
class: Other
|
||||
component: HTTP endpoint
|
||||
type: Latency
|
||||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average HTTP response time over the last hour
|
||||
|
||||
template: httpcheck_web_service_slow
|
||||
families: *
|
||||
on: httpcheck.responsetime
|
||||
lookup: average -3m unaligned of time
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
|
||||
crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
template: httpcheck_web_service_slow
|
||||
families: *
|
||||
on: httpcheck.responsetime
|
||||
class: Web Server
|
||||
component: HTTP endpoint
|
||||
type: Latency
|
||||
lookup: average -3m unaligned of time
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
|
||||
crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
template: ioping_disk_latency
|
||||
families: *
|
||||
on: ioping.latency
|
||||
lookup: average -10s unaligned of average
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: $this > $green OR $max > $red
|
||||
crit: $this > $red
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: average I/O latency over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: ioping_disk_latency
|
||||
families: *
|
||||
on: ioping.latency
|
||||
class: System
|
||||
component: Disk
|
||||
type: Latency
|
||||
lookup: average -10s unaligned of average
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: $this > $green OR $max > $red
|
||||
crit: $this > $red
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: average I/O latency over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,28 +1,34 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: semaphores_used
|
||||
on: system.ipc_semaphores
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: $semaphores * 100 / $ipc_semaphores_max
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (70) : (90))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: IPC semaphore utilization
|
||||
to: sysadmin
|
||||
alarm: semaphores_used
|
||||
on: system.ipc_semaphores
|
||||
class: System
|
||||
component: IPC
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: $semaphores * 100 / $ipc_semaphores_max
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (70) : (90))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: IPC semaphore utilization
|
||||
to: sysadmin
|
||||
|
||||
alarm: semaphore_arrays_used
|
||||
on: system.ipc_semaphore_arrays
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: $arrays * 100 / $ipc_semaphores_arrays_max
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (70) : (90))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: IPC semaphore arrays utilization
|
||||
to: sysadmin
|
||||
alarm: semaphore_arrays_used
|
||||
on: system.ipc_semaphore_arrays
|
||||
class: System
|
||||
component: IPC
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: $arrays * 100 / $ipc_semaphores_arrays_max
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (70) : (90))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: IPC semaphore arrays utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
|
||||
template: ipfs_datastore_usage
|
||||
on: ipfs.repo_size
|
||||
calc: $size * 100 / $avail
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: IPFS datastore utilization
|
||||
to: sysadmin
|
||||
template: ipfs_datastore_usage
|
||||
on: ipfs.repo_size
|
||||
class: Data Sharing
|
||||
component: IPFS
|
||||
type: Utilization
|
||||
calc: $size * 100 / $avail
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: IPFS datastore utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,20 +1,26 @@
|
|||
alarm: ipmi_sensors_states
|
||||
on: ipmi.sensors_states
|
||||
calc: $warning + $critical
|
||||
units: sensors
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
crit: $critical > 0
|
||||
delay: up 5m down 15m multiplier 1.5 max 1h
|
||||
info: number of IPMI sensors in non-nominal state
|
||||
to: sysadmin
|
||||
alarm: ipmi_sensors_states
|
||||
on: ipmi.sensors_states
|
||||
class: System
|
||||
component: IPMI
|
||||
type: Errors
|
||||
calc: $warning + $critical
|
||||
units: sensors
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
crit: $critical > 0
|
||||
delay: up 5m down 15m multiplier 1.5 max 1h
|
||||
info: number of IPMI sensors in non-nominal state
|
||||
to: sysadmin
|
||||
|
||||
alarm: ipmi_events
|
||||
on: ipmi.events
|
||||
calc: $events
|
||||
units: events
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 5m down 15m multiplier 1.5 max 1h
|
||||
info: number of events in the IPMI System Event Log (SEL)
|
||||
to: sysadmin
|
||||
alarm: ipmi_events
|
||||
on: ipmi.events
|
||||
class: System
|
||||
component: IPMI
|
||||
type: Utilization
|
||||
calc: $events
|
||||
units: events
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 5m down 15m multiplier 1.5 max 1h
|
||||
info: number of events in the IPMI System Event Log (SEL)
|
||||
to: sysadmin
|
||||
|
|
|
@ -4,39 +4,48 @@
|
|||
|
||||
# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
|
||||
|
||||
template: kubelet_node_config_error
|
||||
on: k8s_kubelet.kubelet_node_config_error
|
||||
calc: $kubelet_node_config_error
|
||||
units: bool
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: the node is experiencing a configuration-related error (0: false, 1: true)
|
||||
to: sysadmin
|
||||
template: kubelet_node_config_error
|
||||
on: k8s_kubelet.kubelet_node_config_error
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Errors
|
||||
calc: $kubelet_node_config_error
|
||||
units: bool
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: the node is experiencing a configuration-related error (0: false, 1: true)
|
||||
to: sysadmin
|
||||
|
||||
# Failed Token() requests to the alternate token source
|
||||
|
||||
template: kubelet_token_requests
|
||||
lookup: sum -10s of token_fail_count
|
||||
on: k8s_kubelet.kubelet_token_requests
|
||||
units: failed requests
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: number of failed Token() requests to the alternate token source
|
||||
to: sysadmin
|
||||
template: kubelet_token_requests
|
||||
lookup: sum -10s of token_fail_count
|
||||
on: k8s_kubelet.kubelet_token_requests
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Errors
|
||||
units: failed requests
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: number of failed Token() requests to the alternate token source
|
||||
to: sysadmin
|
||||
|
||||
# Docker and runtime operation errors
|
||||
|
||||
template: kubelet_operations_error
|
||||
lookup: sum -1m
|
||||
on: k8s_kubelet.kubelet_operations_errors
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (20))
|
||||
delay: up 30s down 1m multiplier 1.5 max 2h
|
||||
info: number of Docker or runtime operation errors
|
||||
to: sysadmin
|
||||
template: kubelet_operations_error
|
||||
lookup: sum -1m
|
||||
on: k8s_kubelet.kubelet_operations_errors
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Errors
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (20))
|
||||
delay: up 30s down 1m multiplier 1.5 max 2h
|
||||
info: number of Docker or runtime operation errors
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
@ -53,66 +62,84 @@
|
|||
|
||||
# quantile 0.5
|
||||
|
||||
template: kubelet_1m_pleg_relist_latency_quantile_05
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
|
||||
units: microseconds
|
||||
every: 10s
|
||||
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
|
||||
template: kubelet_1m_pleg_relist_latency_quantile_05
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Latency
|
||||
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
|
||||
units: microseconds
|
||||
every: 10s
|
||||
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
|
||||
|
||||
template: kubelet_10s_pleg_relist_latency_quantile_05
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
|
||||
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(100):(200))
|
||||
crit: $this > (($status >= $WARNING)?(200):(400))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.5)
|
||||
to: sysadmin
|
||||
template: kubelet_10s_pleg_relist_latency_quantile_05
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Latency
|
||||
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
|
||||
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(100):(200))
|
||||
crit: $this > (($status >= $WARNING)?(200):(400))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.5)
|
||||
to: sysadmin
|
||||
|
||||
# quantile 0.9
|
||||
|
||||
template: kubelet_1m_pleg_relist_latency_quantile_09
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
|
||||
units: microseconds
|
||||
every: 10s
|
||||
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
|
||||
template: kubelet_1m_pleg_relist_latency_quantile_09
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Latency
|
||||
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
|
||||
units: microseconds
|
||||
every: 10s
|
||||
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
|
||||
|
||||
template: kubelet_10s_pleg_relist_latency_quantile_09
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
|
||||
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(200):(400))
|
||||
crit: $this > (($status >= $WARNING)?(400):(800))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.9)
|
||||
to: sysadmin
|
||||
template: kubelet_10s_pleg_relist_latency_quantile_09
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Latency
|
||||
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
|
||||
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(200):(400))
|
||||
crit: $this > (($status >= $WARNING)?(400):(800))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.9)
|
||||
to: sysadmin
|
||||
|
||||
# quantile 0.99
|
||||
|
||||
template: kubelet_1m_pleg_relist_latency_quantile_099
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
|
||||
units: microseconds
|
||||
every: 10s
|
||||
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
|
||||
template: kubelet_1m_pleg_relist_latency_quantile_099
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Latency
|
||||
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
|
||||
units: microseconds
|
||||
every: 10s
|
||||
info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
|
||||
|
||||
template: kubelet_10s_pleg_relist_latency_quantile_099
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
|
||||
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(400):(800))
|
||||
crit: $this > (($status >= $WARNING)?(800):(1200))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.99)
|
||||
to: sysadmin
|
||||
template: kubelet_10s_pleg_relist_latency_quantile_099
|
||||
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
|
||||
class: Kubernetes
|
||||
component: Kubelet
|
||||
type: Latency
|
||||
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
|
||||
calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(400):(800))
|
||||
crit: $this > (($status >= $WARNING)?(800):(1200))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.99)
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure lighttpd is running
|
||||
|
||||
template: lighttpd_last_collected_secs
|
||||
on: lighttpd.requests
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: lighttpd_last_collected_secs
|
||||
on: lighttpd.requests
|
||||
class: Web Server
|
||||
component: Lighttpd
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
|
|
@ -1,12 +1,15 @@
|
|||
# Alert on low battery capacity.
|
||||
|
||||
template: linux_power_supply_capacity
|
||||
on: powersupply.capacity
|
||||
calc: $capacity
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this < 10
|
||||
crit: $this < 5
|
||||
delay: up 30s down 5m multiplier 1.2 max 1h
|
||||
info: percentage of remaining power supply capacity
|
||||
to: sysadmin
|
||||
template: linux_power_supply_capacity
|
||||
on: powersupply.capacity
|
||||
class: Power Supply
|
||||
component: Battery
|
||||
type: Utilization
|
||||
calc: $capacity
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this < 10
|
||||
crit: $this < 5
|
||||
delay: up 30s down 5m multiplier 1.2 max 1h
|
||||
info: percentage of remaining power supply capacity
|
||||
to: sysadmin
|
||||
|
|
|
@ -4,51 +4,63 @@
|
|||
# Calculate the base trigger point for the load average alarms.
|
||||
# This is the maximum number of CPU's in the system over the past 1
|
||||
# minute, with a special case for a single CPU of setting the trigger at 2.
|
||||
alarm: load_cpu_number
|
||||
on: system.load
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
|
||||
units: cpus
|
||||
every: 1m
|
||||
info: number of active CPU cores in the system
|
||||
alarm: load_cpu_number
|
||||
on: system.load
|
||||
class: System
|
||||
component: Load
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
|
||||
units: cpus
|
||||
every: 1m
|
||||
info: number of active CPU cores in the system
|
||||
|
||||
# Send alarms if the load average is unusually high.
|
||||
# These intentionally _do not_ calculate the average over the sampled
|
||||
# time period because the values being checked already are averages.
|
||||
|
||||
alarm: load_average_15
|
||||
on: system.load
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -1m unaligned of load15
|
||||
units: load
|
||||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system fifteen-minute load average
|
||||
to: sysadmin
|
||||
alarm: load_average_15
|
||||
on: system.load
|
||||
class: System
|
||||
component: Load
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -1m unaligned of load15
|
||||
units: load
|
||||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system fifteen-minute load average
|
||||
to: sysadmin
|
||||
|
||||
alarm: load_average_5
|
||||
on: system.load
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -1m unaligned of load5
|
||||
units: load
|
||||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system five-minute load average
|
||||
to: sysadmin
|
||||
alarm: load_average_5
|
||||
on: system.load
|
||||
class: System
|
||||
component: Load
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -1m unaligned of load5
|
||||
units: load
|
||||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system five-minute load average
|
||||
to: sysadmin
|
||||
|
||||
alarm: load_average_1
|
||||
on: system.load
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -1m unaligned of load1
|
||||
units: load
|
||||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system one-minute load average
|
||||
to: sysadmin
|
||||
alarm: load_average_1
|
||||
on: system.load
|
||||
class: System
|
||||
component: Load
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -1m unaligned of load1
|
||||
units: load
|
||||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system one-minute load average
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,39 +1,51 @@
|
|||
template: mdstat_last_collected
|
||||
on: md.disks
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: mdstat_last_collected
|
||||
on: md.disks
|
||||
class: System
|
||||
component: RAID
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
template: mdstat_disks
|
||||
on: md.disks
|
||||
units: failed devices
|
||||
every: 10s
|
||||
calc: $down
|
||||
crit: $this > 0
|
||||
info: number of devices in the down state. \
|
||||
Any number > 0 indicates that the array is degraded.
|
||||
to: sysadmin
|
||||
template: mdstat_disks
|
||||
on: md.disks
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
units: failed devices
|
||||
every: 10s
|
||||
calc: $down
|
||||
crit: $this > 0
|
||||
info: number of devices in the down state for the $family array. \
|
||||
Any number > 0 indicates that the array is degraded.
|
||||
to: sysadmin
|
||||
|
||||
template: mdstat_mismatch_cnt
|
||||
on: md.mismatch_cnt
|
||||
units: unsynchronized blocks
|
||||
calc: $count
|
||||
every: 60s
|
||||
warn: $this > 1024
|
||||
delay: up 30m
|
||||
info: number of unsynchronized blocks
|
||||
to: sysadmin
|
||||
template: mdstat_mismatch_cnt
|
||||
on: md.mismatch_cnt
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
units: unsynchronized blocks
|
||||
calc: $count
|
||||
every: 60s
|
||||
warn: $this > 1024
|
||||
delay: up 30m
|
||||
info: number of unsynchronized blocks for the $family array
|
||||
to: sysadmin
|
||||
|
||||
template: mdstat_nonredundant_last_collected
|
||||
on: md.nonredundant
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: mdstat_nonredundant_last_collected
|
||||
on: md.nonredundant
|
||||
class: System
|
||||
component: RAID
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,56 +1,71 @@
|
|||
|
||||
## Adapters (controllers)
|
||||
|
||||
template: megacli_adapter_state
|
||||
on: megacli.adapter_degraded
|
||||
lookup: max -10s foreach *
|
||||
units: boolean
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 2 max 10m
|
||||
info: adapter is in the degraded state (0: false, 1: true)
|
||||
to: sysadmin
|
||||
template: megacli_adapter_state
|
||||
on: megacli.adapter_degraded
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
lookup: max -10s foreach *
|
||||
units: boolean
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 2 max 10m
|
||||
info: adapter is in the degraded state (0: false, 1: true)
|
||||
to: sysadmin
|
||||
|
||||
## Physical Disks
|
||||
|
||||
template: megacli_pd_predictive_failures
|
||||
on: megacli.pd_predictive_failure
|
||||
lookup: sum -10s foreach *
|
||||
units: predictive failures
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 2 max 10m
|
||||
info: number of physical drive predictive failures
|
||||
to: sysadmin
|
||||
template: megacli_pd_predictive_failures
|
||||
on: megacli.pd_predictive_failure
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
lookup: sum -10s foreach *
|
||||
units: predictive failures
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 2 max 10m
|
||||
info: number of physical drive predictive failures
|
||||
to: sysadmin
|
||||
|
||||
template: megacli_pd_media_errors
|
||||
on: megacli.pd_media_error
|
||||
lookup: sum -10s foreach *
|
||||
units: media errors
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 2 max 10m
|
||||
info: number of physical drive media errors
|
||||
to: sysadmin
|
||||
template: megacli_pd_media_errors
|
||||
on: megacli.pd_media_error
|
||||
class: System
|
||||
component: RAID
|
||||
type: Errors
|
||||
lookup: sum -10s foreach *
|
||||
units: media errors
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 2 max 10m
|
||||
info: number of physical drive media errors
|
||||
to: sysadmin
|
||||
|
||||
## Battery Backup Units (BBU)
|
||||
|
||||
template: megacli_bbu_relative_charge
|
||||
on: megacli.bbu_relative_charge
|
||||
lookup: average -10s
|
||||
units: percent
|
||||
every: 10s
|
||||
warn: $this <= (($status >= $WARNING) ? (85) : (80))
|
||||
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
|
||||
info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: megacli_bbu_relative_charge
|
||||
on: megacli.bbu_relative_charge
|
||||
class: System
|
||||
component: RAID
|
||||
type: Workload
|
||||
lookup: average -10s
|
||||
units: percent
|
||||
every: 10s
|
||||
warn: $this <= (($status >= $WARNING) ? (85) : (80))
|
||||
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
|
||||
info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
||||
template: megacli_bbu_cycle_count
|
||||
on: megacli.bbu_cycle_count
|
||||
lookup: average -10s
|
||||
units: cycles
|
||||
every: 10s
|
||||
warn: $this >= 100
|
||||
crit: $this >= 500
|
||||
info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
|
||||
to: sysadmin
|
||||
template: megacli_bbu_cycle_count
|
||||
on: megacli.bbu_cycle_count
|
||||
class: System
|
||||
component: RAID
|
||||
type: Workload
|
||||
lookup: average -10s
|
||||
units: cycles
|
||||
every: 10s
|
||||
warn: $this >= 100
|
||||
crit: $this >= 500
|
||||
info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,53 +1,65 @@
|
|||
|
||||
# make sure memcached is running
|
||||
|
||||
template: memcached_last_collected_secs
|
||||
on: memcached.cache
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: memcached_last_collected_secs
|
||||
on: memcached.cache
|
||||
class: KV Storage
|
||||
component: Memcached
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
||||
|
||||
# detect if memcached cache is full
|
||||
|
||||
template: memcached_cache_memory_usage
|
||||
on: memcached.cache
|
||||
calc: $used * 100 / ($used + $available)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (90))
|
||||
delay: up 0 down 15m multiplier 1.5 max 1h
|
||||
info: cache memory utilization
|
||||
to: dba
|
||||
template: memcached_cache_memory_usage
|
||||
on: memcached.cache
|
||||
class: KV Storage
|
||||
component: Memcached
|
||||
type: Utilization
|
||||
calc: $used * 100 / ($used + $available)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (90))
|
||||
delay: up 0 down 15m multiplier 1.5 max 1h
|
||||
info: cache memory utilization
|
||||
to: dba
|
||||
|
||||
|
||||
# find the rate memcached cache is filling
|
||||
|
||||
template: memcached_cache_fill_rate
|
||||
on: memcached.cache
|
||||
lookup: min -10m at -50m unaligned of available
|
||||
calc: ($this - $available) / (($now - $after) / 3600)
|
||||
units: KB/hour
|
||||
every: 1m
|
||||
info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
|
||||
template: memcached_cache_fill_rate
|
||||
on: memcached.cache
|
||||
class: KV Storage
|
||||
component: Memcached
|
||||
type: Utilization
|
||||
lookup: min -10m at -50m unaligned of available
|
||||
calc: ($this - $available) / (($now - $after) / 3600)
|
||||
units: KB/hour
|
||||
every: 1m
|
||||
info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
|
||||
|
||||
|
||||
# find the hours remaining until memcached cache is full
|
||||
|
||||
template: memcached_out_of_cache_space_time
|
||||
on: memcached.cache
|
||||
calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
|
||||
units: hours
|
||||
every: 10s
|
||||
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
|
||||
crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: estimated time the cache will run out of space \
|
||||
if the system continues to add data at the same rate as the past hour
|
||||
to: dba
|
||||
template: memcached_out_of_cache_space_time
|
||||
on: memcached.cache
|
||||
class: KV Storage
|
||||
component: Memcached
|
||||
type: Utilization
|
||||
calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
|
||||
units: hours
|
||||
every: 10s
|
||||
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
|
||||
crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: estimated time the cache will run out of space \
|
||||
if the system continues to add data at the same rate as the past hour
|
||||
to: dba
|
||||
|
|
|
@ -1,38 +1,47 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: 1hour_ecc_memory_correctable
|
||||
on: mem.ecc_ce
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: number of ECC correctable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
alarm: 1hour_ecc_memory_correctable
|
||||
on: mem.ecc_ce
|
||||
class: System
|
||||
component: Memory
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: number of ECC correctable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
alarm: 1hour_ecc_memory_uncorrectable
|
||||
on: mem.ecc_ue
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned
|
||||
units: errors
|
||||
every: 1m
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: number of ECC uncorrectable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
alarm: 1hour_ecc_memory_uncorrectable
|
||||
on: mem.ecc_ue
|
||||
class: System
|
||||
component: Memory
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned
|
||||
units: errors
|
||||
every: 1m
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: number of ECC uncorrectable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
alarm: 1hour_memory_hw_corrupted
|
||||
on: mem.hwcorrupt
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: $HardwareCorrupted
|
||||
units: MB
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: amount of memory corrupted due to a hardware failure
|
||||
to: sysadmin
|
||||
alarm: 1hour_memory_hw_corrupted
|
||||
on: mem.hwcorrupt
|
||||
class: System
|
||||
component: Memory
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: $HardwareCorrupted
|
||||
units: MB
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: amount of memory corrupted due to a hardware failure
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
|
||||
# make sure mongodb is running
|
||||
|
||||
template: mongodb_last_collected_secs
|
||||
on: mongodb.read_operations
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: mongodb_last_collected_secs
|
||||
on: mongodb.read_operations
|
||||
class: Database
|
||||
component: MongoDB
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
|
|
@ -1,150 +1,186 @@
|
|||
|
||||
# make sure mysql is running
|
||||
|
||||
template: mysql_last_collected_secs
|
||||
on: mysql.queries
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: mysql_last_collected_secs
|
||||
on: mysql.queries
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# slow queries
|
||||
|
||||
template: mysql_10s_slow_queries
|
||||
on: mysql.queries
|
||||
lookup: sum -10s of slow_queries
|
||||
units: slow queries
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (10) : (20))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of slow queries in the last 10 seconds
|
||||
to: dba
|
||||
template: mysql_10s_slow_queries
|
||||
on: mysql.queries
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Latency
|
||||
lookup: sum -10s of slow_queries
|
||||
units: slow queries
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (10) : (20))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of slow queries in the last 10 seconds
|
||||
to: dba
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# lock waits
|
||||
|
||||
template: mysql_10s_table_locks_immediate
|
||||
on: mysql.table_locks
|
||||
lookup: sum -10s absolute of immediate
|
||||
units: immediate locks
|
||||
every: 10s
|
||||
info: number of table immediate locks in the last 10 seconds
|
||||
to: dba
|
||||
template: mysql_10s_table_locks_immediate
|
||||
on: mysql.table_locks
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Utilization
|
||||
lookup: sum -10s absolute of immediate
|
||||
units: immediate locks
|
||||
every: 10s
|
||||
info: number of table immediate locks in the last 10 seconds
|
||||
to: dba
|
||||
|
||||
template: mysql_10s_table_locks_waited
|
||||
on: mysql.table_locks
|
||||
lookup: sum -10s absolute of waited
|
||||
units: waited locks
|
||||
every: 10s
|
||||
info: number of table waited locks in the last 10 seconds
|
||||
to: dba
|
||||
template: mysql_10s_table_locks_waited
|
||||
on: mysql.table_locks
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Latency
|
||||
lookup: sum -10s absolute of waited
|
||||
units: waited locks
|
||||
every: 10s
|
||||
info: number of table waited locks in the last 10 seconds
|
||||
to: dba
|
||||
|
||||
template: mysql_10s_waited_locks_ratio
|
||||
on: mysql.table_locks
|
||||
calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (10) : (25))
|
||||
crit: $this > (($status == $CRITICAL) ? (25) : (50))
|
||||
delay: down 30m multiplier 1.5 max 1h
|
||||
info: ratio of waited table locks over the last 10 seconds
|
||||
to: dba
|
||||
template: mysql_10s_waited_locks_ratio
|
||||
on: mysql.table_locks
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Latency
|
||||
calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (10) : (25))
|
||||
crit: $this > (($status == $CRITICAL) ? (25) : (50))
|
||||
delay: down 30m multiplier 1.5 max 1h
|
||||
info: ratio of waited table locks over the last 10 seconds
|
||||
to: dba
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# connections
|
||||
|
||||
template: mysql_connections
|
||||
on: mysql.connections_active
|
||||
calc: $active * 100 / $limit
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (60) : (70))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (90))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: client connections utilization
|
||||
to: dba
|
||||
template: mysql_connections
|
||||
on: mysql.connections_active
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Utilization
|
||||
calc: $active * 100 / $limit
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (60) : (70))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (90))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: client connections utilization
|
||||
to: dba
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# replication
|
||||
|
||||
template: mysql_replication
|
||||
on: mysql.slave_status
|
||||
calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
crit: $this == 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: replication status (0: stopped, 1: working)
|
||||
to: dba
|
||||
template: mysql_replication
|
||||
on: mysql.slave_status
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Errors
|
||||
calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
crit: $this == 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: replication status (0: stopped, 1: working)
|
||||
to: dba
|
||||
|
||||
template: mysql_replication_lag
|
||||
on: mysql.slave_behind
|
||||
calc: $seconds
|
||||
units: seconds
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (10) : (30))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: difference between the timestamp of the latest transaction processed by the SQL thread and \
|
||||
the timestamp of the same transaction when it was processed on the master
|
||||
to: dba
|
||||
template: mysql_replication_lag
|
||||
on: mysql.slave_behind
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Errors
|
||||
calc: $seconds
|
||||
units: seconds
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (10) : (30))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: difference between the timestamp of the latest transaction processed by the SQL thread and \
|
||||
the timestamp of the same transaction when it was processed on the master
|
||||
to: dba
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# galera cluster size
|
||||
|
||||
template: mysql_galera_cluster_size_max_2m
|
||||
on: mysql.galera_cluster_size
|
||||
lookup: max -2m absolute
|
||||
units: nodes
|
||||
every: 10s
|
||||
info: maximum galera cluster size in the last 2 minutes
|
||||
to: dba
|
||||
template: mysql_galera_cluster_size_max_2m
|
||||
on: mysql.galera_cluster_size
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Utilization
|
||||
lookup: max -2m absolute
|
||||
units: nodes
|
||||
every: 10s
|
||||
info: maximum galera cluster size in the last 2 minutes
|
||||
to: dba
|
||||
|
||||
template: mysql_galera_cluster_size
|
||||
on: mysql.galera_cluster_size
|
||||
calc: $nodes
|
||||
units: nodes
|
||||
every: 10s
|
||||
warn: $this > $mysql_galera_cluster_size_max_2m
|
||||
crit: $this < $mysql_galera_cluster_size_max_2m
|
||||
delay: up 20s down 5m multiplier 1.5 max 1h
|
||||
info: current galera cluster size, compared to the maximum size in the last 2 minutes
|
||||
to: dba
|
||||
template: mysql_galera_cluster_size
|
||||
on: mysql.galera_cluster_size
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Utilization
|
||||
calc: $nodes
|
||||
units: nodes
|
||||
every: 10s
|
||||
warn: $this > $mysql_galera_cluster_size_max_2m
|
||||
crit: $this < $mysql_galera_cluster_size_max_2m
|
||||
delay: up 20s down 5m multiplier 1.5 max 1h
|
||||
info: current galera cluster size, compared to the maximum size in the last 2 minutes
|
||||
to: dba
|
||||
|
||||
# galera node state
|
||||
|
||||
template: mysql_galera_cluster_state
|
||||
on: mysql.galera_cluster_state
|
||||
calc: $state
|
||||
every: 10s
|
||||
warn: $this == 2 OR $this == 3
|
||||
crit: $this == 0 OR $this == 1 OR $this >= 5
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node state \
|
||||
(0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
|
||||
to: dba
|
||||
template: mysql_galera_cluster_state
|
||||
on: mysql.galera_cluster_state
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Errors
|
||||
calc: $state
|
||||
every: 10s
|
||||
warn: $this == 2 OR $this == 3
|
||||
crit: $this == 0 OR $this == 1 OR $this >= 5
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node state \
|
||||
(0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
|
||||
to: dba
|
||||
|
||||
|
||||
# galera node status
|
||||
|
||||
template: mysql_galera_cluster_status
|
||||
on: mysql.galera_cluster_status
|
||||
calc: $wsrep_cluster_status
|
||||
every: 10s
|
||||
crit: $mysql_galera_cluster_state != nan AND $this != 0
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node cluster component status \
|
||||
(-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
|
||||
Any other value than primary indicates that the node is part of a nonoperational component.
|
||||
to: dba
|
||||
template: mysql_galera_cluster_status
|
||||
on: mysql.galera_cluster_status
|
||||
class: Database
|
||||
component: MySQL
|
||||
type: Errors
|
||||
calc: $wsrep_cluster_status
|
||||
every: 10s
|
||||
crit: $mysql_galera_cluster_state != nan AND $this != 0
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node cluster component status \
|
||||
(-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
|
||||
Any other value than primary indicates that the node is part of a nonoperational component.
|
||||
to: dba
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure named is running
|
||||
|
||||
template: named_last_collected_secs
|
||||
on: named.global_queries
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: domainadmin
|
||||
template: named_last_collected_secs
|
||||
on: named.global_queries
|
||||
class: DNS
|
||||
component: BIND
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: domainadmin
|
||||
|
||||
|
|
|
@ -6,16 +6,22 @@
|
|||
|
||||
template: interface_speed
|
||||
on: net.net
|
||||
class: System
|
||||
component: Network
|
||||
type: Latency
|
||||
os: *
|
||||
hosts: *
|
||||
families: *
|
||||
calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
|
||||
units: Mbit
|
||||
every: 10s
|
||||
info: network interface current speed
|
||||
info: network interface $family current speed
|
||||
|
||||
template: 1m_received_traffic_overflow
|
||||
on: net.net
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
|
@ -25,11 +31,14 @@
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
delay: up 1m down 1m multiplier 1.5 max 1h
|
||||
info: average inbound utilization for the network interface over the last minute
|
||||
info: average inbound utilization for the network interface $family over the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: 1m_sent_traffic_overflow
|
||||
on: net.net
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
|
@ -39,7 +48,7 @@
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
delay: up 1m down 1m multiplier 1.5 max 1h
|
||||
info: average outbound utilization for the network interface over the last minute
|
||||
info: average outbound utilization for the network interface $family over the last minute
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -52,110 +61,134 @@
|
|||
# it is possible to have expected packet drops on an interface for some network configurations
|
||||
# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information
|
||||
|
||||
template: inbound_packets_dropped
|
||||
on: net.drops
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* *
|
||||
lookup: sum -10m unaligned absolute of inbound
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of inbound dropped packets for the network interface in the last 10 minutes
|
||||
template: inbound_packets_dropped
|
||||
on: net.drops
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* *
|
||||
lookup: sum -10m unaligned absolute of inbound
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of inbound dropped packets for the network interface $family in the last 10 minutes
|
||||
|
||||
template: outbound_packets_dropped
|
||||
on: net.drops
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* *
|
||||
lookup: sum -10m unaligned absolute of outbound
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of outbound dropped packets for the network interface in the last 10 minutes
|
||||
template: outbound_packets_dropped
|
||||
on: net.drops
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* *
|
||||
lookup: sum -10m unaligned absolute of outbound
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of outbound dropped packets for the network interface $family in the last 10 minutes
|
||||
|
||||
template: inbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* !wl* *
|
||||
lookup: sum -10m unaligned absolute of received
|
||||
calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: inbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* !wl* *
|
||||
lookup: sum -10m unaligned absolute of received
|
||||
calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: outbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* !wl* *
|
||||
lookup: sum -10m unaligned absolute of sent
|
||||
calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: outbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: !net* !wl* *
|
||||
lookup: sum -10m unaligned absolute of sent
|
||||
calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: wifi_inbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
os: linux
|
||||
hosts: *
|
||||
families: wl*
|
||||
lookup: sum -10m unaligned absolute of received
|
||||
calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 10
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wifi_inbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: wl*
|
||||
lookup: sum -10m unaligned absolute of received
|
||||
calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 10
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: wifi_outbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
os: linux
|
||||
hosts: *
|
||||
families: wl*
|
||||
lookup: sum -10m unaligned absolute of sent
|
||||
calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 10
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wifi_outbound_packets_dropped_ratio
|
||||
on: net.packets
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: wl*
|
||||
lookup: sum -10m unaligned absolute of sent
|
||||
calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 10
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# interface errors
|
||||
|
||||
template: interface_inbound_errors
|
||||
on: net.errors
|
||||
os: freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute of inbound
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: interface_inbound_errors
|
||||
on: net.errors
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute of inbound
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound errors for the network interface $family in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: interface_outbound_errors
|
||||
on: net.errors
|
||||
os: freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute of outbound
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: interface_outbound_errors
|
||||
on: net.errors
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute of outbound
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound errors for the network interface $family in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# FIFO errors
|
||||
|
@ -165,18 +198,21 @@ families: *
|
|||
# the alarm is checked every 1 minute
|
||||
# and examines the last 10 minutes of data
|
||||
|
||||
template: 10min_fifo_errors
|
||||
on: net.fifo
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of FIFO errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: 10min_fifo_errors
|
||||
on: net.fifo
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of FIFO errors for the network interface $family in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# check for packet storms
|
||||
|
@ -187,28 +223,34 @@ families: *
|
|||
# we assume the minimum packet storm should at least have
|
||||
# 10000 packets/s, average of the last 10 seconds
|
||||
|
||||
template: 1m_received_packets_rate
|
||||
on: net.packets
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -1m unaligned of received
|
||||
units: packets
|
||||
every: 10s
|
||||
info: average number of packets received by the network interface over the last minute
|
||||
template: 1m_received_packets_rate
|
||||
on: net.packets
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -1m unaligned of received
|
||||
units: packets
|
||||
every: 10s
|
||||
info: average number of packets received by the network interface $family over the last minute
|
||||
|
||||
template: 10s_received_packets_storm
|
||||
on: net.packets
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10s unaligned of received
|
||||
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(200):(5000))
|
||||
crit: $this > (($status == $CRITICAL)?(5000):(6000))
|
||||
options: no-clear-notification
|
||||
info: ratio of average number of received packets for the network interface over the last 10 seconds, \
|
||||
compared to the rate over the last minute
|
||||
to: sysadmin
|
||||
template: 10s_received_packets_storm
|
||||
on: net.packets
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: average -10s unaligned of received
|
||||
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this > (($status >= $WARNING)?(200):(5000))
|
||||
crit: $this > (($status == $CRITICAL)?(5000):(6000))
|
||||
options: no-clear-notification
|
||||
info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
|
||||
compared to the rate over the last minute
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: netfilter_conntrack_full
|
||||
on: netfilter.conntrack_sockets
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -10s unaligned of connections
|
||||
calc: $this * 100 / $netfilter_conntrack_max
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (95))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: netfilter connection tracker table size utilization
|
||||
to: sysadmin
|
||||
alarm: netfilter_conntrack_full
|
||||
on: netfilter.conntrack_sockets
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: max -10s unaligned of connections
|
||||
calc: $this * 100 / $netfilter_conntrack_max
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (95))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: netfilter connection tracker table size utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure nginx is running
|
||||
|
||||
template: nginx_last_collected_secs
|
||||
on: nginx.requests
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: nginx_last_collected_secs
|
||||
on: nginx.requests
|
||||
class: Web Server
|
||||
component: NGINX
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure nginx_plus is running
|
||||
|
||||
template: nginx_plus_last_collected_secs
|
||||
on: nginx_plus.requests_total
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: nginx_plus_last_collected_secs
|
||||
on: nginx_plus.requests_total
|
||||
class: Web Server
|
||||
component: NGINX Plus
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure phpfpm is running
|
||||
|
||||
template: phpfpm_last_collected_secs
|
||||
on: phpfpm.requests
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: phpfpm_last_collected_secs
|
||||
on: phpfpm.requests
|
||||
class: Web Server
|
||||
component: PHP-FPM
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
|
|
@ -1,65 +1,80 @@
|
|||
|
||||
# Make sure Pi-hole is responding.
|
||||
|
||||
template: pihole_last_collected_secs
|
||||
on: pihole.dns_queries_total
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: pihole_last_collected_secs
|
||||
on: pihole.dns_queries_total
|
||||
class: Ad Filtering
|
||||
component: Pi-hole
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
# Blocked DNS queries.
|
||||
|
||||
template: pihole_blocked_queries
|
||||
on: pihole.dns_queries_percentage
|
||||
every: 10s
|
||||
units: %
|
||||
calc: $blocked
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
|
||||
crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
|
||||
delay: up 2m down 5m
|
||||
info: percentage of blocked dns queries over the last 24 hour
|
||||
to: sysadmin
|
||||
template: pihole_blocked_queries
|
||||
on: pihole.dns_queries_percentage
|
||||
class: Ad Filtering
|
||||
component: Pi-hole
|
||||
type: Errors
|
||||
every: 10s
|
||||
units: %
|
||||
calc: $blocked
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
|
||||
crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
|
||||
delay: up 2m down 5m
|
||||
info: percentage of blocked dns queries over the last 24 hour
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# Blocklist last update time.
|
||||
# Default update interval is a week.
|
||||
|
||||
template: pihole_blocklist_last_update
|
||||
on: pihole.blocklist_last_update
|
||||
every: 10s
|
||||
units: seconds
|
||||
calc: $ago
|
||||
warn: $this > 60 * 60 * 24 * 8
|
||||
crit: $this > 60 * 60 * 24 * 8 * 2
|
||||
info: gravity.list (blocklist) file last update time
|
||||
to: sysadmin
|
||||
template: pihole_blocklist_last_update
|
||||
on: pihole.blocklist_last_update
|
||||
class: Ad Filtering
|
||||
component: Pi-hole
|
||||
type: Errors
|
||||
every: 10s
|
||||
units: seconds
|
||||
calc: $ago
|
||||
warn: $this > 60 * 60 * 24 * 8
|
||||
crit: $this > 60 * 60 * 24 * 8 * 2
|
||||
info: gravity.list (blocklist) file last update time
|
||||
to: sysadmin
|
||||
|
||||
# Gravity file check (gravity.list).
|
||||
|
||||
template: pihole_blocklist_gravity_file
|
||||
on: pihole.blocklist_last_update
|
||||
every: 10s
|
||||
units: boolean
|
||||
calc: $file_exists
|
||||
crit: $this != 1
|
||||
delay: up 2m down 5m
|
||||
info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
|
||||
to: sysadmin
|
||||
template: pihole_blocklist_gravity_file
|
||||
on: pihole.blocklist_last_update
|
||||
class: Ad Filtering
|
||||
component: Pi-hole
|
||||
type: Errors
|
||||
every: 10s
|
||||
units: boolean
|
||||
calc: $file_exists
|
||||
crit: $this != 1
|
||||
delay: up 2m down 5m
|
||||
info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
|
||||
to: sysadmin
|
||||
|
||||
# Pi-hole's ability to block unwanted domains.
|
||||
# Should be enabled. The whole point of Pi-hole!
|
||||
|
||||
template: pihole_status
|
||||
on: pihole.unwanted_domains_blocking_status
|
||||
every: 10s
|
||||
units: boolean
|
||||
calc: $enabled
|
||||
warn: $this != 1
|
||||
delay: up 2m down 5m
|
||||
info: unwanted domains blocking status (0: enabled, 1: disabled)
|
||||
to: sysadmin
|
||||
template: pihole_status
|
||||
on: pihole.unwanted_domains_blocking_status
|
||||
class: Ad Filtering
|
||||
component: Pi-hole
|
||||
type: Errors
|
||||
every: 10s
|
||||
units: boolean
|
||||
calc: $enabled
|
||||
warn: $this != 1
|
||||
delay: up 2m down 5m
|
||||
info: unwanted domains blocking status (0: enabled, 1: disabled)
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,46 +1,58 @@
|
|||
template: portcheck_last_collected_secs
|
||||
families: *
|
||||
on: portcheck.status
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: portcheck_last_collected_secs
|
||||
families: *
|
||||
on: portcheck.status
|
||||
class: Other
|
||||
component: TCP endpoint
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
|
||||
template: portcheck_service_reachable
|
||||
families: *
|
||||
on: portcheck.status
|
||||
lookup: average -1m unaligned percentage of success
|
||||
calc: ($this < 75) ? (0) : ($this)
|
||||
every: 5s
|
||||
units: up/down
|
||||
info: average ratio of successful connections over the last minute (at least 75%)
|
||||
to: silent
|
||||
template: portcheck_service_reachable
|
||||
families: *
|
||||
on: portcheck.status
|
||||
class: Other
|
||||
component: TCP endpoint
|
||||
type: Workload
|
||||
lookup: average -1m unaligned percentage of success
|
||||
calc: ($this < 75) ? (0) : ($this)
|
||||
every: 5s
|
||||
units: up/down
|
||||
info: average ratio of successful connections over the last minute (at least 75%)
|
||||
to: silent
|
||||
|
||||
template: portcheck_connection_timeouts
|
||||
families: *
|
||||
on: portcheck.status
|
||||
lookup: average -5m unaligned percentage of timeout
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of timeouts over the last 5 minutes
|
||||
to: sysadmin
|
||||
template: portcheck_connection_timeouts
|
||||
families: *
|
||||
on: portcheck.status
|
||||
class: Other
|
||||
component: TCP endpoint
|
||||
type: Errors
|
||||
lookup: average -5m unaligned percentage of timeout
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of timeouts over the last 5 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: portcheck_connection_fails
|
||||
families: *
|
||||
on: portcheck.status
|
||||
lookup: average -5m unaligned percentage of no_connection,failed
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of failed connections over the last 5 minutes
|
||||
to: sysadmin
|
||||
template: portcheck_connection_fails
|
||||
families: *
|
||||
on: portcheck.status
|
||||
class: Other
|
||||
component: TCP endpoint
|
||||
type: Errors
|
||||
lookup: average -5m unaligned percentage of no_connection,failed
|
||||
every: 10s
|
||||
units: %
|
||||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average ratio of failed connections over the last 5 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
|
||||
# make sure postgres is running
|
||||
|
||||
template: postgres_last_collected_secs
|
||||
on: postgres.db_stat_transactions
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: postgres_last_collected_secs
|
||||
on: postgres.db_stat_transactions
|
||||
class: Database
|
||||
component: PostgreSQL
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: active_processes
|
||||
on: system.active_processes
|
||||
hosts: *
|
||||
calc: $active * 100 / $pidmax
|
||||
units: %
|
||||
every: 5s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (95))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: system process IDs (PID) space utilization
|
||||
to: sysadmin
|
||||
alarm: active_processes
|
||||
on: system.active_processes
|
||||
class: System
|
||||
component: Processes
|
||||
type: Workload
|
||||
hosts: *
|
||||
calc: $active * 100 / $pidmax
|
||||
units: %
|
||||
every: 5s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (95))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: system process IDs (PID) space utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
|
||||
# Availability
|
||||
|
||||
template: pulsar_last_collected_secs
|
||||
on: pulsar.broker_components
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: pulsar_last_collected_secs
|
||||
on: pulsar.broker_components
|
||||
class: Messaging
|
||||
component: Pulsar
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,41 +1,50 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: used_ram_to_ignore
|
||||
on: system.ram
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
|
||||
every: 10s
|
||||
info: amount of memory reported as used, \
|
||||
but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
|
||||
alarm: used_ram_to_ignore
|
||||
on: system.ram
|
||||
class: System
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
|
||||
every: 10s
|
||||
info: amount of memory reported as used, \
|
||||
but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
|
||||
|
||||
alarm: ram_in_use
|
||||
on: system.ram
|
||||
os: linux
|
||||
hosts: *
|
||||
# calc: $used * 100 / ($used + $cached + $free)
|
||||
calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system memory utilization
|
||||
to: sysadmin
|
||||
alarm: ram_in_use
|
||||
on: system.ram
|
||||
class: System
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
# calc: $used * 100 / ($used + $cached + $free)
|
||||
calc: ($used - $used_ram_to_ignore) * 100 / ($used + $cached + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system memory utilization
|
||||
to: sysadmin
|
||||
|
||||
alarm: ram_available
|
||||
on: mem.available
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this < (($status >= $WARNING) ? (15) : (10))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
to: sysadmin
|
||||
alarm: ram_available
|
||||
on: mem.available
|
||||
class: System
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this < (($status >= $WARNING) ? (15) : (10))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
to: sysadmin
|
||||
|
||||
alarm: oom_kill
|
||||
on: mem.oom_kill
|
||||
|
@ -50,28 +59,34 @@
|
|||
to: sysadmin
|
||||
|
||||
## FreeBSD
|
||||
alarm: ram_in_use
|
||||
on: system.ram
|
||||
os: freebsd
|
||||
hosts: *
|
||||
calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system memory utilization
|
||||
to: sysadmin
|
||||
alarm: ram_in_use
|
||||
on: system.ram
|
||||
class: System
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: freebsd
|
||||
hosts: *
|
||||
calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: system memory utilization
|
||||
to: sysadmin
|
||||
|
||||
alarm: ram_available
|
||||
on: system.ram
|
||||
os: freebsd
|
||||
hosts: *
|
||||
calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this < (($status >= $WARNING) ? (15) : (10))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
to: sysadmin
|
||||
alarm: ram_available
|
||||
on: system.ram
|
||||
class: System
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: freebsd
|
||||
hosts: *
|
||||
calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this < (($status >= $WARNING) ? (15) : (10))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,34 +1,43 @@
|
|||
|
||||
# make sure redis is running
|
||||
|
||||
template: redis_last_collected_secs
|
||||
on: redis.operations
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: redis_last_collected_secs
|
||||
on: redis.operations
|
||||
class: KV Storage
|
||||
component: Redis
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
||||
template: redis_bgsave_broken
|
||||
families: *
|
||||
on: redis.bgsave_health
|
||||
every: 10s
|
||||
crit: $rdb_last_bgsave_status != 0
|
||||
units: ok/failed
|
||||
info: status of the last RDB save operation (0: ok, 1: error)
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
template: redis_bgsave_broken
|
||||
families: *
|
||||
on: redis.bgsave_health
|
||||
class: KV Storage
|
||||
component: Redis
|
||||
type: Errors
|
||||
every: 10s
|
||||
crit: $rdb_last_bgsave_status != 0
|
||||
units: ok/failed
|
||||
info: status of the last RDB save operation (0: ok, 1: error)
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
|
||||
template: redis_bgsave_slow
|
||||
families: *
|
||||
on: redis.bgsave_now
|
||||
every: 10s
|
||||
warn: $rdb_bgsave_in_progress > 600
|
||||
crit: $rdb_bgsave_in_progress > 1200
|
||||
units: seconds
|
||||
info: duration of the on-going RDB save operation
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
template: redis_bgsave_slow
|
||||
families: *
|
||||
on: redis.bgsave_now
|
||||
class: KV Storage
|
||||
component: Redis
|
||||
type: Latency
|
||||
every: 10s
|
||||
warn: $rdb_bgsave_in_progress > 600
|
||||
crit: $rdb_bgsave_in_progress > 1200
|
||||
units: seconds
|
||||
info: duration of the on-going RDB save operation
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
|
|
|
@ -1,25 +1,31 @@
|
|||
# make sure RetroShare is running
|
||||
|
||||
template: retroshare_last_collected_secs
|
||||
on: retroshare.peers
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: retroshare_last_collected_secs
|
||||
on: retroshare.peers
|
||||
class: Data Sharing
|
||||
component: Retroshare
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# make sure the DHT is fine when active
|
||||
|
||||
template: retroshare_dht_working
|
||||
on: retroshare.dht
|
||||
calc: $dht_size_all
|
||||
units: peers
|
||||
every: 1m
|
||||
warn: $this < (($status >= $WARNING) ? (120) : (100))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : (1))
|
||||
delay: up 0 down 15m multiplier 1.5 max 1h
|
||||
info: number of DHT peers
|
||||
to: sysadmin
|
||||
template: retroshare_dht_working
|
||||
on: retroshare.dht
|
||||
class: Data Sharing
|
||||
component: Retroshare
|
||||
type: Utilization
|
||||
calc: $dht_size_all
|
||||
units: peers
|
||||
every: 1m
|
||||
warn: $this < (($status >= $WARNING) ? (120) : (100))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : (1))
|
||||
delay: up 0 down 15m multiplier 1.5 max 1h
|
||||
info: number of DHT peers
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,86 +1,107 @@
|
|||
# Ensure that Riak is running. template: riak_last_collected_secs
|
||||
template: riakkv_last_collected_secs
|
||||
on: riak.kv.throughput
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
template: riakkv_last_collected_secs
|
||||
on: riak.kv.throughput
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: dba
|
||||
|
||||
# Warn if a list keys operation is running.
|
||||
template: riakkv_list_keys_active
|
||||
on: riak.core.fsm_active
|
||||
calc: $list_fsm_active
|
||||
units: state machines
|
||||
every: 10s
|
||||
warn: $list_fsm_active > 0
|
||||
info: number of currently running list keys finite state machines
|
||||
to: dba
|
||||
template: riakkv_list_keys_active
|
||||
on: riak.core.fsm_active
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Utilization
|
||||
calc: $list_fsm_active
|
||||
units: state machines
|
||||
every: 10s
|
||||
warn: $list_fsm_active > 0
|
||||
info: number of currently running list keys finite state machines
|
||||
to: dba
|
||||
|
||||
|
||||
## Timing healthchecks
|
||||
# KV GET
|
||||
template: riakkv_1h_kv_get_mean_latency
|
||||
on: riak.kv.latency.get
|
||||
calc: $node_get_fsm_time_mean
|
||||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average time between reception of client GET request and \
|
||||
subsequent response to client over the last hour
|
||||
template: riakkv_1h_kv_get_mean_latency
|
||||
on: riak.kv.latency.get
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Latency
|
||||
calc: $node_get_fsm_time_mean
|
||||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average time between reception of client GET request and \
|
||||
subsequent response to client over the last hour
|
||||
|
||||
template: riakkv_kv_get_slow
|
||||
on: riak.kv.latency.get
|
||||
calc: $mean
|
||||
lookup: average -3m unaligned of time
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
|
||||
crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
|
||||
info: average time between reception of client GET request and \
|
||||
subsequent response to the client over the last 3 minutes, \
|
||||
compared to the average over the last hour
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
template: riakkv_kv_get_slow
|
||||
on: riak.kv.latency.get
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Latency
|
||||
calc: $mean
|
||||
lookup: average -3m unaligned of time
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
|
||||
crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
|
||||
info: average time between reception of client GET request and \
|
||||
subsequent response to the client over the last 3 minutes, \
|
||||
compared to the average over the last hour
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
|
||||
# KV PUT
|
||||
template: riakkv_1h_kv_put_mean_latency
|
||||
on: riak.kv.latency.put
|
||||
calc: $node_put_fsm_time_mean
|
||||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average time between reception of client PUT request and \
|
||||
subsequent response to the client over the last hour
|
||||
template: riakkv_1h_kv_put_mean_latency
|
||||
on: riak.kv.latency.put
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Latency
|
||||
calc: $node_put_fsm_time_mean
|
||||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average time between reception of client PUT request and \
|
||||
subsequent response to the client over the last hour
|
||||
|
||||
template: riakkv_kv_put_slow
|
||||
on: riak.kv.latency.put
|
||||
calc: $mean
|
||||
lookup: average -3m unaligned of time
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
|
||||
crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
|
||||
info: average time between reception of client PUT request and \
|
||||
subsequent response to the client over the last 3 minutes, \
|
||||
compared to the average over the last hour
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
template: riakkv_kv_put_slow
|
||||
on: riak.kv.latency.put
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Latency
|
||||
calc: $mean
|
||||
lookup: average -3m unaligned of time
|
||||
units: ms
|
||||
every: 10s
|
||||
warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
|
||||
crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
|
||||
info: average time between reception of client PUT request and \
|
||||
subsequent response to the client over the last 3 minutes, \
|
||||
compared to the average over the last hour
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
to: dba
|
||||
|
||||
|
||||
## VM healthchecks
|
||||
|
||||
# Default Erlang VM process limit: 262144
|
||||
# On systems observed, this is < 2000, but may grow depending on load.
|
||||
template: riakkv_vm_high_process_count
|
||||
on: riak.vm
|
||||
calc: $sys_process_count
|
||||
units: processes
|
||||
every: 10s
|
||||
warn: $this > 10000
|
||||
crit: $this > 100000
|
||||
info: number of processes running in the Erlang VM
|
||||
to: dba
|
||||
template: riakkv_vm_high_process_count
|
||||
on: riak.vm
|
||||
class: Database
|
||||
component: Riak KV
|
||||
type: Utilization
|
||||
calc: $sys_process_count
|
||||
units: processes
|
||||
every: 10s
|
||||
warn: $this > 10000
|
||||
crit: $this > 100000
|
||||
info: number of processes running in the Erlang VM
|
||||
to: dba
|
||||
|
|
|
@ -1,38 +1,47 @@
|
|||
|
||||
# make sure scaleio is running
|
||||
|
||||
template: scaleio_last_collected_secs
|
||||
on: scaleio.system_capacity_total
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: scaleio_last_collected_secs
|
||||
on: scaleio.system_capacity_total
|
||||
class: Storage
|
||||
component: ScaleIO
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# make sure Storage Pool capacity utilization is under limit
|
||||
|
||||
template: scaleio_storage_pool_capacity_utilization
|
||||
on: scaleio.storage_pool_capacity_utilization
|
||||
calc: $used
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage pool capacity utilization
|
||||
to: sysadmin
|
||||
template: scaleio_storage_pool_capacity_utilization
|
||||
on: scaleio.storage_pool_capacity_utilization
|
||||
class: Storage
|
||||
component: ScaleIO
|
||||
type: Utilization
|
||||
calc: $used
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage pool capacity utilization
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# make sure Sdc is connected to MDM
|
||||
|
||||
template: scaleio_sdc_mdm_connection_state
|
||||
on: scaleio.sdc_mdm_connection_state
|
||||
calc: $connected
|
||||
every: 10s
|
||||
warn: $this != 1
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
|
||||
to: sysadmin
|
||||
template: scaleio_sdc_mdm_connection_state
|
||||
on: scaleio.sdc_mdm_connection_state
|
||||
class: Storage
|
||||
component: ScaleIO
|
||||
type: Utilization
|
||||
calc: $connected
|
||||
every: 10s
|
||||
warn: $this != 1
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
|
||||
to: sysadmin
|
||||
|
|
|
@ -3,43 +3,52 @@
|
|||
|
||||
# check for common /proc/net/softnet_stat errors
|
||||
|
||||
alarm: 1min_netdev_backlog_exceeded
|
||||
on: system.softnet_stat
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of dropped
|
||||
units: packets
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of dropped packets in the last minute \
|
||||
due to exceeded net.core.netdev_max_backlog
|
||||
to: sysadmin
|
||||
alarm: 1min_netdev_backlog_exceeded
|
||||
on: system.softnet_stat
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of dropped
|
||||
units: packets
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of dropped packets in the last minute \
|
||||
due to exceeded net.core.netdev_max_backlog
|
||||
to: sysadmin
|
||||
|
||||
alarm: 1min_netdev_budget_ran_outs
|
||||
on: system.softnet_stat
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of squeezed
|
||||
units: events
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
|
||||
net.core.netdev_budget_usecs with work remaining over the last minute \
|
||||
(this can be a cause for dropped packets)
|
||||
to: silent
|
||||
alarm: 1min_netdev_budget_ran_outs
|
||||
on: system.softnet_stat
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of squeezed
|
||||
units: events
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
|
||||
net.core.netdev_budget_usecs with work remaining over the last minute \
|
||||
(this can be a cause for dropped packets)
|
||||
to: silent
|
||||
|
||||
alarm: 10min_netisr_backlog_exceeded
|
||||
on: system.softnet_stat
|
||||
os: freebsd
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of qdrops
|
||||
units: packets
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of drops in the last minute \
|
||||
due to exceeded sysctl net.route.netisr_maxqlen \
|
||||
(this can be a cause for dropped packets)
|
||||
to: sysadmin
|
||||
alarm: 10min_netisr_backlog_exceeded
|
||||
on: system.softnet_stat
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: freebsd
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of qdrops
|
||||
units: packets
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of drops in the last minute \
|
||||
due to exceeded sysctl net.route.netisr_maxqlen \
|
||||
(this can be a cause for dropped packets)
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure squid is running
|
||||
|
||||
template: squid_last_collected_secs
|
||||
on: squid.clients_requests
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: proxyadmin
|
||||
template: squid_last_collected_secs
|
||||
on: squid.clients_requests
|
||||
class: Web Proxy
|
||||
component: Squid
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: proxyadmin
|
||||
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
template: stiebeleltron_last_collected_secs
|
||||
families: *
|
||||
on: stiebeleltron.heating.hc1
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
template: stiebeleltron_last_collected_secs
|
||||
families: *
|
||||
on: stiebeleltron.heating.hc1
|
||||
class: Other
|
||||
component: Sensors
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
every: 10s
|
||||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
|
|
|
@ -1,29 +1,35 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: 30min_ram_swapped_out
|
||||
on: system.swapio
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: sum -30m unaligned absolute of out
|
||||
# we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
|
||||
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
|
||||
units: % of RAM
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (20) : (30))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: percentage of the system RAM swapped in the last 30 minutes
|
||||
to: sysadmin
|
||||
alarm: 30min_ram_swapped_out
|
||||
on: system.swapio
|
||||
class: System
|
||||
component: Memory
|
||||
type: Workload
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: sum -30m unaligned absolute of out
|
||||
# we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
|
||||
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
|
||||
units: % of RAM
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (20) : (30))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: percentage of the system RAM swapped in the last 30 minutes
|
||||
to: sysadmin
|
||||
|
||||
alarm: used_swap
|
||||
on: system.swap
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
calc: $used * 100 / ( $used + $free )
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 30s down 15m multiplier 1.5 max 1h
|
||||
info: swap memory utilization
|
||||
to: sysadmin
|
||||
alarm: used_swap
|
||||
on: system.swap
|
||||
class: System
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
calc: $used * 100 / ( $used + $free )
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 30s down 15m multiplier 1.5 max 1h
|
||||
info: swap memory utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -2,111 +2,141 @@
|
|||
## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed.
|
||||
|
||||
## Service units
|
||||
template: systemd_service_units_state
|
||||
on: systemd.service_units_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd service units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_service_units_state
|
||||
on: systemd.service_units_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd service units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Socket units
|
||||
template: systemd_socket_units_state
|
||||
on: systemd.socket_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd socket units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_socket_units_state
|
||||
on: systemd.socket_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd socket units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Target units
|
||||
template: systemd_target_units_state
|
||||
on: systemd.target_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd target units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_target_units_state
|
||||
on: systemd.target_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd target units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Path units
|
||||
template: systemd_path_units_state
|
||||
on: systemd.path_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd path units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_path_units_state
|
||||
on: systemd.path_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd path units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Device units
|
||||
template: systemd_device_units_state
|
||||
on: systemd.device_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more the systemd device units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_device_units_state
|
||||
on: systemd.device_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more the systemd device units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Mount units
|
||||
template: systemd_mount_units_state
|
||||
on: systemd.mount_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more the systemd mount units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_mount_units_state
|
||||
on: systemd.mount_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more the systemd mount units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Automount units
|
||||
template: systemd_automount_units_state
|
||||
on: systemd.automount_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd automount units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_automount_units_state
|
||||
on: systemd.automount_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd automount units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Swap units
|
||||
template: systemd_swap_units_state
|
||||
on: systemd.swap_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd swap units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_swap_units_state
|
||||
on: systemd.swap_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd swap units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Scope units
|
||||
template: systemd_scope_units_state
|
||||
on: systemd.scope_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd scope units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_scope_units_state
|
||||
on: systemd.scope_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd scope units are in the failed state
|
||||
to: sysadmin
|
||||
|
||||
## Slice units
|
||||
template: systemd_slice_units_state
|
||||
on: systemd.slice_unit_state
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd slice units are in the failed state
|
||||
to: sysadmin
|
||||
template: systemd_slice_units_state
|
||||
on: systemd.slice_unit_state
|
||||
class: Linux
|
||||
component: Systemd units
|
||||
type: Errors
|
||||
lookup: max -1s min2max
|
||||
units: ok/failed
|
||||
every: 10s
|
||||
warn: $this != nan AND $this == 5
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: one or more systemd slice units are in the failed state
|
||||
to: sysadmin
|
||||
|
|
|
@ -5,15 +5,18 @@
|
|||
# In this case, the alarm will always be zero.
|
||||
#
|
||||
|
||||
alarm: tcp_connections
|
||||
on: ipv4.tcpsock
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
|
||||
crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: IPv4 TCP connections utilization
|
||||
to: sysadmin
|
||||
alarm: tcp_connections
|
||||
on: ipv4.tcpsock
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
|
||||
crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: IPv4 TCP connections utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -18,33 +18,39 @@
|
|||
# -----------------------------------------------------------------------------
|
||||
# tcp accept queue (at the kernel)
|
||||
|
||||
alarm: 1m_tcp_accept_queue_overflows
|
||||
on: ip.tcp_accept_queue
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of ListenOverflows
|
||||
units: overflows
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (1) : (5))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: average number of overflows in the TCP accept queue over the last minute
|
||||
to: sysadmin
|
||||
alarm: 1m_tcp_accept_queue_overflows
|
||||
on: ip.tcp_accept_queue
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of ListenOverflows
|
||||
units: overflows
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (1) : (5))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: average number of overflows in the TCP accept queue over the last minute
|
||||
to: sysadmin
|
||||
|
||||
# THIS IS TOO GENERIC
|
||||
# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
|
||||
alarm: 1m_tcp_accept_queue_drops
|
||||
on: ip.tcp_accept_queue
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of ListenDrops
|
||||
units: drops
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (1) : (5))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: average number of dropped packets in the TCP accept queue over the last minute
|
||||
to: sysadmin
|
||||
alarm: 1m_tcp_accept_queue_drops
|
||||
on: ip.tcp_accept_queue
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of ListenDrops
|
||||
units: drops
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (1) : (5))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: average number of dropped packets in the TCP accept queue over the last minute
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -55,30 +61,36 @@
|
|||
# enabled or not. In both cases this probably indicates a SYN flood attack,
|
||||
# so i guess a notification should be sent.
|
||||
|
||||
alarm: 1m_tcp_syn_queue_drops
|
||||
on: ip.tcp_syn_queue
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of TCPReqQFullDrop
|
||||
units: drops
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (0) : (5))
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
|
||||
(SYN cookies were not enabled)
|
||||
to: sysadmin
|
||||
alarm: 1m_tcp_syn_queue_drops
|
||||
on: ip.tcp_syn_queue
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of TCPReqQFullDrop
|
||||
units: drops
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (0) : (5))
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
|
||||
(SYN cookies were not enabled)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 1m_tcp_syn_queue_cookies
|
||||
on: ip.tcp_syn_queue
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
|
||||
units: cookies
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (0) : (5))
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
|
||||
to: sysadmin
|
||||
alarm: 1m_tcp_syn_queue_cookies
|
||||
on: ip.tcp_syn_queue
|
||||
class: System
|
||||
component: Network
|
||||
type: Workload
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
|
||||
units: cookies
|
||||
every: 10s
|
||||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (0) : (5))
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
|
||||
to: sysadmin
|
||||
|
||||
|
|
|
@ -6,15 +6,18 @@
|
|||
# and a critical when TCP is 90% of its upper memory limit
|
||||
#
|
||||
|
||||
alarm: tcp_memory
|
||||
on: ipv4.sockstat_tcp_mem
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ${mem} * 100 / ${tcp_mem_high}
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
|
||||
crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: TCP memory utilization
|
||||
to: sysadmin
|
||||
alarm: tcp_memory
|
||||
on: ipv4.sockstat_tcp_mem
|
||||
class: System
|
||||
component: Network
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ${mem} * 100 / ${tcp_mem_high}
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
|
||||
crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: TCP memory utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -7,15 +7,18 @@
|
|||
# so we alarm warning at 25% and critical at 50%
|
||||
#
|
||||
|
||||
alarm: tcp_orphans
|
||||
on: ipv4.sockstat_tcp_sockets
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ${orphan} * 100 / ${tcp_max_orphans}
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
|
||||
crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: orphan IPv4 TCP sockets utilization
|
||||
to: sysadmin
|
||||
alarm: tcp_orphans
|
||||
on: ipv4.sockstat_tcp_sockets
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ${orphan} * 100 / ${tcp_max_orphans}
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
|
||||
crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: orphan IPv4 TCP sockets utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -4,54 +4,66 @@
|
|||
# -----------------------------------------------------------------------------
|
||||
# tcp resets this host sends
|
||||
|
||||
alarm: 1m_ipv4_tcp_resets_sent
|
||||
on: ipv4.tcphandshake
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m at -10s unaligned absolute of OutRsts
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
info: average number of sent TCP RESETS over the last minute
|
||||
alarm: 1m_ipv4_tcp_resets_sent
|
||||
on: ipv4.tcphandshake
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m at -10s unaligned absolute of OutRsts
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
info: average number of sent TCP RESETS over the last minute
|
||||
|
||||
alarm: 10s_ipv4_tcp_resets_sent
|
||||
on: ipv4.tcphandshake
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10s unaligned absolute of OutRsts
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
|
||||
delay: up 20s down 60m multiplier 1.2 max 2h
|
||||
options: no-clear-notification
|
||||
info: average number of sent TCP RESETS over the last 10 seconds. \
|
||||
This can indicate a port scan, \
|
||||
or that a service running on this host has crashed. \
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
to: sysadmin
|
||||
alarm: 10s_ipv4_tcp_resets_sent
|
||||
on: ipv4.tcphandshake
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10s unaligned absolute of OutRsts
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
|
||||
delay: up 20s down 60m multiplier 1.2 max 2h
|
||||
options: no-clear-notification
|
||||
info: average number of sent TCP RESETS over the last 10 seconds. \
|
||||
This can indicate a port scan, \
|
||||
or that a service running on this host has crashed. \
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# tcp resets this host receives
|
||||
|
||||
alarm: 1m_ipv4_tcp_resets_received
|
||||
on: ipv4.tcphandshake
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: average -1m at -10s unaligned absolute of AttemptFails
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
info: average number of received TCP RESETS over the last minute
|
||||
alarm: 1m_ipv4_tcp_resets_received
|
||||
on: ipv4.tcphandshake
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: average -1m at -10s unaligned absolute of AttemptFails
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
info: average number of received TCP RESETS over the last minute
|
||||
|
||||
alarm: 10s_ipv4_tcp_resets_received
|
||||
on: ipv4.tcphandshake
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: average -10s unaligned absolute of AttemptFails
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
|
||||
delay: up 20s down 60m multiplier 1.2 max 2h
|
||||
options: no-clear-notification
|
||||
info: average number of received TCP RESETS over the last 10 seconds. \
|
||||
This can be an indication that a service this host needs has crashed. \
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
to: sysadmin
|
||||
alarm: 10s_ipv4_tcp_resets_received
|
||||
on: ipv4.tcphandshake
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: average -10s unaligned absolute of AttemptFails
|
||||
units: tcp resets/s
|
||||
every: 10s
|
||||
warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
|
||||
delay: up 20s down 60m multiplier 1.2 max 2h
|
||||
options: no-clear-notification
|
||||
info: average number of received TCP RESETS over the last 10 seconds. \
|
||||
This can be an indication that a service this host needs has crashed. \
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
to: sysadmin
|
||||
|
|
|
@ -4,29 +4,35 @@
|
|||
# -----------------------------------------------------------------------------
|
||||
# UDP receive buffer errors
|
||||
|
||||
alarm: 1m_ipv4_udp_receive_buffer_errors
|
||||
on: ipv4.udperrors
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of RcvbufErrors
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
info: average number of UDP receive buffer errors over the last minute
|
||||
delay: up 1m down 60m multiplier 1.2 max 2h
|
||||
to: sysadmin
|
||||
alarm: 1m_ipv4_udp_receive_buffer_errors
|
||||
on: ipv4.udperrors
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux freebsd
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of RcvbufErrors
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
info: average number of UDP receive buffer errors over the last minute
|
||||
delay: up 1m down 60m multiplier 1.2 max 2h
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# UDP send buffer errors
|
||||
|
||||
alarm: 1m_ipv4_udp_send_buffer_errors
|
||||
on: ipv4.udperrors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of SndbufErrors
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
info: average number of UDP send buffer errors over the last minute
|
||||
delay: up 1m down 60m multiplier 1.2 max 2h
|
||||
to: sysadmin
|
||||
alarm: 1m_ipv4_udp_send_buffer_errors
|
||||
on: ipv4.udperrors
|
||||
class: System
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -1m unaligned absolute of SndbufErrors
|
||||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
info: average number of UDP send buffer errors over the last minute
|
||||
delay: up 1m down 60m multiplier 1.2 max 2h
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,35 +1,44 @@
|
|||
|
||||
# make sure unbound is running
|
||||
|
||||
template: unbound_last_collected_secs
|
||||
on: unbound.queries
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: unbound_last_collected_secs
|
||||
on: unbound.queries
|
||||
class: DNS
|
||||
component: Unbound
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# make sure there is no overwritten/dropped queries in the request-list
|
||||
|
||||
template: unbound_request_list_overwritten
|
||||
on: unbound.request_list_jostle_list
|
||||
lookup: average -60s unaligned absolute match-names of overwritten
|
||||
units: queries
|
||||
every: 10s
|
||||
warn: $this > 5
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: number of overwritten queries in the request-list
|
||||
to: sysadmin
|
||||
template: unbound_request_list_overwritten
|
||||
on: unbound.request_list_jostle_list
|
||||
class: DNS
|
||||
component: Unbound
|
||||
type: Errors
|
||||
lookup: average -60s unaligned absolute match-names of overwritten
|
||||
units: queries
|
||||
every: 10s
|
||||
warn: $this > 5
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: number of overwritten queries in the request-list
|
||||
to: sysadmin
|
||||
|
||||
template: unbound_request_list_dropped
|
||||
on: unbound.request_list_jostle_list
|
||||
lookup: average -60s unaligned absolute match-names of dropped
|
||||
units: queries
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: number of dropped queries in the request-list
|
||||
to: sysadmin
|
||||
template: unbound_request_list_dropped
|
||||
on: unbound.request_list_jostle_list
|
||||
class: DNS
|
||||
component: Unbound
|
||||
type: Errors
|
||||
lookup: average -60s unaligned absolute match-names of dropped
|
||||
units: queries
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: number of dropped queries in the request-list
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
alarm: varnish_last_collected
|
||||
on: varnish.uptime
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
alarm: varnish_last_collected
|
||||
on: varnish.uptime
|
||||
class: Web Proxy
|
||||
component: Varnish
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
|
||||
# make sure vcsa is running and responding
|
||||
|
||||
template: vcsa_last_collected_secs
|
||||
on: vcsa.system_health
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: vcsa_last_collected_secs
|
||||
on: vcsa.system_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# Overall system health:
|
||||
# - 0: all components are healthy.
|
||||
|
@ -19,17 +22,20 @@ template: vcsa_last_collected_secs
|
|||
# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon.
|
||||
# - 4: no health data is available.
|
||||
|
||||
template: vcsa_system_health
|
||||
on: vcsa.system_health
|
||||
lookup: max -10s unaligned of system
|
||||
units: status
|
||||
every: 10s
|
||||
warn: ($this == 1) || ($this == 2)
|
||||
crit: $this == 3
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: overall system health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_system_health
|
||||
on: vcsa.system_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of system
|
||||
units: status
|
||||
every: 10s
|
||||
warn: ($this == 1) || ($this == 2)
|
||||
crit: $this == 3
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: overall system health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
# Components health:
|
||||
# - 0: healthy.
|
||||
|
@ -38,77 +44,95 @@ template: vcsa_system_health
|
|||
# - 3: unavailable, or will stop functioning soon.
|
||||
# - 4: no health data is available.
|
||||
|
||||
template: vcsa_swap_health
|
||||
on: vcsa.components_health
|
||||
lookup: max -10s unaligned of swap
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: swap health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_swap_health
|
||||
on: vcsa.components_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of swap
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: swap health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
template: vcsa_storage_health
|
||||
on: vcsa.components_health
|
||||
lookup: max -10s unaligned of storage
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: storage health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_storage_health
|
||||
on: vcsa.components_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of storage
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: storage health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
template: vcsa_mem_health
|
||||
on: vcsa.components_health
|
||||
lookup: max -10s unaligned of mem
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: memory health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_mem_health
|
||||
on: vcsa.components_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of mem
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: memory health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
template: vcsa_load_health
|
||||
on: vcsa.components_health
|
||||
lookup: max -10s unaligned of load
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: load health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_load_health
|
||||
on: vcsa.components_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Utilization
|
||||
lookup: max -10s unaligned of load
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: load health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
template: vcsa_database_storage_health
|
||||
on: vcsa.components_health
|
||||
lookup: max -10s unaligned of database_storage
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: database storage health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_database_storage_health
|
||||
on: vcsa.components_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of database_storage
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: database storage health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
template: vcsa_applmgmt_health
|
||||
on: vcsa.components_health
|
||||
lookup: max -10s unaligned of applmgmt
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: applmgmt health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_applmgmt_health
|
||||
on: vcsa.components_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of applmgmt
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 1
|
||||
crit: ($this == 2) || ($this == 3)
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: applmgmt health status \
|
||||
(-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
||||
|
||||
# Software updates health:
|
||||
|
@ -117,14 +141,17 @@ template: vcsa_applmgmt_health
|
|||
# - 3: security updates are available.
|
||||
# - 4: an error retrieving information on software updates.
|
||||
|
||||
template: vcsa_software_updates_health
|
||||
on: vcsa.software_updates_health
|
||||
lookup: max -10s unaligned of software_packages
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 4
|
||||
crit: $this == 3
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: software updates availability status \
|
||||
(-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
template: vcsa_software_updates_health
|
||||
on: vcsa.software_updates_health
|
||||
class: Virtual Machine
|
||||
component: VMware vCenter
|
||||
type: Errors
|
||||
lookup: max -10s unaligned of software_packages
|
||||
units: status
|
||||
every: 10s
|
||||
warn: $this == 4
|
||||
crit: $this == 3
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
info: software updates availability status \
|
||||
(-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,300 +1,381 @@
|
|||
|
||||
# Availability
|
||||
|
||||
template: vernemq_last_collected_secs
|
||||
on: vernemq.node_uptime
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: vernemq_last_collected_secs
|
||||
on: vernemq.node_uptime
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
# Socket errors
|
||||
|
||||
template: vernemq_socket_errors
|
||||
on: vernemq.socket_errors
|
||||
lookup: sum -1m unaligned absolute of socket_error
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 2m down 5m multiplier 1.5 max 2h
|
||||
info: number of socket errors in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_socket_errors
|
||||
on: vernemq.socket_errors
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute of socket_error
|
||||
units: errors
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 2m down 5m multiplier 1.5 max 2h
|
||||
info: number of socket errors in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Queues dropped/expired/unhandled PUBLISH messages
|
||||
|
||||
template: vernemq_queue_message_drop
|
||||
on: vernemq.queue_undelivered_messages
|
||||
lookup: sum -1m unaligned absolute of queue_message_drop
|
||||
units: dropped messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of dropped messaged due to full queues in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_queue_message_drop
|
||||
on: vernemq.queue_undelivered_messages
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute of queue_message_drop
|
||||
units: dropped messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of dropped messaged due to full queues in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_queue_message_expired
|
||||
on: vernemq.queue_undelivered_messages
|
||||
lookup: sum -1m unaligned absolute of queue_message_expired
|
||||
units: expired messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (15))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of messages which expired before delivery in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_queue_message_expired
|
||||
on: vernemq.queue_undelivered_messages
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Latency
|
||||
lookup: sum -1m unaligned absolute of queue_message_expired
|
||||
units: expired messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (15))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of messages which expired before delivery in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_queue_message_unhandled
|
||||
on: vernemq.queue_undelivered_messages
|
||||
lookup: sum -1m unaligned absolute of queue_message_unhandled
|
||||
units: unhandled messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of unhandled messages (connections with clean session=true) in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_queue_message_unhandled
|
||||
on: vernemq.queue_undelivered_messages
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Latency
|
||||
lookup: sum -1m unaligned absolute of queue_message_unhandled
|
||||
units: unhandled messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of unhandled messages (connections with clean session=true) in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Erlang VM
|
||||
|
||||
template: vernemq_average_scheduler_utilization
|
||||
on: vernemq.average_scheduler_utilization
|
||||
lookup: average -10m unaligned
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average scheduler utilization over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: vernemq_average_scheduler_utilization
|
||||
on: vernemq.average_scheduler_utilization
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Utilization
|
||||
lookup: average -10m unaligned
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average scheduler utilization over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Cluster communication and netsplits
|
||||
|
||||
template: vernemq_cluster_dropped
|
||||
on: vernemq.cluster_dropped
|
||||
lookup: sum -1m unaligned
|
||||
units: KiB
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 5m down 5m multiplier 1.5 max 1h
|
||||
info: amount of traffic dropped during communication with the cluster nodes in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_cluster_dropped
|
||||
on: vernemq.cluster_dropped
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned
|
||||
units: KiB
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 5m down 5m multiplier 1.5 max 1h
|
||||
info: amount of traffic dropped during communication with the cluster nodes in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_netsplits
|
||||
on: vernemq.netsplits
|
||||
lookup: sum -1m unaligned absolute of netsplit_detected
|
||||
units: netsplits
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 2h
|
||||
info: number of detected netsplits (split brain situation) in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_netsplits
|
||||
on: vernemq.netsplits
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute of netsplit_detected
|
||||
units: netsplits
|
||||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 2h
|
||||
info: number of detected netsplits (split brain situation) in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Unsuccessful CONNACK
|
||||
|
||||
template: vernemq_mqtt_connack_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_connack_sent_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_connack_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_connack_sent_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Not normal DISCONNECT
|
||||
|
||||
template: vernemq_mqtt_disconnect_received_reason_not_normal
|
||||
on: vernemq.mqtt_disconnect_received_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received not normal v5 DISCONNECT packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_disconnect_received_reason_not_normal
|
||||
on: vernemq.mqtt_disconnect_received_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received not normal v5 DISCONNECT packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_disconnect_sent_reason_not_normal
|
||||
on: vernemq.mqtt_disconnect_sent_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent not normal v5 DISCONNECT packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_disconnect_sent_reason_not_normal
|
||||
on: vernemq.mqtt_disconnect_sent_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent not normal v5 DISCONNECT packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# SUBSCRIBE errors and unauthorized attempts
|
||||
|
||||
template: vernemq_mqtt_subscribe_error
|
||||
on: vernemq.mqtt_subscribe_error
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: failed ops
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of failed v3/v5 SUBSCRIBE operations in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_subscribe_error
|
||||
on: vernemq.mqtt_subscribe_error
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: failed ops
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of failed v3/v5 SUBSCRIBE operations in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_subscribe_auth_error
|
||||
on: vernemq.mqtt_subscribe_auth_error
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: attempts
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_subscribe_auth_error
|
||||
on: vernemq.mqtt_subscribe_auth_error
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: attempts
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# UNSUBSCRIBE errors
|
||||
|
||||
template: vernemq_mqtt_unsubscribe_error
|
||||
on: vernemq.mqtt_unsubscribe_error
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: failed ops
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_unsubscribe_error
|
||||
on: vernemq.mqtt_unsubscribe_error
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: failed ops
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# PUBLISH errors and unauthorized attempts
|
||||
|
||||
template: vernemq_mqtt_publish_errors
|
||||
on: vernemq.mqtt_publish_errors
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: failed ops
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of failed v3/v5 PUBLISH operations in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_publish_errors
|
||||
on: vernemq.mqtt_publish_errors
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: failed ops
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of failed v3/v5 PUBLISH operations in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_publish_auth_errors
|
||||
on: vernemq.mqtt_publish_auth_errors
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: attempts
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_publish_auth_errors
|
||||
on: vernemq.mqtt_publish_auth_errors
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: attempts
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Unsuccessful and unexpected PUBACK
|
||||
|
||||
template: vernemq_mqtt_puback_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_puback_received_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBACK packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_puback_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_puback_received_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBACK packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_puback_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_puback_sent_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBACK packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_puback_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_puback_sent_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBACK packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_puback_unexpected
|
||||
on: vernemq.mqtt_puback_invalid_error
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unexpected v3/v5 PUBACK packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_puback_unexpected
|
||||
on: vernemq.mqtt_puback_invalid_error
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unexpected v3/v5 PUBACK packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Unsuccessful and unexpected PUBREC
|
||||
|
||||
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrec_received_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBREC packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrec_received_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBREC packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrec_sent_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBREC packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrec_sent_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBREC packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_pubrec_invalid_error
|
||||
on: vernemq.mqtt_pubrec_invalid_error
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unexpected v3 PUBREC packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubrec_invalid_error
|
||||
on: vernemq.mqtt_pubrec_invalid_error
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unexpected v3 PUBREC packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Unsuccessful PUBREL
|
||||
|
||||
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrel_received_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBREL packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrel_received_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBREL packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrel_sent_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBREL packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubrel_sent_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBREL packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
# Unsuccessful and unexpected PUBCOMP
|
||||
|
||||
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubcomp_received_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBCOMP packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubcomp_received_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unsuccessful v5 PUBCOMP packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubcomp_sent_reason
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
|
||||
on: vernemq.mqtt_pubcomp_sent_reason
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Errors
|
||||
lookup: sum -1m unaligned absolute match-names of !success,*
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
|
||||
to: sysadmin
|
||||
|
||||
template: vernemq_mqtt_pubcomp_unexpected
|
||||
on: vernemq.mqtt_pubcomp_invalid_error
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
|
||||
to: sysadmin
|
||||
template: vernemq_mqtt_pubcomp_unexpected
|
||||
on: vernemq.mqtt_pubcomp_invalid_error
|
||||
class: Messaging
|
||||
component: VerneMQ
|
||||
type: Workload
|
||||
lookup: sum -1m unaligned absolute
|
||||
units: messages
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: up 5m down 5m multiplier 1.5 max 2h
|
||||
info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
|
||||
to: sysadmin
|
||||
|
|
|
@ -4,138 +4,171 @@
|
|||
# -----------------------------------------------VM Specific------------------------------------------------------------
|
||||
# Memory
|
||||
|
||||
template: vsphere_vm_mem_usage
|
||||
on: vsphere.vm_mem_usage_percentage
|
||||
hosts: *
|
||||
calc: $used
|
||||
units: %
|
||||
every: 20s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: virtual machine memory utilization
|
||||
template: vsphere_vm_mem_usage
|
||||
on: vsphere.vm_mem_usage_percentage
|
||||
class: Virtual Machine
|
||||
component: Memory
|
||||
type: Utilization
|
||||
hosts: *
|
||||
calc: $used
|
||||
units: %
|
||||
every: 20s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: virtual machine memory utilization
|
||||
|
||||
# -----------------------------------------------HOST Specific----------------------------------------------------------
|
||||
# Memory
|
||||
|
||||
template: vsphere_host_mem_usage
|
||||
on: vsphere.host_mem_usage_percentage
|
||||
hosts: *
|
||||
calc: $used
|
||||
units: %
|
||||
every: 20s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: host memory utilization
|
||||
template: vsphere_host_mem_usage
|
||||
on: vsphere.host_mem_usage_percentage
|
||||
class: Virtual Machine
|
||||
component: Memory
|
||||
type: Utilization
|
||||
hosts: *
|
||||
calc: $used
|
||||
units: %
|
||||
every: 20s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: host memory utilization
|
||||
|
||||
# Network errors
|
||||
|
||||
template: vsphere_inbound_packets_errors
|
||||
on: vsphere.net_errors_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of inbound errors for the network interface in the last 10 minutes
|
||||
template: vsphere_inbound_packets_errors
|
||||
on: vsphere.net_errors_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of inbound errors for the network interface in the last 10 minutes
|
||||
|
||||
template: vsphere_outbound_packets_errors
|
||||
on: vsphere.net_errors_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of outbound errors for the network interface in the last 10 minutes
|
||||
template: vsphere_outbound_packets_errors
|
||||
on: vsphere.net_errors_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of outbound errors for the network interface in the last 10 minutes
|
||||
|
||||
# Network errors ratio
|
||||
|
||||
template: vsphere_inbound_packets_errors_ratio
|
||||
on: vsphere.net_packets_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound errors for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: vsphere_inbound_packets_errors_ratio
|
||||
on: vsphere.net_packets_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound errors for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: vsphere_outbound_packets_errors_ratio
|
||||
on: vsphere.net_packets_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound errors for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: vsphere_outbound_packets_errors_ratio
|
||||
on: vsphere.net_packets_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound errors for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------Common-------------------------------------------------------------------
|
||||
# CPU
|
||||
|
||||
template: vsphere_cpu_usage
|
||||
on: vsphere.cpu_usage_total
|
||||
hosts: *
|
||||
lookup: average -10m unaligned match-names of used
|
||||
units: %
|
||||
every: 20s
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization
|
||||
to: sysadmin
|
||||
template: vsphere_cpu_usage
|
||||
on: vsphere.cpu_usage_total
|
||||
class: Virtual Machine
|
||||
component: CPU
|
||||
type: Utilization
|
||||
hosts: *
|
||||
lookup: average -10m unaligned match-names of used
|
||||
units: %
|
||||
every: 20s
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization
|
||||
to: sysadmin
|
||||
|
||||
# Network drops
|
||||
|
||||
template: vsphere_inbound_packets_dropped
|
||||
on: vsphere.net_drops_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of inbound dropped packets for the network interface in the last 10 minutes
|
||||
template: vsphere_inbound_packets_dropped
|
||||
on: vsphere.net_drops_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of inbound dropped packets for the network interface in the last 10 minutes
|
||||
|
||||
template: vsphere_outbound_packets_dropped
|
||||
on: vsphere.net_drops_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of outbound dropped packets for the network interface in the last 10 minutes
|
||||
template: vsphere_outbound_packets_dropped
|
||||
on: vsphere.net_drops_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
units: packets
|
||||
every: 1m
|
||||
info: number of outbound dropped packets for the network interface in the last 10 minutes
|
||||
|
||||
# Network drops ratio
|
||||
|
||||
template: vsphere_inbound_packets_dropped_ratio
|
||||
on: vsphere.net_packets_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: vsphere_inbound_packets_dropped_ratio
|
||||
on: vsphere.net_packets_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of rx
|
||||
calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of inbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: vsphere_outbound_packets_dropped_ratio
|
||||
on: vsphere.net_packets_total
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: vsphere_outbound_packets_dropped_ratio
|
||||
on: vsphere.net_packets_total
|
||||
class: Virtual Machine
|
||||
component: Network
|
||||
type: Errors
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of tx
|
||||
calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: ratio of outbound dropped packets for the network interface over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,17 +1,20 @@
|
|||
|
||||
# make sure we can collect web log data
|
||||
|
||||
template: last_collected_secs
|
||||
on: web_log.response_codes
|
||||
families: *
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: last_collected_secs
|
||||
on: web_log.response_codes
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Latency
|
||||
families: *
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -24,66 +27,81 @@ families: *
|
|||
#
|
||||
# i.e. when there are at least 120 requests during the last minute
|
||||
|
||||
template: 1m_requests
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests in the last minute
|
||||
template: 1m_requests
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests in the last minute
|
||||
|
||||
template: 1m_successful
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: sum -1m unaligned of successful_requests
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
|
||||
to: webmaster
|
||||
template: 1m_successful
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned of successful_requests
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
|
||||
to: webmaster
|
||||
|
||||
template: 1m_redirects
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: sum -1m unaligned of redirects
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
|
||||
to: webmaster
|
||||
template: 1m_redirects
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned of redirects
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
|
||||
to: webmaster
|
||||
|
||||
template: 1m_bad_requests
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: sum -1m unaligned of bad_requests
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of client error HTTP requests over the last minute (4xx except 401)
|
||||
to: webmaster
|
||||
template: 1m_bad_requests
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Errors
|
||||
families: *
|
||||
lookup: sum -1m unaligned of bad_requests
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of client error HTTP requests over the last minute (4xx except 401)
|
||||
to: webmaster
|
||||
|
||||
template: 1m_internal_errors
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: sum -1m unaligned of server_errors
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of server error HTTP requests over the last minute (5xx)
|
||||
to: webmaster
|
||||
template: 1m_internal_errors
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Errors
|
||||
families: *
|
||||
lookup: sum -1m unaligned of server_errors
|
||||
calc: $this * 100 / $1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of server error HTTP requests over the last minute (5xx)
|
||||
to: webmaster
|
||||
|
||||
# unmatched lines
|
||||
|
||||
|
@ -94,26 +112,32 @@ families: *
|
|||
#
|
||||
# i.e. when there are at least 120 requests during the last minute
|
||||
|
||||
template: 1m_total_requests
|
||||
on: web_log.response_codes
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests over the last minute
|
||||
template: 1m_total_requests
|
||||
on: web_log.response_codes
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests over the last minute
|
||||
|
||||
template: 1m_unmatched
|
||||
on: web_log.response_codes
|
||||
families: *
|
||||
lookup: sum -1m unaligned of unmatched
|
||||
calc: $this * 100 / $1m_total_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: percentage of unparsed log lines over the last minute
|
||||
to: webmaster
|
||||
template: 1m_unmatched
|
||||
on: web_log.response_codes
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Errors
|
||||
families: *
|
||||
lookup: sum -1m unaligned of unmatched
|
||||
calc: $this * 100 / $1m_total_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: percentage of unparsed log lines over the last minute
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# web slow
|
||||
|
@ -125,28 +149,34 @@ families: *
|
|||
#
|
||||
# i.e. when there are at least 120 requests during the last minute
|
||||
|
||||
template: 10m_response_time
|
||||
on: web_log.response_time
|
||||
families: *
|
||||
lookup: average -10m unaligned of avg
|
||||
units: ms
|
||||
every: 30s
|
||||
info: average HTTP response time over the last 10 minutes
|
||||
template: 10m_response_time
|
||||
on: web_log.response_time
|
||||
class: System
|
||||
component: Web log
|
||||
type: Latency
|
||||
families: *
|
||||
lookup: average -10m unaligned of avg
|
||||
units: ms
|
||||
every: 30s
|
||||
info: average HTTP response time over the last 10 minutes
|
||||
|
||||
template: web_slow
|
||||
on: web_log.response_time
|
||||
families: *
|
||||
lookup: average -1m unaligned of avg
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last minute
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
template: web_slow
|
||||
on: web_log.response_time
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Latency
|
||||
families: *
|
||||
lookup: average -1m unaligned of avg
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
|
||||
crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last minute
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# web too many or too few requests
|
||||
|
@ -159,36 +189,45 @@ families: *
|
|||
# i.e. when there were at least 120 requests during the 5 minutes starting
|
||||
# at -10m and ending at -5m
|
||||
|
||||
template: 5m_successful_old
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: average -5m at -5m unaligned of successful_requests
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
|
||||
template: 5m_successful_old
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: average -5m at -5m unaligned of successful_requests
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
|
||||
|
||||
template: 5m_successful
|
||||
on: web_log.response_statuses
|
||||
families: *
|
||||
lookup: average -5m unaligned of successful_requests
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests over the last 5 minutes
|
||||
template: 5m_successful
|
||||
on: web_log.response_statuses
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: average -5m unaligned of successful_requests
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests over the last 5 minutes
|
||||
|
||||
template: 5m_requests_ratio
|
||||
on: web_log.response_codes
|
||||
families: *
|
||||
calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
|
||||
units: %
|
||||
every: 30s
|
||||
warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
|
||||
crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
options: no-clear-notification
|
||||
info: ratio of successful HTTP requests over the last 5 minutes, \
|
||||
compared with the previous 5 minutes \
|
||||
(clear notification for this alarm will not be sent)
|
||||
to: webmaster
|
||||
template: 5m_requests_ratio
|
||||
on: web_log.response_codes
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
|
||||
units: %
|
||||
every: 30s
|
||||
warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
|
||||
crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
options: no-clear-notification
|
||||
info: ratio of successful HTTP requests over the last 5 minutes, \
|
||||
compared with the previous 5 minutes \
|
||||
(clear notification for this alarm will not be sent)
|
||||
to: webmaster
|
||||
|
||||
|
||||
|
||||
|
@ -196,17 +235,20 @@ options: no-clear-notification
|
|||
|
||||
# make sure we can collect web log data
|
||||
|
||||
template: web_log_last_collected_secs
|
||||
on: web_log.requests
|
||||
families: *
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: web_log_last_collected_secs
|
||||
on: web_log.requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Latency
|
||||
families: *
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
# unmatched lines
|
||||
|
||||
|
@ -217,26 +259,32 @@ families: *
|
|||
#
|
||||
# i.e. when there are at least 120 requests during the last minute
|
||||
|
||||
template: web_log_1m_total_requests
|
||||
on: web_log.requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests in the last minute
|
||||
template: web_log_1m_total_requests
|
||||
on: web_log.requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests in the last minute
|
||||
|
||||
template: web_log_1m_unmatched
|
||||
on: web_log.excluded_requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned of unmatched
|
||||
calc: $this * 100 / $web_log_1m_total_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: percentage of unparsed log lines over the last minute
|
||||
to: webmaster
|
||||
template: web_log_1m_unmatched
|
||||
on: web_log.excluded_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Errors
|
||||
families: *
|
||||
lookup: sum -1m unaligned of unmatched
|
||||
calc: $this * 100 / $web_log_1m_total_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: percentage of unparsed log lines over the last minute
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# high level response code alarms
|
||||
|
@ -248,66 +296,81 @@ families: *
|
|||
#
|
||||
# i.e. when there are at least 120 requests during the last minute
|
||||
|
||||
template: web_log_1m_requests
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests in the last minute
|
||||
template: web_log_1m_requests
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned
|
||||
calc: ($this == 0)?(1):($this)
|
||||
units: requests
|
||||
every: 10s
|
||||
info: number of HTTP requests in the last minute
|
||||
|
||||
template: web_log_1m_successful
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned of success
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
|
||||
to: webmaster
|
||||
template: web_log_1m_successful
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned of success
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
|
||||
to: webmaster
|
||||
|
||||
template: web_log_1m_redirects
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned of redirect
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
|
||||
to: webmaster
|
||||
template: web_log_1m_redirects
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: sum -1m unaligned of redirect
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
|
||||
to: webmaster
|
||||
|
||||
template: web_log_1m_bad_requests
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned of bad
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of client error HTTP requests over the last minute (4xx except 401)
|
||||
to: webmaster
|
||||
template: web_log_1m_bad_requests
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Errors
|
||||
families: *
|
||||
lookup: sum -1m unaligned of bad
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of client error HTTP requests over the last minute (4xx except 401)
|
||||
to: webmaster
|
||||
|
||||
template: web_log_1m_internal_errors
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: sum -1m unaligned of error
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of server error HTTP requests over the last minute (5xx)
|
||||
to: webmaster
|
||||
template: web_log_1m_internal_errors
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Errors
|
||||
families: *
|
||||
lookup: sum -1m unaligned of error
|
||||
calc: $this * 100 / $web_log_1m_requests
|
||||
units: %
|
||||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of server error HTTP requests over the last minute (5xx)
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# web slow
|
||||
|
@ -319,28 +382,34 @@ families: *
|
|||
#
|
||||
# i.e. when there are at least 120 requests during the last minute
|
||||
|
||||
template: web_log_10m_response_time
|
||||
on: web_log.request_processing_time
|
||||
families: *
|
||||
lookup: average -10m unaligned of avg
|
||||
units: ms
|
||||
every: 30s
|
||||
info: average HTTP response time over the last 10 minutes
|
||||
template: web_log_10m_response_time
|
||||
on: web_log.request_processing_time
|
||||
class: System
|
||||
component: Web log
|
||||
type: Latency
|
||||
families: *
|
||||
lookup: average -10m unaligned of avg
|
||||
units: ms
|
||||
every: 30s
|
||||
info: average HTTP response time over the last 10 minutes
|
||||
|
||||
template: web_log_web_slow
|
||||
on: web_log.request_processing_time
|
||||
families: *
|
||||
lookup: average -1m unaligned of avg
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last 1 minute
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
template: web_log_web_slow
|
||||
on: web_log.request_processing_time
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Latency
|
||||
families: *
|
||||
lookup: average -1m unaligned of avg
|
||||
units: ms
|
||||
every: 10s
|
||||
green: 500
|
||||
red: 1000
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last 1 minute
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# web too many or too few requests
|
||||
|
@ -353,33 +422,42 @@ families: *
|
|||
# i.e. when there were at least 120 requests during the 5 minutes starting
|
||||
# at -10m and ending at -5m
|
||||
|
||||
template: web_log_5m_successful_old
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: average -5m at -5m unaligned of success
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
|
||||
template: web_log_5m_successful_old
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: average -5m at -5m unaligned of success
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
|
||||
|
||||
template: web_log_5m_successful
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
lookup: average -5m unaligned of success
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests over the last 5 minutes
|
||||
template: web_log_5m_successful
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
lookup: average -5m unaligned of success
|
||||
units: requests/s
|
||||
every: 30s
|
||||
info: average number of successful HTTP requests over the last 5 minutes
|
||||
|
||||
template: web_log_5m_requests_ratio
|
||||
on: web_log.type_requests
|
||||
families: *
|
||||
calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
|
||||
units: %
|
||||
every: 30s
|
||||
warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
|
||||
crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
options: no-clear-notification
|
||||
info: ratio of successful HTTP requests over over the last 5 minutes, \
|
||||
compared with the previous 5 minutes \
|
||||
(clear notification for this alarm will not be sent)
|
||||
to: webmaster
|
||||
template: web_log_5m_requests_ratio
|
||||
on: web_log.type_requests
|
||||
class: Web Server
|
||||
component: Web log
|
||||
type: Workload
|
||||
families: *
|
||||
calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
|
||||
units: %
|
||||
every: 30s
|
||||
warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
|
||||
crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
options: no-clear-notification
|
||||
info: ratio of successful HTTP requests over over the last 5 minutes, \
|
||||
compared with the previous 5 minutes \
|
||||
(clear notification for this alarm will not be sent)
|
||||
to: webmaster
|
||||
|
|
|
@ -1,24 +1,30 @@
|
|||
|
||||
# make sure whoisquery is running
|
||||
|
||||
template: whoisquery_last_collected_secs
|
||||
on: whoisquery.time_until_expiration
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 60s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: whoisquery_last_collected_secs
|
||||
on: whoisquery.time_until_expiration
|
||||
class: Other
|
||||
component: WHOIS
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 60s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
||||
template: whoisquery_days_until_expiration
|
||||
on: whoisquery.time_until_expiration
|
||||
calc: $expiry
|
||||
units: seconds
|
||||
every: 60s
|
||||
warn: $this < $days_until_expiration_warning*24*60*60
|
||||
crit: $this < $days_until_expiration_critical*24*60*60
|
||||
info: time until the domain name registration expires
|
||||
to: webmaster
|
||||
template: whoisquery_days_until_expiration
|
||||
on: whoisquery.time_until_expiration
|
||||
class: Other
|
||||
component: WHOIS
|
||||
type: Utilization
|
||||
calc: $expiry
|
||||
units: seconds
|
||||
every: 60s
|
||||
warn: $this < $days_until_expiration_warning*24*60*60
|
||||
crit: $this < $days_until_expiration_critical*24*60*60
|
||||
info: time until the domain name registration expires
|
||||
to: webmaster
|
||||
|
|
|
@ -3,128 +3,155 @@
|
|||
|
||||
## Availability
|
||||
|
||||
template: wmi_last_collected_secs
|
||||
on: cpu.collector_duration
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
template: wmi_last_collected_secs
|
||||
on: cpu.collector_duration
|
||||
class: Windows
|
||||
component: Availability
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
||||
## CPU
|
||||
|
||||
template: wmi_10min_cpu_usage
|
||||
on: wmi.cpu_utilization_total
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wmi_10min_cpu_usage
|
||||
on: wmi.cpu_utilization_total
|
||||
class: Windows
|
||||
component: CPU
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
|
||||
units: %
|
||||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
|
||||
## Memory
|
||||
|
||||
template: wmi_ram_in_use
|
||||
on: wmi.memory_utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($used) * 100 / ($used + $available)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: memory utilization
|
||||
to: sysadmin
|
||||
template: wmi_ram_in_use
|
||||
on: wmi.memory_utilization
|
||||
class: Windows
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($used) * 100 / ($used + $available)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: memory utilization
|
||||
to: sysadmin
|
||||
|
||||
template: wmi_swap_in_use
|
||||
on: wmi.memory_swap_utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($used) * 100 / ($used + $available)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: swap memory utilization
|
||||
to: sysadmin
|
||||
template: wmi_swap_in_use
|
||||
on: wmi.memory_swap_utilization
|
||||
class: Windows
|
||||
component: Memory
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($used) * 100 / ($used + $available)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: swap memory utilization
|
||||
to: sysadmin
|
||||
|
||||
|
||||
## Network
|
||||
|
||||
template: wmi_inbound_packets_discarded
|
||||
on: wmi.net_discarded
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of inbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound discarded packets for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wmi_inbound_packets_discarded
|
||||
on: wmi.net_discarded
|
||||
class: Windows
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of inbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound discarded packets for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: wmi_outbound_packets_discarded
|
||||
on: wmi.net_discarded
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of outbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound discarded packets for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wmi_outbound_packets_discarded
|
||||
on: wmi.net_discarded
|
||||
class: Windows
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of outbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound discarded packets for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: wmi_inbound_packets_errors
|
||||
on: wmi.net_errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of inbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wmi_inbound_packets_errors
|
||||
on: wmi.net_errors
|
||||
class: Windows
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of inbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: wmi_outbound_packets_errors
|
||||
on: wmi.net_errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of outbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
template: wmi_outbound_packets_errors
|
||||
on: wmi.net_errors
|
||||
class: Windows
|
||||
component: Network
|
||||
type: Errors
|
||||
os: linux
|
||||
hosts: *
|
||||
families: *
|
||||
lookup: sum -10m unaligned absolute match-names of outbound
|
||||
units: packets
|
||||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound errors for the network interface in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
|
||||
## Disk
|
||||
|
||||
template: wmi_disk_in_use
|
||||
on: wmi.logical_disk_utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($used) * 100 / ($used + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: disk space utilization
|
||||
to: sysadmin
|
||||
template: wmi_disk_in_use
|
||||
on: wmi.logical_disk_utilization
|
||||
class: Windows
|
||||
component: Disk
|
||||
type: Utilization
|
||||
os: linux
|
||||
hosts: *
|
||||
calc: ($used) * 100 / ($used + $free)
|
||||
units: %
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: disk space utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,32 +1,41 @@
|
|||
|
||||
# make sure x509check is running
|
||||
|
||||
template: x509check_last_collected_secs
|
||||
on: x509check.time_until_expiration
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 60s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: x509check_last_collected_secs
|
||||
on: x509check.time_until_expiration
|
||||
class: Certificates
|
||||
component: x509 certificates
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 60s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
||||
template: x509check_days_until_expiration
|
||||
on: x509check.time_until_expiration
|
||||
calc: $expiry
|
||||
units: seconds
|
||||
every: 60s
|
||||
warn: $this < $days_until_expiration_warning*24*60*60
|
||||
crit: $this < $days_until_expiration_critical*24*60*60
|
||||
info: time until x509 certificate expires
|
||||
to: webmaster
|
||||
template: x509check_days_until_expiration
|
||||
on: x509check.time_until_expiration
|
||||
class: Certificates
|
||||
component: x509 certificates
|
||||
type: Latency
|
||||
calc: $expiry
|
||||
units: seconds
|
||||
every: 60s
|
||||
warn: $this < $days_until_expiration_warning*24*60*60
|
||||
crit: $this < $days_until_expiration_critical*24*60*60
|
||||
info: time until x509 certificate expires
|
||||
to: webmaster
|
||||
|
||||
template: x509check_revocation_status
|
||||
on: x509check.revocation_status
|
||||
calc: $revoked
|
||||
every: 60s
|
||||
crit: $this != nan AND $this != 0
|
||||
info: x509 certificate revocation status (0: revoked, 1: valid)
|
||||
to: webmaster
|
||||
template: x509check_revocation_status
|
||||
on: x509check.revocation_status
|
||||
class: Certificates
|
||||
component: x509 certificates
|
||||
type: Errors
|
||||
calc: $revoked
|
||||
every: 60s
|
||||
crit: $this != nan AND $this != 0
|
||||
info: x509 certificate revocation status (0: revoked, 1: valid)
|
||||
to: webmaster
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
|
||||
alarm: zfs_memory_throttle
|
||||
on: zfs.memory_ops
|
||||
lookup: sum -10m unaligned absolute of throttled
|
||||
units: events
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of times ZFS had to limit the ARC growth in the last 10 minutes
|
||||
to: sysadmin
|
||||
alarm: zfs_memory_throttle
|
||||
on: zfs.memory_ops
|
||||
class: System
|
||||
component: File system
|
||||
type: Utilization
|
||||
lookup: sum -10m unaligned absolute of throttled
|
||||
units: events
|
||||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of times ZFS had to limit the ARC growth in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -1,14 +1,17 @@
|
|||
|
||||
# make sure zookeeper is running
|
||||
|
||||
template: zookeeper_last_collected_secs
|
||||
on: zookeeper.requests
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
template: zookeeper_last_collected_secs
|
||||
on: zookeeper.requests
|
||||
class: KV Storage
|
||||
component: ZooKeeper
|
||||
type: Latency
|
||||
calc: $now - $last_collected_t
|
||||
units: seconds ago
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
||||
|
|
|
@ -37,20 +37,6 @@ extern unsigned int default_health_enabled;
|
|||
#define HEALTH_LISTEN_BACKLOG 4096
|
||||
#endif
|
||||
|
||||
#define HEALTH_ON_KEY "on"
|
||||
#define HEALTH_EVERY_KEY "every"
|
||||
#define HEALTH_GREEN_KEY "green"
|
||||
#define HEALTH_RED_KEY "red"
|
||||
#define HEALTH_WARN_KEY "warn"
|
||||
#define HEALTH_CRIT_KEY "crit"
|
||||
#define HEALTH_EXEC_KEY "exec"
|
||||
#define HEALTH_RECIPIENT_KEY "to"
|
||||
#define HEALTH_UNITS_KEY "units"
|
||||
#define HEALTH_INFO_KEY "info"
|
||||
#define HEALTH_DELAY_KEY "delay"
|
||||
#define HEALTH_OPTIONS_KEY "options"
|
||||
#define HEALTH_FOREACH_KEY "foreach"
|
||||
|
||||
#define HEALTH_SILENCERS_MAX_FILE_LEN 10000
|
||||
|
||||
extern char *silencers_filename;
|
||||
|
@ -81,6 +67,9 @@ extern ALARM_ENTRY* health_create_alarm_entry(
|
|||
const char *name,
|
||||
const char *chart,
|
||||
const char *family,
|
||||
const char *class,
|
||||
const char *component,
|
||||
const char *type,
|
||||
const char *exec,
|
||||
const char *recipient,
|
||||
time_t duration,
|
||||
|
|
|
@ -23,10 +23,14 @@
|
|||
#define HEALTH_RECIPIENT_KEY "to"
|
||||
#define HEALTH_UNITS_KEY "units"
|
||||
#define HEALTH_INFO_KEY "info"
|
||||
#define HEALTH_CLASS_KEY "class"
|
||||
#define HEALTH_COMPONENT_KEY "component"
|
||||
#define HEALTH_TYPE_KEY "type"
|
||||
#define HEALTH_DELAY_KEY "delay"
|
||||
#define HEALTH_OPTIONS_KEY "options"
|
||||
#define HEALTH_REPEAT_KEY "repeat"
|
||||
#define HEALTH_HOST_LABEL_KEY "host labels"
|
||||
#define HEALTH_FOREACH_KEY "foreach"
|
||||
|
||||
static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
|
||||
if(!rc->chart) {
|
||||
|
@ -499,6 +503,9 @@ static int health_readfile(const char *filename, void *data) {
|
|||
hash_lookup = 0,
|
||||
hash_units = 0,
|
||||
hash_info = 0,
|
||||
hash_class = 0,
|
||||
hash_component = 0,
|
||||
hash_type = 0,
|
||||
hash_recipient = 0,
|
||||
hash_delay = 0,
|
||||
hash_options = 0,
|
||||
|
@ -526,6 +533,9 @@ static int health_readfile(const char *filename, void *data) {
|
|||
hash_every = simple_uhash(HEALTH_EVERY_KEY);
|
||||
hash_units = simple_hash(HEALTH_UNITS_KEY);
|
||||
hash_info = simple_hash(HEALTH_INFO_KEY);
|
||||
hash_class = simple_uhash(HEALTH_CLASS_KEY);
|
||||
hash_component = simple_uhash(HEALTH_COMPONENT_KEY);
|
||||
hash_type = simple_uhash(HEALTH_TYPE_KEY);
|
||||
hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
|
||||
hash_delay = simple_uhash(HEALTH_DELAY_KEY);
|
||||
hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
|
||||
|
@ -696,6 +706,39 @@ static int health_readfile(const char *filename, void *data) {
|
|||
rc->chart = strdupz(value);
|
||||
rc->hash_chart = simple_hash(rc->chart);
|
||||
}
|
||||
else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
|
||||
if(rc->class) {
|
||||
if(strcmp(rc->class, value) != 0)
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
|
||||
line, filename, rc->name, key, rc->class, value, value);
|
||||
|
||||
freez(rc->class);
|
||||
}
|
||||
rc->class = strdupz(value);
|
||||
strip_quotes(rc->class);
|
||||
}
|
||||
else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
|
||||
if(rc->component) {
|
||||
if(strcmp(rc->component, value) != 0)
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
|
||||
line, filename, rc->name, key, rc->component, value, value);
|
||||
|
||||
freez(rc->component);
|
||||
}
|
||||
rc->component = strdupz(value);
|
||||
strip_quotes(rc->component);
|
||||
}
|
||||
else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
|
||||
if(rc->type) {
|
||||
if(strcmp(rc->type, value) != 0)
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
|
||||
line, filename, rc->name, key, rc->type, value, value);
|
||||
|
||||
freez(rc->type);
|
||||
}
|
||||
rc->type = strdupz(value);
|
||||
strip_quotes(rc->type);
|
||||
}
|
||||
else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
|
||||
health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before,
|
||||
&rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim);
|
||||
|
@ -848,6 +891,39 @@ static int health_readfile(const char *filename, void *data) {
|
|||
rt->context = strdupz(value);
|
||||
rt->hash_context = simple_hash(rt->context);
|
||||
}
|
||||
else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
|
||||
if(rt->class) {
|
||||
if(strcmp(rt->class, value) != 0)
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
|
||||
line, filename, rt->name, key, rt->class, value, value);
|
||||
|
||||
freez(rt->class);
|
||||
}
|
||||
rt->class = strdupz(value);
|
||||
strip_quotes(rt->class);
|
||||
}
|
||||
else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
|
||||
if(rt->component) {
|
||||
if(strcmp(rt->component, value) != 0)
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
|
||||
line, filename, rt->name, key, rt->component, value, value);
|
||||
|
||||
freez(rt->component);
|
||||
}
|
||||
rt->component = strdupz(value);
|
||||
strip_quotes(rt->component);
|
||||
}
|
||||
else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
|
||||
if(rt->type) {
|
||||
if(strcmp(rt->type, value) != 0)
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
|
||||
line, filename, rt->name, key, rt->type, value, value);
|
||||
|
||||
freez(rt->type);
|
||||
}
|
||||
rt->type = strdupz(value);
|
||||
strip_quotes(rt->type);
|
||||
}
|
||||
else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
|
||||
freez(rt->family_match);
|
||||
simple_pattern_free(rt->family_pattern);
|
||||
|
|
|
@ -23,6 +23,9 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
|
|||
"\t\t\"name\": \"%s\",\n"
|
||||
"\t\t\"chart\": \"%s\",\n"
|
||||
"\t\t\"family\": \"%s\",\n"
|
||||
"\t\t\"class\": \"%s\",\n"
|
||||
"\t\t\"component\": \"%s\",\n"
|
||||
"\t\t\"type\": \"%s\",\n"
|
||||
"\t\t\"processed\": %s,\n"
|
||||
"\t\t\"updated\": %s,\n"
|
||||
"\t\t\"exec_run\": %lu,\n"
|
||||
|
@ -52,6 +55,9 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
|
|||
, ae->name
|
||||
, ae->chart
|
||||
, ae->family
|
||||
, ae->class?ae->class:"Unknown"
|
||||
, ae->component?ae->component:"Unknown"
|
||||
, ae->type?ae->type:"Unknown"
|
||||
, (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
|
||||
, (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
|
||||
, (unsigned long)ae->exec_run_timestamp
|
||||
|
@ -76,7 +82,22 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
|
|||
, (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
|
||||
);
|
||||
|
||||
health_string2json(wb, "\t\t", "info", ae->info?ae->info:"", ",\n");
|
||||
char *replaced_info = NULL;
|
||||
if (likely(ae->info)) {
|
||||
char *m = NULL;
|
||||
replaced_info = strdupz(ae->info);
|
||||
size_t pos = 0;
|
||||
while ((m = strstr(replaced_info + pos, "$family"))) {
|
||||
char *buf = NULL;
|
||||
pos = m - replaced_info;
|
||||
buf = find_and_replace(replaced_info, "$family", ae->family ? ae->family : "", m);
|
||||
freez(replaced_info);
|
||||
replaced_info = strdupz(buf);
|
||||
freez(buf);
|
||||
}
|
||||
}
|
||||
|
||||
health_string2json(wb, "\t\t", "info", replaced_info?replaced_info:"", ",\n");
|
||||
|
||||
if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
|
||||
buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
|
||||
|
@ -91,6 +112,8 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
|
|||
buffer_strcat(wb, "\n");
|
||||
|
||||
buffer_strcat(wb, "\t}");
|
||||
|
||||
freez(replaced_info);
|
||||
}
|
||||
|
||||
void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
|
||||
|
@ -140,12 +163,30 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
|
|||
char value_string[100 + 1];
|
||||
format_value_and_unit(value_string, 100, rc->value, rc->units, -1);
|
||||
|
||||
char *replaced_info = NULL;
|
||||
if (likely(rc->info)) {
|
||||
char *m;
|
||||
replaced_info = strdupz(rc->info);
|
||||
size_t pos = 0;
|
||||
while ((m = strstr(replaced_info + pos, "$family"))) {
|
||||
char *buf = NULL;
|
||||
pos = m - replaced_info;
|
||||
buf = find_and_replace(replaced_info, "$family", (rc->rrdset && rc->rrdset->family) ? rc->rrdset->family : "", m);
|
||||
freez(replaced_info);
|
||||
replaced_info = strdupz(buf);
|
||||
freez(buf);
|
||||
}
|
||||
}
|
||||
|
||||
buffer_sprintf(wb,
|
||||
"\t\t\"%s.%s\": {\n"
|
||||
"\t\t\t\"id\": %lu,\n"
|
||||
"\t\t\t\"name\": \"%s\",\n"
|
||||
"\t\t\t\"chart\": \"%s\",\n"
|
||||
"\t\t\t\"family\": \"%s\",\n"
|
||||
"\t\t\t\"class\": \"%s\",\n"
|
||||
"\t\t\t\"component\": \"%s\",\n"
|
||||
"\t\t\t\"type\": \"%s\",\n"
|
||||
"\t\t\t\"active\": %s,\n"
|
||||
"\t\t\t\"disabled\": %s,\n"
|
||||
"\t\t\t\"silenced\": %s,\n"
|
||||
|
@ -174,6 +215,9 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
|
|||
, rc->name
|
||||
, rc->chart
|
||||
, (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
|
||||
, rc->class?rc->class:"Unknown"
|
||||
, rc->component?rc->component:"Unknown"
|
||||
, rc->type?rc->type:"Unknown"
|
||||
, (rc->rrdset)?"true":"false"
|
||||
, (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
|
||||
, (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
|
||||
|
@ -181,7 +225,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
|
|||
, rc->recipient?rc->recipient:host->health_default_recipient
|
||||
, rc->source
|
||||
, rc->units?rc->units:""
|
||||
, rc->info?rc->info:""
|
||||
, replaced_info?replaced_info:""
|
||||
, rrdcalc_status2string(rc->status)
|
||||
, (unsigned long)rc->last_status_change
|
||||
, (unsigned long)rc->last_updated
|
||||
|
@ -252,6 +296,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
|
|||
buffer_strcat(wb, "\n");
|
||||
|
||||
buffer_strcat(wb, "\t\t}");
|
||||
|
||||
freez(replaced_info);
|
||||
}
|
||||
|
||||
//void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
|
||||
|
|
|
@ -111,6 +111,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
|
|||
"\t%d\t%d\t%d\t%d"
|
||||
"\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
|
||||
"\t%016lx"
|
||||
"\t%s\t%s\t%s"
|
||||
"\n"
|
||||
, (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
|
||||
, host->hostname
|
||||
|
@ -145,6 +146,9 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
|
|||
, ae->new_value
|
||||
, ae->old_value
|
||||
, (uint64_t)ae->last_repeat
|
||||
, (ae->class)?ae->class:"Unknown"
|
||||
, (ae->component)?ae->component:"Unknown"
|
||||
, (ae->type)?ae->type:"Unknown"
|
||||
) < 0))
|
||||
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
|
||||
else {
|
||||
|
@ -191,7 +195,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
|
|||
host->health_log_entries_written++;
|
||||
line++;
|
||||
|
||||
int max_entries = 30, entries = 0;
|
||||
int max_entries = 33, entries = 0;
|
||||
char *pointers[max_entries];
|
||||
|
||||
pointers[entries++] = s++;
|
||||
|
@ -364,6 +368,20 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
|
|||
|
||||
ae->last_repeat = last_repeat;
|
||||
|
||||
if (likely(entries > 28)) {
|
||||
freez(ae->class);
|
||||
ae->class = strdupz(pointers[28]);
|
||||
if(!*ae->class) { freez(ae->class); ae->class = NULL; }
|
||||
|
||||
freez(ae->component);
|
||||
ae->component = strdupz(pointers[29]);
|
||||
if(!*ae->component) { freez(ae->component); ae->component = NULL; }
|
||||
|
||||
freez(ae->type);
|
||||
ae->type = strdupz(pointers[30]);
|
||||
if(!*ae->type) { freez(ae->type); ae->type = NULL; }
|
||||
}
|
||||
|
||||
char value_string[100 + 1];
|
||||
freez(ae->old_value_string);
|
||||
freez(ae->new_value_string);
|
||||
|
@ -442,6 +460,9 @@ inline ALARM_ENTRY* health_create_alarm_entry(
|
|||
const char *name,
|
||||
const char *chart,
|
||||
const char *family,
|
||||
const char *class,
|
||||
const char *component,
|
||||
const char *type,
|
||||
const char *exec,
|
||||
const char *recipient,
|
||||
time_t duration,
|
||||
|
@ -469,11 +490,19 @@ inline ALARM_ENTRY* health_create_alarm_entry(
|
|||
if(family)
|
||||
ae->family = strdupz(family);
|
||||
|
||||
if (class)
|
||||
ae->class = strdupz(class);
|
||||
|
||||
if (component)
|
||||
ae->component = strdupz(component);
|
||||
|
||||
if (type)
|
||||
ae->type = strdupz(type);
|
||||
|
||||
if(exec) ae->exec = strdupz(exec);
|
||||
if(recipient) ae->recipient = strdupz(recipient);
|
||||
if(source) ae->source = strdupz(source);
|
||||
if(units) ae->units = strdupz(units);
|
||||
if(info) ae->info = strdupz(info);
|
||||
|
||||
ae->unique_id = host->health_log.next_log_id++;
|
||||
ae->alarm_id = alarm_id;
|
||||
|
@ -486,6 +515,24 @@ inline ALARM_ENTRY* health_create_alarm_entry(
|
|||
ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
|
||||
ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));
|
||||
|
||||
char *replaced_info = NULL;
|
||||
if (likely(info)) {
|
||||
char *m;
|
||||
replaced_info = strdupz(info);
|
||||
size_t pos = 0;
|
||||
while ((m = strstr(replaced_info + pos, "$family"))) {
|
||||
char *buf = NULL;
|
||||
pos = m - replaced_info;
|
||||
buf = find_and_replace(replaced_info, "$family", (ae->family) ? ae->family : "", m);
|
||||
freez(replaced_info);
|
||||
replaced_info = strdupz(buf);
|
||||
freez(buf);
|
||||
}
|
||||
}
|
||||
|
||||
if(replaced_info) ae->info = strdupz(replaced_info);
|
||||
freez(replaced_info);
|
||||
|
||||
ae->old_status = old_status;
|
||||
ae->new_status = new_status;
|
||||
ae->duration = duration;
|
||||
|
@ -548,6 +595,9 @@ inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
|
|||
freez(ae->name);
|
||||
freez(ae->chart);
|
||||
freez(ae->family);
|
||||
freez(ae->class);
|
||||
freez(ae->component);
|
||||
freez(ae->type);
|
||||
freez(ae->exec);
|
||||
freez(ae->recipient);
|
||||
freez(ae->source);
|
||||
|
|
|
@ -1492,3 +1492,33 @@ char *read_by_filename(char *filename, long *file_size)
|
|||
*file_size = size;
|
||||
return contents;
|
||||
}
|
||||
|
||||
char *find_and_replace(const char *src, const char *find, const char *replace, const char *where)
|
||||
{
|
||||
size_t size = strlen(src) + 1;
|
||||
size_t find_len = strlen(find);
|
||||
size_t repl_len = strlen(replace);
|
||||
char *value, *dst;
|
||||
|
||||
if (likely(where))
|
||||
size += (repl_len - find_len);
|
||||
|
||||
value = mallocz(size);
|
||||
dst = value;
|
||||
|
||||
if (likely(where)) {
|
||||
size_t count = where - src;
|
||||
|
||||
memmove(dst, src, count);
|
||||
src += count;
|
||||
dst += count;
|
||||
|
||||
memmove(dst, replace, repl_len);
|
||||
src += find_len;
|
||||
dst += repl_len;
|
||||
}
|
||||
|
||||
strcpy(dst, src);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
|
|
@ -279,6 +279,7 @@ extern void recursive_config_double_dir_load(
|
|||
, size_t depth
|
||||
);
|
||||
extern char *read_by_filename(char *filename, long *file_size);
|
||||
extern char *find_and_replace(const char *src, const char *find, const char *replace, const char *where);
|
||||
|
||||
/* fix for alpine linux */
|
||||
#ifndef RUSAGE_THREAD
|
||||
|
|
Loading…
Add table
Reference in a new issue