mirror of
https://github.com/netdata/netdata.git
synced 2025-04-06 14:35:32 +00:00
Add summary to alerts configurations (#16129)
* add to web log alerts * add more * more adds * more adds * more * more * more * more * more * more * more * more * updates --------- Co-authored-by: ilyam8 <ilya@netdata.cloud>
This commit is contained in:
parent
e06b469cd0
commit
c317e4d380
67 changed files with 339 additions and 247 deletions
health/health.d
adaptec_raid.confapcupsd.confbcache.confbeanstalkd.confbind_rndc.confboinc.confbtrfs.confceph.confcgroups.confcockroachdb.confconsul.confcpu.confdbengine.confdisks.confdns_query.confdnsmasq_dhcp.confelasticsearch.confentropy.confexporting.conffile_descriptors.confgearman.confgo.d.plugin.confhaproxy.confhdfs.confhttpcheck.confioping.confipfs.confipmi.confkubelet.confload.confmdstat.confmegacli.confmemcached.confmemory.confml.confmysql.confnet.confnetfilter.confnut.confpihole.confplugin.confportcheck.confpostgres.confprocesses.confpython.d.plugin.confram.confretroshare.confriakkv.confscaleio.confsoftnet.confswap.confsynchronization.confsystemdunits.conftcp_conn.conftcp_listen.conftcp_mem.conftcp_orphans.conftcp_resets.confudp_errors.confunbound.confvcsa.confvsphere.confweb_log.confwhoisquery.confwindows.confx509check.confzfs.conf
|
@ -11,7 +11,8 @@ component: RAID
|
|||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: logical device status is failed or degraded
|
||||
summary: Adaptec raid logical device status
|
||||
info: Logical device status is failed or degraded
|
||||
to: sysadmin
|
||||
|
||||
# physical device state check
|
||||
|
@ -26,5 +27,6 @@ component: RAID
|
|||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: physical device state is not online
|
||||
summary: Adaptec raid physical device state
|
||||
info: Physical device state is not online
|
||||
to: sysadmin
|
||||
|
|
|
@ -12,7 +12,8 @@ component: UPS
|
|||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
info: average UPS load over the last 10 minutes
|
||||
summary: APC UPS load
|
||||
info: APC UPS average load over the last 10 minutes
|
||||
to: sitemgr
|
||||
|
||||
# Discussion in https://github.com/netdata/netdata/pull/3928:
|
||||
|
@ -30,7 +31,8 @@ component: UPS
|
|||
warn: $this < 100
|
||||
crit: $this < 40
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
info: average UPS charge over the last minute
|
||||
summary: APC UPS battery charge
|
||||
info: APC UPS average battery charge over the last minute
|
||||
to: sitemgr
|
||||
|
||||
template: apcupsd_last_collected_secs
|
||||
|
@ -43,5 +45,6 @@ component: UPS device
|
|||
units: seconds ago
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
summary: APC UPS last collection
|
||||
info: APC UPS number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
|
|
|
@ -9,7 +9,8 @@ component: Disk
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 2m down 1h multiplier 1.5 max 2h
|
||||
info: number of times data was read from the cache, \
|
||||
summary: Bcache cache read race errors
|
||||
info: Number of times data was read from the cache, \
|
||||
the bucket was reused and invalidated in the last 10 minutes \
|
||||
(when this occurs the data is reread from the backing device)
|
||||
to: silent
|
||||
|
@ -24,6 +25,7 @@ component: Disk
|
|||
every: 1m
|
||||
warn: $this > 75
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
info: percentage of cache space used for dirty data and metadata \
|
||||
summary: Bcache cache used space
|
||||
info: Percentage of cache space used for dirty data and metadata \
|
||||
(this usually means your SSD cache is too small)
|
||||
to: silent
|
||||
|
|
|
@ -10,7 +10,8 @@ component: Beanstalk
|
|||
every: 10s
|
||||
warn: $this > 3
|
||||
delay: up 0 down 5m multiplier 1.2 max 1h
|
||||
info: number of buried jobs across all tubes. \
|
||||
summary: Beanstalk buried jobs
|
||||
info: Number of buried jobs across all tubes. \
|
||||
You need to manually kick them so they can be processed. \
|
||||
Presence of buried jobs in a tube does not affect new jobs.
|
||||
to: sysadmin
|
||||
|
|
|
@ -7,5 +7,6 @@ component: BIND
|
|||
every: 60
|
||||
calc: $stats_size
|
||||
warn: $this > 512
|
||||
summary: BIND statistics file size
|
||||
info: BIND statistics-file size
|
||||
to: sysadmin
|
||||
|
|
|
@ -13,7 +13,8 @@ component: BOINC
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: average number of compute errors over the last 10 minutes
|
||||
summary: BOINC compute errors
|
||||
info: Average number of compute errors over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Warn on lots of upload errors
|
||||
|
@ -29,7 +30,8 @@ component: BOINC
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: average number of failed uploads over the last 10 minutes
|
||||
summary: BOINC failed uploads
|
||||
info: Average number of failed uploads over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Warn on the task queue being empty
|
||||
|
@ -45,7 +47,8 @@ component: BOINC
|
|||
every: 1m
|
||||
warn: $this < 1
|
||||
delay: up 5m down 10m multiplier 1.5 max 1h
|
||||
info: average number of total tasks over the last 10 minutes
|
||||
summary: BOINC total tasks
|
||||
info: Average number of total tasks over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
# Warn on no active tasks with a non-empty queue
|
||||
|
@ -62,5 +65,6 @@ component: BOINC
|
|||
every: 1m
|
||||
warn: $this < 1
|
||||
delay: up 5m down 10m multiplier 1.5 max 1h
|
||||
info: average number of active tasks over the last 10 minutes
|
||||
summary: BOINC active tasks
|
||||
info: Average number of active tasks over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -11,7 +11,7 @@ component: File system
|
|||
every: 10s
|
||||
warn: $this > (($status == $CRITICAL) ? (95) : (98))
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
summary: BTRFS space allocated
|
||||
summary: BTRFS allocated space utilization
|
||||
info: Percentage of allocated BTRFS physical disk space
|
||||
to: silent
|
||||
|
||||
|
@ -28,7 +28,7 @@ component: File system
|
|||
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
|
||||
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
|
||||
delay: up 1m down 15m multiplier 1.5 max 1h
|
||||
summary: BTRFS space utilization
|
||||
summary: BTRFS data space utilization
|
||||
info: Utilization of BTRFS data space
|
||||
to: sysadmin
|
||||
|
||||
|
|
|
@ -11,5 +11,6 @@ component: Ceph
|
|||
warn: $this > (($status >= $WARNING ) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 5m multiplier 1.2 max 1h
|
||||
info: cluster disk space utilization
|
||||
summary: Ceph cluster disk space utilization
|
||||
info: Ceph cluster disk space utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -13,8 +13,8 @@ component: CPU
|
|||
every: 1m
|
||||
warn: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Cgroup CPU utilization
|
||||
info: Average cgroup CPU utilization over the last 10 minutes
|
||||
summary: Cgroup ${label:cgroup_name} CPU utilization
|
||||
info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
template: cgroup_ram_in_use
|
||||
|
@ -30,47 +30,10 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Cgroup ram utilization
|
||||
info: Cgroup memory utilization
|
||||
summary: Cgroup ${label:cgroup_name} memory utilization
|
||||
info: Cgroup ${label:cgroup_name} memory utilization
|
||||
to: silent
|
||||
|
||||
# FIXME COMMENTED DUE TO A BUG IN NETDATA
|
||||
## -----------------------------------------------------------------------------
|
||||
## check for packet storms
|
||||
#
|
||||
## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
|
||||
## 2. do the same for the last 10s
|
||||
## 3. raise an alarm if the later is 10x or 20x the first
|
||||
## we assume the minimum packet storm should at least have
|
||||
## 10000 packets/s, average of the last 10 seconds
|
||||
#
|
||||
# template: cgroup_1m_received_packets_rate
|
||||
# on: cgroup.net_packets
|
||||
# class: Workload
|
||||
# type: Cgroups
|
||||
#component: Network
|
||||
# hosts: *
|
||||
# lookup: average -1m unaligned of received
|
||||
# units: packets
|
||||
# every: 10s
|
||||
# info: average number of packets received by the network interface ${label:device} over the last minute
|
||||
#
|
||||
# template: cgroup_10s_received_packets_storm
|
||||
# on: cgroup.net_packets
|
||||
# class: Workload
|
||||
# type: Cgroups
|
||||
#component: Network
|
||||
# hosts: *
|
||||
# lookup: average -10s unaligned of received
|
||||
# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
|
||||
# every: 10s
|
||||
# units: %
|
||||
# warn: $this > (($status >= $WARNING)?(200):(5000))
|
||||
# options: no-clear-notification
|
||||
# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
|
||||
# compared to the rate over the last minute
|
||||
# to: sysadmin
|
||||
#
|
||||
# ---------------------------------K8s containers--------------------------------------------
|
||||
|
||||
template: k8s_cgroup_10min_cpu_usage
|
||||
|
@ -85,7 +48,7 @@ component: CPU
|
|||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Container ${label:k8s_container_name} CPU utilization
|
||||
summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization
|
||||
info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
|
||||
average CPU utilization over the last 10 minutes
|
||||
to: silent
|
||||
|
@ -103,43 +66,7 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Container ${label:k8s_container_name} ram utilization
|
||||
summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization
|
||||
info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
|
||||
memory utilization
|
||||
to: silent
|
||||
|
||||
# check for packet storms
|
||||
|
||||
# FIXME COMMENTED DUE TO A BUG IN NETDATA
|
||||
## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
|
||||
## 2. do the same for the last 10s
|
||||
## 3. raise an alarm if the later is 10x or 20x the first
|
||||
## we assume the minimum packet storm should at least have
|
||||
## 10000 packets/s, average of the last 10 seconds
|
||||
#
|
||||
# template: k8s_cgroup_1m_received_packets_rate
|
||||
# on: k8s.cgroup.net_packets
|
||||
# class: Workload
|
||||
# type: Cgroups
|
||||
#component: Network
|
||||
# hosts: *
|
||||
# lookup: average -1m unaligned of received
|
||||
# units: packets
|
||||
# every: 10s
|
||||
# info: average number of packets received by the network interface ${label:device} over the last minute
|
||||
#
|
||||
# template: k8s_cgroup_10s_received_packets_storm
|
||||
# on: k8s.cgroup.net_packets
|
||||
# class: Workload
|
||||
# type: Cgroups
|
||||
#component: Network
|
||||
# hosts: *
|
||||
# lookup: average -10s unaligned of received
|
||||
# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
|
||||
# every: 10s
|
||||
# units: %
|
||||
# warn: $this > (($status >= $WARNING)?(200):(5000))
|
||||
# options: no-clear-notification
|
||||
# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
|
||||
# compared to the rate over the last minute
|
||||
# to: sysadmin
|
||||
|
|
|
@ -12,7 +12,8 @@ component: CockroachDB
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage capacity utilization
|
||||
summary: CockroachDB storage space utilization
|
||||
info: Storage capacity utilization
|
||||
to: dba
|
||||
|
||||
template: cockroachdb_used_usable_storage_capacity
|
||||
|
@ -26,7 +27,8 @@ component: CockroachDB
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage usable space utilization
|
||||
summary: CockroachDB usable storage space utilization
|
||||
info: Storage usable space utilization
|
||||
to: dba
|
||||
|
||||
# Replication
|
||||
|
@ -41,7 +43,8 @@ component: CockroachDB
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of ranges with fewer live replicas than needed for quorum
|
||||
summary: CockroachDB unavailable replication
|
||||
info: Number of ranges with fewer live replicas than needed for quorum
|
||||
to: dba
|
||||
|
||||
template: cockroachdb_underreplicated_ranges
|
||||
|
@ -54,7 +57,8 @@ component: CockroachDB
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of ranges with fewer live replicas than the replication target
|
||||
summary: CockroachDB under-replicated
|
||||
info: Number of ranges with fewer live replicas than the replication target
|
||||
to: dba
|
||||
|
||||
# FD
|
||||
|
@ -69,5 +73,6 @@ component: CockroachDB
|
|||
every: 10s
|
||||
warn: $this > 80
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: open file descriptors utilization (against softlimit)
|
||||
summary: CockroachDB file descriptors utilization
|
||||
info: Open file descriptors utilization (against softlimit)
|
||||
to: dba
|
||||
|
|
|
@ -10,6 +10,7 @@ component: Consul
|
|||
units: seconds
|
||||
warn: $this < 14*24*60*60
|
||||
crit: $this < 7*24*60*60
|
||||
summary: Consul license expiration on ${label:node_name}
|
||||
info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
|
@ -23,7 +24,8 @@ component: Consul
|
|||
units: status
|
||||
warn: $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
|
||||
summary: Consul datacenter ${label:datacenter} health
|
||||
info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name}
|
||||
to: sysadmin
|
||||
|
||||
template: consul_autopilot_server_health_status
|
||||
|
@ -36,7 +38,8 @@ component: Consul
|
|||
units: status
|
||||
warn: $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
|
||||
summary: Consul server ${label:node_name} health
|
||||
info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy
|
||||
to: sysadmin
|
||||
|
||||
template: consul_raft_leader_last_contact_time
|
||||
|
@ -50,7 +53,8 @@ component: Consul
|
|||
warn: $this > (($status >= $WARNING) ? (150) : (200))
|
||||
crit: $this > (($status == $CRITICAL) ? (200) : (500))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
|
||||
summary: Consul leader server ${label:node_name} last contact time
|
||||
info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes
|
||||
to: sysadmin
|
||||
|
||||
template: consul_raft_leadership_transitions
|
||||
|
@ -63,7 +67,8 @@ component: Consul
|
|||
units: transitions
|
||||
warn: $this > 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: there has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
|
||||
summary: Consul server ${label:node_name} leadership transitions
|
||||
info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader
|
||||
to: sysadmin
|
||||
|
||||
template: consul_raft_thread_main_saturation
|
||||
|
@ -76,7 +81,8 @@ component: Consul
|
|||
units: percentage
|
||||
warn: $this > (($status >= $WARNING) ? (40) : (50))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
|
||||
summary: Consul server ${label:node_name} main Raft saturation
|
||||
info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
template: consul_raft_thread_fsm_saturation
|
||||
|
@ -89,7 +95,8 @@ component: Consul
|
|||
units: milliseconds
|
||||
warn: $this > (($status >= $WARNING) ? (40) : (50))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
|
||||
summary: Consul server ${label:node_name} FSM Raft saturation
|
||||
info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
template: consul_client_rpc_requests_exceeded
|
||||
|
@ -102,7 +109,8 @@ component: Consul
|
|||
units: requests
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
|
||||
summary: Consul server ${label:node_name} RPC requests rate
|
||||
info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
template: consul_client_rpc_requests_failed
|
||||
|
@ -115,6 +123,7 @@ component: Consul
|
|||
units: requests
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (5))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: Consul server ${label:node_name} failed RPC requests
|
||||
info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
|
@ -128,7 +137,8 @@ component: Consul
|
|||
units: status
|
||||
warn: $this != nan AND $this != 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
|
||||
summary: Consul node health check ${label:check_name} on ${label:node_name}
|
||||
info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
template: consul_service_health_check_status
|
||||
|
@ -141,7 +151,8 @@ component: Consul
|
|||
units: status
|
||||
warn: $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
|
||||
summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name}
|
||||
info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
template: consul_gc_pause_time
|
||||
|
@ -155,5 +166,6 @@ component: Consul
|
|||
warn: $this > (($status >= $WARNING) ? (1) : (2))
|
||||
crit: $this > (($status >= $WARNING) ? (2) : (5))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
|
||||
summary: Consul server ${label:node_name} garbage collection pauses
|
||||
info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
|
|
@ -14,7 +14,7 @@ component: CPU
|
|||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: CPU utilization
|
||||
summary: System CPU utilization
|
||||
info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
|
||||
to: silent
|
||||
|
||||
|
@ -30,7 +30,7 @@ component: CPU
|
|||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (20) : (40))
|
||||
delay: up 30m down 30m multiplier 1.5 max 2h
|
||||
summary: CPU iowait time
|
||||
summary: System CPU iowait time
|
||||
info: Average CPU iowait time over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -46,7 +46,7 @@ component: CPU
|
|||
every: 5m
|
||||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
summary: CPU steal time
|
||||
summary: System CPU steal time
|
||||
info: Average CPU steal time over the last 20 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -64,6 +64,6 @@ component: CPU
|
|||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: CPU utilization
|
||||
summary: System CPU utilization
|
||||
info: Average CPU utilization over the last 10 minutes (excluding nice)
|
||||
to: silent
|
||||
|
|
|
@ -13,7 +13,8 @@ component: DB engine
|
|||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
|
||||
summary: Netdata DBengine filesystem errors
|
||||
info: Number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_io_errors
|
||||
|
@ -28,7 +29,8 @@ component: DB engine
|
|||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
|
||||
summary: Netdata DBengine IO errors
|
||||
info: Number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_flushing_warnings
|
||||
|
@ -43,6 +45,7 @@ component: DB engine
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
summary: Netdata DBengine global flushing warnings
|
||||
info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
|
||||
Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
|
||||
to: sysadmin
|
||||
|
@ -59,6 +62,7 @@ component: DB engine
|
|||
every: 10s
|
||||
crit: $this != 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
|
||||
summary: Netdata DBengine global flushing errors
|
||||
info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
|
||||
Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
|
||||
to: sysadmin
|
||||
|
|
|
@ -81,7 +81,7 @@ template: out_of_disk_space_time
|
|||
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
|
||||
crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
|
||||
delay: down 15m multiplier 1.2 max 1h
|
||||
summary: Out of disk space time for ${label:mount_point}
|
||||
summary: Disk ${label:mount_point} estimation of lack of space
|
||||
info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour
|
||||
to: silent
|
||||
|
||||
|
@ -121,7 +121,7 @@ template: out_of_disk_inodes_time
|
|||
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
|
||||
crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
|
||||
delay: down 15m multiplier 1.2 max 1h
|
||||
summary: Out of disk inodes time for ${label:mount_point}
|
||||
summary: Disk ${label:mount_point} estimation of lack of inodes
|
||||
info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
|
||||
to: silent
|
||||
|
||||
|
|
|
@ -10,5 +10,6 @@ component: DNS
|
|||
every: 10s
|
||||
warn: $this != nan && $this != 1
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
summary: DNS query unsuccessful requests to ${label:server}
|
||||
info: DNS request type ${label:record_type} to server ${label:server} is unsuccessful
|
||||
to: sysadmin
|
||||
|
|
|
@ -10,5 +10,6 @@ component: Dnsmasq
|
|||
calc: $used
|
||||
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
|
||||
delay: down 5m
|
||||
info: DHCP range utilization
|
||||
summary: Dnsmasq DHCP range ${label:dhcp_range} utilization
|
||||
info: DHCP range ${label:dhcp_range} utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -12,7 +12,8 @@ component: Elasticsearch
|
|||
units: status
|
||||
crit: $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: cluster health status is red.
|
||||
summary: Elasticsearch cluster ${label:cluster_name} status
|
||||
info: Elasticsearch cluster ${label:cluster_name} health status is red.
|
||||
to: sysadmin
|
||||
|
||||
# the idea of '-10m' is to handle yellow status after node restart,
|
||||
|
@ -27,7 +28,8 @@ component: Elasticsearch
|
|||
units: status
|
||||
warn: $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: cluster health status is yellow.
|
||||
summary: Elasticsearch cluster ${label:cluster_name} status
|
||||
info: Elasticsearch cluster ${label:cluster_name} health status is yellow.
|
||||
to: sysadmin
|
||||
|
||||
template: elasticsearch_node_index_health_red
|
||||
|
@ -40,7 +42,8 @@ component: Elasticsearch
|
|||
units: status
|
||||
warn: $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: node index $label:index health status is red.
|
||||
summary: Elasticsearch cluster ${label:cluster_name} index ${label:index} status
|
||||
info: Elasticsearch cluster ${label:cluster_name} index ${label:index} health status is red.
|
||||
to: sysadmin
|
||||
|
||||
# don't convert 'lookup' value to seconds in 'calc' due to UI showing seconds as hh:mm:ss (0 as now).
|
||||
|
@ -55,7 +58,8 @@ component: Elasticsearch
|
|||
units: milliseconds
|
||||
warn: $this > (($status >= $WARNING) ? (20 * 1000) : (30 * 1000))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: search performance is degraded, queries run slowly.
|
||||
summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} query performance
|
||||
info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, queries run slowly.
|
||||
to: sysadmin
|
||||
|
||||
template: elasticsearch_node_indices_search_time_fetch
|
||||
|
@ -69,5 +73,6 @@ component: Elasticsearch
|
|||
warn: $this > (($status >= $WARNING) ? (3 * 1000) : (5 * 1000))
|
||||
crit: $this > (($status == $CRITICAL) ? (5 * 1000) : (30 * 1000))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: search performance is degraded, fetches run slowly.
|
||||
summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} fetch performance
|
||||
info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, fetches run slowly.
|
||||
to: sysadmin
|
||||
|
|
|
@ -15,5 +15,6 @@ component: Cryptography
|
|||
every: 5m
|
||||
warn: $this < (($status >= $WARNING) ? (200) : (100))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: minimum number of entries in the random numbers pool in the last 5 minutes
|
||||
summary: System entropy pool number of entries
|
||||
info: Minimum number of entries in the random numbers pool in the last 5 minutes
|
||||
to: silent
|
||||
|
|
|
@ -10,7 +10,8 @@ component: Exporting engine
|
|||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful buffering of exporting data
|
||||
summary: Netdata exporting data last successful buffering
|
||||
info: Number of seconds since the last successful buffering of exporting data
|
||||
to: dba
|
||||
|
||||
template: exporting_metrics_sent
|
||||
|
@ -23,5 +24,6 @@ component: Exporting engine
|
|||
every: 10s
|
||||
warn: $this != 100
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of metrics sent to the external database server
|
||||
summary: Netdata exporting metrics sent
|
||||
info: Percentage of metrics sent to the external database server
|
||||
to: dba
|
||||
|
|
|
@ -11,12 +11,12 @@
|
|||
every: 1m
|
||||
crit: $this > 90
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: System open files utilization
|
||||
summary: System open file descriptors utilization
|
||||
info: System-wide utilization of open files
|
||||
to: sysadmin
|
||||
|
||||
template: apps_group_file_descriptors_utilization
|
||||
on: apps.fd_limit
|
||||
on: app.fds_open_limit
|
||||
class: Utilization
|
||||
type: System
|
||||
component: Process
|
||||
|
@ -28,6 +28,6 @@ component: Process
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Group open files utilization
|
||||
summary: App group ${label:app_group} file descriptors utilization
|
||||
info: Open files percentage against the processes limits, among all PIDs in application group
|
||||
to: sysadmin
|
||||
|
|
|
@ -9,5 +9,6 @@ component: Gearman
|
|||
every: 10s
|
||||
warn: $this > 30000
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: average number of queued jobs over the last 10 minutes
|
||||
summary: Gearman queued jobs
|
||||
info: Average number of queued jobs over the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -13,5 +13,6 @@ component: go.d.plugin
|
|||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
summary: Go.d plugin last collection
|
||||
info: Number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
|
|
@ -7,7 +7,8 @@ component: HAProxy
|
|||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of failed haproxy backend servers over the last 10 seconds
|
||||
summary: HAProxy server status
|
||||
info: Average number of failed haproxy backend servers over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
||||
template: haproxy_backend_status
|
||||
|
@ -19,5 +20,6 @@ component: HAProxy
|
|||
every: 10s
|
||||
lookup: average -10s
|
||||
crit: $this > 0
|
||||
info: average number of failed haproxy backends over the last 10 seconds
|
||||
summary: HAProxy backend status
|
||||
info: Average number of failed haproxy backends over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
|
|
@ -12,6 +12,7 @@ component: HDFS
|
|||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: HDFS datanodes space utilization
|
||||
info: summary datanodes space capacity utilization
|
||||
to: sysadmin
|
||||
|
||||
|
@ -28,6 +29,7 @@ component: HDFS
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: HDFS missing blocks
|
||||
info: number of missing blocks
|
||||
to: sysadmin
|
||||
|
||||
|
@ -42,6 +44,7 @@ component: HDFS
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: HDFS stale datanodes
|
||||
info: number of datanodes marked stale due to delayed heartbeat
|
||||
to: sysadmin
|
||||
|
||||
|
@ -56,6 +59,7 @@ component: HDFS
|
|||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: HDFS dead datanodes
|
||||
info: number of datanodes which are currently dead
|
||||
to: sysadmin
|
||||
|
||||
|
@ -72,5 +76,6 @@ component: HDFS
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: HDFS failed volumes
|
||||
info: number of failed volumes
|
||||
to: sysadmin
|
||||
|
|
|
@ -9,7 +9,7 @@ component: HTTP endpoint
|
|||
calc: ($this < 75) ? (0) : ($this)
|
||||
every: 5s
|
||||
units: up/down
|
||||
info: HTTP endpoint ${label:url} liveness status
|
||||
info: HTTP check endpoint ${label:url} liveness status
|
||||
to: silent
|
||||
|
||||
template: httpcheck_web_service_bad_content
|
||||
|
|
|
@ -9,5 +9,6 @@ component: Disk
|
|||
green: 10000
|
||||
warn: $this > $green
|
||||
delay: down 30m multiplier 1.5 max 2h
|
||||
info: average I/O latency over the last 10 seconds
|
||||
summary: IO ping latency
|
||||
info: Average I/O latency over the last 10 seconds
|
||||
to: silent
|
||||
|
|
|
@ -10,5 +10,6 @@ component: IPFS
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: IPFS datastore utilization
|
||||
info: IPFS datastore utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -23,6 +23,6 @@ component: IPMI
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 5m down 15m multiplier 1.5 max 1h
|
||||
summary: IPMI events
|
||||
summary: IPMI entries in System Event Log
|
||||
info: number of events in the IPMI System Event Log (SEL)
|
||||
to: silent
|
||||
|
|
|
@ -14,7 +14,8 @@ component: Kubelet
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: the node is experiencing a configuration-related error (0: false, 1: true)
|
||||
summary: Kubelet node config error
|
||||
info: The node is experiencing a configuration-related error (0: false, 1: true)
|
||||
to: sysadmin
|
||||
|
||||
# Failed Token() requests to the alternate token source
|
||||
|
@ -29,7 +30,8 @@ component: Kubelet
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: number of failed Token() requests to the alternate token source
|
||||
summary: Kubelet failed token requests
|
||||
info: Number of failed Token() requests to the alternate token source
|
||||
to: sysadmin
|
||||
|
||||
# Docker and runtime operation errors
|
||||
|
@ -44,7 +46,8 @@ component: Kubelet
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (20))
|
||||
delay: up 30s down 1m multiplier 1.5 max 2h
|
||||
info: number of Docker or runtime operation errors
|
||||
summary: Kubelet runtime errors
|
||||
info: Number of Docker or runtime operation errors
|
||||
to: sysadmin
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -84,7 +87,8 @@ component: Kubelet
|
|||
warn: $this > (($status >= $WARNING)?(100):(200))
|
||||
crit: $this > (($status >= $WARNING)?(200):(400))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
summary: Kubelet relisting latency (quantile 0.5)
|
||||
info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.5)
|
||||
to: sysadmin
|
||||
|
||||
|
@ -112,7 +116,8 @@ component: Kubelet
|
|||
warn: $this > (($status >= $WARNING)?(200):(400))
|
||||
crit: $this > (($status >= $WARNING)?(400):(800))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
summary: Kubelet relisting latency (quantile 0.9)
|
||||
info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.9)
|
||||
to: sysadmin
|
||||
|
||||
|
@ -140,6 +145,7 @@ component: Kubelet
|
|||
warn: $this > (($status >= $WARNING)?(400):(800))
|
||||
crit: $this > (($status >= $WARNING)?(800):(1200))
|
||||
delay: down 1m multiplier 1.5 max 2h
|
||||
info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
summary: Kubelet relisting latency (quantile 0.99)
|
||||
info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
|
||||
compared to the last minute (quantile 0.99)
|
||||
to: sysadmin
|
||||
|
|
|
@ -33,7 +33,7 @@ component: Load
|
|||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Load average (15 minutes)
|
||||
summary: Host load average (15 minutes)
|
||||
info: System load average for the past 15 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -50,7 +50,7 @@ component: Load
|
|||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Load average (5 minutes)
|
||||
summary: System load average (5 minutes)
|
||||
info: System load average for the past 5 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -67,6 +67,6 @@ component: Load
|
|||
every: 1m
|
||||
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Load average (1 minute)
|
||||
summary: System load average (1 minute)
|
||||
info: System load average for the past 1 minute
|
||||
to: silent
|
||||
|
|
|
@ -8,7 +8,7 @@ component: RAID
|
|||
every: 10s
|
||||
calc: $down
|
||||
warn: $this > 0
|
||||
summary: Mdtat device ${label:device} down
|
||||
summary: MD array device ${label:device} down
|
||||
info: Number of devices in the down state for the ${label:device} ${label:raid_level} array. \
|
||||
Any number > 0 indicates that the array is degraded.
|
||||
to: sysadmin
|
||||
|
@ -24,7 +24,7 @@ chart labels: raid_level=!raid1 !raid10 *
|
|||
every: 60s
|
||||
warn: $this > 1024
|
||||
delay: up 30m
|
||||
summary: Mdstat device ${label:device} unsynchronized blocks
|
||||
summary: MD array device ${label:device} unsynchronized blocks
|
||||
info: Number of unsynchronized blocks for the ${label:device} ${label:raid_level} array
|
||||
to: silent
|
||||
|
||||
|
@ -38,6 +38,6 @@ component: RAID
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
summary: Mdstat last collected
|
||||
summary: MD array last collected
|
||||
info: Number of seconds since the last successful data collection
|
||||
to: sysadmin
|
||||
|
|
|
@ -11,7 +11,8 @@ component: RAID
|
|||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 5m multiplier 2 max 10m
|
||||
info: adapter is in the degraded state (0: false, 1: true)
|
||||
summary: MegaCLI adapter state
|
||||
info: Adapter is in the degraded state (0: false, 1: true)
|
||||
to: sysadmin
|
||||
|
||||
## Physical Disks
|
||||
|
@ -26,7 +27,8 @@ component: RAID
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 2 max 10m
|
||||
info: number of physical drive predictive failures
|
||||
summary: MegaCLI physical drive predictive failures
|
||||
info: Number of physical drive predictive failures
|
||||
to: sysadmin
|
||||
|
||||
template: megacli_pd_media_errors
|
||||
|
@ -39,7 +41,8 @@ component: RAID
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 1m down 5m multiplier 2 max 10m
|
||||
info: number of physical drive media errors
|
||||
summary: MegaCLI physical drive errors
|
||||
info: Number of physical drive media errors
|
||||
to: sysadmin
|
||||
|
||||
## Battery Backup Units (BBU)
|
||||
|
@ -54,7 +57,8 @@ component: RAID
|
|||
every: 10s
|
||||
warn: $this <= (($status >= $WARNING) ? (85) : (80))
|
||||
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
|
||||
info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
|
||||
summary: MegaCLI BBU charge state
|
||||
info: Average battery backup unit (BBU) relative state of charge over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
||||
template: megacli_bbu_cycle_count
|
||||
|
@ -67,5 +71,6 @@ component: RAID
|
|||
every: 10s
|
||||
warn: $this >= 100
|
||||
crit: $this >= 500
|
||||
info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
|
||||
summary: MegaCLI BBU cycles count
|
||||
info: Average battery backup unit (BBU) charge cycles count over the last 10 seconds
|
||||
to: sysadmin
|
||||
|
|
|
@ -12,7 +12,8 @@ component: Memcached
|
|||
warn: $this > (($status >= $WARNING) ? (70) : (80))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (90))
|
||||
delay: up 0 down 15m multiplier 1.5 max 1h
|
||||
info: cache memory utilization
|
||||
summary: Memcached memory utilization
|
||||
info: Cache memory utilization
|
||||
to: dba
|
||||
|
||||
|
||||
|
@ -27,7 +28,7 @@ component: Memcached
|
|||
calc: ($this - $available) / (($now - $after) / 3600)
|
||||
units: KB/hour
|
||||
every: 1m
|
||||
info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
|
||||
info: Average rate the cache fills up (positive), or frees up (negative) space over the last hour
|
||||
|
||||
|
||||
# find the hours remaining until memcached cache is full
|
||||
|
@ -43,6 +44,7 @@ component: Memcached
|
|||
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
|
||||
crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: estimated time the cache will run out of space \
|
||||
summary: Memcached estimation of lack of cache space
|
||||
info: Estimated time the cache will run out of space \
|
||||
if the system continues to add data at the same rate as the past hour
|
||||
to: dba
|
||||
|
|
|
@ -12,7 +12,8 @@ component: Memory
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: amount of memory corrupted due to a hardware failure
|
||||
summary: System corrupted memory
|
||||
info: Amount of memory corrupted due to a hardware failure
|
||||
to: sysadmin
|
||||
|
||||
## ECC Controller
|
||||
|
@ -29,7 +30,8 @@ component: Memory
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: memory controller ${label:controller} ECC correctable errors in the last 10 minutes
|
||||
summary: System ECC memory ${label:controller} correctable errors
|
||||
info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: ecc_memory_mc_uncorrectable
|
||||
|
@ -44,7 +46,8 @@ component: Memory
|
|||
every: 1m
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
info: memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
|
||||
summary: System ECC memory ${label:controller} uncorrectable errors
|
||||
info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
## ECC DIMM
|
||||
|
@ -61,6 +64,7 @@ component: Memory
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
summary: System ECC memory DIMM ${label:dimm} correctable errors
|
||||
info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
||||
|
@ -76,5 +80,6 @@ component: Memory
|
|||
every: 1m
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 1h
|
||||
summary: System ECC memory DIMM ${label:dimm} uncorrectable errors
|
||||
info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -20,7 +20,8 @@ component: ML
|
|||
units: %
|
||||
every: 30s
|
||||
warn: $this > 1
|
||||
info: rolling 1min node level anomaly rate
|
||||
summary: ML node anomaly rate
|
||||
info: Rolling 1min node level anomaly rate
|
||||
to: silent
|
||||
|
||||
# alert per dimension example
|
||||
|
|
|
@ -12,7 +12,8 @@ component: MySQL
|
|||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (10) : (20))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of slow queries in the last 10 seconds
|
||||
summary: MySQL slow queries
|
||||
info: Number of slow queries in the last 10 seconds
|
||||
to: dba
|
||||
|
||||
|
||||
|
@ -27,7 +28,8 @@ component: MySQL
|
|||
lookup: sum -10s absolute of immediate
|
||||
units: immediate locks
|
||||
every: 10s
|
||||
info: number of table immediate locks in the last 10 seconds
|
||||
summary: MySQL table immediate locks
|
||||
info: Number of table immediate locks in the last 10 seconds
|
||||
to: dba
|
||||
|
||||
template: mysql_10s_table_locks_waited
|
||||
|
@ -38,7 +40,8 @@ component: MySQL
|
|||
lookup: sum -10s absolute of waited
|
||||
units: waited locks
|
||||
every: 10s
|
||||
info: number of table waited locks in the last 10 seconds
|
||||
summary: MySQL table waited locks
|
||||
info: Number of table waited locks in the last 10 seconds
|
||||
to: dba
|
||||
|
||||
template: mysql_10s_waited_locks_ratio
|
||||
|
@ -52,7 +55,8 @@ component: MySQL
|
|||
warn: $this > (($status >= $WARNING) ? (10) : (25))
|
||||
crit: $this > (($status == $CRITICAL) ? (25) : (50))
|
||||
delay: down 30m multiplier 1.5 max 1h
|
||||
info: ratio of waited table locks over the last 10 seconds
|
||||
summary: MySQL waited table locks ratio
|
||||
info: Ratio of waited table locks over the last 10 seconds
|
||||
to: dba
|
||||
|
||||
|
||||
|
@ -70,7 +74,8 @@ component: MySQL
|
|||
warn: $this > (($status >= $WARNING) ? (60) : (70))
|
||||
crit: $this > (($status == $CRITICAL) ? (80) : (90))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: client connections utilization
|
||||
summary: MySQL connections utilization
|
||||
info: Client connections utilization
|
||||
to: dba
|
||||
|
||||
|
||||
|
@ -87,7 +92,8 @@ component: MySQL
|
|||
every: 10s
|
||||
crit: $this == 0
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: replication status (0: stopped, 1: working)
|
||||
summary: MySQL replication status
|
||||
info: Replication status (0: stopped, 1: working)
|
||||
to: dba
|
||||
|
||||
template: mysql_replication_lag
|
||||
|
@ -101,7 +107,8 @@ component: MySQL
|
|||
warn: $this > (($status >= $WARNING) ? (5) : (10))
|
||||
crit: $this > (($status == $CRITICAL) ? (10) : (30))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: difference between the timestamp of the latest transaction processed by the SQL thread and \
|
||||
summary: MySQL replication lag
|
||||
info: Difference between the timestamp of the latest transaction processed by the SQL thread and \
|
||||
the timestamp of the same transaction when it was processed on the master
|
||||
to: dba
|
||||
|
||||
|
@ -131,7 +138,8 @@ component: MySQL
|
|||
warn: $this > $mysql_galera_cluster_size_max_2m
|
||||
crit: $this < $mysql_galera_cluster_size_max_2m
|
||||
delay: up 20s down 5m multiplier 1.5 max 1h
|
||||
info: current galera cluster size, compared to the maximum size in the last 2 minutes
|
||||
summary: MySQL galera cluster size
|
||||
info: Current galera cluster size, compared to the maximum size in the last 2 minutes
|
||||
to: dba
|
||||
|
||||
# galera node state
|
||||
|
@ -145,7 +153,8 @@ component: MySQL
|
|||
every: 10s
|
||||
warn: $this != nan AND $this != 0
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node state is either Donor/Desynced or Joined.
|
||||
summary: MySQL galera node state
|
||||
info: Galera node state is either Donor/Desynced or Joined.
|
||||
to: dba
|
||||
|
||||
template: mysql_galera_cluster_state_crit
|
||||
|
@ -157,7 +166,8 @@ component: MySQL
|
|||
every: 10s
|
||||
crit: $this != nan AND $this != 0
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node state is either Undefined or Joining or Error.
|
||||
summary: MySQL galera node state
|
||||
info: Galera node state is either Undefined or Joining or Error.
|
||||
to: dba
|
||||
|
||||
# galera node status
|
||||
|
@ -171,6 +181,7 @@ component: MySQL
|
|||
every: 10s
|
||||
crit: $this != nan AND $this != 1
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
info: galera node is part of a nonoperational component. \
|
||||
summary: MySQL galera cluster status
|
||||
info: Galera node is part of a nonoperational component. \
|
||||
This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations.
|
||||
to: dba
|
||||
|
|
|
@ -29,7 +29,7 @@ component: Network
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
delay: up 1m down 1m multiplier 1.5 max 1h
|
||||
summary: 1 minute received traffic overflow for ${label:device}
|
||||
summary: System network interface ${label:device} inbound utilization
|
||||
info: Average inbound utilization for the network interface ${label:device} over the last minute
|
||||
to: silent
|
||||
|
||||
|
@ -46,7 +46,7 @@ component: Network
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
delay: up 1m down 1m multiplier 1.5 max 1h
|
||||
summary: 1 minute sent traffic overflow for ${label:device}
|
||||
summary: System network interface ${label:device} outbound utilization
|
||||
info: Average outbound utilization for the network interface ${label:device} over the last minute
|
||||
to: silent
|
||||
|
||||
|
@ -98,7 +98,7 @@ chart labels: device=!wl* *
|
|||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
summary: Inbound packets dropped ratio for ${label:device}
|
||||
summary: System network interface ${label:device} inbound drops
|
||||
info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -116,7 +116,7 @@ chart labels: device=!wl* *
|
|||
every: 1m
|
||||
warn: $this >= 2
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
summary: Outbound packets dropped ratio for ${label:device}
|
||||
summary: System network interface ${label:device} outbound drops
|
||||
info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -134,7 +134,7 @@ chart labels: device=wl*
|
|||
every: 1m
|
||||
warn: $this >= 10
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
summary: Inbound packets dropped ratio for ${label:device}
|
||||
summary: System network interface ${label:device} inbound drops ratio
|
||||
info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -152,7 +152,7 @@ chart labels: device=wl*
|
|||
every: 1m
|
||||
warn: $this >= 10
|
||||
delay: up 1m down 1h multiplier 1.5 max 2h
|
||||
summary: Outbound packets dropped ratio for ${label:device}
|
||||
summary: System network interface ${label:device} outbound drops ratio
|
||||
info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -171,7 +171,7 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
summary: Inbound interface errors for ${label:device}
|
||||
summary: System network interface ${label:device} inbound errors
|
||||
info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -187,7 +187,7 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
summary: Outbound interface errors for ${label:device}
|
||||
summary: System network interface ${label:device} outbound errors
|
||||
info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -211,7 +211,7 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
summary: Net FIFO errors for ${label:device}
|
||||
summary: System network interface ${label:device} FIFO errors
|
||||
info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -250,7 +250,7 @@ component: Network
|
|||
warn: $this > (($status >= $WARNING)?(200):(5000))
|
||||
crit: $this > (($status == $CRITICAL)?(5000):(6000))
|
||||
options: no-clear-notification
|
||||
summary: Received packets storm for ${label:device}
|
||||
summary: System network interface ${label:device} inbound packet storm
|
||||
info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
|
||||
compared to the rate over the last minute
|
||||
to: silent
|
||||
|
|
|
@ -15,5 +15,6 @@ component: Network
|
|||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (95))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: netfilter connection tracker table size utilization
|
||||
summary: System Netfilter connection tracker utilization
|
||||
info: Netfilter connection tracker table size utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -14,7 +14,7 @@ component: UPS
|
|||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
summary: UPS load
|
||||
info: Average UPS load over the last 10 minutes
|
||||
info: UPS average load over the last 10 minutes
|
||||
to: sitemgr
|
||||
|
||||
template: nut_ups_charge
|
||||
|
@ -30,8 +30,8 @@ component: UPS
|
|||
warn: $this < 75
|
||||
crit: $this < 40
|
||||
delay: down 10m multiplier 1.5 max 1h
|
||||
summary: UPS charge
|
||||
info: Average UPS charge over the last minute
|
||||
summary: UPS battery charge
|
||||
info: UPS average battery charge over the last minute
|
||||
to: sitemgr
|
||||
|
||||
template: nut_last_collected_secs
|
||||
|
@ -46,5 +46,5 @@ component: UPS device
|
|||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: NUT last collected
|
||||
info: number of seconds since the last successful data collection
|
||||
info: Number of seconds since the last successful data collection
|
||||
to: sitemgr
|
||||
|
|
|
@ -11,6 +11,7 @@ component: Pi-hole
|
|||
units: seconds
|
||||
calc: $ago
|
||||
warn: $this > 60 * 60 * 24 * 30
|
||||
summary: Pi-hole blocklist last update
|
||||
info: gravity.list (blocklist) file last update time
|
||||
to: sysadmin
|
||||
|
||||
|
@ -27,5 +28,6 @@ component: Pi-hole
|
|||
calc: $disabled
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: up 2m down 5m
|
||||
info: unwanted domains blocking is disabled
|
||||
summary: Pi-hole domains blocking status
|
||||
info: Unwanted domains blocking is disabled
|
||||
to: sysadmin
|
||||
|
|
|
@ -7,5 +7,6 @@
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? ($update_every) : (20 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: Plugin ${label:_collect_plugin} availability status
|
||||
info: the amount of time that ${label:_collect_plugin} did not report its availability status
|
||||
to: sysadmin
|
||||
|
|
|
@ -9,6 +9,7 @@ component: TCP endpoint
|
|||
calc: ($this < 75) ? (0) : ($this)
|
||||
every: 5s
|
||||
units: up/down
|
||||
summary: Portcheck status for ${label:host}:${label:port}
|
||||
info: TCP host ${label:host} port ${label:port} liveness status
|
||||
to: silent
|
||||
|
||||
|
@ -23,7 +24,8 @@ component: TCP endpoint
|
|||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
|
||||
summary: Portcheck timeouts for ${label:host}:${label:port}
|
||||
info: Percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
|
||||
to: sysadmin
|
||||
|
||||
template: portcheck_connection_fails
|
||||
|
@ -37,5 +39,6 @@ component: TCP endpoint
|
|||
warn: $this >= 10 AND $this < 40
|
||||
crit: $this >= 40
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
|
||||
summary: Portcheck fails for ${label:host}:${label:port}
|
||||
info: Percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes
|
||||
to: sysadmin
|
||||
|
|
|
@ -110,7 +110,7 @@ component: PostgreSQL
|
|||
warn: $this < (($status >= $WARNING) ? (70) : (60))
|
||||
crit: $this < (($status == $CRITICAL) ? (60) : (50))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: PostgreSQL table ${label:table} cache hit ratio
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio
|
||||
info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute
|
||||
to: dba
|
||||
|
||||
|
@ -127,7 +127,7 @@ component: PostgreSQL
|
|||
warn: $this < (($status >= $WARNING) ? (70) : (60))
|
||||
crit: $this < (($status == $CRITICAL) ? (60) : (50))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: PostgreSQL table ${label:table} index cache hit ratio
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio
|
||||
info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute
|
||||
to: dba
|
||||
|
||||
|
@ -144,7 +144,7 @@ component: PostgreSQL
|
|||
warn: $this < (($status >= $WARNING) ? (70) : (60))
|
||||
crit: $this < (($status == $CRITICAL) ? (60) : (50))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: PostgreSQL table ${label:table} toast cache hit ratio
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio
|
||||
info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
|
||||
to: dba
|
||||
|
||||
|
@ -161,7 +161,7 @@ component: PostgreSQL
|
|||
warn: $this < (($status >= $WARNING) ? (70) : (60))
|
||||
crit: $this < (($status == $CRITICAL) ? (60) : (50))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: PostgreSQL table ${label:table} index toast hit ratio
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio
|
||||
info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute
|
||||
to: dba
|
||||
|
||||
|
@ -177,7 +177,7 @@ component: PostgreSQL
|
|||
warn: $this > (($status >= $WARNING) ? (60) : (70))
|
||||
crit: $this > (($status == $CRITICAL) ? (70) : (80))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: PostgreSQL table ${label:table} bloat size
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} bloat size
|
||||
info: Bloat size percentage in db ${label:database} table ${label:table}
|
||||
to: dba
|
||||
|
||||
|
@ -191,7 +191,7 @@ component: PostgreSQL
|
|||
units: seconds
|
||||
every: 1m
|
||||
warn: $this != nan AND $this > (60 * 60 * 24 * 7)
|
||||
summary: PostgreSQL table ${label:table} last autovacuum
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum
|
||||
info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon
|
||||
to: dba
|
||||
|
||||
|
@ -205,7 +205,7 @@ component: PostgreSQL
|
|||
units: seconds
|
||||
every: 1m
|
||||
warn: $this != nan AND $this > (60 * 60 * 24 * 7)
|
||||
summary: PostgreSQL table ${label:table} last autoanalyze
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze
|
||||
info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon
|
||||
to: dba
|
||||
|
||||
|
@ -223,6 +223,6 @@ component: PostgreSQL
|
|||
warn: $this > (($status >= $WARNING) ? (60) : (70))
|
||||
crit: $this > (($status == $CRITICAL) ? (70) : (80))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: PostgreSQL table ${label:table} index bloat size
|
||||
summary: PostgreSQL table ${label:table} db ${label:database} index bloat size
|
||||
info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index}
|
||||
to: dba
|
||||
|
|
|
@ -12,5 +12,6 @@ component: Processes
|
|||
warn: $this > (($status >= $WARNING) ? (85) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (95))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: system process IDs (PID) space utilization
|
||||
summary: System PIDs utilization
|
||||
info: System process IDs (PID) space utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -13,5 +13,6 @@ component: python.d.plugin
|
|||
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
|
||||
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
info: number of seconds since the last successful data collection
|
||||
summary: Python.d plugin last collection
|
||||
info: Number of seconds since the last successful data collection
|
||||
to: webmaster
|
||||
|
|
|
@ -14,7 +14,7 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Ram utilization
|
||||
summary: System memory utilization
|
||||
info: System memory utilization
|
||||
to: sysadmin
|
||||
|
||||
|
@ -30,7 +30,7 @@ component: Memory
|
|||
every: 10s
|
||||
warn: $this < (($status >= $WARNING) ? (15) : (10))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Available Ram
|
||||
summary: System available memory
|
||||
info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
to: silent
|
||||
|
||||
|
@ -43,7 +43,7 @@ component: Memory
|
|||
every: 5m
|
||||
warn: $this > 0
|
||||
delay: down 10m
|
||||
summary: OOM kills
|
||||
summary: System OOM kills
|
||||
info: Number of out of memory kills in the last 30 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -61,7 +61,7 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Ram utilization
|
||||
summary: System memory utilization
|
||||
info: System memory utilization
|
||||
to: sysadmin
|
||||
|
||||
|
@ -77,6 +77,6 @@ component: Memory
|
|||
every: 10s
|
||||
warn: $this < (($status >= $WARNING) ? (15) : (10))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Available Ram
|
||||
summary: System available memory
|
||||
info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
to: silent
|
||||
|
|
|
@ -12,5 +12,6 @@ component: Retroshare
|
|||
warn: $this < (($status >= $WARNING) ? (120) : (100))
|
||||
crit: $this < (($status == $CRITICAL) ? (10) : (1))
|
||||
delay: up 0 down 15m multiplier 1.5 max 1h
|
||||
info: number of DHT peers
|
||||
summary: Retroshare DHT peers
|
||||
info: Number of DHT peers
|
||||
to: sysadmin
|
||||
|
|
|
@ -9,7 +9,8 @@ component: Riak KV
|
|||
units: state machines
|
||||
every: 10s
|
||||
warn: $list_fsm_active > 0
|
||||
info: number of currently running list keys finite state machines
|
||||
summary: Riak KV active list keys
|
||||
info: Number of currently running list keys finite state machines
|
||||
to: dba
|
||||
|
||||
|
||||
|
@ -38,7 +39,8 @@ component: Riak KV
|
|||
every: 10s
|
||||
warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
|
||||
crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
|
||||
info: average time between reception of client GET request and \
|
||||
summary: Riak KV GET latency
|
||||
info: Average time between reception of client GET request and \
|
||||
subsequent response to the client over the last 3 minutes, \
|
||||
compared to the average over the last hour
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
|
@ -54,7 +56,8 @@ component: Riak KV
|
|||
lookup: average -1h unaligned of time
|
||||
every: 30s
|
||||
units: ms
|
||||
info: average time between reception of client PUT request and \
|
||||
summary: Riak KV PUT mean latency
|
||||
info: Average time between reception of client PUT request and \
|
||||
subsequent response to the client over the last hour
|
||||
|
||||
template: riakkv_kv_put_slow
|
||||
|
@ -68,7 +71,8 @@ component: Riak KV
|
|||
every: 10s
|
||||
warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
|
||||
crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
|
||||
info: average time between reception of client PUT request and \
|
||||
summary: Riak KV PUT latency
|
||||
info: Average time between reception of client PUT request and \
|
||||
subsequent response to the client over the last 3 minutes, \
|
||||
compared to the average over the last hour
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
|
@ -89,5 +93,6 @@ component: Riak KV
|
|||
every: 10s
|
||||
warn: $this > 10000
|
||||
crit: $this > 100000
|
||||
info: number of processes running in the Erlang VM
|
||||
summary: Riak KV number of processes
|
||||
info: Number of processes running in the Erlang VM
|
||||
to: dba
|
||||
|
|
|
@ -12,7 +12,8 @@ component: ScaleIO
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (90))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: storage pool capacity utilization
|
||||
summary: ScaleIO storage pool capacity utilization
|
||||
info: Storage pool capacity utilization
|
||||
to: sysadmin
|
||||
|
||||
|
||||
|
@ -27,5 +28,6 @@ component: ScaleIO
|
|||
every: 10s
|
||||
warn: $this != 1
|
||||
delay: up 30s down 5m multiplier 1.5 max 1h
|
||||
summary: ScaleIO SDC-MDM connection state
|
||||
info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
|
||||
to: sysadmin
|
||||
|
|
|
@ -15,7 +15,8 @@ component: Network
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of dropped packets in the last minute \
|
||||
summary: System netdev dropped packets
|
||||
info: Average number of dropped packets in the last minute \
|
||||
due to exceeded net.core.netdev_max_backlog
|
||||
to: silent
|
||||
|
||||
|
@ -31,7 +32,8 @@ component: Network
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
|
||||
summary: System netdev budget run outs
|
||||
info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
|
||||
net.core.netdev_budget_usecs with work remaining over the last minute \
|
||||
(this can be a cause for dropped packets)
|
||||
to: silent
|
||||
|
@ -48,7 +50,8 @@ component: Network
|
|||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: average number of drops in the last minute \
|
||||
summary: System netisr drops
|
||||
info: Average number of drops in the last minute \
|
||||
due to exceeded sysctl net.route.netisr_maxqlen \
|
||||
(this can be a cause for dropped packets)
|
||||
to: silent
|
||||
|
|
|
@ -15,7 +15,7 @@ component: Memory
|
|||
every: 1m
|
||||
warn: $this > (($status >= $WARNING) ? (20) : (30))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: Ram swapped out
|
||||
summary: System memory swapped out
|
||||
info: Percentage of the system RAM swapped in the last 30 minutes
|
||||
to: silent
|
||||
|
||||
|
@ -32,6 +32,6 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: up 30s down 15m multiplier 1.5 max 1h
|
||||
summary: Swap utilization
|
||||
summary: System swap memory utilization
|
||||
info: Swap memory utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -6,7 +6,8 @@
|
|||
every: 1m
|
||||
warn: $this > 6
|
||||
delay: up 1m down 10m multiplier 1.5 max 1h
|
||||
info: number of sync() system calls. \
|
||||
summary: Sync system call frequency
|
||||
info: Number of sync() system calls. \
|
||||
Every call causes all pending modifications to filesystem metadata and \
|
||||
cached file data to be written to the underlying filesystems.
|
||||
to: silent
|
||||
|
|
|
@ -12,6 +12,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd service unit in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -27,6 +28,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd socket unit in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -42,6 +44,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd target unit in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -57,6 +60,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd path unit in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -72,6 +76,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd device unit in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -87,6 +92,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd mount units in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -102,6 +108,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd automount unit in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -117,6 +124,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd swap units in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -132,6 +140,7 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd scope units in the failed state
|
||||
to: sysadmin
|
||||
|
||||
|
@ -147,5 +156,6 @@ component: Systemd units
|
|||
every: 10s
|
||||
warn: $this != nan AND $this == 1
|
||||
delay: down 5m multiplier 1.5 max 1h
|
||||
summary: systemd unit ${label:unit_name} state
|
||||
info: systemd slice units in the failed state
|
||||
to: sysadmin
|
||||
|
|
|
@ -18,5 +18,6 @@ component: Network
|
|||
warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
|
||||
crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
summary: System TCP connections utilization
|
||||
info: IPv4 TCP connections utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -31,7 +31,8 @@ component: Network
|
|||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (1) : (5))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: average number of overflows in the TCP accept queue over the last minute
|
||||
summary: System TCP accept queue overflows
|
||||
info: Average number of overflows in the TCP accept queue over the last minute
|
||||
to: silent
|
||||
|
||||
# THIS IS TOO GENERIC
|
||||
|
@ -49,7 +50,8 @@ component: Network
|
|||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (1) : (5))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: average number of dropped packets in the TCP accept queue over the last minute
|
||||
summary: System TCP accept queue dropped packets
|
||||
info: Average number of dropped packets in the TCP accept queue over the last minute
|
||||
to: silent
|
||||
|
||||
|
||||
|
@ -74,7 +76,8 @@ component: Network
|
|||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (0) : (5))
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
|
||||
summary: System TCP SYN queue drops
|
||||
info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
|
||||
(SYN cookies were not enabled)
|
||||
to: silent
|
||||
|
||||
|
@ -91,6 +94,7 @@ component: Network
|
|||
warn: $this > 1
|
||||
crit: $this > (($status == $CRITICAL) ? (0) : (5))
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
|
||||
summary: System TCP SYN queue cookies
|
||||
info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute
|
||||
to: silent
|
||||
|
||||
|
|
|
@ -19,6 +19,6 @@ component: Network
|
|||
warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
|
||||
crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
summary: TCP memory utilization
|
||||
summary: System TCP memory utilization
|
||||
info: TCP memory utilization
|
||||
to: silent
|
||||
|
|
|
@ -20,5 +20,6 @@ component: Network
|
|||
warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
|
||||
crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
|
||||
delay: up 0 down 5m multiplier 1.5 max 1h
|
||||
info: orphan IPv4 TCP sockets utilization
|
||||
summary: System TCP orphan sockets utilization
|
||||
info: Orphan IPv4 TCP sockets utilization
|
||||
to: silent
|
||||
|
|
|
@ -29,7 +29,8 @@ component: Network
|
|||
warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10)))
|
||||
delay: up 20s down 60m multiplier 1.2 max 2h
|
||||
options: no-clear-notification
|
||||
info: average number of sent TCP RESETS over the last 10 seconds. \
|
||||
summary: System TCP outbound resets
|
||||
info: Average number of sent TCP RESETS over the last 10 seconds. \
|
||||
This can indicate a port scan, \
|
||||
or that a service running on this host has crashed. \
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
|
@ -63,6 +64,7 @@ component: Network
|
|||
warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
|
||||
delay: up 20s down 60m multiplier 1.2 max 2h
|
||||
options: no-clear-notification
|
||||
summary: System TCP inbound resets
|
||||
info: average number of received TCP RESETS over the last 10 seconds. \
|
||||
This can be an indication that a service this host needs has crashed. \
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
|
|
|
@ -15,7 +15,8 @@ component: Network
|
|||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
info: average number of UDP receive buffer errors over the last minute
|
||||
summary: System UDP receive buffer errors
|
||||
info: Average number of UDP receive buffer errors over the last minute
|
||||
delay: up 1m down 60m multiplier 1.2 max 2h
|
||||
to: silent
|
||||
|
||||
|
@ -33,6 +34,7 @@ component: Network
|
|||
units: errors
|
||||
every: 10s
|
||||
warn: $this > (($status >= $WARNING) ? (0) : (10))
|
||||
info: average number of UDP send buffer errors over the last minute
|
||||
summary: System UDP send buffer errors
|
||||
info: Average number of UDP send buffer errors over the last minute
|
||||
delay: up 1m down 60m multiplier 1.2 max 2h
|
||||
to: silent
|
||||
|
|
|
@ -11,7 +11,8 @@ component: Unbound
|
|||
every: 10s
|
||||
warn: $this > 5
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: number of overwritten queries in the request-list
|
||||
summary: Unbound overwritten queries
|
||||
info: Number of overwritten queries in the request-list
|
||||
to: sysadmin
|
||||
|
||||
template: unbound_request_list_dropped
|
||||
|
@ -24,5 +25,6 @@ component: Unbound
|
|||
every: 10s
|
||||
warn: $this > 0
|
||||
delay: up 10 down 5m multiplier 1.5 max 1h
|
||||
info: number of dropped queries in the request-list
|
||||
summary: Unbound dropped queries
|
||||
info: Number of dropped queries in the request-list
|
||||
to: sysadmin
|
||||
|
|
|
@ -16,6 +16,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA system status
|
||||
info: VCSA overall system status is orange. One or more components are degraded.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -29,6 +30,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
crit: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA system status
|
||||
info: VCSA overall system status is red. One or more components are unavailable or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -49,6 +51,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA ApplMgmt service status
|
||||
info: VCSA ApplMgmt component status is orange. It is degraded, and may have serious problems.
|
||||
to: silent
|
||||
|
||||
|
@ -62,6 +65,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA ApplMgmt service status
|
||||
info: VCSA ApplMgmt component status is red. It is unavailable, or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -75,6 +79,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Load status
|
||||
info: VCSA Load component status is orange. It is degraded, and may have serious problems.
|
||||
to: silent
|
||||
|
||||
|
@ -88,6 +93,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Load status
|
||||
info: VCSA Load component status is red. It is unavailable, or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -101,6 +107,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Memory status
|
||||
info: VCSA Memory component status is orange. It is degraded, and may have serious problems.
|
||||
to: silent
|
||||
|
||||
|
@ -114,6 +121,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Memory status
|
||||
info: VCSA Memory component status is red. It is unavailable, or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -127,6 +135,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Swap status
|
||||
info: VCSA Swap component status is orange. It is degraded, and may have serious problems.
|
||||
to: silent
|
||||
|
||||
|
@ -140,6 +149,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Swap status
|
||||
info: VCSA Swap component status is red. It is unavailable, or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -153,6 +163,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Database status
|
||||
info: VCSA Database Storage component status is orange. It is degraded, and may have serious problems.
|
||||
to: silent
|
||||
|
||||
|
@ -166,6 +177,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Database status
|
||||
info: VCSA Database Storage component status is red. It is unavailable, or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -179,6 +191,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Storage status
|
||||
info: VCSA Storage component status is orange. It is degraded, and may have serious problems.
|
||||
to: silent
|
||||
|
||||
|
@ -192,6 +205,7 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA Storage status
|
||||
info: VCSA Storage component status is red. It is unavailable, or will stop functioning soon.
|
||||
to: sysadmin
|
||||
|
||||
|
@ -211,5 +225,6 @@ component: VMware vCenter
|
|||
every: 10s
|
||||
warn: $this == 1
|
||||
delay: down 1m multiplier 1.5 max 1h
|
||||
summary: VCSA software status
|
||||
info: VCSA software packages security updates are available.
|
||||
to: silent
|
||||
|
|
|
@ -15,6 +15,7 @@ component: CPU
|
|||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: vSphere CPU utilization for VM ${label:vm}
|
||||
info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
|
||||
to: silent
|
||||
|
||||
|
@ -30,6 +31,7 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: vSphere memory utilization for VM ${label:vm}
|
||||
info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
|
||||
to: silent
|
||||
|
||||
|
@ -47,6 +49,7 @@ component: CPU
|
|||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: vSphere ESXi CPU utilization for host ${label:host}
|
||||
info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
||||
|
@ -62,5 +65,6 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
summary: vSphere ESXi Ram utilization for host ${label:host}
|
||||
info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter}
|
||||
to: sysadmin
|
||||
|
|
|
@ -30,7 +30,8 @@ component: Web log
|
|||
every: 10s
|
||||
warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
|
||||
delay: up 1m down 5m multiplier 1.5 max 1h
|
||||
info: percentage of unparsed log lines over the last minute
|
||||
summary: Web log unparsed
|
||||
info: Percentage of unparsed log lines over the last minute
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -66,7 +67,8 @@ component: Web log
|
|||
warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
|
||||
summary: Web log successful
|
||||
info: Ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
|
||||
to: webmaster
|
||||
|
||||
template: web_log_1m_redirects
|
||||
|
@ -80,7 +82,8 @@ component: Web log
|
|||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of redirection HTTP requests over the last minute (3xx except 304)
|
||||
summary: Web log redirects
|
||||
info: Ratio of redirection HTTP requests over the last minute (3xx except 304)
|
||||
to: webmaster
|
||||
|
||||
template: web_log_1m_bad_requests
|
||||
|
@ -94,7 +97,8 @@ component: Web log
|
|||
every: 10s
|
||||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of client error HTTP requests over the last minute (4xx except 401)
|
||||
summary: Web log bad requests
|
||||
info: Ratio of client error HTTP requests over the last minute (4xx except 401)
|
||||
to: webmaster
|
||||
|
||||
template: web_log_1m_internal_errors
|
||||
|
@ -109,7 +113,8 @@ component: Web log
|
|||
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
|
||||
delay: up 2m down 15m multiplier 1.5 max 1h
|
||||
info: ratio of server error HTTP requests over the last minute (5xx)
|
||||
summary: Web log server errors
|
||||
info: Ratio of server error HTTP requests over the last minute (5xx)
|
||||
to: webmaster
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
|
@ -145,7 +150,8 @@ component: Web log
|
|||
warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
|
||||
crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average HTTP response time over the last 1 minute
|
||||
summary: Web log processing time
|
||||
info: Average HTTP response time over the last 1 minute
|
||||
options: no-clear-notification
|
||||
to: webmaster
|
||||
|
||||
|
@ -192,7 +198,8 @@ component: Web log
|
|||
crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
options: no-clear-notification
|
||||
info: ratio of successful HTTP requests over over the last 5 minutes, \
|
||||
summary: Web log 5 minutes requests ratio
|
||||
info: Ratio of successful HTTP requests over over the last 5 minutes, \
|
||||
compared with the previous 5 minutes \
|
||||
(clear notification for this alarm will not be sent)
|
||||
to: webmaster
|
||||
|
|
|
@ -9,5 +9,6 @@ component: WHOIS
|
|||
every: 60s
|
||||
warn: $this < $days_until_expiration_warning*24*60*60
|
||||
crit: $this < $days_until_expiration_critical*24*60*60
|
||||
info: time until the domain name registration expires
|
||||
summary: Whois expiration time for domain ${label:domain}
|
||||
info: Time until the domain name registration for ${label:domain} expires
|
||||
to: webmaster
|
||||
|
|
|
@ -14,7 +14,8 @@ component: CPU
|
|||
warn: $this > (($status >= $WARNING) ? (75) : (85))
|
||||
crit: $this > (($status == $CRITICAL) ? (85) : (95))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: average CPU utilization over the last 10 minutes
|
||||
summary: CPU utilization
|
||||
info: Average CPU utilization over the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
||||
|
@ -33,7 +34,8 @@ component: Memory
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: memory utilization
|
||||
summary: Ram utilization
|
||||
info: Memory utilization
|
||||
to: sysadmin
|
||||
|
||||
|
||||
|
@ -51,7 +53,8 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound discarded packets for the network interface in the last 10 minutes
|
||||
summary: Inbound network packets discarded
|
||||
info: Number of inbound discarded packets for the network interface in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
template: windows_outbound_packets_discarded
|
||||
|
@ -66,7 +69,8 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound discarded packets for the network interface in the last 10 minutes
|
||||
summary: Outbound network packets discarded
|
||||
info: Number of outbound discarded packets for the network interface in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
template: windows_inbound_packets_errors
|
||||
|
@ -81,7 +85,8 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of inbound errors for the network interface in the last 10 minutes
|
||||
summary: Inbound network errors
|
||||
info: Number of inbound errors for the network interface in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
template: windows_outbound_packets_errors
|
||||
|
@ -96,7 +101,8 @@ component: Network
|
|||
every: 1m
|
||||
warn: $this >= 5
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
info: number of outbound errors for the network interface in the last 10 minutes
|
||||
summary: Outbound network errors
|
||||
info: Number of outbound errors for the network interface in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
||||
|
@ -115,5 +121,6 @@ component: Disk
|
|||
warn: $this > (($status >= $WARNING) ? (80) : (90))
|
||||
crit: $this > (($status == $CRITICAL) ? (90) : (98))
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: disk space utilization
|
||||
summary: Disk space usage
|
||||
info: Disk space utilization
|
||||
to: sysadmin
|
||||
|
|
|
@ -9,7 +9,8 @@ component: x509 certificates
|
|||
every: 60s
|
||||
warn: $this < $days_until_expiration_warning*24*60*60
|
||||
crit: $this < $days_until_expiration_critical*24*60*60
|
||||
info: time until x509 certificate expires
|
||||
summary: x509 certificate expiration for ${label:source}
|
||||
info: Time until x509 certificate expires for ${label:source}
|
||||
to: webmaster
|
||||
|
||||
template: x509check_revocation_status
|
||||
|
@ -20,5 +21,6 @@ component: x509 certificates
|
|||
calc: $revoked
|
||||
every: 60s
|
||||
crit: $this != nan AND $this != 0
|
||||
info: x509 certificate revocation status (0: revoked, 1: valid)
|
||||
summary: x509 certificate revocation status for ${label:source}
|
||||
info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source}
|
||||
to: webmaster
|
||||
|
|
|
@ -9,7 +9,7 @@ component: File system
|
|||
every: 1m
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 2h
|
||||
summary: ZFS memory throttle
|
||||
summary: ZFS ARC growth throttling
|
||||
info: number of times ZFS had to limit the ARC growth in the last 10 minutes
|
||||
to: silent
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue