mirror of
https://github.com/netdata/netdata.git
synced 2025-04-06 22:38:55 +00:00
Move health/ under src/ (#16954)
* Move health/ under src/ * Update references to health paths. ``` find . -type f -exec sed -i 's:master/exporting:master/src/exporting:g' {} \; ``` * Update .git{,hub} refs to health.
This commit is contained in:
parent
3133f03a10
commit
8c736bb135
567 changed files with 522 additions and 522 deletions
.github
.gitignoreCMakeLists.txtHISTORICAL_CHANGELOG.mdcollectors
cgroups.plugin
integrations
containers.mdkubernetes_containers.mdlibvirt_containers.mdlxc_containers.mdovirt_containers.mdproxmox_containers.mdvirtual_machines.md
metadata.yamlcharts.d.plugin/apcupsd
diskspace.plugin
ebpf.plugin
freebsd.plugin
integrations
devstat.mdgetifaddrs.mdgetmntinfo.mdkern.cp_time.mdkern.ipc.sem.mdnet.inet.tcp.states.mdnet.inet.tcp.stats.mdnet.inet.udp.stats.mdnet.isr.mdsystem.ram.mdvm.loadavg.mdvm.stats.vm.v_swappgs.mdvm.swap_info.mdvm.vmtotal.mdzfs.md
metadata.yamlfreeipmi.plugin
ioping.plugin
macos.plugin
proc.plugin
README.md
integrations
btrfs.mdconntrack.mddisk_statistics.mdentropy.mdinter_process_communication.mdmd_raid.mdmemory_modules_dimms.mdmemory_statistics.mdmemory_usage.mdnetwork_interfaces.mdnetwork_statistics.mdpower_supply.mdsocket_statistics.mdsoftnet_statistics.mdsystem_load_average.mdsystem_statistics.mdzfs_adaptive_replacement_cache.mdzfs_pools.md
metadata.yamlpython.d.plugin
adaptec_raid
anomalies
beanstalk
bind_rndc
boinc
ceph
gearman
haproxy
ipfs
megacli
memcached
retroshare
riakkv
timex.plugin
diagrams
docs
6
.github/CODEOWNERS
vendored
6
.github/CODEOWNERS
vendored
|
@ -19,9 +19,9 @@ src/exporting/ @thiagoftsm
|
|||
src/daemon/ @thiagoftsm @vkalintiris
|
||||
src/database/ @thiagoftsm @vkalintiris
|
||||
docs/ @tkatsoulas @Ancairon
|
||||
health/ @thiagoftsm @vkalintiris
|
||||
health/health.d/ @thiagoftsm
|
||||
health/notifications/ @Ferroin @thiagoftsm
|
||||
src/health/ @thiagoftsm @vkalintiris
|
||||
src/health/health.d/ @thiagoftsm
|
||||
src/health/notifications/ @Ferroin @thiagoftsm
|
||||
src/ml/ @vkalintiris
|
||||
src/libnetdata/ @thiagoftsm @vkalintiris
|
||||
packaging/ @Ferroin @tkatsoulas
|
||||
|
|
2
.github/labeler.yml
vendored
2
.github/labeler.yml
vendored
|
@ -220,7 +220,7 @@ area/health:
|
|||
- any:
|
||||
- changed-files:
|
||||
- any-glob-to-any-file:
|
||||
- health/**
|
||||
- src/health/**
|
||||
|
||||
area/metadata:
|
||||
- any:
|
||||
|
|
2
.github/workflows/generate-integrations.yml
vendored
2
.github/workflows/generate-integrations.yml
vendored
|
@ -8,7 +8,7 @@ on:
|
|||
paths: # If any of these files change, we need to regenerate integrations.js.
|
||||
- 'collectors/**/metadata.yaml'
|
||||
- 'src/exporting/**/metadata.yaml'
|
||||
- 'health/notifications/**/metadata.yaml'
|
||||
- 'src/health/notifications/**/metadata.yaml'
|
||||
- 'integrations/templates/**'
|
||||
- 'integrations/categories.yaml'
|
||||
- 'integrations/deploy.yaml'
|
||||
|
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -86,7 +86,7 @@ system/systemd/netdata-updater.service
|
|||
!system/systemd/netdata.service.in
|
||||
!system/systemd/netdata.service.*.in
|
||||
|
||||
health/notifications/alarm-notify.sh
|
||||
src/health/notifications/alarm-notify.sh
|
||||
claim/netdata-claim.sh
|
||||
collectors/cgroups.plugin/cgroup-name.sh
|
||||
collectors/cgroups.plugin/cgroup-network-helper.sh
|
||||
|
|
|
@ -942,24 +942,24 @@ set(EXPORTING_ENGINE_FILES
|
|||
)
|
||||
|
||||
set(HEALTH_PLUGIN_FILES
|
||||
health/health.c
|
||||
health/health.h
|
||||
health/health_config.c
|
||||
health/health_json.c
|
||||
health/health_log.c
|
||||
health/health_prototypes.c
|
||||
health/health_prototypes.h
|
||||
health/health_silencers.c
|
||||
health/health_silencers.h
|
||||
health/health_internals.h
|
||||
health/health_notifications.c
|
||||
health/health_event_loop.c
|
||||
health/health_dyncfg.c
|
||||
health/health_variable.c
|
||||
health/rrdcalc.c
|
||||
health/rrdcalc.h
|
||||
health/rrdvar.c
|
||||
health/rrdvar.h
|
||||
src/health/health.c
|
||||
src/health/health.h
|
||||
src/health/health_config.c
|
||||
src/health/health_json.c
|
||||
src/health/health_log.c
|
||||
src/health/health_prototypes.c
|
||||
src/health/health_prototypes.h
|
||||
src/health/health_silencers.c
|
||||
src/health/health_silencers.h
|
||||
src/health/health_internals.h
|
||||
src/health/health_notifications.c
|
||||
src/health/health_event_loop.c
|
||||
src/health/health_dyncfg.c
|
||||
src/health/health_variable.c
|
||||
src/health/rrdcalc.c
|
||||
src/health/rrdcalc.h
|
||||
src/health/rrdvar.c
|
||||
src/health/rrdvar.h
|
||||
)
|
||||
|
||||
set(IDLEJITTER_PLUGIN_FILES collectors/idlejitter.plugin/plugin_idlejitter.c)
|
||||
|
@ -2450,24 +2450,24 @@ install(PROGRAMS
|
|||
# health files
|
||||
#
|
||||
|
||||
file(GLOB_RECURSE HEALTH_CONF_FILES "health/health.d/*.conf")
|
||||
file(GLOB_RECURSE HEALTH_CONF_FILES "src/health/health.d/*.conf")
|
||||
install(FILES
|
||||
${HEALTH_CONF_FILES}
|
||||
DESTINATION usr/lib/netdata/conf.d/health.d)
|
||||
|
||||
configure_file(health/notifications/alarm-notify.sh.in health/notifications/alarm-notify.sh @ONLY)
|
||||
configure_file(src/health/notifications/alarm-notify.sh.in src/health/notifications/alarm-notify.sh @ONLY)
|
||||
install(PROGRAMS
|
||||
${CMAKE_BINARY_DIR}/health/notifications/alarm-notify.sh
|
||||
${CMAKE_BINARY_DIR}/src/health/notifications/alarm-notify.sh
|
||||
DESTINATION usr/libexec/netdata/plugins.d)
|
||||
|
||||
install(PROGRAMS
|
||||
health/notifications/alarm-email.sh
|
||||
health/notifications/alarm-test.sh
|
||||
src/health/notifications/alarm-email.sh
|
||||
src/health/notifications/alarm-test.sh
|
||||
DESTINATION usr/libexec/netdata/plugins.d)
|
||||
|
||||
install(FILES
|
||||
health/notifications/health_alarm_notify.conf
|
||||
health/notifications/health_email_recipients.conf
|
||||
src/health/notifications/health_alarm_notify.conf
|
||||
src/health/notifications/health_email_recipients.conf
|
||||
DESTINATION usr/lib/netdata/conf.d)
|
||||
#
|
||||
# test/ files
|
||||
|
@ -2540,7 +2540,7 @@ install(FILES
|
|||
|
||||
install(FILES
|
||||
collectors/systemd-journal.plugin/schema.d/systemd-journal:monitored-directories.json
|
||||
health/schema.d/health:alert:prototype.json
|
||||
src/health/schema.d/health:alert:prototype.json
|
||||
DESTINATION usr/lib/netdata/conf.d/schema.d)
|
||||
|
||||
#
|
||||
|
|
|
@ -456,7 +456,7 @@ netdata (1.3.0) - 2016-08-28
|
|||
netdata now has a powerful health monitoring system embedded.
|
||||
Please check the wiki page:
|
||||
|
||||
<https://github.com/netdata/netdata/tree/master/health>
|
||||
<https://github.com/netdata/netdata/tree/master/src/health>
|
||||
|
||||
- netdata has badges!
|
||||
|
||||
|
|
|
@ -140,10 +140,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -154,10 +154,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ k8s_cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ k8s_cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ k8s_cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ k8s_cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ k8s_cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ k8s_cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ k8s_cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | k8s.cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -140,10 +140,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -140,10 +140,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -140,10 +140,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -140,10 +140,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -140,10 +140,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ cgroup_10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.cpu_limit | average cgroup CPU utilization over the last 10 minutes |
|
||||
| [ cgroup_ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.mem_usage | cgroup memory utilization |
|
||||
| [ cgroup_1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ cgroup_10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf) | cgroup.net_packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -58,19 +58,19 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: cgroup_10min_cpu_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: cgroup.cpu_limit
|
||||
info: average cgroup CPU utilization over the last 10 minutes
|
||||
- name: cgroup_ram_in_use
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: cgroup.mem_usage
|
||||
info: cgroup memory utilization
|
||||
- name: cgroup_1m_received_packets_rate
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: cgroup.net_packets
|
||||
info: average number of packets received by the network interface ${label:device} over the last minute
|
||||
- name: cgroup_10s_received_packets_storm
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: cgroup.net_packets
|
||||
info:
|
||||
ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over
|
||||
|
@ -428,19 +428,19 @@ modules:
|
|||
metrics_description: Monitor Kubernetes Clusters for performance, resource usage, and health status.
|
||||
alerts:
|
||||
- name: k8s_cgroup_10min_cpu_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: k8s.cgroup.cpu_limit
|
||||
info: average cgroup CPU utilization over the last 10 minutes
|
||||
- name: k8s_cgroup_ram_in_use
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: k8s.cgroup.mem_usage
|
||||
info: cgroup memory utilization
|
||||
- name: k8s_cgroup_1m_received_packets_rate
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: k8s.cgroup.net_packets
|
||||
info: average number of packets received by the network interface ${label:device} over the last minute
|
||||
- name: k8s_cgroup_10s_received_packets_storm
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cgroups.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cgroups.conf
|
||||
metric: k8s.cgroup.net_packets
|
||||
info:
|
||||
ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over
|
||||
|
|
|
@ -85,16 +85,16 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ apcupsd_ups_charge ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.charge | average UPS charge over the last minute |
|
||||
| [ apcupsd_10min_ups_load ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.load | average UPS load over the last 10 minutes |
|
||||
| [ apcupsd_last_collected_secs ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.load | number of seconds since the last successful data collection |
|
||||
| [ apcupsd_selftest_warning ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.selftest | self-test failed due to insufficient battery capacity or due to overload. |
|
||||
| [ apcupsd_status_onbatt ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS has switched to battery power because the input power has failed |
|
||||
| [ apcupsd_status_overload ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS is overloaded and cannot supply enough power to the load |
|
||||
| [ apcupsd_status_lowbatt ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS battery is low and needs to be recharged |
|
||||
| [ apcupsd_status_replacebatt ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS battery has reached the end of its lifespan and needs to be replaced |
|
||||
| [ apcupsd_status_nobatt ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS has no battery |
|
||||
| [ apcupsd_status_commlost ](https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS communication link is lost |
|
||||
| [ apcupsd_ups_charge ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.charge | average UPS charge over the last minute |
|
||||
| [ apcupsd_10min_ups_load ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.load | average UPS load over the last 10 minutes |
|
||||
| [ apcupsd_last_collected_secs ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.load | number of seconds since the last successful data collection |
|
||||
| [ apcupsd_selftest_warning ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.selftest | self-test failed due to insufficient battery capacity or due to overload. |
|
||||
| [ apcupsd_status_onbatt ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS has switched to battery power because the input power has failed |
|
||||
| [ apcupsd_status_overload ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS is overloaded and cannot supply enough power to the load |
|
||||
| [ apcupsd_status_lowbatt ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS battery is low and needs to be recharged |
|
||||
| [ apcupsd_status_replacebatt ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS battery has reached the end of its lifespan and needs to be replaced |
|
||||
| [ apcupsd_status_nobatt ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS has no battery |
|
||||
| [ apcupsd_status_commlost ](https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf) | apcupsd.status | APC UPS communication link is lost |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -111,45 +111,45 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: apcupsd_ups_charge
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.charge
|
||||
info: average UPS charge over the last minute
|
||||
os: "*"
|
||||
- name: apcupsd_10min_ups_load
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.load
|
||||
info: average UPS load over the last 10 minutes
|
||||
os: "*"
|
||||
- name: apcupsd_last_collected_secs
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.load
|
||||
info: number of seconds since the last successful data collection
|
||||
- name: apcupsd_selftest_warning
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.selftest
|
||||
info: self-test failed due to insufficient battery capacity or due to overload.
|
||||
- name: apcupsd_status_onbatt
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.status
|
||||
info: APC UPS has switched to battery power because the input power has failed
|
||||
- name: apcupsd_status_overload
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.status
|
||||
info: APC UPS is overloaded and cannot supply enough power to the load
|
||||
- name: apcupsd_status_lowbatt
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.status
|
||||
info: APC UPS battery is low and needs to be recharged
|
||||
- name: apcupsd_status_replacebatt
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.status
|
||||
info: APC UPS battery has reached the end of its lifespan and needs to be replaced
|
||||
- name: apcupsd_status_nobatt
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.status
|
||||
info: APC UPS has no battery
|
||||
- name: apcupsd_status_commlost
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/apcupsd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/apcupsd.conf
|
||||
metric: apcupsd.status
|
||||
info: APC UPS communication link is lost
|
||||
metrics:
|
||||
|
|
|
@ -81,8 +81,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ disk_space_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.space | disk ${label:mount_point} space utilization |
|
||||
| [ disk_inode_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.inodes | disk ${label:mount_point} inode utilization |
|
||||
| [ disk_space_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.space | disk ${label:mount_point} space utilization |
|
||||
| [ disk_inode_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.inodes | disk ${label:mount_point} inode utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -95,12 +95,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: disk_space_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.space
|
||||
info: disk ${label:mount_point} space utilization
|
||||
os: "linux freebsd"
|
||||
- name: disk_inode_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.inodes
|
||||
info: disk ${label:mount_point} inode utilization
|
||||
os: "linux freebsd"
|
||||
|
|
|
@ -80,7 +80,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ sync_freq ](https://github.com/netdata/netdata/blob/master/health/health.d/synchronization.conf) | mem.sync | number of sync() system calls. Every call causes all pending modifications to filesystem metadata and cached file data to be written to the underlying filesystems. |
|
||||
| [ sync_freq ](https://github.com/netdata/netdata/blob/master/src/health/health.d/synchronization.conf) | mem.sync | number of sync() system calls. Every call causes all pending modifications to filesystem metadata and cached file data to be written to the underlying filesystems. |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -1065,7 +1065,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: sync_freq
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/synchronization.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/synchronization.conf
|
||||
metric: mem.sync
|
||||
info:
|
||||
number of sync() system calls. Every call causes all pending modifications to filesystem metadata and cached file data to be written to the
|
||||
|
|
|
@ -93,7 +93,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 10min_disk_utilization ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.util | average percentage of time ${label:device} disk was busy over the last 10 minutes |
|
||||
| [ 10min_disk_utilization ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.util | average percentage of time ${label:device} disk was busy over the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -93,13 +93,13 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ interface_speed ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.net | network interface ${label:device} current speed |
|
||||
| [ inbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.drops | ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ outbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.drops | ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ 1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ 10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ interface_inbound_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.errors | number of inbound errors for the network interface ${label:device} in the last 10 minutes |
|
||||
| [ interface_outbound_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.errors | number of outbound errors for the network interface ${label:device} in the last 10 minutes |
|
||||
| [ interface_speed ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.net | network interface ${label:device} current speed |
|
||||
| [ inbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.drops | ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ outbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.drops | ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ 1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ 10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ interface_inbound_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.errors | number of inbound errors for the network interface ${label:device} in the last 10 minutes |
|
||||
| [ interface_outbound_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.errors | number of outbound errors for the network interface ${label:device} in the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -75,8 +75,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ disk_space_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.space | disk ${label:mount_point} space utilization |
|
||||
| [ disk_inode_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.inodes | disk ${label:mount_point} inode utilization |
|
||||
| [ disk_space_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.space | disk ${label:mount_point} space utilization |
|
||||
| [ disk_inode_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.inodes | disk ${label:mount_point} inode utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -86,10 +86,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) |
|
||||
| [ 10min_cpu_iowait ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU iowait time over the last 10 minutes |
|
||||
| [ 20min_steal_cpu ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU steal time over the last 20 minutes |
|
||||
| [ 10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU utilization over the last 10 minutes (excluding nice) |
|
||||
| [ 10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) |
|
||||
| [ 10min_cpu_iowait ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU iowait time over the last 10 minutes |
|
||||
| [ 20min_steal_cpu ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU steal time over the last 20 minutes |
|
||||
| [ 10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU utilization over the last 10 minutes (excluding nice) |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -75,8 +75,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ semaphores_used ](https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf) | system.ipc_semaphores | IPC semaphore utilization |
|
||||
| [ semaphore_arrays_used ](https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf) | system.ipc_semaphore_arrays | IPC semaphore arrays utilization |
|
||||
| [ semaphores_used ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf) | system.ipc_semaphores | IPC semaphore utilization |
|
||||
| [ semaphore_arrays_used ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf) | system.ipc_semaphore_arrays | IPC semaphore arrays utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -74,7 +74,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ tcp_connections ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_conn.conf) | ipv4.tcpsock | IPv4 TCP connections utilization |
|
||||
| [ tcp_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_conn.conf) | ipv4.tcpsock | IPv4 TCP connections utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -81,10 +81,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 1m_ipv4_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of sent TCP RESETS over the last minute |
|
||||
| [ 10s_ipv4_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ipv4_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of received TCP RESETS over the last minute |
|
||||
| [ 10s_ipv4_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ipv4_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of sent TCP RESETS over the last minute |
|
||||
| [ 10s_ipv4_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ipv4_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of received TCP RESETS over the last minute |
|
||||
| [ 10s_ipv4_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ipv4.tcphandshake | average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -75,8 +75,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 1m_ipv4_udp_receive_buffer_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP receive buffer errors over the last minute |
|
||||
| [ 1m_ipv4_udp_send_buffer_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP send buffer errors over the last minute |
|
||||
| [ 1m_ipv4_udp_receive_buffer_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP receive buffer errors over the last minute |
|
||||
| [ 1m_ipv4_udp_send_buffer_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP send buffer errors over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -86,9 +86,9 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 1min_netdev_backlog_exceeded ](https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf) | system.softnet_stat | average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog |
|
||||
| [ 1min_netdev_budget_ran_outs ](https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf) | system.softnet_stat | average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last minute (this can be a cause for dropped packets) |
|
||||
| [ 10min_netisr_backlog_exceeded ](https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf) | system.softnet_stat | average number of drops in the last minute due to exceeded sysctl net.route.netisr_maxqlen (this can be a cause for dropped packets) |
|
||||
| [ 1min_netdev_backlog_exceeded ](https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf) | system.softnet_stat | average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog |
|
||||
| [ 1min_netdev_budget_ran_outs ](https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf) | system.softnet_stat | average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last minute (this can be a cause for dropped packets) |
|
||||
| [ 10min_netisr_backlog_exceeded ](https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf) | system.softnet_stat | average number of drops in the last minute due to exceeded sysctl net.route.netisr_maxqlen (this can be a cause for dropped packets) |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -75,10 +75,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | system.ram | system memory utilization |
|
||||
| [ ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | system.ram | system memory utilization |
|
||||
| [ ram_available ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | mem.available | percentage of estimated amount of RAM available for userspace processes, without causing swapping |
|
||||
| [ ram_available ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | mem.available | percentage of estimated amount of RAM available for userspace processes, without causing swapping |
|
||||
| [ ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | system.ram | system memory utilization |
|
||||
| [ ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | system.ram | system memory utilization |
|
||||
| [ ram_available ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | mem.available | percentage of estimated amount of RAM available for userspace processes, without causing swapping |
|
||||
| [ ram_available ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | mem.available | percentage of estimated amount of RAM available for userspace processes, without causing swapping |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -74,10 +74,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ load_cpu_number ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | number of active CPU cores in the system |
|
||||
| [ load_average_15 ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | system fifteen-minute load average |
|
||||
| [ load_average_5 ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | system five-minute load average |
|
||||
| [ load_average_1 ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | system one-minute load average |
|
||||
| [ load_cpu_number ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | number of active CPU cores in the system |
|
||||
| [ load_average_15 ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | system fifteen-minute load average |
|
||||
| [ load_average_5 ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | system five-minute load average |
|
||||
| [ load_average_1 ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | system one-minute load average |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -74,7 +74,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 30min_ram_swapped_out ](https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf) | mem.swapio | percentage of the system RAM swapped in the last 30 minutes |
|
||||
| [ 30min_ram_swapped_out ](https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf) | mem.swapio | percentage of the system RAM swapped in the last 30 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -74,7 +74,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ used_swap ](https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf) | mem.swap | swap memory utilization |
|
||||
| [ used_swap ](https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf) | mem.swap | swap memory utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -76,7 +76,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ active_processes ](https://github.com/netdata/netdata/blob/master/health/health.d/processes.conf) | system.active_processes | system process IDs (PID) space utilization |
|
||||
| [ active_processes ](https://github.com/netdata/netdata/blob/master/src/health/health.d/processes.conf) | system.active_processes | system process IDs (PID) space utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -101,7 +101,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ zfs_memory_throttle ](https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf) | zfs.memory_ops | number of times ZFS had to limit the ARC growth in the last 10 minutes |
|
||||
| [ zfs_memory_throttle ](https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf) | zfs.memory_ops | number of times ZFS had to limit the ARC growth in the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -61,22 +61,22 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: load_cpu_number
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: number of active CPU cores in the system
|
||||
os: "linux"
|
||||
- name: load_average_15
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: system fifteen-minute load average
|
||||
os: "linux"
|
||||
- name: load_average_5
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: system five-minute load average
|
||||
os: "linux"
|
||||
- name: load_average_1
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: system one-minute load average
|
||||
os: "linux"
|
||||
|
@ -168,7 +168,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: active_processes
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/processes.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/processes.conf
|
||||
metric: system.active_processes
|
||||
info: system process IDs (PID) space utilization
|
||||
metrics:
|
||||
|
@ -261,22 +261,22 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 10min_cpu_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
|
||||
os: "linux"
|
||||
- name: 10min_cpu_iowait
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU iowait time over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: 20min_steal_cpu
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU steal time over the last 20 minutes
|
||||
os: "linux"
|
||||
- name: 10min_cpu_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU utilization over the last 10 minutes (excluding nice)
|
||||
os: "freebsd"
|
||||
|
@ -850,7 +850,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: used_swap
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf
|
||||
metric: mem.swap
|
||||
info: swap memory utilization
|
||||
os: "linux freebsd"
|
||||
|
@ -933,22 +933,22 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: ram_in_use
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: system.ram
|
||||
info: system memory utilization
|
||||
os: "linux"
|
||||
- name: ram_in_use
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: system.ram
|
||||
info: system memory utilization
|
||||
os: "freebsd"
|
||||
- name: ram_available
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: mem.available
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
os: "linux"
|
||||
- name: ram_available
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: mem.available
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
os: "freebsd"
|
||||
|
@ -1042,7 +1042,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 30min_ram_swapped_out
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf
|
||||
metric: mem.swapio
|
||||
info: percentage of the system RAM swapped in the last 30 minutes
|
||||
os: "linux freebsd"
|
||||
|
@ -1206,12 +1206,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: semaphores_used
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf
|
||||
metric: system.ipc_semaphores
|
||||
info: IPC semaphore utilization
|
||||
os: "linux"
|
||||
- name: semaphore_arrays_used
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf
|
||||
metric: system.ipc_semaphore_arrays
|
||||
info: IPC semaphore arrays utilization
|
||||
os: "linux"
|
||||
|
@ -1553,19 +1553,19 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 1min_netdev_backlog_exceeded
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf
|
||||
metric: system.softnet_stat
|
||||
info: average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog
|
||||
os: "linux"
|
||||
- name: 1min_netdev_budget_ran_outs
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf
|
||||
metric: system.softnet_stat
|
||||
info:
|
||||
average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last
|
||||
minute (this can be a cause for dropped packets)
|
||||
os: "linux"
|
||||
- name: 10min_netisr_backlog_exceeded
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf
|
||||
metric: system.softnet_stat
|
||||
info: average number of drops in the last minute due to exceeded sysctl net.route.netisr_maxqlen (this can be a cause for dropped packets)
|
||||
os: "freebsd"
|
||||
|
@ -1707,7 +1707,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 10min_disk_utilization
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.util
|
||||
info: average percentage of time ${label:device} disk was busy over the last 10 minutes
|
||||
os: "linux freebsd"
|
||||
|
@ -1855,7 +1855,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: tcp_connections
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_conn.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_conn.conf
|
||||
metric: ipv4.tcpsock
|
||||
info: IPv4 TCP connections utilization
|
||||
os: "linux"
|
||||
|
@ -1965,24 +1965,24 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 1m_ipv4_tcp_resets_sent
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ipv4.tcphandshake
|
||||
info: average number of sent TCP RESETS over the last minute
|
||||
os: "linux"
|
||||
- name: 10s_ipv4_tcp_resets_sent
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ipv4.tcphandshake
|
||||
info:
|
||||
average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has
|
||||
crashed. Netdata will not send a clear notification for this alarm.
|
||||
os: "linux"
|
||||
- name: 1m_ipv4_tcp_resets_received
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ipv4.tcphandshake
|
||||
info: average number of received TCP RESETS over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 10s_ipv4_tcp_resets_received
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ipv4.tcphandshake
|
||||
info:
|
||||
average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed.
|
||||
|
@ -2128,12 +2128,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 1m_ipv4_udp_receive_buffer_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf
|
||||
metric: ipv4.udperrors
|
||||
info: average number of UDP receive buffer errors over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 1m_ipv4_udp_send_buffer_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf
|
||||
metric: ipv4.udperrors
|
||||
info: average number of UDP send buffer errors over the last minute
|
||||
os: "linux"
|
||||
|
@ -2889,37 +2889,37 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: interface_speed
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.net
|
||||
info: network interface ${label:device} current speed
|
||||
os: "*"
|
||||
- name: inbound_packets_dropped_ratio
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.drops
|
||||
info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
os: "*"
|
||||
- name: outbound_packets_dropped_ratio
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.drops
|
||||
info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
os: "*"
|
||||
- name: 1m_received_packets_rate
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.packets
|
||||
info: average number of packets received by the network interface ${label:device} over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 10s_received_packets_storm
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.packets
|
||||
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: interface_inbound_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.errors
|
||||
info: number of inbound errors for the network interface ${label:device} in the last 10 minutes
|
||||
os: "freebsd"
|
||||
- name: interface_outbound_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.errors
|
||||
info: number of outbound errors for the network interface ${label:device} in the last 10 minutes
|
||||
os: "freebsd"
|
||||
|
@ -3081,12 +3081,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: disk_space_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.space
|
||||
info: disk ${label:mount_point} space utilization
|
||||
os: "linux freebsd"
|
||||
- name: disk_inode_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.inodes
|
||||
info: disk ${label:mount_point} inode utilization
|
||||
os: "linux freebsd"
|
||||
|
@ -3178,7 +3178,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: zfs_memory_throttle
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf
|
||||
metric: zfs.memory_ops
|
||||
info: number of times ZFS had to limit the ARC growth in the last 10 minutes
|
||||
metrics:
|
||||
|
|
|
@ -103,7 +103,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ipmi_sensor_state ](https://github.com/netdata/netdata/blob/master/health/health.d/ipmi.conf) | ipmi.sensor_state | IPMI sensor ${label:sensor} (${label:component}) state |
|
||||
| [ ipmi_sensor_state ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ipmi.conf) | ipmi.sensor_state | IPMI sensor ${label:sensor} (${label:component}) state |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -263,7 +263,7 @@ modules:
|
|||
You can also use a higher number (this is the number of microseconds to poll IPMI for a response, before waiting for a tick).
|
||||
alerts:
|
||||
- name: ipmi_sensor_state
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ipmi.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ipmi.conf
|
||||
metric: ipmi.sensor_state
|
||||
info: IPMI sensor ${label:sensor} (${label:component}) state
|
||||
metrics:
|
||||
|
|
|
@ -74,7 +74,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ioping_disk_latency ](https://github.com/netdata/netdata/blob/master/health/health.d/ioping.conf) | ioping.latency | average I/O latency over the last 10 seconds |
|
||||
| [ ioping_disk_latency ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ioping.conf) | ioping.latency | average I/O latency over the last 10 seconds |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -79,7 +79,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: ioping_disk_latency
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ioping.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ioping.conf
|
||||
metric: ioping.latency
|
||||
info: average I/O latency over the last 10 seconds
|
||||
metrics:
|
||||
|
|
|
@ -163,7 +163,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ interface_speed ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.net | network interface ${label:device} current speed |
|
||||
| [ interface_speed ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.net | network interface ${label:device} current speed |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -279,7 +279,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: interface_speed
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.net
|
||||
info: network interface ${label:device} current speed
|
||||
os: "*"
|
||||
|
|
|
@ -402,7 +402,7 @@ You can set the following values for each configuration option:
|
|||
|
||||
There are several alerts defined in `health.d/net.conf`.
|
||||
|
||||
The tricky ones are `inbound packets dropped` and `inbound packets dropped ratio`. They have quite a strict policy so that they warn users about possible issues. These alerts can be annoying for some network configurations. It is especially true for some bonding configurations if an interface is a child or a bonding interface itself. If it is expected to have a certain number of drops on an interface for a certain network configuration, a separate alert with different triggering thresholds can be created or the existing one can be disabled for this specific interface. It can be done with the help of the [families](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-families) line in the alert configuration. For example, if you want to disable the `inbound packets dropped` alert for `eth0`, set `families: !eth0 *` in the alert definition for `template: inbound_packets_dropped`.
|
||||
The tricky ones are `inbound packets dropped` and `inbound packets dropped ratio`. They have quite a strict policy so that they warn users about possible issues. These alerts can be annoying for some network configurations. It is especially true for some bonding configurations if an interface is a child or a bonding interface itself. If it is expected to have a certain number of drops on an interface for a certain network configuration, a separate alert with different triggering thresholds can be created or the existing one can be disabled for this specific interface. It can be done with the help of the [families](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-families) line in the alert configuration. For example, if you want to disable the `inbound packets dropped` alert for `eth0`, set `families: !eth0 *` in the alert definition for `template: inbound_packets_dropped`.
|
||||
|
||||
#### configuration
|
||||
|
||||
|
|
|
@ -103,15 +103,15 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ btrfs_allocated ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.disk | percentage of allocated BTRFS physical disk space |
|
||||
| [ btrfs_data ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.data | utilization of BTRFS data space |
|
||||
| [ btrfs_metadata ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.metadata | utilization of BTRFS metadata space |
|
||||
| [ btrfs_system ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.system | utilization of BTRFS system space |
|
||||
| [ btrfs_device_read_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS read errors |
|
||||
| [ btrfs_device_write_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS write errors |
|
||||
| [ btrfs_device_flush_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS flush errors |
|
||||
| [ btrfs_device_corruption_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS corruption errors |
|
||||
| [ btrfs_device_generation_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS generation errors |
|
||||
| [ btrfs_allocated ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.disk | percentage of allocated BTRFS physical disk space |
|
||||
| [ btrfs_data ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.data | utilization of BTRFS data space |
|
||||
| [ btrfs_metadata ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.metadata | utilization of BTRFS metadata space |
|
||||
| [ btrfs_system ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.system | utilization of BTRFS system space |
|
||||
| [ btrfs_device_read_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS read errors |
|
||||
| [ btrfs_device_write_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS write errors |
|
||||
| [ btrfs_device_flush_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS flush errors |
|
||||
| [ btrfs_device_corruption_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS corruption errors |
|
||||
| [ btrfs_device_generation_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf) | btrfs.device_errors | number of encountered BTRFS generation errors |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -79,7 +79,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ netfilter_conntrack_full ](https://github.com/netdata/netdata/blob/master/health/health.d/netfilter.conf) | netfilter.conntrack_sockets | netfilter connection tracker table size utilization |
|
||||
| [ netfilter_conntrack_full ](https://github.com/netdata/netdata/blob/master/src/health/health.d/netfilter.conf) | netfilter.conntrack_sockets | netfilter connection tracker table size utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -120,10 +120,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 10min_disk_backlog ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.backlog | average backlog size of the ${label:device} disk over the last 10 minutes |
|
||||
| [ 10min_disk_utilization ](https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf) | disk.util | average percentage of time ${label:device} disk was busy over the last 10 minutes |
|
||||
| [ bcache_cache_dirty ](https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf) | disk.bcache_cache_alloc | percentage of cache space used for dirty data and metadata (this usually means your SSD cache is too small) |
|
||||
| [ bcache_cache_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf) | disk.bcache_cache_read_races | number of times data was read from the cache, the bucket was reused and invalidated in the last 10 minutes (when this occurs the data is reread from the backing device) |
|
||||
| [ 10min_disk_backlog ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.backlog | average backlog size of the ${label:device} disk over the last 10 minutes |
|
||||
| [ 10min_disk_utilization ](https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf) | disk.util | average percentage of time ${label:device} disk was busy over the last 10 minutes |
|
||||
| [ bcache_cache_dirty ](https://github.com/netdata/netdata/blob/master/src/health/health.d/bcache.conf) | disk.bcache_cache_alloc | percentage of cache space used for dirty data and metadata (this usually means your SSD cache is too small) |
|
||||
| [ bcache_cache_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/bcache.conf) | disk.bcache_cache_read_races | number of times data was read from the cache, the bucket was reused and invalidated in the last 10 minutes (when this occurs the data is reread from the backing device) |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -107,7 +107,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ lowest_entropy ](https://github.com/netdata/netdata/blob/master/health/health.d/entropy.conf) | system.entropy | minimum number of bits of entropy available for the kernel’s random number generator |
|
||||
| [ lowest_entropy ](https://github.com/netdata/netdata/blob/master/src/health/health.d/entropy.conf) | system.entropy | minimum number of bits of entropy available for the kernel’s random number generator |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -93,8 +93,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ semaphores_used ](https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf) | system.ipc_semaphores | IPC semaphore utilization |
|
||||
| [ semaphore_arrays_used ](https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf) | system.ipc_semaphore_arrays | IPC semaphore arrays utilization |
|
||||
| [ semaphores_used ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf) | system.ipc_semaphores | IPC semaphore utilization |
|
||||
| [ semaphore_arrays_used ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf) | system.ipc_semaphore_arrays | IPC semaphore arrays utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -96,10 +96,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ mdstat_last_collected ](https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf) | md.disks | number of seconds since the last successful data collection |
|
||||
| [ mdstat_disks ](https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf) | md.disks | number of devices in the down state for the ${label:device} ${label:raid_level} array. Any number > 0 indicates that the array is degraded. |
|
||||
| [ mdstat_mismatch_cnt ](https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf) | md.mismatch_cnt | number of unsynchronized blocks for the ${label:device} ${label:raid_level} array |
|
||||
| [ mdstat_nonredundant_last_collected ](https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf) | md.nonredundant | number of seconds since the last successful data collection |
|
||||
| [ mdstat_last_collected ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf) | md.disks | number of seconds since the last successful data collection |
|
||||
| [ mdstat_disks ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf) | md.disks | number of devices in the down state for the ${label:device} ${label:raid_level} array. Any number > 0 indicates that the array is degraded. |
|
||||
| [ mdstat_mismatch_cnt ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf) | md.mismatch_cnt | number of unsynchronized blocks for the ${label:device} ${label:raid_level} array |
|
||||
| [ mdstat_nonredundant_last_collected ](https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf) | md.nonredundant | number of seconds since the last successful data collection |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -117,10 +117,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ecc_memory_mc_noinfo_correctable ](https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf) | mem.edac_mc | memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes |
|
||||
| [ ecc_memory_mc_noinfo_uncorrectable ](https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf) | mem.edac_mc | memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes |
|
||||
| [ ecc_memory_dimm_correctable ](https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf) | mem.edac_mc_dimm | DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes |
|
||||
| [ ecc_memory_dimm_uncorrectable ](https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf) | mem.edac_mc_dimm | DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes |
|
||||
| [ ecc_memory_mc_noinfo_correctable ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf) | mem.edac_mc | memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes |
|
||||
| [ ecc_memory_mc_noinfo_uncorrectable ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf) | mem.edac_mc | memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes |
|
||||
| [ ecc_memory_dimm_correctable ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf) | mem.edac_mc_dimm | DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes |
|
||||
| [ ecc_memory_dimm_uncorrectable ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf) | mem.edac_mc_dimm | DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -111,8 +111,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 30min_ram_swapped_out ](https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf) | mem.swapio | percentage of the system RAM swapped in the last 30 minutes |
|
||||
| [ oom_kill ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | mem.oom_kill | number of out of memory kills in the last 30 minutes |
|
||||
| [ 30min_ram_swapped_out ](https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf) | mem.swapio | percentage of the system RAM swapped in the last 30 minutes |
|
||||
| [ oom_kill ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | mem.oom_kill | number of out of memory kills in the last 30 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -106,10 +106,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ram_in_use ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | system.ram | system memory utilization |
|
||||
| [ ram_available ](https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf) | mem.available | percentage of estimated amount of RAM available for userspace processes, without causing swapping |
|
||||
| [ used_swap ](https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf) | mem.swap | swap memory utilization |
|
||||
| [ 1hour_memory_hw_corrupted ](https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf) | mem.hwcorrupt | amount of memory corrupted due to a hardware failure |
|
||||
| [ ram_in_use ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | system.ram | system memory utilization |
|
||||
| [ ram_available ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf) | mem.available | percentage of estimated amount of RAM available for userspace processes, without causing swapping |
|
||||
| [ used_swap ](https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf) | mem.swap | swap memory utilization |
|
||||
| [ 1hour_memory_hw_corrupted ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf) | mem.hwcorrupt | amount of memory corrupted due to a hardware failure |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -102,16 +102,16 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ interface_speed ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.net | network interface ${label:device} current speed |
|
||||
| [ 1m_received_traffic_overflow ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.net | average inbound utilization for the network interface ${label:device} over the last minute |
|
||||
| [ 1m_sent_traffic_overflow ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.net | average outbound utilization for the network interface ${label:device} over the last minute |
|
||||
| [ inbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.drops | ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ outbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.drops | ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ wifi_inbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.drops | ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ wifi_outbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.drops | ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ 1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ 10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ 10min_fifo_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/net.conf) | net.fifo | number of FIFO errors for the network interface ${label:device} in the last 10 minutes |
|
||||
| [ interface_speed ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.net | network interface ${label:device} current speed |
|
||||
| [ 1m_received_traffic_overflow ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.net | average inbound utilization for the network interface ${label:device} over the last minute |
|
||||
| [ 1m_sent_traffic_overflow ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.net | average outbound utilization for the network interface ${label:device} over the last minute |
|
||||
| [ inbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.drops | ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ outbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.drops | ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ wifi_inbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.drops | ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ wifi_outbound_packets_dropped_ratio ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.drops | ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes |
|
||||
| [ 1m_received_packets_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.packets | average number of packets received by the network interface ${label:device} over the last minute |
|
||||
| [ 10s_received_packets_storm ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.packets | ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute |
|
||||
| [ 10min_fifo_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf) | net.fifo | number of FIFO errors for the network interface ${label:device} in the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -125,17 +125,17 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 1m_tcp_syn_queue_drops ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf) | ip.tcp_syn_queue | average number of SYN requests was dropped due to the full TCP SYN queue over the last minute (SYN cookies were not enabled) |
|
||||
| [ 1m_tcp_syn_queue_cookies ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf) | ip.tcp_syn_queue | average number of sent SYN cookies due to the full TCP SYN queue over the last minute |
|
||||
| [ 1m_tcp_accept_queue_overflows ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf) | ip.tcp_accept_queue | average number of overflows in the TCP accept queue over the last minute |
|
||||
| [ 1m_tcp_accept_queue_drops ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf) | ip.tcp_accept_queue | average number of dropped packets in the TCP accept queue over the last minute |
|
||||
| [ tcp_connections ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_conn.conf) | ip.tcpsock | TCP connections utilization |
|
||||
| [ 1m_ip_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of sent TCP RESETS over the last minute |
|
||||
| [ 10s_ip_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ip_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of received TCP RESETS over the last minute |
|
||||
| [ 10s_ip_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ipv4_udp_receive_buffer_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP receive buffer errors over the last minute |
|
||||
| [ 1m_ipv4_udp_send_buffer_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP send buffer errors over the last minute |
|
||||
| [ 1m_tcp_syn_queue_drops ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf) | ip.tcp_syn_queue | average number of SYN requests was dropped due to the full TCP SYN queue over the last minute (SYN cookies were not enabled) |
|
||||
| [ 1m_tcp_syn_queue_cookies ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf) | ip.tcp_syn_queue | average number of sent SYN cookies due to the full TCP SYN queue over the last minute |
|
||||
| [ 1m_tcp_accept_queue_overflows ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf) | ip.tcp_accept_queue | average number of overflows in the TCP accept queue over the last minute |
|
||||
| [ 1m_tcp_accept_queue_drops ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf) | ip.tcp_accept_queue | average number of dropped packets in the TCP accept queue over the last minute |
|
||||
| [ tcp_connections ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_conn.conf) | ip.tcpsock | TCP connections utilization |
|
||||
| [ 1m_ip_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of sent TCP RESETS over the last minute |
|
||||
| [ 10s_ip_tcp_resets_sent ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ip_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of received TCP RESETS over the last minute |
|
||||
| [ 10s_ip_tcp_resets_received ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf) | ip.tcphandshake | average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed. Netdata will not send a clear notification for this alarm. |
|
||||
| [ 1m_ipv4_udp_receive_buffer_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP receive buffer errors over the last minute |
|
||||
| [ 1m_ipv4_udp_send_buffer_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf) | ipv4.udperrors | average number of UDP send buffer errors over the last minute |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -81,7 +81,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ linux_power_supply_capacity ](https://github.com/netdata/netdata/blob/master/health/health.d/linux_power_supply.conf) | powersupply.capacity | percentage of remaining power supply capacity |
|
||||
| [ linux_power_supply_capacity ](https://github.com/netdata/netdata/blob/master/src/health/health.d/linux_power_supply.conf) | powersupply.capacity | percentage of remaining power supply capacity |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -82,8 +82,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ tcp_orphans ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_orphans.conf) | ipv4.sockstat_tcp_sockets | orphan IPv4 TCP sockets utilization |
|
||||
| [ tcp_memory ](https://github.com/netdata/netdata/blob/master/health/health.d/tcp_mem.conf) | ipv4.sockstat_tcp_mem | TCP memory utilization |
|
||||
| [ tcp_orphans ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_orphans.conf) | ipv4.sockstat_tcp_sockets | orphan IPv4 TCP sockets utilization |
|
||||
| [ tcp_memory ](https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_mem.conf) | ipv4.sockstat_tcp_mem | TCP memory utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -108,8 +108,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 1min_netdev_backlog_exceeded ](https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf) | system.softnet_stat | average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog |
|
||||
| [ 1min_netdev_budget_ran_outs ](https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf) | system.softnet_stat | average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last minute (this can be a cause for dropped packets) |
|
||||
| [ 1min_netdev_backlog_exceeded ](https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf) | system.softnet_stat | average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog |
|
||||
| [ 1min_netdev_budget_ran_outs ](https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf) | system.softnet_stat | average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last minute (this can be a cause for dropped packets) |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -98,11 +98,11 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ load_cpu_number ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | number of active CPU cores in the system |
|
||||
| [ load_average_15 ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | system fifteen-minute load average |
|
||||
| [ load_average_5 ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | system five-minute load average |
|
||||
| [ load_average_1 ](https://github.com/netdata/netdata/blob/master/health/health.d/load.conf) | system.load | system one-minute load average |
|
||||
| [ active_processes ](https://github.com/netdata/netdata/blob/master/health/health.d/processes.conf) | system.active_processes | system process IDs (PID) space utilization |
|
||||
| [ load_cpu_number ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | number of active CPU cores in the system |
|
||||
| [ load_average_15 ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | system fifteen-minute load average |
|
||||
| [ load_average_5 ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | system five-minute load average |
|
||||
| [ load_average_1 ](https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf) | system.load | system one-minute load average |
|
||||
| [ active_processes ](https://github.com/netdata/netdata/blob/master/src/health/health.d/processes.conf) | system.active_processes | system process IDs (PID) space utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -122,9 +122,9 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ 10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) |
|
||||
| [ 10min_cpu_iowait ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU iowait time over the last 10 minutes |
|
||||
| [ 20min_steal_cpu ](https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf) | system.cpu | average CPU steal time over the last 20 minutes |
|
||||
| [ 10min_cpu_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) |
|
||||
| [ 10min_cpu_iowait ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU iowait time over the last 10 minutes |
|
||||
| [ 20min_steal_cpu ](https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf) | system.cpu | average CPU steal time over the last 20 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -99,7 +99,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ zfs_memory_throttle ](https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf) | zfs.memory_ops | number of times ZFS had to limit the ARC growth in the last 10 minutes |
|
||||
| [ zfs_memory_throttle ](https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf) | zfs.memory_ops | number of times ZFS had to limit the ARC growth in the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -78,8 +78,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ zfs_pool_state_warn ](https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf) | zfspool.state | ZFS pool ${label:pool} state is degraded |
|
||||
| [ zfs_pool_state_crit ](https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf) | zfspool.state | ZFS pool ${label:pool} state is faulted or unavail |
|
||||
| [ zfs_pool_state_warn ](https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf) | zfspool.state | ZFS pool ${label:pool} state is degraded |
|
||||
| [ zfs_pool_state_crit ](https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf) | zfspool.state | ZFS pool ${label:pool} state is faulted or unavail |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -81,17 +81,17 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 10min_cpu_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
|
||||
os: "linux"
|
||||
- name: 10min_cpu_iowait
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU iowait time over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: 20min_steal_cpu
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/cpu.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/cpu.conf
|
||||
metric: system.cpu
|
||||
info: average CPU steal time over the last 20 minutes
|
||||
os: "linux"
|
||||
|
@ -279,7 +279,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: lowest_entropy
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/entropy.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/entropy.conf
|
||||
metric: system.entropy
|
||||
info: minimum number of bits of entropy available for the kernel’s random number generator
|
||||
metrics:
|
||||
|
@ -464,12 +464,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 30min_ram_swapped_out
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf
|
||||
metric: mem.swapio
|
||||
info: percentage of the system RAM swapped in the last 30 minutes
|
||||
os: "linux freebsd"
|
||||
- name: oom_kill
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: mem.oom_kill
|
||||
info: number of out of memory kills in the last 30 minutes
|
||||
os: "linux"
|
||||
|
@ -798,27 +798,27 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: load_cpu_number
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: number of active CPU cores in the system
|
||||
os: "linux"
|
||||
- name: load_average_15
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: system fifteen-minute load average
|
||||
os: "linux"
|
||||
- name: load_average_5
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: system five-minute load average
|
||||
os: "linux"
|
||||
- name: load_average_1
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/load.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/load.conf
|
||||
metric: system.load
|
||||
info: system one-minute load average
|
||||
os: "linux"
|
||||
- name: active_processes
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/processes.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/processes.conf
|
||||
metric: system.active_processes
|
||||
info: system process IDs (PID) space utilization
|
||||
metrics:
|
||||
|
@ -1207,12 +1207,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 1min_netdev_backlog_exceeded
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf
|
||||
metric: system.softnet_stat
|
||||
info: average number of dropped packets in the last minute due to exceeded net.core.netdev_max_backlog
|
||||
os: "linux"
|
||||
- name: 1min_netdev_budget_ran_outs
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/softnet.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/softnet.conf
|
||||
metric: system.softnet_stat
|
||||
info:
|
||||
average number of times ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs with work remaining over the last
|
||||
|
@ -1329,22 +1329,22 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: ram_in_use
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: system.ram
|
||||
info: system memory utilization
|
||||
os: "linux"
|
||||
- name: ram_available
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ram.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ram.conf
|
||||
metric: mem.available
|
||||
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
|
||||
os: "linux"
|
||||
- name: used_swap
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/swap.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/swap.conf
|
||||
metric: mem.swap
|
||||
info: swap memory utilization
|
||||
os: "linux freebsd"
|
||||
- name: 1hour_memory_hw_corrupted
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
|
||||
metric: mem.hwcorrupt
|
||||
info: amount of memory corrupted due to a hardware failure
|
||||
os: "linux"
|
||||
|
@ -1660,19 +1660,19 @@ modules:
|
|||
- name: ecc_memory_mc_noinfo_correctable
|
||||
metric: mem.edac_mc
|
||||
info: memory controller ${label:controller} ECC correctable errors (unknown DIMM slot) in the last 10 minutes
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
|
||||
- name: ecc_memory_mc_noinfo_uncorrectable
|
||||
metric: mem.edac_mc
|
||||
info: memory controller ${label:controller} ECC uncorrectable errors (unknown DIMM slot) in the last 10 minutes
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
|
||||
- name: ecc_memory_dimm_correctable
|
||||
metric: mem.edac_mc_dimm
|
||||
info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
|
||||
- name: ecc_memory_dimm_uncorrectable
|
||||
metric: mem.edac_mc_dimm
|
||||
info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memory.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memory.conf
|
||||
metrics:
|
||||
folding:
|
||||
title: Metrics
|
||||
|
@ -2089,12 +2089,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: semaphores_used
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf
|
||||
metric: system.ipc_semaphores
|
||||
info: IPC semaphore utilization
|
||||
os: "linux"
|
||||
- name: semaphore_arrays_used
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ipc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ipc.conf
|
||||
metric: system.ipc_semaphore_arrays
|
||||
info: IPC semaphore arrays utilization
|
||||
os: "linux"
|
||||
|
@ -2210,21 +2210,21 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 10min_disk_backlog
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.backlog
|
||||
info: average backlog size of the ${label:device} disk over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: 10min_disk_utilization
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/disks.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/disks.conf
|
||||
metric: disk.util
|
||||
info: average percentage of time ${label:device} disk was busy over the last 10 minutes
|
||||
os: "linux freebsd"
|
||||
- name: bcache_cache_dirty
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/bcache.conf
|
||||
metric: disk.bcache_cache_alloc
|
||||
info: percentage of cache space used for dirty data and metadata (this usually means your SSD cache is too small)
|
||||
- name: bcache_cache_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/bcache.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/bcache.conf
|
||||
metric: disk.bcache_cache_read_races
|
||||
info:
|
||||
number of times data was read from the cache, the bucket was reused and invalidated in the last 10 minutes (when this occurs the data is
|
||||
|
@ -2489,20 +2489,20 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: mdstat_last_collected
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf
|
||||
metric: md.disks
|
||||
info: number of seconds since the last successful data collection
|
||||
- name: mdstat_disks
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf
|
||||
metric: md.disks
|
||||
info:
|
||||
number of devices in the down state for the ${label:device} ${label:raid_level} array. Any number > 0 indicates that the array is degraded.
|
||||
- name: mdstat_mismatch_cnt
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf
|
||||
metric: md.mismatch_cnt
|
||||
info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array
|
||||
- name: mdstat_nonredundant_last_collected
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/mdstat.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/mdstat.conf
|
||||
metric: md.nonredundant
|
||||
info: number of seconds since the last successful data collection
|
||||
metrics:
|
||||
|
@ -2627,52 +2627,52 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: interface_speed
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.net
|
||||
info: network interface ${label:device} current speed
|
||||
os: "*"
|
||||
- name: 1m_received_traffic_overflow
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.net
|
||||
info: average inbound utilization for the network interface ${label:device} over the last minute
|
||||
os: "linux"
|
||||
- name: 1m_sent_traffic_overflow
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.net
|
||||
info: average outbound utilization for the network interface ${label:device} over the last minute
|
||||
os: "linux"
|
||||
- name: inbound_packets_dropped_ratio
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.drops
|
||||
info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: outbound_packets_dropped_ratio
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.drops
|
||||
info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: wifi_inbound_packets_dropped_ratio
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.drops
|
||||
info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: wifi_outbound_packets_dropped_ratio
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.drops
|
||||
info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
|
||||
os: "linux"
|
||||
- name: 1m_received_packets_rate
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.packets
|
||||
info: average number of packets received by the network interface ${label:device} over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 10s_received_packets_storm
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.packets
|
||||
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, compared to the rate over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 10min_fifo_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/net.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/net.conf
|
||||
metric: net.fifo
|
||||
info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes
|
||||
os: "linux"
|
||||
|
@ -3107,61 +3107,61 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: 1m_tcp_syn_queue_drops
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf
|
||||
metric: ip.tcp_syn_queue
|
||||
info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute (SYN cookies were not enabled)
|
||||
os: "linux"
|
||||
- name: 1m_tcp_syn_queue_cookies
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf
|
||||
metric: ip.tcp_syn_queue
|
||||
info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
|
||||
os: "linux"
|
||||
- name: 1m_tcp_accept_queue_overflows
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf
|
||||
metric: ip.tcp_accept_queue
|
||||
info: average number of overflows in the TCP accept queue over the last minute
|
||||
os: "linux"
|
||||
- name: 1m_tcp_accept_queue_drops
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_listen.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_listen.conf
|
||||
metric: ip.tcp_accept_queue
|
||||
info: average number of dropped packets in the TCP accept queue over the last minute
|
||||
os: "linux"
|
||||
- name: tcp_connections
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_conn.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_conn.conf
|
||||
metric: ip.tcpsock
|
||||
info: TCP connections utilization
|
||||
os: "linux"
|
||||
- name: 1m_ip_tcp_resets_sent
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ip.tcphandshake
|
||||
info: average number of sent TCP RESETS over the last minute
|
||||
os: "linux"
|
||||
- name: 10s_ip_tcp_resets_sent
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ip.tcphandshake
|
||||
info:
|
||||
average number of sent TCP RESETS over the last 10 seconds. This can indicate a port scan, or that a service running on this host has
|
||||
crashed. Netdata will not send a clear notification for this alarm.
|
||||
os: "linux"
|
||||
- name: 1m_ip_tcp_resets_received
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ip.tcphandshake
|
||||
info: average number of received TCP RESETS over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 10s_ip_tcp_resets_received
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_resets.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_resets.conf
|
||||
metric: ip.tcphandshake
|
||||
info:
|
||||
average number of received TCP RESETS over the last 10 seconds. This can be an indication that a service this host needs has crashed.
|
||||
Netdata will not send a clear notification for this alarm.
|
||||
os: "linux freebsd"
|
||||
- name: 1m_ipv4_udp_receive_buffer_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf
|
||||
metric: ipv4.udperrors
|
||||
info: average number of UDP receive buffer errors over the last minute
|
||||
os: "linux freebsd"
|
||||
- name: 1m_ipv4_udp_send_buffer_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/udp_errors.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/udp_errors.conf
|
||||
metric: ipv4.udperrors
|
||||
info: average number of UDP send buffer errors over the last minute
|
||||
os: "linux"
|
||||
|
@ -3696,12 +3696,12 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: tcp_orphans
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_orphans.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_orphans.conf
|
||||
metric: ipv4.sockstat_tcp_sockets
|
||||
info: orphan IPv4 TCP sockets utilization
|
||||
os: "linux"
|
||||
- name: tcp_memory
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/tcp_mem.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/tcp_mem.conf
|
||||
metric: ipv4.sockstat_tcp_mem
|
||||
info: TCP memory utilization
|
||||
os: "linux"
|
||||
|
@ -4355,7 +4355,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: netfilter_conntrack_full
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/netfilter.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/netfilter.conf
|
||||
metric: netfilter.conntrack_sockets
|
||||
info: netfilter connection tracker table size utilization
|
||||
os: "linux"
|
||||
|
@ -4564,11 +4564,11 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: zfs_pool_state_warn
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf
|
||||
metric: zfspool.state
|
||||
info: ZFS pool ${label:pool} state is degraded
|
||||
- name: zfs_pool_state_crit
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf
|
||||
metric: zfspool.state
|
||||
info: ZFS pool ${label:pool} state is faulted or unavail
|
||||
metrics:
|
||||
|
@ -4656,7 +4656,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: zfs_memory_throttle
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/zfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/zfs.conf
|
||||
metric: zfs.memory_ops
|
||||
info: number of times ZFS had to limit the ARC growth in the last 10 minutes
|
||||
metrics:
|
||||
|
@ -4920,47 +4920,47 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: btrfs_allocated
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.disk
|
||||
info: percentage of allocated BTRFS physical disk space
|
||||
os: "*"
|
||||
- name: btrfs_data
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.data
|
||||
info: utilization of BTRFS data space
|
||||
os: "*"
|
||||
- name: btrfs_metadata
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.metadata
|
||||
info: utilization of BTRFS metadata space
|
||||
os: "*"
|
||||
- name: btrfs_system
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.system
|
||||
info: utilization of BTRFS system space
|
||||
os: "*"
|
||||
- name: btrfs_device_read_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.device_errors
|
||||
info: number of encountered BTRFS read errors
|
||||
os: "*"
|
||||
- name: btrfs_device_write_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.device_errors
|
||||
info: number of encountered BTRFS write errors
|
||||
os: "*"
|
||||
- name: btrfs_device_flush_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.device_errors
|
||||
info: number of encountered BTRFS flush errors
|
||||
os: "*"
|
||||
- name: btrfs_device_corruption_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.device_errors
|
||||
info: number of encountered BTRFS corruption errors
|
||||
os: "*"
|
||||
- name: btrfs_device_generation_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/btrfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/btrfs.conf
|
||||
metric: btrfs.device_errors
|
||||
info: number of encountered BTRFS generation errors
|
||||
os: "*"
|
||||
|
@ -5110,7 +5110,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: linux_power_supply_capacity
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/linux_power_supply.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/linux_power_supply.conf
|
||||
metric: powersupply.capacity
|
||||
info: percentage of remaining power supply capacity
|
||||
metrics:
|
||||
|
|
|
@ -84,8 +84,8 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ adaptec_raid_ld_status ](https://github.com/netdata/netdata/blob/master/health/health.d/adaptec_raid.conf) | adaptec_raid.ld_status | logical device status is failed or degraded |
|
||||
| [ adaptec_raid_pd_state ](https://github.com/netdata/netdata/blob/master/health/health.d/adaptec_raid.conf) | adaptec_raid.pd_state | physical device state is not online |
|
||||
| [ adaptec_raid_ld_status ](https://github.com/netdata/netdata/blob/master/src/health/health.d/adaptec_raid.conf) | adaptec_raid.ld_status | logical device status is failed or degraded |
|
||||
| [ adaptec_raid_pd_state ](https://github.com/netdata/netdata/blob/master/src/health/health.d/adaptec_raid.conf) | adaptec_raid.pd_state | physical device state is not online |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -123,11 +123,11 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: adaptec_raid_ld_status
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/adaptec_raid.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/adaptec_raid.conf
|
||||
metric: adaptec_raid.ld_status
|
||||
info: logical device status is failed or degraded
|
||||
- name: adaptec_raid_pd_state
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/adaptec_raid.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/adaptec_raid.conf
|
||||
metric: adaptec_raid.pd_state
|
||||
info: physical device state is not online
|
||||
metrics:
|
||||
|
|
|
@ -55,11 +55,11 @@
|
|||
# list: []
|
||||
# alerts:
|
||||
# - name: anomalies_anomaly_probabilities
|
||||
# link: https://github.com/netdata/netdata/blob/master/health/health.d/anomalies.conf
|
||||
# link: https://github.com/netdata/netdata/blob/master/src/health/health.d/anomalies.conf
|
||||
# metric: anomalies.probability
|
||||
# info: average anomaly probability over the last 2 minutes
|
||||
# - name: anomalies_anomaly_flags
|
||||
# link: https://github.com/netdata/netdata/blob/master/health/health.d/anomalies.conf
|
||||
# link: https://github.com/netdata/netdata/blob/master/src/health/health.d/anomalies.conf
|
||||
# metric: anomalies.anomaly
|
||||
# info: number of anomalies in the last 2 minutes
|
||||
# metrics:
|
||||
|
|
|
@ -98,7 +98,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ beanstalk_server_buried_jobs ](https://github.com/netdata/netdata/blob/master/health/health.d/beanstalkd.conf) | beanstalk.current_jobs | number of buried jobs across all tubes. You need to manually kick them so they can be processed. Presence of buried jobs in a tube does not affect new jobs. |
|
||||
| [ beanstalk_server_buried_jobs ](https://github.com/netdata/netdata/blob/master/src/health/health.d/beanstalkd.conf) | beanstalk.current_jobs | number of buried jobs across all tubes. You need to manually kick them so they can be processed. Presence of buried jobs in a tube does not affect new jobs. |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -123,7 +123,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: beanstalk_server_buried_jobs
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/beanstalkd.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/beanstalkd.conf
|
||||
metric: beanstalk.current_jobs
|
||||
info: number of buried jobs across all tubes. You need to manually kick them so they can be processed. Presence of buried jobs in a tube does not affect new jobs.
|
||||
metrics:
|
||||
|
|
|
@ -77,7 +77,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ bind_rndc_stats_file_size ](https://github.com/netdata/netdata/blob/master/health/health.d/bind_rndc.conf) | bind_rndc.stats_size | BIND statistics-file size |
|
||||
| [ bind_rndc_stats_file_size ](https://github.com/netdata/netdata/blob/master/src/health/health.d/bind_rndc.conf) | bind_rndc.stats_size | BIND statistics-file size |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -140,7 +140,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: bind_rndc_stats_file_size
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/bind_rndc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/bind_rndc.conf
|
||||
metric: bind_rndc.stats_size
|
||||
info: BIND statistics-file size
|
||||
metrics:
|
||||
|
|
|
@ -77,10 +77,10 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ boinc_total_tasks ](https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf) | boinc.tasks | average number of total tasks over the last 10 minutes |
|
||||
| [ boinc_active_tasks ](https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf) | boinc.tasks | average number of active tasks over the last 10 minutes |
|
||||
| [ boinc_compute_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf) | boinc.states | average number of compute errors over the last 10 minutes |
|
||||
| [ boinc_upload_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf) | boinc.states | average number of failed uploads over the last 10 minutes |
|
||||
| [ boinc_total_tasks ](https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf) | boinc.tasks | average number of total tasks over the last 10 minutes |
|
||||
| [ boinc_active_tasks ](https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf) | boinc.tasks | average number of active tasks over the last 10 minutes |
|
||||
| [ boinc_compute_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf) | boinc.states | average number of compute errors over the last 10 minutes |
|
||||
| [ boinc_upload_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf) | boinc.states | average number of failed uploads over the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -127,22 +127,22 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: boinc_total_tasks
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf
|
||||
metric: boinc.tasks
|
||||
info: average number of total tasks over the last 10 minutes
|
||||
os: "*"
|
||||
- name: boinc_active_tasks
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf
|
||||
metric: boinc.tasks
|
||||
info: average number of active tasks over the last 10 minutes
|
||||
os: "*"
|
||||
- name: boinc_compute_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf
|
||||
metric: boinc.states
|
||||
info: average number of compute errors over the last 10 minutes
|
||||
os: "*"
|
||||
- name: boinc_upload_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/boinc.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/boinc.conf
|
||||
metric: boinc.states
|
||||
info: average number of failed uploads over the last 10 minutes
|
||||
os: "*"
|
||||
|
|
|
@ -88,7 +88,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ceph_cluster_space_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/ceph.conf) | ceph.general_usage | cluster disk space utilization |
|
||||
| [ ceph_cluster_space_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ceph.conf) | ceph.general_usage | cluster disk space utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -113,7 +113,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: ceph_cluster_space_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ceph.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ceph.conf
|
||||
metric: ceph.general_usage
|
||||
info: cluster disk space utilization
|
||||
metrics:
|
||||
|
|
|
@ -86,7 +86,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ gearman_workers_queued ](https://github.com/netdata/netdata/blob/master/health/health.d/gearman.conf) | gearman.single_job | average number of queued jobs over the last 10 minutes |
|
||||
| [ gearman_workers_queued ](https://github.com/netdata/netdata/blob/master/src/health/health.d/gearman.conf) | gearman.single_job | average number of queued jobs over the last 10 minutes |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -133,7 +133,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: gearman_workers_queued
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/gearman.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/gearman.conf
|
||||
metric: gearman.single_job
|
||||
info: average number of queued jobs over the last 10 minutes
|
||||
metrics:
|
||||
|
|
|
@ -122,11 +122,11 @@
|
|||
# list: []
|
||||
# alerts:
|
||||
# - name: haproxy_backend_server_status
|
||||
# link: https://github.com/netdata/netdata/blob/master/health/health.d/haproxy.conf
|
||||
# link: https://github.com/netdata/netdata/blob/master/src/health/health.d/haproxy.conf
|
||||
# metric: haproxy_hs.down
|
||||
# info: average number of failed haproxy backend servers over the last 10 seconds
|
||||
# - name: haproxy_backend_status
|
||||
# link: https://github.com/netdata/netdata/blob/master/health/health.d/haproxy.conf
|
||||
# link: https://github.com/netdata/netdata/blob/master/src/health/health.d/haproxy.conf
|
||||
# metric: haproxy_hb.down
|
||||
# info: average number of failed haproxy backends over the last 10 seconds
|
||||
# metrics:
|
||||
|
|
|
@ -81,7 +81,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ ipfs_datastore_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/ipfs.conf) | ipfs.repo_size | IPFS datastore utilization |
|
||||
| [ ipfs_datastore_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/ipfs.conf) | ipfs.repo_size | IPFS datastore utilization |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -128,7 +128,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: ipfs_datastore_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/ipfs.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/ipfs.conf
|
||||
metric: ipfs.repo_size
|
||||
info: IPFS datastore utilization
|
||||
metrics:
|
||||
|
|
|
@ -96,11 +96,11 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ megacli_adapter_state ](https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf) | megacli.adapter_degraded | adapter is in the degraded state (0: false, 1: true) |
|
||||
| [ megacli_pd_media_errors ](https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf) | megacli.pd_media_error | number of physical drive media errors |
|
||||
| [ megacli_pd_predictive_failures ](https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf) | megacli.pd_predictive_failure | number of physical drive predictive failures |
|
||||
| [ megacli_bbu_relative_charge ](https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf) | megacli.bbu_relative_charge | average battery backup unit (BBU) relative state of charge over the last 10 seconds |
|
||||
| [ megacli_bbu_cycle_count ](https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf) | megacli.bbu_cycle_count | average battery backup unit (BBU) charge cycles count over the last 10 seconds |
|
||||
| [ megacli_adapter_state ](https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf) | megacli.adapter_degraded | adapter is in the degraded state (0: false, 1: true) |
|
||||
| [ megacli_pd_media_errors ](https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf) | megacli.pd_media_error | number of physical drive media errors |
|
||||
| [ megacli_pd_predictive_failures ](https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf) | megacli.pd_predictive_failure | number of physical drive predictive failures |
|
||||
| [ megacli_bbu_relative_charge ](https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf) | megacli.bbu_relative_charge | average battery backup unit (BBU) relative state of charge over the last 10 seconds |
|
||||
| [ megacli_bbu_cycle_count ](https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf) | megacli.bbu_cycle_count | average battery backup unit (BBU) charge cycles count over the last 10 seconds |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -127,23 +127,23 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: megacli_adapter_state
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf
|
||||
metric: megacli.adapter_degraded
|
||||
info: "adapter is in the degraded state (0: false, 1: true)"
|
||||
- name: megacli_pd_media_errors
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf
|
||||
metric: megacli.pd_media_error
|
||||
info: number of physical drive media errors
|
||||
- name: megacli_pd_predictive_failures
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf
|
||||
metric: megacli.pd_predictive_failure
|
||||
info: number of physical drive predictive failures
|
||||
- name: megacli_bbu_relative_charge
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf
|
||||
metric: megacli.bbu_relative_charge
|
||||
info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
|
||||
- name: megacli_bbu_cycle_count
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/megacli.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/megacli.conf
|
||||
metric: megacli.bbu_cycle_count
|
||||
info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
|
||||
metrics:
|
||||
|
|
|
@ -88,9 +88,9 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ memcached_cache_memory_usage ](https://github.com/netdata/netdata/blob/master/health/health.d/memcached.conf) | memcached.cache | cache memory utilization |
|
||||
| [ memcached_cache_fill_rate ](https://github.com/netdata/netdata/blob/master/health/health.d/memcached.conf) | memcached.cache | average rate the cache fills up (positive), or frees up (negative) space over the last hour |
|
||||
| [ memcached_out_of_cache_space_time ](https://github.com/netdata/netdata/blob/master/health/health.d/memcached.conf) | memcached.cache | estimated time the cache will run out of space if the system continues to add data at the same rate as the past hour |
|
||||
| [ memcached_cache_memory_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memcached.conf) | memcached.cache | cache memory utilization |
|
||||
| [ memcached_cache_fill_rate ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memcached.conf) | memcached.cache | average rate the cache fills up (positive), or frees up (negative) space over the last hour |
|
||||
| [ memcached_out_of_cache_space_time ](https://github.com/netdata/netdata/blob/master/src/health/health.d/memcached.conf) | memcached.cache | estimated time the cache will run out of space if the system continues to add data at the same rate as the past hour |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -126,15 +126,15 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: memcached_cache_memory_usage
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memcached.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memcached.conf
|
||||
metric: memcached.cache
|
||||
info: cache memory utilization
|
||||
- name: memcached_cache_fill_rate
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memcached.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memcached.conf
|
||||
metric: memcached.cache
|
||||
info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
|
||||
- name: memcached_out_of_cache_space_time
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/memcached.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/memcached.conf
|
||||
metric: memcached.cache
|
||||
info: estimated time the cache will run out of space if the system continues to add data at the same rate as the past hour
|
||||
metrics:
|
||||
|
|
|
@ -76,7 +76,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ retroshare_dht_working ](https://github.com/netdata/netdata/blob/master/health/health.d/retroshare.conf) | retroshare.dht | number of DHT peers |
|
||||
| [ retroshare_dht_working ](https://github.com/netdata/netdata/blob/master/src/health/health.d/retroshare.conf) | retroshare.dht | number of DHT peers |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -107,7 +107,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: retroshare_dht_working
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/retroshare.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/retroshare.conf
|
||||
metric: retroshare.dht
|
||||
info: number of DHT peers
|
||||
metrics:
|
||||
|
|
|
@ -99,12 +99,12 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ riakkv_1h_kv_get_mean_latency ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.get | average time between reception of client GET request and subsequent response to client over the last hour |
|
||||
| [ riakkv_kv_get_slow ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.get | average time between reception of client GET request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour |
|
||||
| [ riakkv_1h_kv_put_mean_latency ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.put | average time between reception of client PUT request and subsequent response to the client over the last hour |
|
||||
| [ riakkv_kv_put_slow ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.kv.latency.put | average time between reception of client PUT request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour |
|
||||
| [ riakkv_vm_high_process_count ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.vm | number of processes running in the Erlang VM |
|
||||
| [ riakkv_list_keys_active ](https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf) | riak.core.fsm_active | number of currently running list keys finite state machines |
|
||||
| [ riakkv_1h_kv_get_mean_latency ](https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf) | riak.kv.latency.get | average time between reception of client GET request and subsequent response to client over the last hour |
|
||||
| [ riakkv_kv_get_slow ](https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf) | riak.kv.latency.get | average time between reception of client GET request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour |
|
||||
| [ riakkv_1h_kv_put_mean_latency ](https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf) | riak.kv.latency.put | average time between reception of client PUT request and subsequent response to the client over the last hour |
|
||||
| [ riakkv_kv_put_slow ](https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf) | riak.kv.latency.put | average time between reception of client PUT request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour |
|
||||
| [ riakkv_vm_high_process_count ](https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf) | riak.vm | number of processes running in the Erlang VM |
|
||||
| [ riakkv_list_keys_active ](https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf) | riak.core.fsm_active | number of currently running list keys finite state machines |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -112,27 +112,27 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: riakkv_1h_kv_get_mean_latency
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf
|
||||
metric: riak.kv.latency.get
|
||||
info: average time between reception of client GET request and subsequent response to client over the last hour
|
||||
- name: riakkv_kv_get_slow
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf
|
||||
metric: riak.kv.latency.get
|
||||
info: average time between reception of client GET request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour
|
||||
- name: riakkv_1h_kv_put_mean_latency
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf
|
||||
metric: riak.kv.latency.put
|
||||
info: average time between reception of client PUT request and subsequent response to the client over the last hour
|
||||
- name: riakkv_kv_put_slow
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf
|
||||
metric: riak.kv.latency.put
|
||||
info: average time between reception of client PUT request and subsequent response to the client over the last 3 minutes, compared to the average over the last hour
|
||||
- name: riakkv_vm_high_process_count
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf
|
||||
metric: riak.vm
|
||||
info: number of processes running in the Erlang VM
|
||||
- name: riakkv_list_keys_active
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/riakkv.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/riakkv.conf
|
||||
metric: riak.core.fsm_active
|
||||
info: number of currently running list keys finite state machines
|
||||
metrics:
|
||||
|
|
|
@ -76,7 +76,7 @@ The following alerts are available:
|
|||
|
||||
| Alert name | On metric | Description |
|
||||
|:------------|:----------|:------------|
|
||||
| [ system_clock_sync_state ](https://github.com/netdata/netdata/blob/master/health/health.d/timex.conf) | system.clock_sync_state | when set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server |
|
||||
| [ system_clock_sync_state ](https://github.com/netdata/netdata/blob/master/src/health/health.d/timex.conf) | system.clock_sync_state | when set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server |
|
||||
|
||||
|
||||
## Setup
|
||||
|
|
|
@ -76,7 +76,7 @@ modules:
|
|||
list: []
|
||||
alerts:
|
||||
- name: system_clock_sync_state
|
||||
link: https://github.com/netdata/netdata/blob/master/health/health.d/timex.conf
|
||||
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/timex.conf
|
||||
metric: system.clock_sync_state
|
||||
info: when set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server
|
||||
os: "linux"
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
<mxGeometry x="1154" y="2050" width="82" height="86" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/messagebird" id="6">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/messagebird" id="6">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://cloud.githubusercontent.com/assets/2662304/26528607/80773520-43b8-11e7-8010-5219c75fbe9b.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="576" y="410" width="195" height="38" as="geometry" />
|
||||
</mxCell>
|
||||
|
@ -194,7 +194,7 @@
|
|||
<mxGeometry x="74" y="557" width="800" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="<b><font style="font-size: 16px">&nbsp; &nbsp; &nbsp; Notifications</font></b>" link="https://github.com/netdata/netdata/tree/master/health/notifications" id="77">
|
||||
<UserObject label="<b><font style="font-size: 16px">&nbsp; &nbsp; &nbsp; Notifications</font></b>" link="https://github.com/netdata/netdata/tree/master/src/health/notifications" id="77">
|
||||
<mxCell style="whiteSpace=wrap;html=1;strokeColor=#59DE9C;fillColor=#FFFFFF;fontSize=14;align=center;" parent="1" vertex="1">
|
||||
<mxGeometry x="675" y="470" width="163" height="77" as="geometry" />
|
||||
</mxCell>
|
||||
|
@ -255,32 +255,32 @@
|
|||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/slack" id="101">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/slack" id="101">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://cloud.githubusercontent.com/assets/2662304/26529341/ebcbc5d0-43c6-11e7-9056-df89e0c10e6a.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="530" y="302" width="162" height="48" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/pushbullet" id="102">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/pushbullet" id="102">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://cloud.githubusercontent.com/assets/2662304/26528627/81813ef2-43b8-11e7-9371-9dfcfb1f27f8.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="553" y="363" width="188" height="32" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/discord" id="103">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/discord" id="103">
|
||||
<mxCell style="shape=image;imageAspect=0;aspect=fixed;verticalLabelPosition=bottom;verticalAlign=top;image=https://cloud.githubusercontent.com/assets/2662304/26528628/8195a284-43b8-11e7-8582-e94e53feb1c7.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="440" y="227" width="146" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/pagerduty" id="104">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/pagerduty" id="104">
|
||||
<mxCell style="shape=image;imageAspect=0;aspect=fixed;verticalLabelPosition=bottom;verticalAlign=top;image=https://cloud.githubusercontent.com/assets/2662304/26528630/81a3e33a-43b8-11e7-89a0-53c9d89ef2d6.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="500" y="188" width="140" height="26" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/telegram" id="105">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/telegram" id="105">
|
||||
<mxCell style="shape=image;imageAspect=0;aspect=fixed;verticalLabelPosition=bottom;verticalAlign=top;image=https://cloud.githubusercontent.com/assets/2662304/26529349/256417f2-43c7-11e7-8036-9aa59efe0605.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="452" y="363" width="77" height="83" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/twilio" id="106">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/twilio" id="106">
|
||||
<mxCell style="shape=image;imageAspect=0;aspect=fixed;verticalLabelPosition=bottom;verticalAlign=top;image=https://cloud.githubusercontent.com/assets/2662304/26528631/81c0f9b6-43b8-11e7-9088-77d895f50346.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="600" y="250" width="134" height="40" as="geometry" />
|
||||
</mxCell>
|
||||
|
@ -505,12 +505,12 @@
|
|||
<mxCell id="188" value="" style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://cloud.githubusercontent.com/assets/2662304/26528673/84d4fc60-43b8-11e7-9512-5db073e72e9f.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="283" y="1956" width="58" height="77" as="geometry" />
|
||||
</mxCell>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/email" id="189">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/email" id="189">
|
||||
<mxCell style="shape=image;html=1;verticalAlign=top;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;imageAspect=0;aspect=fixed;image=https://cloud.githubusercontent.com/assets/2662304/26528675/84f226d2-43b8-11e7-9de6-6d7c391aaa8d.png;rounded=0;shadow=0;glass=0;dashed=1;dashPattern=1 1;comic=0;strokeColor=#C7D1E0;strokeWidth=3;fillColor=none;gradientColor=#CCFFFF;fontSize=14;fontColor=#E6E6E6;align=center;" parent="1" vertex="1">
|
||||
<mxGeometry x="766" y="250" width="56" height="56" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/pushover" id="191">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/pushover" id="191">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://cloud.githubusercontent.com/assets/2662304/26528605/80703edc-43b8-11e7-839d-267f613e15b0.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="654" y="205" width="134" height="28" as="geometry" />
|
||||
</mxCell>
|
||||
|
@ -518,12 +518,12 @@
|
|||
<mxCell id="192" value="" style="shape=image;html=1;verticalAlign=top;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;imageAspect=0;aspect=fixed;image=https://cloud.githubusercontent.com/assets/2662304/26528645/8275abfe-43b8-11e7-9d21-306ab844f7bb.png;strokeColor=#6A9153;fillColor=#CCFFFF;gradientColor=#CCFFFF;fontSize=20;align=center;" parent="1" vertex="1">
|
||||
<mxGeometry x="127" y="815" width="61" height="39" as="geometry" />
|
||||
</mxCell>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/flock" id="198">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/flock" id="198">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://user-images.githubusercontent.com/2662304/32355748-512dda64-c039-11e7-8117-bc31a0cf50eb.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="370" y="311" width="142" height="50" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/alerta" id="200">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/alerta" id="200">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://user-images.githubusercontent.com/2662304/34533144-9ecf2918-f0c1-11e7-928e-c2654b092303.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="420" y="275" width="106" height="27" as="geometry" />
|
||||
</mxCell>
|
||||
|
@ -540,7 +540,7 @@
|
|||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/irc" id="204">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/irc" id="204">
|
||||
<mxCell style="shape=image;imageAspect=0;aspect=fixed;verticalLabelPosition=bottom;verticalAlign=top;image=https://user-images.githubusercontent.com/31221999/36795772-8b096efe-1cac-11e8-8d6d-791ee5c88179.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="680" y="153" width="69" height="35" as="geometry" />
|
||||
</mxCell>
|
||||
|
@ -582,12 +582,12 @@
|
|||
<mxGeometry x="959" y="476" width="274" height="52" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/awssns" id="232">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/awssns" id="232">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://user-images.githubusercontent.com/2662304/47609620-26040480-da4b-11e8-8ac3-c1b6a209619f.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="510" y="120" width="140" height="56" as="geometry" />
|
||||
</mxCell>
|
||||
</UserObject>
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/health/notifications/rocketchat" id="233">
|
||||
<UserObject label="" link="https://github.com/netdata/netdata/tree/master/src/health/notifications/rocketchat" id="233">
|
||||
<mxCell style="shape=image;verticalLabelPosition=bottom;labelBackgroundColor=#ffffff;verticalAlign=top;aspect=fixed;imageAspect=0;image=https://user-images.githubusercontent.com/2662304/47609674-ec7fc900-da4b-11e8-84cd-ff47fe586f84.png;" parent="1" vertex="1">
|
||||
<mxGeometry x="422" y="143" width="77" height="77" as="geometry" />
|
||||
</mxCell>
|
||||
|
|
|
@ -20,7 +20,7 @@ Use them to trigger [alert notifications](https://github.com/netdata/netdata/blo
|
|||
either centrally, via the
|
||||
[Cloud alert notifications](https://github.com/netdata/netdata/blob/master/docs/cloud/alerts-notifications/notifications.md)
|
||||
, or by configuring individual
|
||||
[agent notifications](https://github.com/netdata/netdata/blob/master/health/notifications/README.md).
|
||||
[agent notifications](https://github.com/netdata/netdata/blob/master/src/health/notifications/README.md).
|
||||
|
||||
We designed Netdata with interoperability in mind. The Agent collects thousands of metrics every second, and then what
|
||||
you do with them is up to you. You can
|
||||
|
|
|
@ -15,7 +15,7 @@ The Netdata Alerts Configuration Manager enables users with [Business subscripti
|
|||
4. If you want more fine-grained control or access to more advanced settings, enable **Show advanced**
|
||||
<!---->
|
||||
|
||||
5. Copy the alert definition that is generated in the code box and add it to an existing [health configuration file](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#edit-health-configuration-files) or a new custom file under `<path to netdata install>/etc/netdata/health.d/` on a Parent Agent or a Standalone Child Agent.
|
||||
5. Copy the alert definition that is generated in the code box and add it to an existing [health configuration file](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#edit-health-configuration-files) or a new custom file under `<path to netdata install>/etc/netdata/health.d/` on a Parent Agent or a Standalone Child Agent.
|
||||
<!---->
|
||||
|
||||
6. Reload Netdata Alert Health checks `<path to netdata install>/usr/sbin/netdatacli reload-health` and the new alert is now configured.
|
||||
|
@ -34,7 +34,7 @@ The Netdata Alerts Configuration Manager enables users with [Business subscripti
|
|||
- **Metrics Lookup, Filtering and Formula Section**
|
||||
<!---->
|
||||
|
||||
- **Metrics Lookup**: This is the **Evaluate** line of fields in the modal and it defines the parameters for the database lookup that is needed to get the value that will be compared against the alert definition. It corresponds to the [`lookup`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-lookup) line of the Alert configuration file. The Alerts Configuration Manager provides a default selection for the lookup and can be modified to suit your requirements. The parameters that can be modified are:
|
||||
- **Metrics Lookup**: This is the **Evaluate** line of fields in the modal and it defines the parameters for the database lookup that is needed to get the value that will be compared against the alert definition. It corresponds to the [`lookup`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-lookup) line of the Alert configuration file. The Alerts Configuration Manager provides a default selection for the lookup and can be modified to suit your requirements. The parameters that can be modified are:
|
||||
- METHOD (`avg`, `sum`, `min`, `max`, `cv`, `stddev`)
|
||||
- COMPUTATION (sum of all dimensions or individually for each dimension)
|
||||
- DIMENSIONS (All dimensions, or a selection of dimensions)
|
||||
|
@ -42,30 +42,30 @@ The Netdata Alerts Configuration Manager enables users with [Business subscripti
|
|||
- OPTIONS (`absolute`, `unaligned`, `percentage`, `min2max`)
|
||||
|
||||
- **Alert Filtering**: This functionality can be accessed through the **Show advanced** checkbox and it allows for filtering the alert health checks to be run only for specific components of the infrastructure. It helps in achieving a fine-grained configuration for any given alert.
|
||||
- `HOSTS` / `NODES` - By default all hosts are selected. You can pick nodes from the dropdown list, or enter a wildcard matching a list of hosts that you want the alert health check to run on. This field corresponds to the [`hosts`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-hosts) line of the Alert configuration file.
|
||||
- `INSTANCES` - All instances are selected by default. You can pick instances from the dropdown list, or enter a wildcard matching a list of instances that you want the alert health check to run on. This field corresponds to the [`charts`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-charts) line of the Alert configuration file.
|
||||
- `CHART LABELS` - All chart labels are selected by default. You can pick a chart label from the dropdown list or enter a wildcard matching a list of chart labels that you want the alert health check to run on. This field corresponds to the [`chart labels`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-chart-labels) line of the Alert configuration file.
|
||||
- `OS` - All Operating Systems are selected by default. You can choose which OS(s) an alert health check should run on. This field corresponds to the [`os`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-os) line of the Alert configuration file.
|
||||
- `HOSTS` / `NODES` - By default all hosts are selected. You can pick nodes from the dropdown list, or enter a wildcard matching a list of hosts that you want the alert health check to run on. This field corresponds to the [`hosts`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-hosts) line of the Alert configuration file.
|
||||
- `INSTANCES` - All instances are selected by default. You can pick instances from the dropdown list, or enter a wildcard matching a list of instances that you want the alert health check to run on. This field corresponds to the [`charts`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-charts) line of the Alert configuration file.
|
||||
- `CHART LABELS` - All chart labels are selected by default. You can pick a chart label from the dropdown list or enter a wildcard matching a list of chart labels that you want the alert health check to run on. This field corresponds to the [`chart labels`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-chart-labels) line of the Alert configuration file.
|
||||
- `OS` - All Operating Systems are selected by default. You can choose which OS(s) an alert health check should run on. This field corresponds to the [`os`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-os) line of the Alert configuration file.
|
||||
|
||||
- **Formula / Calculation**: This field is available through the **Show advanced** checkbox and it is used to define a formula to be run on top of the `lookup` value. The result of the lookup is available in the `$this` variable, and after the formula is run, the result is also stored in `$this` and can be accessed while setting the alert thresholds. This field corresponds to the [`calc`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-calc) line of the Alert configuration file.
|
||||
- **Formula / Calculation**: This field is available through the **Show advanced** checkbox and it is used to define a formula to be run on top of the `lookup` value. The result of the lookup is available in the `$this` variable, and after the formula is run, the result is also stored in `$this` and can be accessed while setting the alert thresholds. This field corresponds to the [`calc`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-calc) line of the Alert configuration file.
|
||||
|
||||
- **Alerting conditions**
|
||||
<!---->
|
||||
- **Warning and Critical Thresholds**: These fields are used to set the thresholds for the `Warning` and `Critical` alert states, while also having the option to set the condition for the alert to be raised if it is `above` or `below` the given threshold. If the advanced settings are selected, a **formula** option can also be used, to define a custom formula instead of a threshold. These fields correspond to the [`warn` and `crit`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-lines-warn-and-crit) lines of the Alert configuration file.
|
||||
- **Warning and Critical Thresholds**: These fields are used to set the thresholds for the `Warning` and `Critical` alert states, while also having the option to set the condition for the alert to be raised if it is `above` or `below` the given threshold. If the advanced settings are selected, a **formula** option can also be used, to define a custom formula instead of a threshold. These fields correspond to the [`warn` and `crit`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-lines-warn-and-crit) lines of the Alert configuration file.
|
||||
- **Recovery Thresholds**: This field is available through the **Show advanced** checkbox, and it is used to set the threshold that the metric value needs to meet to de-escalate from a given severity status, like `Critical to Warning` and from `Warning to Clear`. The logic is appended to the `warn` and `crit` lines of the Alert configuration file and resembles a one-line `IF-THEN-ELSE` clause.
|
||||
- **Check Interval**: This field is used to define the frequency of the health check for the alert and corresponds to the [`every`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-every) line of the Alert configuration file.
|
||||
- **Delay Notifications**: This field is available through the **Show advanced** checkbox and it is used to set delay parameters on notifications for an alert severity `escalation` or `de-escalation`. It corresponds to the [`delay`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-delay) line of the Alert configuration file.
|
||||
- **Check Interval**: This field is used to define the frequency of the health check for the alert and corresponds to the [`every`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-every) line of the Alert configuration file.
|
||||
- **Delay Notifications**: This field is available through the **Show advanced** checkbox and it is used to set delay parameters on notifications for an alert severity `escalation` or `de-escalation`. It corresponds to the [`delay`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-delay) line of the Alert configuration file.
|
||||
|
||||
- **Agent Specific Options**: These options are only available on the `Netdata Agent` and not honored on `Netdata Cloud`. They can be accessed through the **Show advanced** checkbox.
|
||||
<!---->
|
||||
- **Repeat Notifications**: This field defines the repeat frequency for the alert notification when the alert is in either `warning` or `critical` status and corresponds to the [`repeat`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-repeat) line of the Alert configuration file.
|
||||
- **Send to**: This field is used to define a user role to which the alert notifications will be sent. If set to `silent`, then the alert won't be sent to any role. It corresponds to the [`to`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-to) line of the Alert configuration file.
|
||||
- **Custom Exec Script**: This field is used to define a custom script that will be executed when the alert is triggered (but needs to be carefully designed as it needs to call the `health_alarm_notify.sh` module) and corresponds to the [`exec`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-exec) line of the Alert configuration file.
|
||||
- **Repeat Notifications**: This field defines the repeat frequency for the alert notification when the alert is in either `warning` or `critical` status and corresponds to the [`repeat`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-repeat) line of the Alert configuration file.
|
||||
- **Send to**: This field is used to define a user role to which the alert notifications will be sent. If set to `silent`, then the alert won't be sent to any role. It corresponds to the [`to`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-to) line of the Alert configuration file.
|
||||
- **Custom Exec Script**: This field is used to define a custom script that will be executed when the alert is triggered (but needs to be carefully designed as it needs to call the `health_alarm_notify.sh` module) and corresponds to the [`exec`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-exec) line of the Alert configuration file.
|
||||
|
||||
- **Alert Name, Description and Summary Section**
|
||||
<!---->
|
||||
|
||||
- **Alert Template Name**: This field uniquely identifies an alert and corresponds to the [`template`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-alarm-or-template) field of the Alert configuration file.
|
||||
- **Alert Template Name**: This field uniquely identifies an alert and corresponds to the [`template`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-alarm-or-template) field of the Alert configuration file.
|
||||
The Alerts Configuration Manager provides a default name for an Alert template but we recommend you modify this to have a meaningful name for your configured alert.
|
||||
- **Alert Template Description**: This field provides a brief explanation of the alert and corresponds to the [`info`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-info) line of the Alert configuration file.
|
||||
- **Alert Summary**: This field enables the users to create a custom title for the alert notification (via [Notification integrations](https://learn.netdata.cloud/docs/alerting/notifications/centralized-cloud-notifications)) and corresponds to the [`summary`](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-summary) line of the Alert configuration file.
|
||||
- **Alert Template Description**: This field provides a brief explanation of the alert and corresponds to the [`info`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-info) line of the Alert configuration file.
|
||||
- **Alert Summary**: This field enables the users to create a custom title for the alert notification (via [Notification integrations](https://learn.netdata.cloud/docs/alerting/notifications/centralized-cloud-notifications)) and corresponds to the [`summary`](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-summary) line of the Alert configuration file.
|
||||
|
|
|
@ -27,7 +27,7 @@ Centralized alert notifications from Netdata Cloud is a independent process from
|
|||
Netdata](https://github.com/netdata/netdata/blob/master/docs/monitor/enable-notifications.md). You can enable one or the other, or both, based on your needs. However,
|
||||
the alerts you see in Netdata Cloud are based on those streamed from your Netdata-monitoring nodes. If you want to tweak
|
||||
or add new alert that you see in Netdata Cloud, and receive via centralized alert notifications, you must
|
||||
[configure](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md) each node's alert watchdog.
|
||||
[configure](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md) each node's alert watchdog.
|
||||
|
||||
</Callout>
|
||||
|
||||
|
@ -104,8 +104,8 @@ if the node should be silenced for the entire space or just for specific rooms (
|
|||
|
||||
### Scope definition for Alerts
|
||||
* **Alert name:** silencing a specific alert name silences all alert state transitions for that specific alert.
|
||||
* **Alert context:** silencing a specific alert context will silence all alert state transitions for alerts targeting that chart context, for more details check [alert configuration docs](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-on).
|
||||
* **Alert role:** silencing a specific alert role will silence all the alert state transitions for alerts that are configured to be specific role recipients, for more details check [alert configuration docs](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-to).
|
||||
* **Alert context:** silencing a specific alert context will silence all alert state transitions for alerts targeting that chart context, for more details check [alert configuration docs](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-on).
|
||||
* **Alert role:** silencing a specific alert role will silence all the alert state transitions for alerts that are configured to be specific role recipients, for more details check [alert configuration docs](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-to).
|
||||
|
||||
Beside the above two main entities there are another two important settings that you can define on a silencing rule:
|
||||
* Who does the rule affect? **All user** in the space or **Myself**
|
||||
|
|
|
@ -42,7 +42,7 @@ With a quick glance you have immediate information available at your disposal:
|
|||
While Netdata's charts require no configuration and are easy to interact with, they have a lot of underlying complexity. To meaningfully organize charts out of the box based on what's happening in your nodes, Netdata uses the concepts of [dimensions](#dimensions), [contexts](#contexts), and [families](#families).
|
||||
|
||||
Understanding how these work will help you more easily navigate the dashboard,
|
||||
[write new alerts](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md), or play around
|
||||
[write new alerts](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md), or play around
|
||||
with the [API](https://github.com/netdata/netdata/blob/master/web/api/README.md).
|
||||
|
||||
### Dimensions
|
||||
|
@ -71,7 +71,7 @@ whereas anything after the `.` is specified either by the chart's developer or b
|
|||
|
||||
By default, a chart's type affects where it fits in the menu, while its family creates submenus.
|
||||
|
||||
Netdata also relies on contexts for [alert configuration](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md) (the [`on` line](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#alert-line-on)).
|
||||
Netdata also relies on contexts for [alert configuration](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md) (the [`on` line](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#alert-line-on)).
|
||||
|
||||
### Families
|
||||
|
||||
|
|
|
@ -83,9 +83,9 @@ sudo ./edit-config health.d/example-alert.conf
|
|||
|
||||
Or, append your new alert to an existing file by editing a relevant existing file in the `health.d/` directory.
|
||||
|
||||
Read more about [configuring alerts](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md) to
|
||||
Read more about [configuring alerts](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md) to
|
||||
get started, and see
|
||||
the [health monitoring reference](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md) for a full listing
|
||||
the [health monitoring reference](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md) for a full listing
|
||||
of options available in health entities.
|
||||
|
||||
### Configure a specific alert
|
||||
|
|
|
@ -41,7 +41,7 @@ exist.
|
|||
- `apps_groups.conf` is a configuration file for changing how applications/processes are grouped when viewing the
|
||||
**Application** charts from [`apps.plugin`](https://github.com/netdata/netdata/blob/master/collectors/apps.plugin/README.md) or
|
||||
[`ebpf.plugin`](https://github.com/netdata/netdata/blob/master/collectors/ebpf.plugin/README.md).
|
||||
- `health.d/` is a directory that contains [health configuration files](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md).
|
||||
- `health.d/` is a directory that contains [health configuration files](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md).
|
||||
- `health_alarm_notify.conf` enables and configures [alert notifications](https://github.com/netdata/netdata/blob/master/docs/monitor/enable-notifications.md).
|
||||
- `statsd.d/` is a directory for configuring Netdata's [statsd collector](https://github.com/netdata/netdata/blob/master/collectors/statsd.plugin/README.md).
|
||||
- `stream.conf` configures [parent-child streaming](https://github.com/netdata/netdata/blob/master/src/streaming/README.md) between separate nodes running the Agent.
|
||||
|
|
|
@ -21,7 +21,7 @@ Use the alphabatized list below to find the answer to your single-term questions
|
|||
|
||||
- [**Alerts** (formerly **Alarms**)](https://github.com/netdata/netdata/blob/master/docs/cloud/alerts-notifications/notifications.md): With the information that appears on Netdata Cloud and the local dashboard about active alerts, you can configure alerts to match your infrastructure's needs or your team's goals.
|
||||
|
||||
- [**Alarm Entity Type**](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#health-entity-reference): Entity types that are attached to specific charts and use the `alarm` label.
|
||||
- [**Alarm Entity Type**](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#health-entity-reference): Entity types that are attached to specific charts and use the `alarm` label.
|
||||
|
||||
- [**Anomaly Advisor**](https://github.com/netdata/netdata/blob/master/docs/cloud/insights/anomaly-advisor.md): A Netdata feature that lets you quickly surface potentially anomalous metrics and charts related to a particular highlight window of interest.
|
||||
|
||||
|
@ -77,9 +77,9 @@ Use the alphabatized list below to find the answer to your single-term questions
|
|||
|
||||
- [**Headless Collector Streaming**](https://github.com/netdata/netdata/blob/master/docs/metrics-storage-management/enable-streaming.md#supported-streaming-configurations): Streaming configuration where child `A`, _without_ a database or web dashboard, streams metrics to parent `B`.
|
||||
|
||||
- [**Health Configuration Files**](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#edit-health-configuration-files): Files that you can edit to configure your Agent's health watchdog service.
|
||||
- [**Health Configuration Files**](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#edit-health-configuration-files): Files that you can edit to configure your Agent's health watchdog service.
|
||||
|
||||
- [**Health Entity Reference**](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#health-entity-reference):
|
||||
- [**Health Entity Reference**](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#health-entity-reference):
|
||||
|
||||
- [**Home** tab](https://github.com/netdata/netdata/blob/master/docs/cloud/visualize/overview.md#home): Tab in Netdata Cloud that provides a predefined dashboard of relevant information about entities in the War Room.
|
||||
|
||||
|
@ -158,7 +158,7 @@ metrics, troubleshoot complex performance problems, and make data interoperable
|
|||
|
||||
## T
|
||||
|
||||
- [**Template Entity Type**](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#entity-types): Entity type that defines rules that apply to all charts of a specific context, and use the template label. Templates help you apply one entity to all disks, all network interfaces, all MySQL databases, and so on.
|
||||
- [**Template Entity Type**](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#entity-types): Entity type that defines rules that apply to all charts of a specific context, and use the template label. Templates help you apply one entity to all disks, all network interfaces, all MySQL databases, and so on.
|
||||
|
||||
- [**Tiers**](https://github.com/netdata/netdata/blob/master/src/database/engine/README.md#tiers): Tiering is a mechanism of providing multiple tiers of data with different granularity of metrics (the frequency they are collected and stored, i.e. their resolution).
|
||||
|
||||
|
|
|
@ -100,7 +100,7 @@ Over time, we've created some default alerts for web log monitoring. These alert
|
|||
web server is receiving more than 120 requests per minute. Otherwise, there's simply not enough data to make conclusions
|
||||
about what is "too few" or "too many."
|
||||
|
||||
- [web log alerts](https://raw.githubusercontent.com/netdata/netdata/master/health/health.d/web_log.conf).
|
||||
- [web log alerts](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/web_log.conf).
|
||||
|
||||
You can also edit this file directly with `edit-config`:
|
||||
|
||||
|
@ -109,4 +109,4 @@ You can also edit this file directly with `edit-config`:
|
|||
```
|
||||
|
||||
For more information about editing the defaults or writing new alert entities, see our
|
||||
[health monitoring documentation](https://github.com/netdata/netdata/blob/master/health/README.md).
|
||||
[health monitoring documentation](https://github.com/netdata/netdata/blob/master/src/health/README.md).
|
||||
|
|
|
@ -106,7 +106,7 @@ Netdata to see your new charts.
|
|||
|
||||
This release also includes eight pre-configured alerts for live nodes, such as whether the node is live, storage
|
||||
capacity, issues with replication, and the number of SQL connections/statements. See [health.d/cockroachdb.conf on
|
||||
GitHub](https://raw.githubusercontent.com/netdata/netdata/master/health/health.d/cockroachdb.conf) for details.
|
||||
GitHub](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/cockroachdb.conf) for details.
|
||||
|
||||
You can also edit these files directly with `edit-config`:
|
||||
|
||||
|
@ -115,4 +115,4 @@ cd /etc/netdata/ # Replace with your Netdata configuration directory, if not /et
|
|||
./edit-config health.d/cockroachdb.conf # You may need to use `sudo` for write privileges
|
||||
```
|
||||
|
||||
For more information about editing the defaults or writing new alert entities, see our documentation on [configuring health alerts](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md).
|
||||
For more information about editing the defaults or writing new alert entities, see our documentation on [configuring health alerts](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md).
|
||||
|
|
|
@ -178,7 +178,7 @@ showing real-time metrics for both in your Netdata dashboard. 🎉
|
|||
The Netdata community helped us create sane defaults for alerts related to both HDFS and Zookeeper. You may want to
|
||||
investigate these to ensure they work well with your Hadoop implementation.
|
||||
|
||||
- [HDFS alerts](https://raw.githubusercontent.com/netdata/netdata/master/health/health.d/hdfs.conf)
|
||||
- [HDFS alerts](https://raw.githubusercontent.com/netdata/netdata/master/src/health/health.d/hdfs.conf)
|
||||
|
||||
You can also access/edit these files directly with `edit-config`:
|
||||
|
||||
|
@ -188,4 +188,4 @@ sudo /etc/netdata/edit-config health.d/zookeeper.conf
|
|||
```
|
||||
|
||||
For more information about editing the defaults or writing new alert entities, see our
|
||||
[health monitoring documentation](https://github.com/netdata/netdata/blob/master/health/README.md).
|
||||
[health monitoring documentation](https://github.com/netdata/netdata/blob/master/src/health/README.md).
|
||||
|
|
|
@ -57,10 +57,10 @@ It is possible to use the `anomaly-bit` when defining traditional Alerts within
|
|||
|
||||
You can see some example ML based alert configurations below:
|
||||
|
||||
- [Anomaly rate based CPU dimensions alert](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#example-8---anomaly-rate-based-cpu-dimensions-alert)
|
||||
- [Anomaly rate based CPU chart alert](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#example-9---anomaly-rate-based-cpu-chart-alert)
|
||||
- [Anomaly rate based node level alert](https://github.com/netdata/netdata/blob/master/health/REFERENCE.md#example-10---anomaly-rate-based-node-level-alert)
|
||||
- More examples in the [`/health/health.d/ml.conf`](https://github.com/netdata/netdata/blob/master/health/health.d/ml.conf) file that ships with the agent.
|
||||
- [Anomaly rate based CPU dimensions alert](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#example-8---anomaly-rate-based-cpu-dimensions-alert)
|
||||
- [Anomaly rate based CPU chart alert](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#example-9---anomaly-rate-based-cpu-chart-alert)
|
||||
- [Anomaly rate based node level alert](https://github.com/netdata/netdata/blob/master/src/health/REFERENCE.md#example-10---anomaly-rate-based-node-level-alert)
|
||||
- More examples in the [`/health/health.d/ml.conf`](https://github.com/netdata/netdata/blob/master/src/health/health.d/ml.conf) file that ships with the agent.
|
||||
|
||||
## Learn More
|
||||
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue