diff --git a/.gitignore b/.gitignore index 52a108f7fd..1222d46ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -166,6 +166,7 @@ gmon.txt sitespeed-result/ tests/acls/acl.sh tests/urls/request.sh +tests/alarm_repetition/alarm.sh # tests and temp files python.d/python-modules-installer.sh diff --git a/database/rrdcalc.h b/database/rrdcalc.h index 3400f711cf..f0c34b5439 100644 --- a/database/rrdcalc.h +++ b/database/rrdcalc.h @@ -27,6 +27,7 @@ #define RRDCALC_FLAG_RUNNABLE 0x00000040 #define RRDCALC_FLAG_DISABLED 0x00000080 #define RRDCALC_FLAG_SILENCED 0x00000100 +#define RRDCALC_FLAG_RUN_ONCE 0x00000200 #define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 diff --git a/health/README.md b/health/README.md index 848c1bc3ba..35f037bce6 100644 --- a/health/README.md +++ b/health/README.md @@ -347,7 +347,8 @@ delay: [[[up U] [down D] multiplier M] max X] #### Alarm line `repeat` -Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration. +Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration, when one of these interval is bigger than 0, Netdata will activate the repeat notification for `CRITICAL`, `CLEAR` and `WARNING` messages. +` Format: diff --git a/health/health.c b/health/health.c index 1ee1a37226..1460b5ba48 100644 --- a/health/health.c +++ b/health/health.c @@ -216,9 +216,11 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) { // we have not executed this alarm notification in the past // so, don't send CLEAR notifications if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) { - debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" - , ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); - goto done; + if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) { + debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s" + , ae->chart, ae->name, rrdcalc_status2string(ae->new_status)); + goto done; + } } } } @@ -872,10 +874,21 @@ void *health_main(void *ptr) { for(rc = host->alarms; rc ; rc = rc->next) { int repeat_every = 0; if(unlikely(rrdcalc_isrepeating(rc))) { - if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE; repeat_every = rc->warn_repeat_every; - else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) + } else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) { + rc->rrdcalc_flags &= ~RRDCALC_FLAG_RUN_ONCE; repeat_every = rc->crit_repeat_every; + } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { + if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) { + if(rc->old_status == RRDCALC_STATUS_CRITICAL) { + repeat_every = rc->crit_repeat_every; + } else if (rc->old_status == RRDCALC_STATUS_WARNING) { + repeat_every = rc->warn_repeat_every; + } + } + } } if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { rc->last_repeat = now; @@ -890,6 +903,10 @@ void *health_main(void *ptr) { ) ); ae->last_repeat = rc->last_repeat; + if (!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) { + ae->flags |= HEALTH_ENTRY_RUN_ONCE; + } + rc->rrdcalc_flags |= RRDCALC_FLAG_RUN_ONCE; health_process_notifications(host, ae); debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); health_alarm_log_free_one_nochecks_nounlink(ae); diff --git a/health/health.h b/health/health.h index 6c000bf458..8e4d0f7cb3 100644 --- a/health/health.h +++ b/health/health.h @@ -23,6 +23,7 @@ extern unsigned int default_health_enabled; #define HEALTH_ENTRY_FLAG_EXEC_RUN 0x00000004 #define HEALTH_ENTRY_FLAG_EXEC_FAILED 0x00000008 #define HEALTH_ENTRY_FLAG_SILENCED 0x00000010 +#define HEALTH_ENTRY_RUN_ONCE 0x00000020 #define HEALTH_ENTRY_FLAG_SAVED 0x10000000 #define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000 diff --git a/health/health_json.c b/health/health_json.c index f6ff1b1a74..8a088d034a 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -43,6 +43,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R "\t\t\"updates_id\": %u,\n" "\t\t\"value_string\": \"%s\",\n" "\t\t\"old_value_string\": \"%s\",\n" + "\t\t\"last_repeat\": \"%lu\",\n" "\t\t\"silenced\": \"%s\",\n" , host->hostname , ae->unique_id @@ -71,6 +72,7 @@ static inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, R , ae->updates_id , ae->new_value_string , ae->old_value_string + , (unsigned long)ae->last_repeat , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false" ); @@ -143,6 +145,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"warn_repeat_every\": \"%u\",\n" "\t\t\t\"crit_repeat_every\": \"%u\",\n" "\t\t\t\"value_string\": \"%s\",\n" + "\t\t\t\"last_repeat\": \"%lu\",\n" , rc->chart, rc->name , (unsigned long)rc->id , rc->name @@ -170,6 +173,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , rc->warn_repeat_every , rc->crit_repeat_every , value_string + , (unsigned long)rc->last_repeat ); if(unlikely(rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)) { diff --git a/tests/Makefile.am b/tests/Makefile.am index 0aa5af247f..179b04864b 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -7,6 +7,7 @@ CLEANFILES = \ health_mgmtapi/health-cmdapi-test.sh \ acls/acl.sh \ urls/request.sh \ + alarm_repetition/alarm.sh \ $(NULL) include $(top_srcdir)/build/subst.inc @@ -26,12 +27,14 @@ dist_noinst_DATA = \ health_mgmtapi/health-cmdapi-test.sh.in \ acls/acl.sh.in \ urls/request.sh.in \ + alarm_repetition/alarm.sh.in \ $(NULL) dist_plugins_SCRIPTS = \ health_mgmtapi/health-cmdapi-test.sh \ acls/acl.sh \ urls/request.sh \ + alarm_repetition/alarm.sh \ $(NULL) dist_noinst_SCRIPTS = \ diff --git a/tests/alarm_repetition/alarm.sh.in b/tests/alarm_repetition/alarm.sh.in new file mode 100644 index 0000000000..8555e0a3c0 --- /dev/null +++ b/tests/alarm_repetition/alarm.sh.in @@ -0,0 +1,86 @@ +#!/bin/bash + +#The health directory to put the alarms +HEALTHDIR="@configdir_POST@/health.d/" + +#output directory +OUTDIR="workdir/" + +#url to do download +MURL="http://localhost:19999/api/v1/alarms?active" + +#error messages +RED='\033[0;31m' +GREEN='\033[0;32m' +NOCOLOR='\033[0m' + +MYCDIR="$(pwd)" +CONFFILE="$MYCDIR/netdata.conf" + +change_alarm_file() { + if [ -f "$1" ]; then + rm "$1" + fi + + #copy keeping the permissions + cp -a "$2" "$3" +} + +netdata_test_download() { + OPT="-e" + if [ "$3" == "I" ]; then + OPT="-v" + fi + + grep "HTTP/1.1 200 OK" "$1" 2>/dev/null 1>/dev/null + TEST="$?" + if [ "$TEST" -ne "0" ]; then + echo -e "${RED} Error to get the alarms" + killall netdata + exit 1 + fi + + COUNT=$(grep -w "\"last_repeat\":" "$2" | grep -c "$OPT" "\"0\"") + if [ "$COUNT" -eq "0" ]; then + echo -e "${RED} Netdata gave an unexpected result when alarm repetition is $4" + killall netdata + exit 1 + fi + + echo -e "${GREEN} I got the expected result" +} + +get_the_logs() { + curl -v -k --create-dirs -o "$OUTDIR/$1.out" "$MURL" 2> "$OUTDIR/$1.err" + netdata_test_download "$OUTDIR/$1.err" "$OUTDIR/$1.out" "$2" "$3" +} + +process_data() { + SEC=120 + netdata -c "$CONFFILE" -D & + NETDATAPID=$! + echo -e "${NOCOLOR}Sleeping during $SEC seconds to create alarm entries" + sleep $SEC + get_the_logs "$1" "$2" "$3" + kill $NETDATAPID +} + +mkdir "$OUTDIR" +CREATEDIR="$?" +if [ "$CREATEDIR" -ne "0" ]; then + echo -e "${RED}Cannot create the output directory, it already exists. The test will overwrite previous results." +fi + +change_alarm_file "./0" "ram_without_repetition.conf" "$HEALTHDIR/ram.conf" +cp -a netdata.conf_without_repetition netdata.conf +process_data "ram_without" "K" "not activated." +rm netdata.conf + +change_alarm_file "$HEALTHDIR/ram.conf" "ram_with_repetition.conf" "$HEALTHDIR/ram.conf" +cp -a netdata.conf_with_repetition netdata.conf +process_data "ram_with" "I" "activated." +rm netdata.conf + +echo -e "${GREEN} all the tests were sucessful" +rm "$HEALTHDIR/ram.conf" +rm -rf $OUTDIR diff --git a/tests/alarm_repetition/netdata.conf_with_repetition b/tests/alarm_repetition/netdata.conf_with_repetition new file mode 100644 index 0000000000..5e02288dbf --- /dev/null +++ b/tests/alarm_repetition/netdata.conf_with_repetition @@ -0,0 +1,57 @@ +# netdata configuration +# +# You can download the latest version of this file, using: +# +# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf +# or +# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf +# +# You can uncomment and change any of the options below. +# The value shown in the commented settings, is the default value. +# + +# global netdata configuration + +[global] + #run as user = netdata + +[web] + #ssl key = /etc/netdata/ssl/key2048.pem + #ssl certificate = /etc/netdata/ssl/cert2048.pem + mode = static-threaded + # listen backlog = 4096 + default port = 19999 + #bind to = *=dashboard|registry|streaming|netdata.conf|badges|management *:20000=dashboard|registry|streaming|netdata.conf|badges|management^SSL=optional *:20001=dashboard|registry|streaming|netdata.conf|badges|management^SSL=force unix:/tmp/netdata/netdata.sock + # web files owner = netdata + # web files group = netdata + #accept a streaming request every seconds = 2 + +[plugins] + proc = yes + diskspace = no + cgroups = no + tc = no + idlejitter = no + enable running new plugins = no + check for new plugins every = 60 + go.d = no + node.d = no + charts.d = no + nfacct = no + python.d = no + apps = no + fping = no + cups = no + +[health] + enabled = yes + in memory max health log entries = 1000 + default repeat warning = 4s + default repeat critical = 2s + +[registry] + enabled = yes + allow from = * + +[cloud] + cloud base url = https://netdata.cloud diff --git a/tests/alarm_repetition/netdata.conf_without_repetition b/tests/alarm_repetition/netdata.conf_without_repetition new file mode 100644 index 0000000000..80513ecb79 --- /dev/null +++ b/tests/alarm_repetition/netdata.conf_without_repetition @@ -0,0 +1,57 @@ +# netdata configuration +# +# You can download the latest version of this file, using: +# +# wget -O /etc/netdata/netdata.conf http://localhost:19999/netdata.conf +# or +# curl -o /etc/netdata/netdata.conf http://localhost:19999/netdata.conf +# +# You can uncomment and change any of the options below. +# The value shown in the commented settings, is the default value. +# + +# global netdata configuration + +[global] + #run as user = netdata + +[web] + #ssl key = /etc/netdata/ssl/key2048.pem + #ssl certificate = /etc/netdata/ssl/cert2048.pem + mode = static-threaded + # listen backlog = 4096 + default port = 19999 + #bind to = *=dashboard|registry|streaming|netdata.conf|badges|management *:20000=dashboard|registry|streaming|netdata.conf|badges|management^SSL=optional *:20001=dashboard|registry|streaming|netdata.conf|badges|management^SSL=force unix:/tmp/netdata/netdata.sock + # web files owner = netdata + # web files group = netdata + #accept a streaming request every seconds = 2 + +[plugins] + proc = yes + diskspace = no + cgroups = no + tc = no + idlejitter = no + enable running new plugins = no + check for new plugins every = 60 + go.d = no + node.d = no + charts.d = no + nfacct = no + python.d = no + apps = no + fping = no + cups = no + +[health] + enabled = yes + in memory max health log entries = 1000 + #default repeat warning = 4s + #default repeat critical = 2s + +[registry] + enabled = yes + allow from = * + +[cloud] + cloud base url = https://netdata.cloud diff --git a/tests/alarm_repetition/ram_with_repetition.conf b/tests/alarm_repetition/ram_with_repetition.conf new file mode 100644 index 0000000000..c215a71d78 --- /dev/null +++ b/tests/alarm_repetition/ram_with_repetition.conf @@ -0,0 +1,64 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: used_ram_to_ignore + on: system.ram + os: linux freebsd + hosts: * + calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) + every: 10s + info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) + + alarm: ram_in_use + on: system.ram + os: linux + hosts: * +# calc: $used * 100 / ($used + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) + units: % + every: 1s + warn: $this > 1 + crit: $this > 5 + delay: down 15m multiplier 1.5 max 1h + info: system RAM used + to: sysadmin #alarms + repeat: warning 30s critical 60s + + alarm: ram_available + on: mem.available + os: linux + hosts: * + calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin #alarms + +## FreeBSD +alarm: ram_in_use + on: system.ram + os: freebsd +hosts: * + calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) +units: % +every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) +delay: down 15m multiplier 1.5 max 1h + info: system RAM usage + to: sysadmin #alarms + + alarm: ram_available + on: system.ram + os: freebsd + hosts: * + calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin #alarms diff --git a/tests/alarm_repetition/ram_without_repetition.conf b/tests/alarm_repetition/ram_without_repetition.conf new file mode 100644 index 0000000000..edfc492e0e --- /dev/null +++ b/tests/alarm_repetition/ram_without_repetition.conf @@ -0,0 +1,63 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: used_ram_to_ignore + on: system.ram + os: linux freebsd + hosts: * + calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) + every: 10s + info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) + + alarm: ram_in_use + on: system.ram + os: linux + hosts: * +# calc: $used * 100 / ($used + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) + units: % + every: 1s + warn: $this > 1 + crit: $this > 5 + delay: down 15m multiplier 1.5 max 1h + info: system RAM used + to: sysadmin #alarms + + alarm: ram_available + on: mem.available + os: linux + hosts: * + calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin #alarms + +## FreeBSD +alarm: ram_in_use + on: system.ram + os: freebsd +hosts: * + calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) +units: % +every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) +delay: down 15m multiplier 1.5 max 1h + info: system RAM usage + to: sysadmin #alarms + + alarm: ram_available + on: system.ram + os: freebsd + hosts: * + calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin #alarms