0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-14 09:38:34 +00:00

Repeating alarm notifications ()

* Alarm_repeat mergin the original!

* Alarm_repeat binary tree!

* Alarm_repeat binary tree finished!

* Alarm_repeat move function and format string

* Alarms bringing a new Binary tree

* Alarms fixing the last two

* Alarm_repeat useless var!

* Alarm fix format and repeat alarm!

* Alarm_backend steps!

* Alarm_repeat stopping to test cloud!

* Alarm_repeat stopping to test cloud 2!

* Alarm_repeat fixing when restart!
This commit is contained in:
thiagoftsm 2019-07-01 11:55:16 +00:00 committed by GitHub
parent 266cbec7a8
commit dd73f3e0cd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 513 additions and 105 deletions

View file

@ -14,6 +14,7 @@
#define config_get_float(section, name, value) appconfig_get_float(&netdata_config, section, name, value)
#define config_get_boolean(section, name, value) appconfig_get_boolean(&netdata_config, section, name, value)
#define config_get_boolean_ondemand(section, name, value) appconfig_get_boolean_ondemand(&netdata_config, section, name, value)
#define config_get_duration(section, name, value) appconfig_get_duration(&netdata_config, section, name, value)
#define config_set(section, name, default_value) appconfig_set(&netdata_config, section, name, default_value)
#define config_set_default(section, name, value) appconfig_set_default(&netdata_config, section, name, value)

View file

@ -1217,7 +1217,7 @@ int main(int argc, char **argv) {
info("netdata initialization completed. Enjoy real-time performance monitoring!");
netdata_ready = 1;
send_statistics("START","-", "-");
send_statistics("START", "-", "-");
// ------------------------------------------------------------------------
// unblock signals

View file

@ -132,7 +132,6 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) {
}
}
// ----------------------------------------------------------------------------
// RRD - cache directory
@ -154,3 +153,4 @@ char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section
return ret;
}

View file

@ -572,6 +572,8 @@ struct alarm_entry {
uint32_t updated_by_id;
uint32_t updates_id;
time_t last_repeat;
struct alarm_entry *next;
};
@ -686,11 +688,16 @@ struct rrdhost {
char *health_log_filename; // the alarms event log filename
size_t health_log_entries_written; // the number of alarm events writtern to the alarms event log
FILE *health_log_fp; // the FILE pointer to the open alarms event log file
uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications
uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications
// all RRDCALCs are primarily allocated and linked here
// RRDCALCs may be linked to charts at any point
// (charts may or may not exist when these are loaded)
RRDCALC *alarms;
avl_tree_lock alarms_idx_health_log;
avl_tree_lock alarms_idx_name;
ALARM_LOG health_log; // alarms historical events (event log)
uint32_t health_last_processed_id; // the last processed health id from the log
@ -1021,6 +1028,12 @@ extern collected_number rrddim_set(RRDSET *st, const char *id, collected_number
extern long align_entries_to_pagesize(RRD_MEMORY_MODE mode, long entries);
// ----------------------------------------------------------------------------
// Miscellaneous functions
extern int alarm_compare_id(void *a, void *b);
extern int alarm_compare_name(void *a, void *b);
// ----------------------------------------------------------------------------
// RRD internal functions

View file

@ -81,9 +81,9 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
if(!rc->units) rc->units = strdupz(st->units);
{
if(!rrdcalc_isrepeating(rc)) {
time_t now = now_realtime_sec();
health_alarm_log(
ALARM_ENTRY *ae = health_create_alarm_entry(
host,
rc->id,
rc->next_event_id++,
@ -104,6 +104,7 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
0,
0
);
health_alarm_log(host, ae);
}
}
@ -142,9 +143,9 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
RRDHOST *host = st->rrdhost;
{
if(!rrdcalc_isrepeating(rc)) {
time_t now = now_realtime_sec();
health_alarm_log(
ALARM_ENTRY *ae = health_create_alarm_entry(
host,
rc->id,
rc->next_event_id++,
@ -165,6 +166,7 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
0,
0
);
health_alarm_log(host, ae);
}
debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
@ -253,7 +255,7 @@ inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const ch
return host->health_log.next_alarm_id++;
}
inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
inline void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc) {
rrdhost_check_rdlock(host);
if(rc->calculation) {
@ -301,8 +303,7 @@ inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
}
}
inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
if(rrdcalc_exists(host, chart, rt->name, 0, 0))
@ -328,6 +329,10 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
rc->delay_max_duration = rt->delay_max_duration;
rc->delay_multiplier = rt->delay_multiplier;
rc->last_repeat = 0;
rc->warn_repeat_every = rt->warn_repeat_every;
rc->crit_repeat_every = rt->crit_repeat_every;
rc->group = rt->group;
rc->after = rt->after;
rc->before = rt->before;
@ -356,7 +361,7 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
}
debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
(rc->chart)?rc->chart:"NOCHART",
rc->name,
(rc->exec)?rc->exec:"DEFAULT",
@ -376,16 +381,24 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
rc->delay_up_duration,
rc->delay_down_duration,
rc->delay_max_duration,
rc->delay_multiplier
rc->delay_multiplier,
rc->warn_repeat_every,
rc->crit_repeat_every
);
rrdcalc_create_part2(host, rc);
rrdcalc_add_to_host(host, rc);
RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc);
if (rdcmp != rc) {
error("Cannot insert the alarm index ID %s",rc->name);
}
return rc;
}
void rrdcalc_free(RRDCALC *rc) {
if(unlikely(!rc)) return;
expression_free(rc->calculation);
expression_free(rc->warning);
expression_free(rc->critical);
@ -413,7 +426,6 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) {
// unlink it from RRDHOST
if(unlikely(rc == host->alarms))
host->alarms = rc->next;
else {
RRDCALC *t;
for(t = host->alarms; t && t->next != rc; t = t->next) ;
@ -425,5 +437,73 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) {
error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
}
if (rc) {
RRDCALC *rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc);
if (!rdcmp) {
error("Cannot remove the health alarm index");
}
rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc);
if (!rdcmp) {
error("Cannot remove the health alarm index");
}
}
rrdcalc_free(rc);
}
// ----------------------------------------------------------------------------
// Alarm
/**
* Alarm is repeating
*
* Is this alarm repeating ?
*
* @param host The structure that has the binary tree
* @param alarm_id the id of the alarm to search
*
* @return It returns 1 case it is repeating and 0 otherwise
*/
int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id) {
RRDCALC findme;
findme.id = alarm_id;
RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_health_log, (avl *)&findme);
if (!rc) {
return 0;
}
return rrdcalc_isrepeating(rc);
}
/**
* Entry is repeating
*
* Check whether the id of alarm entry is yet present in the host structure
*
* @param host The structure that has the binary tree
* @param ae the alarm entry
*
* @return It returns 1 case it is repeating and 0 otherwise
*/
int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae) {
return alarm_isrepeating(host, ae->alarm_id);
}
/**
* Max last repeat
*
* Check the maximum last_repeat for the alarms associated a host
*
* @param host The structure that has the binary tree
*
* @return It returns 1 case it is repeating and 0 otherwise
*/
RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name,uint32_t hash) {
RRDCALC findme;
findme.name = alarm_name;
findme.hash = hash;
RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_name, (avl *)&findme);
return rc;
}

View file

@ -29,7 +29,9 @@
#define RRDCALC_FLAG_SILENCED 0x00000100
#define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
struct rrdcalc {
avl avl; // the index, with key the id - this has to be first!
uint32_t id; // the unique id of this alarm
uint32_t next_event_id; // the next event id that will be used for this alarm
@ -77,9 +79,16 @@ struct rrdcalc {
float delay_multiplier; // multiplier for all delays when alarms switch status
// while now < delay_up_to
// ------------------------------------------------------------------------
// notification repeat settings
uint32_t warn_repeat_every; // interval between repeating warning notifications
uint32_t crit_repeat_every; // interval between repeating critical notifications
// ------------------------------------------------------------------------
// runtime information
RRDCALC_STATUS old_status; // the old status of the alarm
RRDCALC_STATUS status; // the current status of the alarm
calculated_number value; // the current value of the alarm
@ -90,6 +99,7 @@ struct rrdcalc {
time_t last_updated; // the last update timestamp of the alarm
time_t next_update; // the next update timestamp of the alarm
time_t last_status_change; // the timestamp of the last time this alarm changed status
time_t last_repeat; // the last time the alarm got repeated
time_t db_after; // the first timestamp evaluated by the db lookup
time_t db_before; // the last timestamp evaluated by the db lookup
@ -119,6 +129,10 @@ struct rrdcalc {
struct rrdcalc *next;
};
extern int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id);
extern int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae);
extern RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name, uint32_t hash);
#define RRDCALC_HAS_DB_LOOKUP(rc) ((rc)->after)
extern void rrdsetcalc_link_matching(RRDSET *st);
@ -132,7 +146,14 @@ extern void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc);
extern int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name);
extern uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id);
extern RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
extern void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc);
extern RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
extern void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc);
static inline int rrdcalc_isrepeating(RRDCALC *rc) {
if (unlikely(rc->warn_repeat_every > 0 || rc->crit_repeat_every > 0)) {
return 1;
}
return 0;
}
#endif //NETDATA_RRDCALC_H

View file

@ -13,7 +13,7 @@ void rrdcalctemplate_link_matching(RRDSET *st) {
for(rt = host->templates; rt ; rt = rt->next) {
if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
&& (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
RRDCALC *rc = rrdcalc_create(host, rt, st->id);
RRDCALC *rc = rrdcalc_create_from_template(host, rt, st->id);
if(unlikely(!rc))
info("Health tried to create alarm from template '%s' on chart '%s' of host '%s', but it failed", rt->name, st->id, host->hostname);

View file

@ -48,6 +48,12 @@ struct rrdcalctemplate {
int delay_max_duration; // the absolute max delay to apply to this alarm
float delay_multiplier; // multiplier for all delays when alarms switch status
// ------------------------------------------------------------------------
// notification repeat settings
uint32_t warn_repeat_every; // interval between repeating warning notifications
uint32_t crit_repeat_every; // interval between repeating critical notifications
// ------------------------------------------------------------------------
// expressions related to the alarm

View file

@ -179,6 +179,10 @@ RRDHOST *rrdhost_create(const char *hostname,
if(config_get_boolean(CONFIG_SECTION_GLOBAL, "delete orphan hosts files", 1) && !is_localhost)
rrdhost_flag_set(host, RRDHOST_FLAG_DELETE_ORPHAN_HOST);
host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
avl_init_lock(&(host->alarms_idx_health_log), alarm_compare_id);
avl_init_lock(&(host->alarms_idx_name), alarm_compare_name);
// ------------------------------------------------------------------------
// initialize health variables
@ -274,12 +278,12 @@ RRDHOST *rrdhost_create(const char *hostname,
// load health configuration
if(host->health_enabled) {
health_alarm_log_load(host);
health_alarm_log_open(host);
rrdhost_wrlock(host);
health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
rrdhost_unlock(host);
health_alarm_log_load(host);
health_alarm_log_open(host);
}
@ -876,3 +880,43 @@ int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, ch
return res;
}
/**
* Alarm Compare ID
*
* Callback function used with the binary trees to compare the id of RRDCALC
*
* @param a a pointer to the RRDCAL item to insert,compare or update the binary tree
* @param b the pointer to the binary tree.
*
* @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b.
*/
int alarm_compare_id(void *a, void *b) {
register uint32_t hash1 = ((RRDCALC *)a)->id;
register uint32_t hash2 = ((RRDCALC *)b)->id;
if(hash1 < hash2) return -1;
else if(hash1 > hash2) return 1;
return 0;
}
/**
* Alarm Compare NAME
*
* Callback function used with the binary trees to compare the name of RRDCALC
*
* @param a a pointer to the RRDCAL item to insert,compare or update the binary tree
* @param b the pointer to the binary tree.
*
* @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b.
*/
int alarm_compare_name(void *a, void *b) {
RRDCALC *in1 = (RRDCALC *)a;
RRDCALC *in2 = (RRDCALC *)b;
if(in1->hash < in2->hash) return -1;
else if(in1->hash > in2->hash) return 1;
return strcmp(in1->name,in2->name);
}

View file

@ -11,7 +11,6 @@ packet dropped).
Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.).
Each alarm can execute a single query to the database using statistical algorithms against past data,
but alarms can be combined. So, if you need 2 queries in the database, you can combine
2 alarms together (both will run a query to the database, and the results can be combined).
@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X]
their matching one) and a delay is in place.
- All are reset to their defaults when the alarm switches state without a delay in place.
---
#### Alarm line `repeat`
Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
Format:
```
repeat: [off] [warning DURATION] [critical DURATION]
```
* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration.
* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode.
* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode.
---
#### Alarm line `option`
The only possible value for the `option` line is
@ -567,12 +584,15 @@ template: disk_full_percent
every: 1m
warn: $this > 80
crit: $this > 95
repeat: warning 120s critical 10s
```
`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard.
So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage.
This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode.
### Example 3
Predict if any disk will run out of space in the near future.

View file

@ -255,17 +255,18 @@ static inline void health_alarm_log_process(RRDHOST *host) {
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *ae;
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) {
if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
if(likely(!alarm_entry_isrepeating(host, ae))) {
if(unlikely(
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
)) {
if(unlikely(ae->unique_id < first_waiting))
first_waiting = ae->unique_id;
if(unlikely(ae->unique_id < first_waiting))
first_waiting = ae->unique_id;
if(likely(now >= ae->delay_up_to_timestamp))
health_process_notifications(host, ae);
if(likely(now >= ae->delay_up_to_timestamp))
health_process_notifications(host, ae);
}
}
}
@ -295,6 +296,10 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
health_alarm_log_free_one_nochecks_nounlink(ae);
if(likely(!alarm_entry_isrepeating(host, ae))) {
health_alarm_log_free_one_nochecks_nounlink(ae);
host->health_log.count--;
}
ae = t;
host->health_log.count--;
@ -411,7 +416,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
, (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, rc->name
, (rc->rrdset)?rc->rrdset->context:""
, rc->chart
@ -756,20 +761,22 @@ void *health_main(void *ptr) {
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
health_alarm_log(
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
rc->delay_last,
(
((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
)
);
rc->last_status_change = now;
rc->status = status;
if(likely(!rrdcalc_isrepeating(rc))) {
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
rc->delay_last,
(
((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
)
);
health_alarm_log(host, ae);
}
rc->last_status_change = now;
rc->old_status = rc->status;
rc->status = status;
}
rc->last_updated = now;
@ -779,6 +786,35 @@ void *health_main(void *ptr) {
next_run = rc->next_update;
}
// process repeating alarms
RRDCALC *rc;
for(rc = host->alarms; rc ; rc = rc->next) {
int repeat_every = 0;
if(unlikely(rrdcalc_isrepeating(rc))) {
if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
repeat_every = rc->warn_repeat_every;
else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
repeat_every = rc->crit_repeat_every;
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
rc->last_repeat = now;
ALARM_ENTRY *ae = health_create_alarm_entry(
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
rc->delay_last,
(
((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
)
);
ae->last_repeat = rc->last_repeat;
health_process_notifications(host, ae);
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
health_alarm_log_free_one_nochecks_nounlink(ae);
}
}
rrdhost_unlock(host);
}

View file

@ -108,7 +108,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename);
extern void health_alarm_log_load(RRDHOST *host);
extern void health_alarm_log(
extern ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
@ -129,6 +129,8 @@ extern void health_alarm_log(
int delay,
uint32_t flags);
extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae);
extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
extern char *health_user_config_dir(void);
extern char *health_stock_config_dir(void);

View file

@ -23,6 +23,7 @@
#define HEALTH_INFO_KEY "info"
#define HEALTH_DELAY_KEY "delay"
#define HEALTH_OPTIONS_KEY "options"
#define HEALTH_REPEAT_KEY "repeat"
static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
if(!rc->chart) {
@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rc->chart?rc->chart:"NOCHART",
rc->name,
rc->id,
@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->delay_up_duration,
rc->delay_down_duration,
rc->delay_max_duration,
rc->delay_multiplier
rc->delay_multiplier,
rc->warn_repeat_every,
rc->crit_repeat_every
);
rrdcalc_create_part2(host, rc);
rrdcalc_add_to_host(host, rc);
return 1;
}
@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
}
}
debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rt->name,
(rt->context)?rt->context:"NONE",
(rt->exec)?rt->exec:"DEFAULT",
@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
rt->delay_up_duration,
rt->delay_down_duration,
rt->delay_max_duration,
rt->delay_multiplier
rt->delay_multiplier,
rt->warn_repeat_every,
rt->crit_repeat_every
);
if(likely(last)) {
@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
return 1;
}
static inline int health_parse_duration(char *string, int *result) {
// make sure it is a number
if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
*result = 0;
return 0;
}
char *e = NULL;
calculated_number n = str2ld(string, &e);
if(e && *e) {
switch (*e) {
case 'Y':
*result = (int) (n * 86400 * 365);
break;
case 'M':
*result = (int) (n * 86400 * 30);
break;
case 'w':
*result = (int) (n * 86400 * 7);
break;
case 'd':
*result = (int) (n * 86400);
break;
case 'h':
*result = (int) (n * 3600);
break;
case 'm':
*result = (int) (n * 60);
break;
default:
case 's':
*result = (int) (n);
break;
}
}
else
*result = (int)(n);
return 1;
}
static inline int health_parse_delay(
size_t line, const char *filename, char *string,
int *delay_up_duration,
@ -202,14 +165,14 @@ static inline int health_parse_delay(
while(*s && isspace(*s)) *s++ = '\0';
if(!strcasecmp(key, "up")) {
if (!health_parse_duration(value, delay_up_duration)) {
if (!config_parse_duration(value, delay_up_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
else given_up = 1;
}
else if(!strcasecmp(key, "down")) {
if (!health_parse_duration(value, delay_down_duration)) {
if (!config_parse_duration(value, delay_down_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
@ -224,7 +187,7 @@ static inline int health_parse_delay(
else given_multiplier = 1;
}
else if(!strcasecmp(key, "max")) {
if (!health_parse_duration(value, delay_max_duration)) {
if (!config_parse_duration(value, delay_max_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) {
return options;
}
static inline int health_parse_repeat(
size_t line,
const char *file,
char *string,
uint32_t *warn_repeat_every,
uint32_t *crit_repeat_every
) {
char *s = string;
while(*s) {
char *key = s;
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
if(!*key) break;
char *value = s;
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
if(!strcasecmp(key, "off")) {
*warn_repeat_every = 0;
*crit_repeat_every = 0;
return 1;
}
if(!strcasecmp(key, "warning")) {
if (!config_parse_duration(value, (int*)warn_repeat_every)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, file, value, key);
}
}
else if(!strcasecmp(key, "critical")) {
if (!config_parse_duration(value, (int*)crit_repeat_every)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, file, value, key);
}
}
}
return 1;
}
static inline int health_parse_db_lookup(
size_t line, const char *filename, char *string,
RRDR_GROUPING *group_method, int *after, int *before, int *every,
@ -322,7 +329,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
if(!health_parse_duration(key, after)) {
if(!config_parse_duration(key, after)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
line, filename, key);
return 0;
@ -343,7 +350,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
if (!health_parse_duration(value, before)) {
if (!config_parse_duration(value, before)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
line, filename, value, key);
}
@ -353,7 +360,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
if (!health_parse_duration(value, every)) {
if (!config_parse_duration(value, every)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
line, filename, value, key);
}
@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) {
hash_info = 0,
hash_recipient = 0,
hash_delay = 0,
hash_options = 0;
hash_options = 0,
hash_repeat = 0;
char buffer[HEALTH_CONF_MAX_LINE + 1];
@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) {
hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
hash_delay = simple_uhash(HEALTH_DELAY_KEY);
hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
hash_repeat = simple_uhash(HEALTH_REPEAT_KEY);
}
FILE *fp = fopen(filename, "r");
@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) {
rc->value = NAN;
rc->old_value = NAN;
rc->delay_multiplier = 1.0;
rc->old_status = RRDCALC_STATUS_UNINITIALIZED;
rc->warn_repeat_every = host->health_default_warn_repeat_every;
rc->crit_repeat_every = host->health_default_crit_repeat_every;
if(rrdvar_fix_name(rc->name))
error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) {
rt->green = NAN;
rt->red = NAN;
rt->delay_multiplier = 1.0;
rt->warn_repeat_every = host->health_default_warn_repeat_every;
rt->crit_repeat_every = host->health_default_crit_repeat_every;
if(rrdvar_fix_name(rt->name))
error("Health configuration renamed template '%s' to '%s'", value, rt->name);
@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) {
&rc->options, &rc->dimensions);
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
if(!health_parse_duration(value, &rc->update_every))
if(!config_parse_duration(value, &rc->update_every))
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rc->name, key, value);
}
@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
rc->options |= health_parse_options(value);
}
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
health_parse_repeat(line, filename, value,
&rc->warn_repeat_every,
&rc->crit_repeat_every);
}
else {
error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
line, filename, rc->name, key);
@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) {
&rt->update_every, &rt->options, &rt->dimensions);
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
if(!health_parse_duration(value, &rt->update_every))
if(!config_parse_duration(value, &rt->update_every))
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rt->name, key, value);
}
@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
rt->options |= health_parse_options(value);
}
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
health_parse_repeat(line, filename, value,
&rt->warn_repeat_every,
&rt->crit_repeat_every);
}
else {
error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
line, filename, rt->name, key);

View file

@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"delay_multiplier\": %f,\n"
"\t\t\t\"delay\": %d,\n"
"\t\t\t\"delay_up_to_timestamp\": %lu,\n"
"\t\t\t\"warn_repeat_every\": \"%u\",\n"
"\t\t\t\"crit_repeat_every\": \"%u\",\n"
"\t\t\t\"value_string\": \"%s\",\n"
, rc->chart, rc->name
, (unsigned long)rc->id
@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, rc->delay_multiplier
, rc->delay_last
, (unsigned long)rc->delay_up_to_timestamp
, rc->warn_repeat_every
, rc->crit_repeat_every
, value_string
);

View file

@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
"\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
"\t%d\t%d\t%d\t%d"
"\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
"\t%016lx"
"\n"
, (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
, host->hostname
@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
, ae->new_value
, ae->old_value
, (uint64_t)ae->last_repeat
) < 0))
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
else {
@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
continue;
}
// Check if we got last_repeat field
time_t last_repeat = 0;
if(entries > 27) {
char* alarm_name = pointers[13];
last_repeat = (time_t)strtoul(pointers[27], NULL, 16);
RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
if (!rc) {
for(rc = host->alarms; rc ; rc = rc->next) {
RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc);
if(rdcmp != rc) {
error("Cannot insert the alarm index ID using log %s", rc->name);
}
}
rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
}
if(unlikely(rc)) {
if (rrdcalc_isrepeating(rc)) {
rc->last_repeat = last_repeat;
// We iterate through repeating alarm entries only to
// find the latest last_repeat timestamp. Otherwise,
// there is no need to keep them in memory.
continue;
}
}
}
if(unlikely(*pointers[0] == 'A')) {
// make sure it is properly numbered
if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id);
error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it."
, host->hostname, line, filename, unique_id);
errored++;
continue;
}
@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
}
else if(unlikely(*pointers[0] == 'U')) {
// find the original
for(ae = host->health_log.alarms; ae; ae = ae->next) {
for(ae = host->health_log.alarms; ae ; ae = ae->next) {
if(unlikely(unique_id == ae->unique_id)) {
if(unlikely(*pointers[0] == 'A')) {
error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later."
, host->hostname, line, filename, unique_id);
, host->hostname, line, filename, unique_id);
*pointers[0] = 'U';
duplicate++;
}
@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
ae->new_value = str2l(pointers[25]);
ae->old_value = str2l(pointers[26]);
ae->last_repeat = last_repeat;
char value_string[100 + 1];
freez(ae->old_value_string);
freez(ae->new_value_string);
@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) {
// ----------------------------------------------------------------------------
// health alarm log management
inline void health_alarm_log(
inline ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
@ -398,9 +432,24 @@ inline void health_alarm_log(
ae->delay_up_to_timestamp = when + delay;
ae->flags |= flags;
ae->last_repeat = 0;
if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
ae->non_clear_duration += ae->duration;
return ae;
}
inline void health_alarm_log(
RRDHOST *host,
ALARM_ENTRY *ae
) {
debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
if(unlikely(alarm_entry_isrepeating(host, ae))) {
error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id);
return;
}
// link it
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
ae->next = host->health_log.alarms;

View file

@ -411,6 +411,27 @@ int appconfig_set_boolean(struct config *root, const char *section, const char *
return value;
}
int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value)
{
int result = 0;
const char *s;
s = appconfig_get(root, section, name, value);
if(!s) goto fallback;
if(!config_parse_duration(s, &result)) {
error("config option '[%s].%s = %s' is configured with an valid duration", section, name, s);
goto fallback;
}
return result;
fallback:
if(!config_parse_duration(value, &result))
error("INTERNAL ERROR: default duration supplied for option '[%s].%s = %s' is not a valid duration", section, name, value);
return result;
}
// ----------------------------------------------------------------------------
// config load/save
@ -586,3 +607,65 @@ void appconfig_generate(struct config *root, BUFFER *wb, int only_changed)
appconfig_unlock(root);
}
}
/**
* Parse Duration
*
* Parse the string setting the result
*
* @param string the timestamp string
* @param result the output variable
*
* @return It returns 1 on success and 0 otherwise
*/
int config_parse_duration(const char* string, int* result) {
while(*string && isspace(*string)) string++;
if(unlikely(!*string)) goto fallback;
if(*string == 'n' && !strcmp(string, "never")) {
// this is a valid option
*result = 0;
return 1;
}
// make sure it is a number
if(!(isdigit(*string) || *string == '+' || *string == '-')) goto fallback;
char *e = NULL;
calculated_number n = str2ld(string, &e);
if(e && *e) {
switch (*e) {
case 'Y':
*result = (int) (n * 31536000);
break;
case 'M':
*result = (int) (n * 2592000);
break;
case 'w':
*result = (int) (n * 604800);
break;
case 'd':
*result = (int) (n * 86400);
break;
case 'h':
*result = (int) (n * 3600);
break;
case 'm':
*result = (int) (n * 60);
break;
case 's':
default:
*result = (int) (n);
break;
}
}
else
*result = (int)(n);
return 1;
fallback:
*result = 0;
return 0;
}

View file

@ -119,6 +119,7 @@ extern long long appconfig_get_number(struct config *root, const char *section,
extern LONG_DOUBLE appconfig_get_float(struct config *root, const char *section, const char *name, LONG_DOUBLE value);
extern int appconfig_get_boolean(struct config *root, const char *section, const char *name, int value);
extern int appconfig_get_boolean_ondemand(struct config *root, const char *section, const char *name, int value);
extern int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value);
extern const char *appconfig_set(struct config *root, const char *section, const char *name, const char *value);
extern const char *appconfig_set_default(struct config *root, const char *section, const char *name, const char *value);
@ -133,4 +134,6 @@ extern void appconfig_generate(struct config *root, BUFFER *wb, int only_changed
extern int appconfig_section_compare(void *a, void *b);
extern int config_parse_duration(const char* string, int* result);
#endif /* NETDATA_CONFIG_H */

View file

@ -1305,6 +1305,14 @@
"crit_parsed": {
"type": "string"
},
"warn_repeat_every": {
"type": "integer",
"format": "int32"
},
"crit_repeat_every": {
"type": "integer",
"format": "int32"
},
"green": {
"type": "string",
"format": "nullable"

View file

@ -892,6 +892,12 @@ definitions:
type: string
crit_parsed:
type: string
warn_repeat_every:
type: integer
format: int32
crit_repeat_every:
type: integer
format: int32
green:
type: string
format: nullable

View file

@ -2100,6 +2100,14 @@ function alarmsUpdateModal() {
+ ((chart.red !== null) ? ('<tr><td width="10%" style="text-align:right">red&nbsp;threshold</td><td><code>' + chart.red + ' ' + units + '</code></td></tr>') : '');
}
if (alarm.warn_repeat_every > 0) {
html += '<tr><td width="10%" style="text-align:right">repeat&nbsp;warning</td><td>' + NETDATA.seconds4human(alarm.warn_repeat_every) + '</td></tr>';
}
if (alarm.crit_repeat_every > 0) {
html += '<tr><td width="10%" style="text-align:right">repeat&nbsp;critical</td><td>' + NETDATA.seconds4human(alarm.crit_repeat_every) + '</td></tr>';
}
var delay = '';
if ((alarm.delay_up_duration > 0 || alarm.delay_down_duration > 0) && alarm.delay_multiplier !== 0 && alarm.delay_max_duration > 0) {
if (alarm.delay_up_duration === alarm.delay_down_duration) {