mirror of
https://github.com/netdata/netdata.git
synced 2025-04-14 09:38:34 +00:00
Repeating alarm notifications (#6309)
* Alarm_repeat mergin the original! * Alarm_repeat binary tree! * Alarm_repeat binary tree finished! * Alarm_repeat move function and format string * Alarms bringing a new Binary tree * Alarms fixing the last two * Alarm_repeat useless var! * Alarm fix format and repeat alarm! * Alarm_backend steps! * Alarm_repeat stopping to test cloud! * Alarm_repeat stopping to test cloud 2! * Alarm_repeat fixing when restart!
This commit is contained in:
parent
266cbec7a8
commit
dd73f3e0cd
20 changed files with 513 additions and 105 deletions
|
@ -14,6 +14,7 @@
|
|||
#define config_get_float(section, name, value) appconfig_get_float(&netdata_config, section, name, value)
|
||||
#define config_get_boolean(section, name, value) appconfig_get_boolean(&netdata_config, section, name, value)
|
||||
#define config_get_boolean_ondemand(section, name, value) appconfig_get_boolean_ondemand(&netdata_config, section, name, value)
|
||||
#define config_get_duration(section, name, value) appconfig_get_duration(&netdata_config, section, name, value)
|
||||
|
||||
#define config_set(section, name, default_value) appconfig_set(&netdata_config, section, name, default_value)
|
||||
#define config_set_default(section, name, value) appconfig_set_default(&netdata_config, section, name, value)
|
||||
|
|
|
@ -1217,7 +1217,7 @@ int main(int argc, char **argv) {
|
|||
info("netdata initialization completed. Enjoy real-time performance monitoring!");
|
||||
netdata_ready = 1;
|
||||
|
||||
send_statistics("START","-", "-");
|
||||
send_statistics("START", "-", "-");
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// unblock signals
|
||||
|
|
|
@ -132,7 +132,6 @@ const char *rrdset_type_name(RRDSET_TYPE chart_type) {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// RRD - cache directory
|
||||
|
||||
|
@ -154,3 +153,4 @@ char *rrdset_cache_dir(RRDHOST *host, const char *id, const char *config_section
|
|||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
|
|
@ -572,6 +572,8 @@ struct alarm_entry {
|
|||
uint32_t updated_by_id;
|
||||
uint32_t updates_id;
|
||||
|
||||
time_t last_repeat;
|
||||
|
||||
struct alarm_entry *next;
|
||||
};
|
||||
|
||||
|
@ -686,11 +688,16 @@ struct rrdhost {
|
|||
char *health_log_filename; // the alarms event log filename
|
||||
size_t health_log_entries_written; // the number of alarm events writtern to the alarms event log
|
||||
FILE *health_log_fp; // the FILE pointer to the open alarms event log file
|
||||
uint32_t health_default_warn_repeat_every; // the default value for the interval between repeating warning notifications
|
||||
uint32_t health_default_crit_repeat_every; // the default value for the interval between repeating critical notifications
|
||||
|
||||
|
||||
// all RRDCALCs are primarily allocated and linked here
|
||||
// RRDCALCs may be linked to charts at any point
|
||||
// (charts may or may not exist when these are loaded)
|
||||
RRDCALC *alarms;
|
||||
avl_tree_lock alarms_idx_health_log;
|
||||
avl_tree_lock alarms_idx_name;
|
||||
|
||||
ALARM_LOG health_log; // alarms historical events (event log)
|
||||
uint32_t health_last_processed_id; // the last processed health id from the log
|
||||
|
@ -1021,6 +1028,12 @@ extern collected_number rrddim_set(RRDSET *st, const char *id, collected_number
|
|||
|
||||
extern long align_entries_to_pagesize(RRD_MEMORY_MODE mode, long entries);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Miscellaneous functions
|
||||
|
||||
extern int alarm_compare_id(void *a, void *b);
|
||||
extern int alarm_compare_name(void *a, void *b);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// RRD internal functions
|
||||
|
||||
|
|
|
@ -81,9 +81,9 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
|
|||
|
||||
if(!rc->units) rc->units = strdupz(st->units);
|
||||
|
||||
{
|
||||
if(!rrdcalc_isrepeating(rc)) {
|
||||
time_t now = now_realtime_sec();
|
||||
health_alarm_log(
|
||||
ALARM_ENTRY *ae = health_create_alarm_entry(
|
||||
host,
|
||||
rc->id,
|
||||
rc->next_event_id++,
|
||||
|
@ -104,6 +104,7 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
|
|||
0,
|
||||
0
|
||||
);
|
||||
health_alarm_log(host, ae);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -142,9 +143,9 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
|
|||
|
||||
RRDHOST *host = st->rrdhost;
|
||||
|
||||
{
|
||||
if(!rrdcalc_isrepeating(rc)) {
|
||||
time_t now = now_realtime_sec();
|
||||
health_alarm_log(
|
||||
ALARM_ENTRY *ae = health_create_alarm_entry(
|
||||
host,
|
||||
rc->id,
|
||||
rc->next_event_id++,
|
||||
|
@ -165,6 +166,7 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
|
|||
0,
|
||||
0
|
||||
);
|
||||
health_alarm_log(host, ae);
|
||||
}
|
||||
|
||||
debug(D_HEALTH, "Health unlinking alarm '%s.%s' from chart '%s' of host '%s'", rc->chart?rc->chart:"NOCHART", rc->name, st->id, host->hostname);
|
||||
|
@ -253,7 +255,7 @@ inline uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const ch
|
|||
return host->health_log.next_alarm_id++;
|
||||
}
|
||||
|
||||
inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
|
||||
inline void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc) {
|
||||
rrdhost_check_rdlock(host);
|
||||
|
||||
if(rc->calculation) {
|
||||
|
@ -301,8 +303,7 @@ inline void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc) {
|
|||
}
|
||||
}
|
||||
|
||||
inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
|
||||
|
||||
inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart) {
|
||||
debug(D_HEALTH, "Health creating dynamic alarm (from template) '%s.%s'", chart, rt->name);
|
||||
|
||||
if(rrdcalc_exists(host, chart, rt->name, 0, 0))
|
||||
|
@ -328,6 +329,10 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
|
|||
rc->delay_max_duration = rt->delay_max_duration;
|
||||
rc->delay_multiplier = rt->delay_multiplier;
|
||||
|
||||
rc->last_repeat = 0;
|
||||
rc->warn_repeat_every = rt->warn_repeat_every;
|
||||
rc->crit_repeat_every = rt->crit_repeat_every;
|
||||
|
||||
rc->group = rt->group;
|
||||
rc->after = rt->after;
|
||||
rc->before = rt->before;
|
||||
|
@ -356,7 +361,7 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
|
|||
error("Health alarm '%s.%s': failed to re-parse critical expression '%s'", chart, rt->name, rt->critical->source);
|
||||
}
|
||||
|
||||
debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
|
||||
debug(D_HEALTH, "Health runtime added alarm '%s.%s': exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
|
||||
(rc->chart)?rc->chart:"NOCHART",
|
||||
rc->name,
|
||||
(rc->exec)?rc->exec:"DEFAULT",
|
||||
|
@ -376,16 +381,24 @@ inline RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *c
|
|||
rc->delay_up_duration,
|
||||
rc->delay_down_duration,
|
||||
rc->delay_max_duration,
|
||||
rc->delay_multiplier
|
||||
rc->delay_multiplier,
|
||||
rc->warn_repeat_every,
|
||||
rc->crit_repeat_every
|
||||
);
|
||||
|
||||
rrdcalc_create_part2(host, rc);
|
||||
rrdcalc_add_to_host(host, rc);
|
||||
RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_health_log,(avl *)rc);
|
||||
if (rdcmp != rc) {
|
||||
error("Cannot insert the alarm index ID %s",rc->name);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
void rrdcalc_free(RRDCALC *rc) {
|
||||
if(unlikely(!rc)) return;
|
||||
|
||||
|
||||
expression_free(rc->calculation);
|
||||
expression_free(rc->warning);
|
||||
expression_free(rc->critical);
|
||||
|
@ -413,7 +426,6 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) {
|
|||
// unlink it from RRDHOST
|
||||
if(unlikely(rc == host->alarms))
|
||||
host->alarms = rc->next;
|
||||
|
||||
else {
|
||||
RRDCALC *t;
|
||||
for(t = host->alarms; t && t->next != rc; t = t->next) ;
|
||||
|
@ -425,5 +437,73 @@ void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc) {
|
|||
error("Cannot unlink alarm '%s.%s' from host '%s': not found", rc->chart?rc->chart:"NOCHART", rc->name, host->hostname);
|
||||
}
|
||||
|
||||
if (rc) {
|
||||
RRDCALC *rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_health_log, (avl *)rc);
|
||||
if (!rdcmp) {
|
||||
error("Cannot remove the health alarm index");
|
||||
}
|
||||
|
||||
rdcmp = (RRDCALC *) avl_remove_lock(&(host)->alarms_idx_name, (avl *)rc);
|
||||
if (!rdcmp) {
|
||||
error("Cannot remove the health alarm index");
|
||||
}
|
||||
}
|
||||
|
||||
rrdcalc_free(rc);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Alarm
|
||||
|
||||
|
||||
/**
|
||||
* Alarm is repeating
|
||||
*
|
||||
* Is this alarm repeating ?
|
||||
*
|
||||
* @param host The structure that has the binary tree
|
||||
* @param alarm_id the id of the alarm to search
|
||||
*
|
||||
* @return It returns 1 case it is repeating and 0 otherwise
|
||||
*/
|
||||
int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id) {
|
||||
RRDCALC findme;
|
||||
findme.id = alarm_id;
|
||||
RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_health_log, (avl *)&findme);
|
||||
if (!rc) {
|
||||
return 0;
|
||||
}
|
||||
return rrdcalc_isrepeating(rc);
|
||||
}
|
||||
|
||||
/**
|
||||
* Entry is repeating
|
||||
*
|
||||
* Check whether the id of alarm entry is yet present in the host structure
|
||||
*
|
||||
* @param host The structure that has the binary tree
|
||||
* @param ae the alarm entry
|
||||
*
|
||||
* @return It returns 1 case it is repeating and 0 otherwise
|
||||
*/
|
||||
int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae) {
|
||||
return alarm_isrepeating(host, ae->alarm_id);
|
||||
}
|
||||
|
||||
/**
|
||||
* Max last repeat
|
||||
*
|
||||
* Check the maximum last_repeat for the alarms associated a host
|
||||
*
|
||||
* @param host The structure that has the binary tree
|
||||
*
|
||||
* @return It returns 1 case it is repeating and 0 otherwise
|
||||
*/
|
||||
RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name,uint32_t hash) {
|
||||
RRDCALC findme;
|
||||
findme.name = alarm_name;
|
||||
findme.hash = hash;
|
||||
RRDCALC *rc = (RRDCALC *)avl_search_lock(&host->alarms_idx_name, (avl *)&findme);
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -29,7 +29,9 @@
|
|||
#define RRDCALC_FLAG_SILENCED 0x00000100
|
||||
#define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000
|
||||
|
||||
|
||||
struct rrdcalc {
|
||||
avl avl; // the index, with key the id - this has to be first!
|
||||
uint32_t id; // the unique id of this alarm
|
||||
uint32_t next_event_id; // the next event id that will be used for this alarm
|
||||
|
||||
|
@ -77,9 +79,16 @@ struct rrdcalc {
|
|||
float delay_multiplier; // multiplier for all delays when alarms switch status
|
||||
// while now < delay_up_to
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// notification repeat settings
|
||||
|
||||
uint32_t warn_repeat_every; // interval between repeating warning notifications
|
||||
uint32_t crit_repeat_every; // interval between repeating critical notifications
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// runtime information
|
||||
|
||||
RRDCALC_STATUS old_status; // the old status of the alarm
|
||||
RRDCALC_STATUS status; // the current status of the alarm
|
||||
|
||||
calculated_number value; // the current value of the alarm
|
||||
|
@ -90,6 +99,7 @@ struct rrdcalc {
|
|||
time_t last_updated; // the last update timestamp of the alarm
|
||||
time_t next_update; // the next update timestamp of the alarm
|
||||
time_t last_status_change; // the timestamp of the last time this alarm changed status
|
||||
time_t last_repeat; // the last time the alarm got repeated
|
||||
|
||||
time_t db_after; // the first timestamp evaluated by the db lookup
|
||||
time_t db_before; // the last timestamp evaluated by the db lookup
|
||||
|
@ -119,6 +129,10 @@ struct rrdcalc {
|
|||
struct rrdcalc *next;
|
||||
};
|
||||
|
||||
extern int alarm_isrepeating(RRDHOST *host, uint32_t alarm_id);
|
||||
extern int alarm_entry_isrepeating(RRDHOST *host, ALARM_ENTRY *ae);
|
||||
extern RRDCALC *alarm_max_last_repeat(RRDHOST *host, char *alarm_name, uint32_t hash);
|
||||
|
||||
#define RRDCALC_HAS_DB_LOOKUP(rc) ((rc)->after)
|
||||
|
||||
extern void rrdsetcalc_link_matching(RRDSET *st);
|
||||
|
@ -132,7 +146,14 @@ extern void rrdcalc_unlink_and_free(RRDHOST *host, RRDCALC *rc);
|
|||
|
||||
extern int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name);
|
||||
extern uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id);
|
||||
extern RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
|
||||
extern void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc);
|
||||
extern RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
|
||||
extern void rrdcalc_add_to_host(RRDHOST *host, RRDCALC *rc);
|
||||
|
||||
static inline int rrdcalc_isrepeating(RRDCALC *rc) {
|
||||
if (unlikely(rc->warn_repeat_every > 0 || rc->crit_repeat_every > 0)) {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif //NETDATA_RRDCALC_H
|
||||
|
|
|
@ -13,7 +13,7 @@ void rrdcalctemplate_link_matching(RRDSET *st) {
|
|||
for(rt = host->templates; rt ; rt = rt->next) {
|
||||
if(rt->hash_context == st->hash_context && !strcmp(rt->context, st->context)
|
||||
&& (!rt->family_pattern || simple_pattern_matches(rt->family_pattern, st->family))) {
|
||||
RRDCALC *rc = rrdcalc_create(host, rt, st->id);
|
||||
RRDCALC *rc = rrdcalc_create_from_template(host, rt, st->id);
|
||||
if(unlikely(!rc))
|
||||
info("Health tried to create alarm from template '%s' on chart '%s' of host '%s', but it failed", rt->name, st->id, host->hostname);
|
||||
|
||||
|
|
|
@ -48,6 +48,12 @@ struct rrdcalctemplate {
|
|||
int delay_max_duration; // the absolute max delay to apply to this alarm
|
||||
float delay_multiplier; // multiplier for all delays when alarms switch status
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// notification repeat settings
|
||||
|
||||
uint32_t warn_repeat_every; // interval between repeating warning notifications
|
||||
uint32_t crit_repeat_every; // interval between repeating critical notifications
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// expressions related to the alarm
|
||||
|
||||
|
|
|
@ -179,6 +179,10 @@ RRDHOST *rrdhost_create(const char *hostname,
|
|||
if(config_get_boolean(CONFIG_SECTION_GLOBAL, "delete orphan hosts files", 1) && !is_localhost)
|
||||
rrdhost_flag_set(host, RRDHOST_FLAG_DELETE_ORPHAN_HOST);
|
||||
|
||||
host->health_default_warn_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat warning", "never");
|
||||
host->health_default_crit_repeat_every = config_get_duration(CONFIG_SECTION_HEALTH, "default repeat critical", "never");
|
||||
avl_init_lock(&(host->alarms_idx_health_log), alarm_compare_id);
|
||||
avl_init_lock(&(host->alarms_idx_name), alarm_compare_name);
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// initialize health variables
|
||||
|
@ -274,12 +278,12 @@ RRDHOST *rrdhost_create(const char *hostname,
|
|||
// load health configuration
|
||||
|
||||
if(host->health_enabled) {
|
||||
health_alarm_log_load(host);
|
||||
health_alarm_log_open(host);
|
||||
|
||||
rrdhost_wrlock(host);
|
||||
health_readdir(host, health_user_config_dir(), health_stock_config_dir(), NULL);
|
||||
rrdhost_unlock(host);
|
||||
|
||||
health_alarm_log_load(host);
|
||||
health_alarm_log_open(host);
|
||||
}
|
||||
|
||||
|
||||
|
@ -876,3 +880,43 @@ int rrdhost_set_system_info_variable(struct rrdhost_system_info *system_info, ch
|
|||
|
||||
return res;
|
||||
}
|
||||
|
||||
/**
|
||||
* Alarm Compare ID
|
||||
*
|
||||
* Callback function used with the binary trees to compare the id of RRDCALC
|
||||
*
|
||||
* @param a a pointer to the RRDCAL item to insert,compare or update the binary tree
|
||||
* @param b the pointer to the binary tree.
|
||||
*
|
||||
* @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b.
|
||||
*/
|
||||
int alarm_compare_id(void *a, void *b) {
|
||||
register uint32_t hash1 = ((RRDCALC *)a)->id;
|
||||
register uint32_t hash2 = ((RRDCALC *)b)->id;
|
||||
|
||||
if(hash1 < hash2) return -1;
|
||||
else if(hash1 > hash2) return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Alarm Compare NAME
|
||||
*
|
||||
* Callback function used with the binary trees to compare the name of RRDCALC
|
||||
*
|
||||
* @param a a pointer to the RRDCAL item to insert,compare or update the binary tree
|
||||
* @param b the pointer to the binary tree.
|
||||
*
|
||||
* @return It returns 0 case the values are equal, 1 case a is bigger than b and -1 case a is smaller than b.
|
||||
*/
|
||||
int alarm_compare_name(void *a, void *b) {
|
||||
RRDCALC *in1 = (RRDCALC *)a;
|
||||
RRDCALC *in2 = (RRDCALC *)b;
|
||||
|
||||
if(in1->hash < in2->hash) return -1;
|
||||
else if(in1->hash > in2->hash) return 1;
|
||||
|
||||
return strcmp(in1->name,in2->name);
|
||||
}
|
||||
|
|
|
@ -11,7 +11,6 @@ packet dropped).
|
|||
|
||||
Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.).
|
||||
|
||||
|
||||
Each alarm can execute a single query to the database using statistical algorithms against past data,
|
||||
but alarms can be combined. So, if you need 2 queries in the database, you can combine
|
||||
2 alarms together (both will run a query to the database, and the results can be combined).
|
||||
|
@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X]
|
|||
their matching one) and a delay is in place.
|
||||
- All are reset to their defaults when the alarm switches state without a delay in place.
|
||||
|
||||
---
|
||||
|
||||
#### Alarm line `repeat`
|
||||
|
||||
Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
|
||||
|
||||
Format:
|
||||
|
||||
```
|
||||
repeat: [off] [warning DURATION] [critical DURATION]
|
||||
```
|
||||
|
||||
* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration.
|
||||
* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode.
|
||||
* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode.
|
||||
|
||||
---
|
||||
|
||||
#### Alarm line `option`
|
||||
|
||||
The only possible value for the `option` line is
|
||||
|
@ -567,12 +584,15 @@ template: disk_full_percent
|
|||
every: 1m
|
||||
warn: $this > 80
|
||||
crit: $this > 95
|
||||
repeat: warning 120s critical 10s
|
||||
```
|
||||
|
||||
`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard.
|
||||
|
||||
So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage.
|
||||
|
||||
This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode.
|
||||
|
||||
### Example 3
|
||||
|
||||
Predict if any disk will run out of space in the near future.
|
||||
|
|
|
@ -255,17 +255,18 @@ static inline void health_alarm_log_process(RRDHOST *host) {
|
|||
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
|
||||
|
||||
ALARM_ENTRY *ae;
|
||||
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) {
|
||||
if(unlikely(
|
||||
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
|
||||
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
|
||||
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
|
||||
if(likely(!alarm_entry_isrepeating(host, ae))) {
|
||||
if(unlikely(
|
||||
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
|
||||
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
|
||||
)) {
|
||||
if(unlikely(ae->unique_id < first_waiting))
|
||||
first_waiting = ae->unique_id;
|
||||
|
||||
if(unlikely(ae->unique_id < first_waiting))
|
||||
first_waiting = ae->unique_id;
|
||||
|
||||
if(likely(now >= ae->delay_up_to_timestamp))
|
||||
health_process_notifications(host, ae);
|
||||
if(likely(now >= ae->delay_up_to_timestamp))
|
||||
health_process_notifications(host, ae);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -295,6 +296,10 @@ static inline void health_alarm_log_process(RRDHOST *host) {
|
|||
ALARM_ENTRY *t = ae->next;
|
||||
|
||||
health_alarm_log_free_one_nochecks_nounlink(ae);
|
||||
if(likely(!alarm_entry_isrepeating(host, ae))) {
|
||||
health_alarm_log_free_one_nochecks_nounlink(ae);
|
||||
host->health_log.count--;
|
||||
}
|
||||
|
||||
ae = t;
|
||||
host->health_log.count--;
|
||||
|
@ -411,7 +416,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
|
|||
debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
|
||||
} else {
|
||||
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
|
||||
, (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
|
||||
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
|
||||
, rc->name
|
||||
, (rc->rrdset)?rc->rrdset->context:""
|
||||
, rc->chart
|
||||
|
@ -756,20 +761,22 @@ void *health_main(void *ptr) {
|
|||
rc->delay_last = delay;
|
||||
rc->delay_up_to_timestamp = now + delay;
|
||||
|
||||
health_alarm_log(
|
||||
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
|
||||
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
|
||||
rc->delay_last,
|
||||
(
|
||||
((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
|
||||
((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
|
||||
)
|
||||
|
||||
);
|
||||
|
||||
rc->last_status_change = now;
|
||||
rc->status = status;
|
||||
if(likely(!rrdcalc_isrepeating(rc))) {
|
||||
ALARM_ENTRY *ae = health_create_alarm_entry(
|
||||
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
|
||||
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
|
||||
rc->delay_last,
|
||||
(
|
||||
((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
|
||||
((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
|
||||
)
|
||||
);
|
||||
health_alarm_log(host, ae);
|
||||
}
|
||||
rc->last_status_change = now;
|
||||
rc->old_status = rc->status;
|
||||
rc->status = status;
|
||||
}
|
||||
|
||||
rc->last_updated = now;
|
||||
|
@ -779,6 +786,35 @@ void *health_main(void *ptr) {
|
|||
next_run = rc->next_update;
|
||||
}
|
||||
|
||||
// process repeating alarms
|
||||
RRDCALC *rc;
|
||||
for(rc = host->alarms; rc ; rc = rc->next) {
|
||||
int repeat_every = 0;
|
||||
if(unlikely(rrdcalc_isrepeating(rc))) {
|
||||
if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
|
||||
repeat_every = rc->warn_repeat_every;
|
||||
else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
|
||||
repeat_every = rc->crit_repeat_every;
|
||||
}
|
||||
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
|
||||
rc->last_repeat = now;
|
||||
ALARM_ENTRY *ae = health_create_alarm_entry(
|
||||
host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
|
||||
rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
|
||||
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
|
||||
rc->delay_last,
|
||||
(
|
||||
((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
|
||||
((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
|
||||
)
|
||||
);
|
||||
ae->last_repeat = rc->last_repeat;
|
||||
health_process_notifications(host, ae);
|
||||
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
|
||||
health_alarm_log_free_one_nochecks_nounlink(ae);
|
||||
}
|
||||
}
|
||||
|
||||
rrdhost_unlock(host);
|
||||
}
|
||||
|
||||
|
|
|
@ -108,7 +108,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
|
|||
extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename);
|
||||
extern void health_alarm_log_load(RRDHOST *host);
|
||||
|
||||
extern void health_alarm_log(
|
||||
extern ALARM_ENTRY* health_create_alarm_entry(
|
||||
RRDHOST *host,
|
||||
uint32_t alarm_id,
|
||||
uint32_t alarm_event_id,
|
||||
|
@ -129,6 +129,8 @@ extern void health_alarm_log(
|
|||
int delay,
|
||||
uint32_t flags);
|
||||
|
||||
extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae);
|
||||
|
||||
extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
|
||||
extern char *health_user_config_dir(void);
|
||||
extern char *health_stock_config_dir(void);
|
||||
|
|
|
@ -23,6 +23,7 @@
|
|||
#define HEALTH_INFO_KEY "info"
|
||||
#define HEALTH_DELAY_KEY "delay"
|
||||
#define HEALTH_OPTIONS_KEY "options"
|
||||
#define HEALTH_REPEAT_KEY "repeat"
|
||||
|
||||
static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
|
||||
if(!rc->chart) {
|
||||
|
@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
|
|||
|
||||
rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
|
||||
|
||||
debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
|
||||
debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
|
||||
rc->chart?rc->chart:"NOCHART",
|
||||
rc->name,
|
||||
rc->id,
|
||||
|
@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
|
|||
rc->delay_up_duration,
|
||||
rc->delay_down_duration,
|
||||
rc->delay_max_duration,
|
||||
rc->delay_multiplier
|
||||
rc->delay_multiplier,
|
||||
rc->warn_repeat_every,
|
||||
rc->crit_repeat_every
|
||||
);
|
||||
|
||||
rrdcalc_create_part2(host, rc);
|
||||
rrdcalc_add_to_host(host, rc);
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
|
|||
}
|
||||
}
|
||||
|
||||
debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
|
||||
debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
|
||||
rt->name,
|
||||
(rt->context)?rt->context:"NONE",
|
||||
(rt->exec)?rt->exec:"DEFAULT",
|
||||
|
@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
|
|||
rt->delay_up_duration,
|
||||
rt->delay_down_duration,
|
||||
rt->delay_max_duration,
|
||||
rt->delay_multiplier
|
||||
rt->delay_multiplier,
|
||||
rt->warn_repeat_every,
|
||||
rt->crit_repeat_every
|
||||
);
|
||||
|
||||
if(likely(last)) {
|
||||
|
@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
|
|||
return 1;
|
||||
}
|
||||
|
||||
static inline int health_parse_duration(char *string, int *result) {
|
||||
// make sure it is a number
|
||||
if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
|
||||
*result = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
char *e = NULL;
|
||||
calculated_number n = str2ld(string, &e);
|
||||
if(e && *e) {
|
||||
switch (*e) {
|
||||
case 'Y':
|
||||
*result = (int) (n * 86400 * 365);
|
||||
break;
|
||||
case 'M':
|
||||
*result = (int) (n * 86400 * 30);
|
||||
break;
|
||||
case 'w':
|
||||
*result = (int) (n * 86400 * 7);
|
||||
break;
|
||||
case 'd':
|
||||
*result = (int) (n * 86400);
|
||||
break;
|
||||
case 'h':
|
||||
*result = (int) (n * 3600);
|
||||
break;
|
||||
case 'm':
|
||||
*result = (int) (n * 60);
|
||||
break;
|
||||
|
||||
default:
|
||||
case 's':
|
||||
*result = (int) (n);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
*result = (int)(n);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
static inline int health_parse_delay(
|
||||
size_t line, const char *filename, char *string,
|
||||
int *delay_up_duration,
|
||||
|
@ -202,14 +165,14 @@ static inline int health_parse_delay(
|
|||
while(*s && isspace(*s)) *s++ = '\0';
|
||||
|
||||
if(!strcasecmp(key, "up")) {
|
||||
if (!health_parse_duration(value, delay_up_duration)) {
|
||||
if (!config_parse_duration(value, delay_up_duration)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
|
||||
line, filename, value, key);
|
||||
}
|
||||
else given_up = 1;
|
||||
}
|
||||
else if(!strcasecmp(key, "down")) {
|
||||
if (!health_parse_duration(value, delay_down_duration)) {
|
||||
if (!config_parse_duration(value, delay_down_duration)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
|
||||
line, filename, value, key);
|
||||
}
|
||||
|
@ -224,7 +187,7 @@ static inline int health_parse_delay(
|
|||
else given_multiplier = 1;
|
||||
}
|
||||
else if(!strcasecmp(key, "max")) {
|
||||
if (!health_parse_duration(value, delay_max_duration)) {
|
||||
if (!config_parse_duration(value, delay_max_duration)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
|
||||
line, filename, value, key);
|
||||
}
|
||||
|
@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) {
|
|||
return options;
|
||||
}
|
||||
|
||||
static inline int health_parse_repeat(
|
||||
size_t line,
|
||||
const char *file,
|
||||
char *string,
|
||||
uint32_t *warn_repeat_every,
|
||||
uint32_t *crit_repeat_every
|
||||
) {
|
||||
|
||||
char *s = string;
|
||||
while(*s) {
|
||||
char *key = s;
|
||||
|
||||
while(*s && !isspace(*s)) s++;
|
||||
while(*s && isspace(*s)) *s++ = '\0';
|
||||
|
||||
if(!*key) break;
|
||||
|
||||
char *value = s;
|
||||
while(*s && !isspace(*s)) s++;
|
||||
while(*s && isspace(*s)) *s++ = '\0';
|
||||
|
||||
if(!strcasecmp(key, "off")) {
|
||||
*warn_repeat_every = 0;
|
||||
*crit_repeat_every = 0;
|
||||
return 1;
|
||||
}
|
||||
if(!strcasecmp(key, "warning")) {
|
||||
if (!config_parse_duration(value, (int*)warn_repeat_every)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
|
||||
line, file, value, key);
|
||||
}
|
||||
}
|
||||
else if(!strcasecmp(key, "critical")) {
|
||||
if (!config_parse_duration(value, (int*)crit_repeat_every)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
|
||||
line, file, value, key);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
static inline int health_parse_db_lookup(
|
||||
size_t line, const char *filename, char *string,
|
||||
RRDR_GROUPING *group_method, int *after, int *before, int *every,
|
||||
|
@ -322,7 +329,7 @@ static inline int health_parse_db_lookup(
|
|||
while(*s && !isspace(*s)) s++;
|
||||
while(*s && isspace(*s)) *s++ = '\0';
|
||||
|
||||
if(!health_parse_duration(key, after)) {
|
||||
if(!config_parse_duration(key, after)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
|
||||
line, filename, key);
|
||||
return 0;
|
||||
|
@ -343,7 +350,7 @@ static inline int health_parse_db_lookup(
|
|||
while(*s && !isspace(*s)) s++;
|
||||
while(*s && isspace(*s)) *s++ = '\0';
|
||||
|
||||
if (!health_parse_duration(value, before)) {
|
||||
if (!config_parse_duration(value, before)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
|
||||
line, filename, value, key);
|
||||
}
|
||||
|
@ -353,7 +360,7 @@ static inline int health_parse_db_lookup(
|
|||
while(*s && !isspace(*s)) s++;
|
||||
while(*s && isspace(*s)) *s++ = '\0';
|
||||
|
||||
if (!health_parse_duration(value, every)) {
|
||||
if (!config_parse_duration(value, every)) {
|
||||
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
|
||||
line, filename, value, key);
|
||||
}
|
||||
|
@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) {
|
|||
hash_info = 0,
|
||||
hash_recipient = 0,
|
||||
hash_delay = 0,
|
||||
hash_options = 0;
|
||||
hash_options = 0,
|
||||
hash_repeat = 0;
|
||||
|
||||
char buffer[HEALTH_CONF_MAX_LINE + 1];
|
||||
|
||||
|
@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) {
|
|||
hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
|
||||
hash_delay = simple_uhash(HEALTH_DELAY_KEY);
|
||||
hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
|
||||
hash_repeat = simple_uhash(HEALTH_REPEAT_KEY);
|
||||
}
|
||||
|
||||
FILE *fp = fopen(filename, "r");
|
||||
|
@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) {
|
|||
rc->value = NAN;
|
||||
rc->old_value = NAN;
|
||||
rc->delay_multiplier = 1.0;
|
||||
rc->old_status = RRDCALC_STATUS_UNINITIALIZED;
|
||||
rc->warn_repeat_every = host->health_default_warn_repeat_every;
|
||||
rc->crit_repeat_every = host->health_default_crit_repeat_every;
|
||||
|
||||
if(rrdvar_fix_name(rc->name))
|
||||
error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
|
||||
|
@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) {
|
|||
rt->green = NAN;
|
||||
rt->red = NAN;
|
||||
rt->delay_multiplier = 1.0;
|
||||
rt->warn_repeat_every = host->health_default_warn_repeat_every;
|
||||
rt->crit_repeat_every = host->health_default_crit_repeat_every;
|
||||
|
||||
if(rrdvar_fix_name(rt->name))
|
||||
error("Health configuration renamed template '%s' to '%s'", value, rt->name);
|
||||
|
@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) {
|
|||
&rc->options, &rc->dimensions);
|
||||
}
|
||||
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
|
||||
if(!health_parse_duration(value, &rc->update_every))
|
||||
if(!config_parse_duration(value, &rc->update_every))
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
|
||||
line, filename, rc->name, key, value);
|
||||
}
|
||||
|
@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) {
|
|||
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
|
||||
rc->options |= health_parse_options(value);
|
||||
}
|
||||
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
|
||||
health_parse_repeat(line, filename, value,
|
||||
&rc->warn_repeat_every,
|
||||
&rc->crit_repeat_every);
|
||||
}
|
||||
else {
|
||||
error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
|
||||
line, filename, rc->name, key);
|
||||
|
@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) {
|
|||
&rt->update_every, &rt->options, &rt->dimensions);
|
||||
}
|
||||
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
|
||||
if(!health_parse_duration(value, &rt->update_every))
|
||||
if(!config_parse_duration(value, &rt->update_every))
|
||||
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
|
||||
line, filename, rt->name, key, value);
|
||||
}
|
||||
|
@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) {
|
|||
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
|
||||
rt->options |= health_parse_options(value);
|
||||
}
|
||||
else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
|
||||
health_parse_repeat(line, filename, value,
|
||||
&rt->warn_repeat_every,
|
||||
&rt->crit_repeat_every);
|
||||
}
|
||||
else {
|
||||
error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
|
||||
line, filename, rt->name, key);
|
||||
|
|
|
@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
|
|||
"\t\t\t\"delay_multiplier\": %f,\n"
|
||||
"\t\t\t\"delay\": %d,\n"
|
||||
"\t\t\t\"delay_up_to_timestamp\": %lu,\n"
|
||||
"\t\t\t\"warn_repeat_every\": \"%u\",\n"
|
||||
"\t\t\t\"crit_repeat_every\": \"%u\",\n"
|
||||
"\t\t\t\"value_string\": \"%s\",\n"
|
||||
, rc->chart, rc->name
|
||||
, (unsigned long)rc->id
|
||||
|
@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
|
|||
, rc->delay_multiplier
|
||||
, rc->delay_last
|
||||
, (unsigned long)rc->delay_up_to_timestamp
|
||||
, rc->warn_repeat_every
|
||||
, rc->crit_repeat_every
|
||||
, value_string
|
||||
);
|
||||
|
||||
|
|
|
@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
|
|||
"\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
|
||||
"\t%d\t%d\t%d\t%d"
|
||||
"\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
|
||||
"\t%016lx"
|
||||
"\n"
|
||||
, (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
|
||||
, host->hostname
|
||||
|
@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
|
|||
|
||||
, ae->new_value
|
||||
, ae->old_value
|
||||
, (uint64_t)ae->last_repeat
|
||||
) < 0))
|
||||
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
|
||||
else {
|
||||
|
@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
|
|||
continue;
|
||||
}
|
||||
|
||||
// Check if we got last_repeat field
|
||||
time_t last_repeat = 0;
|
||||
if(entries > 27) {
|
||||
char* alarm_name = pointers[13];
|
||||
last_repeat = (time_t)strtoul(pointers[27], NULL, 16);
|
||||
|
||||
RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
|
||||
if (!rc) {
|
||||
for(rc = host->alarms; rc ; rc = rc->next) {
|
||||
RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc);
|
||||
if(rdcmp != rc) {
|
||||
error("Cannot insert the alarm index ID using log %s", rc->name);
|
||||
}
|
||||
}
|
||||
|
||||
rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
|
||||
}
|
||||
|
||||
if(unlikely(rc)) {
|
||||
if (rrdcalc_isrepeating(rc)) {
|
||||
rc->last_repeat = last_repeat;
|
||||
// We iterate through repeating alarm entries only to
|
||||
// find the latest last_repeat timestamp. Otherwise,
|
||||
// there is no need to keep them in memory.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(unlikely(*pointers[0] == 'A')) {
|
||||
// make sure it is properly numbered
|
||||
if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
|
||||
error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id);
|
||||
error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it."
|
||||
, host->hostname, line, filename, unique_id);
|
||||
errored++;
|
||||
continue;
|
||||
}
|
||||
|
@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
|
|||
}
|
||||
else if(unlikely(*pointers[0] == 'U')) {
|
||||
// find the original
|
||||
for(ae = host->health_log.alarms; ae; ae = ae->next) {
|
||||
for(ae = host->health_log.alarms; ae ; ae = ae->next) {
|
||||
if(unlikely(unique_id == ae->unique_id)) {
|
||||
if(unlikely(*pointers[0] == 'A')) {
|
||||
error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later."
|
||||
, host->hostname, line, filename, unique_id);
|
||||
, host->hostname, line, filename, unique_id);
|
||||
*pointers[0] = 'U';
|
||||
duplicate++;
|
||||
}
|
||||
|
@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
|
|||
ae->new_value = str2l(pointers[25]);
|
||||
ae->old_value = str2l(pointers[26]);
|
||||
|
||||
ae->last_repeat = last_repeat;
|
||||
|
||||
char value_string[100 + 1];
|
||||
freez(ae->old_value_string);
|
||||
freez(ae->new_value_string);
|
||||
|
@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) {
|
|||
// ----------------------------------------------------------------------------
|
||||
// health alarm log management
|
||||
|
||||
inline void health_alarm_log(
|
||||
inline ALARM_ENTRY* health_create_alarm_entry(
|
||||
RRDHOST *host,
|
||||
uint32_t alarm_id,
|
||||
uint32_t alarm_event_id,
|
||||
|
@ -398,9 +432,24 @@ inline void health_alarm_log(
|
|||
ae->delay_up_to_timestamp = when + delay;
|
||||
ae->flags |= flags;
|
||||
|
||||
ae->last_repeat = 0;
|
||||
|
||||
if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
|
||||
ae->non_clear_duration += ae->duration;
|
||||
|
||||
return ae;
|
||||
}
|
||||
|
||||
inline void health_alarm_log(
|
||||
RRDHOST *host,
|
||||
ALARM_ENTRY *ae
|
||||
) {
|
||||
debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
|
||||
|
||||
if(unlikely(alarm_entry_isrepeating(host, ae))) {
|
||||
error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id);
|
||||
return;
|
||||
}
|
||||
// link it
|
||||
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
|
||||
ae->next = host->health_log.alarms;
|
||||
|
|
|
@ -411,6 +411,27 @@ int appconfig_set_boolean(struct config *root, const char *section, const char *
|
|||
return value;
|
||||
}
|
||||
|
||||
int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value)
|
||||
{
|
||||
int result = 0;
|
||||
const char *s;
|
||||
|
||||
s = appconfig_get(root, section, name, value);
|
||||
if(!s) goto fallback;
|
||||
|
||||
if(!config_parse_duration(s, &result)) {
|
||||
error("config option '[%s].%s = %s' is configured with an valid duration", section, name, s);
|
||||
goto fallback;
|
||||
}
|
||||
|
||||
return result;
|
||||
|
||||
fallback:
|
||||
if(!config_parse_duration(value, &result))
|
||||
error("INTERNAL ERROR: default duration supplied for option '[%s].%s = %s' is not a valid duration", section, name, value);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// config load/save
|
||||
|
@ -586,3 +607,65 @@ void appconfig_generate(struct config *root, BUFFER *wb, int only_changed)
|
|||
appconfig_unlock(root);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse Duration
|
||||
*
|
||||
* Parse the string setting the result
|
||||
*
|
||||
* @param string the timestamp string
|
||||
* @param result the output variable
|
||||
*
|
||||
* @return It returns 1 on success and 0 otherwise
|
||||
*/
|
||||
int config_parse_duration(const char* string, int* result) {
|
||||
while(*string && isspace(*string)) string++;
|
||||
|
||||
if(unlikely(!*string)) goto fallback;
|
||||
|
||||
if(*string == 'n' && !strcmp(string, "never")) {
|
||||
// this is a valid option
|
||||
*result = 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
// make sure it is a number
|
||||
if(!(isdigit(*string) || *string == '+' || *string == '-')) goto fallback;
|
||||
|
||||
char *e = NULL;
|
||||
calculated_number n = str2ld(string, &e);
|
||||
if(e && *e) {
|
||||
switch (*e) {
|
||||
case 'Y':
|
||||
*result = (int) (n * 31536000);
|
||||
break;
|
||||
case 'M':
|
||||
*result = (int) (n * 2592000);
|
||||
break;
|
||||
case 'w':
|
||||
*result = (int) (n * 604800);
|
||||
break;
|
||||
case 'd':
|
||||
*result = (int) (n * 86400);
|
||||
break;
|
||||
case 'h':
|
||||
*result = (int) (n * 3600);
|
||||
break;
|
||||
case 'm':
|
||||
*result = (int) (n * 60);
|
||||
break;
|
||||
case 's':
|
||||
default:
|
||||
*result = (int) (n);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
*result = (int)(n);
|
||||
|
||||
return 1;
|
||||
|
||||
fallback:
|
||||
*result = 0;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -119,6 +119,7 @@ extern long long appconfig_get_number(struct config *root, const char *section,
|
|||
extern LONG_DOUBLE appconfig_get_float(struct config *root, const char *section, const char *name, LONG_DOUBLE value);
|
||||
extern int appconfig_get_boolean(struct config *root, const char *section, const char *name, int value);
|
||||
extern int appconfig_get_boolean_ondemand(struct config *root, const char *section, const char *name, int value);
|
||||
extern int appconfig_get_duration(struct config *root, const char *section, const char *name, const char *value);
|
||||
|
||||
extern const char *appconfig_set(struct config *root, const char *section, const char *name, const char *value);
|
||||
extern const char *appconfig_set_default(struct config *root, const char *section, const char *name, const char *value);
|
||||
|
@ -133,4 +134,6 @@ extern void appconfig_generate(struct config *root, BUFFER *wb, int only_changed
|
|||
|
||||
extern int appconfig_section_compare(void *a, void *b);
|
||||
|
||||
extern int config_parse_duration(const char* string, int* result);
|
||||
|
||||
#endif /* NETDATA_CONFIG_H */
|
||||
|
|
|
@ -1305,6 +1305,14 @@
|
|||
"crit_parsed": {
|
||||
"type": "string"
|
||||
},
|
||||
"warn_repeat_every": {
|
||||
"type": "integer",
|
||||
"format": "int32"
|
||||
},
|
||||
"crit_repeat_every": {
|
||||
"type": "integer",
|
||||
"format": "int32"
|
||||
},
|
||||
"green": {
|
||||
"type": "string",
|
||||
"format": "nullable"
|
||||
|
|
|
@ -892,6 +892,12 @@ definitions:
|
|||
type: string
|
||||
crit_parsed:
|
||||
type: string
|
||||
warn_repeat_every:
|
||||
type: integer
|
||||
format: int32
|
||||
crit_repeat_every:
|
||||
type: integer
|
||||
format: int32
|
||||
green:
|
||||
type: string
|
||||
format: nullable
|
||||
|
|
|
@ -2100,6 +2100,14 @@ function alarmsUpdateModal() {
|
|||
+ ((chart.red !== null) ? ('<tr><td width="10%" style="text-align:right">red threshold</td><td><code>' + chart.red + ' ' + units + '</code></td></tr>') : '');
|
||||
}
|
||||
|
||||
if (alarm.warn_repeat_every > 0) {
|
||||
html += '<tr><td width="10%" style="text-align:right">repeat warning</td><td>' + NETDATA.seconds4human(alarm.warn_repeat_every) + '</td></tr>';
|
||||
}
|
||||
|
||||
if (alarm.crit_repeat_every > 0) {
|
||||
html += '<tr><td width="10%" style="text-align:right">repeat critical</td><td>' + NETDATA.seconds4human(alarm.crit_repeat_every) + '</td></tr>';
|
||||
}
|
||||
|
||||
var delay = '';
|
||||
if ((alarm.delay_up_duration > 0 || alarm.delay_down_duration > 0) && alarm.delay_multiplier !== 0 && alarm.delay_max_duration > 0) {
|
||||
if (alarm.delay_up_duration === alarm.delay_down_duration) {
|
||||
|
|
Loading…
Add table
Reference in a new issue