0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-14 17:48:37 +00:00
netdata_netdata/health/rrdcalc.c
Costa Tsaousis f466b8aef5
DYNCFG: dynamically configured alerts ()
* cleanup alerts

* fix references

* fix references

* fix references

* load alerts once and apply them to each node

* simplify health_create_alarm_entry()

* Compile without warnings with compiler flags:

   -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self

* code re-organization and cleanup

* generate patterns when applying prototypes; give unique dyncfg names to all alerts

* eval expressions keep the source and the parsed_as as STRING pointers

* renamed host to node in dyncfg ids

* renamed host to node in dyncfg ids

* add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level

* working functionality

* code re-organization: moved health event-loop to a new file, moved health globals to health.c

* rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs

* dyncfg alert prototypes initialization for alerts

* health dyncfg split to separate file

* cleanup not-needed code

* normalize matches between parsing and json

* also detect !* for disabled alerts

* dyncfg capability disabled

* Store alert config part1

* Add rrdlabels_common_count

* wip health variables lookup without indexes

* Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter

* working variables with runtime lookup

* working variables with runtime lookup

* delete rrddimvar and rrdfamily index

* remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts

* added /api/v1/variable that resolves a variable the same way alerts do

* remove rrdcalc from eval

* remove debug code

* remove duplicate assignment

* Fix memory leak

* all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts

* hide all internal structures of EVAL

* Enable -Wformat flag

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>

* Adjust binding for calculation, warning, critical

* Remove unused macro

* Update config hash id

* use the right info and summary in alerts log

* use synchronous queries for alerts

* Handle cases when config_hash_id is missing from health_log

* remove deadlock from health worker

* parsing to json payload for health alert prototypes

* cleaner parsing and avoiding memory leaks in case of duplicate members in json

* fix left-over rename of function

* Keep original lookup field to send to the cloud
Cleanup / rename function to store config
Remove unused DEFINEs, functions

* Use ac->lookup

* link jobs to the host when the template is registered; do not accept running a function without a host

* full dyncfg support for health alerts, except action TEST

* working dyncfg additions, updates, removals

* fixed missing source, wrong status updates

* add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint

* fix dyncfg unittest

* rename functions

* generalize the json-c parser macros and move them to libnetdata

* report progress when enabling and disabling dyncfg templates

* moved rrdcalc and rrdvar to health

* update alarms

* added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts

* enable parsed json alerts; allow sending back accepted but disabled

* added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output

* remove the RRDHOST pointer from DYNCFG

* Fix command field submitted to the cloud

* do not send updates to creation requests, for DYNCFG jobs

---------

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: ilyam8 <ilya@netdata.cloud>
2024-01-23 20:20:41 +02:00

539 lines
17 KiB
C

// SPDX-License-Identifier: GPL-3.0-or-later
#include "database/rrd.h"
#include "health_internals.h"
// ----------------------------------------------------------------------------
// RRDCALC helpers
void rrdcalc_flags_to_json_array(BUFFER *wb, const char *key, RRDCALC_FLAGS flags) {
buffer_json_member_add_array(wb, key);
if(flags & RRDCALC_FLAG_DB_ERROR)
buffer_json_add_array_item_string(wb, "DB_ERROR");
if(flags & RRDCALC_FLAG_DB_NAN)
buffer_json_add_array_item_string(wb, "DB_NAN");
if(flags & RRDCALC_FLAG_CALC_ERROR)
buffer_json_add_array_item_string(wb, "CALC_ERROR");
if(flags & RRDCALC_FLAG_WARN_ERROR)
buffer_json_add_array_item_string(wb, "WARN_ERROR");
if(flags & RRDCALC_FLAG_CRIT_ERROR)
buffer_json_add_array_item_string(wb, "CRIT_ERROR");
if(flags & RRDCALC_FLAG_RUNNABLE)
buffer_json_add_array_item_string(wb, "RUNNABLE");
if(flags & RRDCALC_FLAG_DISABLED)
buffer_json_add_array_item_string(wb, "DISABLED");
if(flags & RRDCALC_FLAG_SILENCED)
buffer_json_add_array_item_string(wb, "SILENCED");
if(flags & RRDCALC_FLAG_RUN_ONCE)
buffer_json_add_array_item_string(wb, "RUN_ONCE");
buffer_json_array_close(wb);
}
inline const char *rrdcalc_status2string(RRDCALC_STATUS status) {
switch(status) {
case RRDCALC_STATUS_REMOVED:
return "REMOVED";
case RRDCALC_STATUS_UNDEFINED:
return "UNDEFINED";
case RRDCALC_STATUS_UNINITIALIZED:
return "UNINITIALIZED";
case RRDCALC_STATUS_CLEAR:
return "CLEAR";
case RRDCALC_STATUS_RAISED:
return "RAISED";
case RRDCALC_STATUS_WARNING:
return "WARNING";
case RRDCALC_STATUS_CRITICAL:
return "CRITICAL";
default:
netdata_log_error("Unknown alarm status %d", status);
return "UNKNOWN";
}
}
uint32_t rrdcalc_get_unique_id(RRDHOST *host, STRING *chart, STRING *name, uint32_t *next_event_id, uuid_t *config_hash_id) {
rw_spinlock_read_lock(&host->health_log.spinlock);
// re-use old IDs, by looking them up in the alarm log
ALARM_ENTRY *ae = NULL;
for(ae = host->health_log.alarms; ae ;ae = ae->next) {
if(unlikely(name == ae->name && chart == ae->chart && !uuid_memcmp(&ae->config_hash_id, config_hash_id))) {
if(next_event_id) *next_event_id = ae->alarm_event_id + 1;
break;
}
}
uint32_t alarm_id;
if(ae)
alarm_id = ae->alarm_id;
else {
alarm_id = sql_get_alarm_id(host, chart, name, next_event_id);
if (!alarm_id) {
if (unlikely(!host->health_log.next_alarm_id))
host->health_log.next_alarm_id = (uint32_t)now_realtime_sec();
alarm_id = host->health_log.next_alarm_id++;
}
}
rw_spinlock_read_unlock(&host->health_log.spinlock);
return alarm_id;
}
// ----------------------------------------------------------------------------
// RRDCALC replacing info/summary text variables with RRDSET labels
static STRING *rrdcalc_replace_variables_with_rrdset_labels(const char *line, RRDCALC *rc) {
if (!line || !*line)
return NULL;
size_t pos = 0;
char *temp = strdupz(line);
char var[RRDCALC_VAR_MAX];
char *m, *lbl_value = NULL;
while ((m = strchr(temp + pos, '$')) && *(m+1) == '{') {
int i = 0;
char *e = m;
while (*e) {
var[i++] = *e;
if (*e == '}' || i == RRDCALC_VAR_MAX - 1)
break;
e++;
}
var[i] = '\0';
pos = m - temp + 1;
if (!strcmp(var, RRDCALC_VAR_FAMILY)) {
char *buf = find_and_replace(temp, var, (rc->rrdset && rc->rrdset->family) ? rrdset_family(rc->rrdset) : "", m);
freez(temp);
temp = buf;
}
else if (!strncmp(var, RRDCALC_VAR_LABEL, RRDCALC_VAR_LABEL_LEN)) {
char label_val[RRDCALC_VAR_MAX + RRDCALC_VAR_LABEL_LEN + 1] = { 0 };
strcpy(label_val, var+RRDCALC_VAR_LABEL_LEN);
label_val[i - RRDCALC_VAR_LABEL_LEN - 1] = '\0';
if(likely(rc->rrdset && rc->rrdset->rrdlabels)) {
lbl_value = NULL;
rrdlabels_get_value_strdup_or_null(rc->rrdset->rrdlabels, &lbl_value, label_val);
if (lbl_value) {
char *buf = find_and_replace(temp, var, lbl_value, m);
freez(temp);
temp = buf;
freez(lbl_value);
}
}
}
}
STRING *ret = string_strdupz(temp);
freez(temp);
return ret;
}
void rrdcalc_update_info_using_rrdset_labels(RRDCALC *rc) {
if(rc->rrdset && rc->rrdset->rrdlabels) {
size_t labels_version = rrdlabels_version(rc->rrdset->rrdlabels);
if (rc->labels_version != labels_version) {
STRING *old;
old = rc->info;
rc->info = rrdcalc_replace_variables_with_rrdset_labels(string2str(rc->config.info), rc);
string_freez(old);
old = rc->summary;
rc->summary = rrdcalc_replace_variables_with_rrdset_labels(string2str(rc->config.summary), rc);
string_freez(old);
rc->labels_version = labels_version;
}
}
if(!rc->summary)
rc->summary = string_dup(rc->config.summary);
if(!rc->info)
rc->info = string_dup(rc->config.info);
}
// ----------------------------------------------------------------------------
// RRDCALC index management for RRDSET
// the dictionary requires a unique key for every item
// we use {chart id}.{alert name} for both the RRDHOST and RRDSET alert indexes.
#define RRDCALC_MAX_KEY_SIZE 1024
static size_t rrdcalc_key(char *dst, size_t dst_len, const char *chart, const char *alert) {
return snprintfz(dst, dst_len, "%s,on[%s]", alert, chart);
}
const RRDCALC_ACQUIRED *rrdcalc_from_rrdset_get(RRDSET *st, const char *alert_name) {
char key[RRDCALC_MAX_KEY_SIZE + 1];
size_t key_len = rrdcalc_key(key, RRDCALC_MAX_KEY_SIZE, rrdset_id(st), alert_name);
const RRDCALC_ACQUIRED *rca = (const RRDCALC_ACQUIRED *)dictionary_get_and_acquire_item_advanced(st->rrdhost->rrdcalc_root_index, key, (ssize_t)key_len);
if(!rca) {
key_len = rrdcalc_key(key, RRDCALC_MAX_KEY_SIZE, rrdset_name(st), alert_name);
rca = (const RRDCALC_ACQUIRED *)dictionary_get_and_acquire_item_advanced(st->rrdhost->rrdcalc_root_index, key, (ssize_t)key_len);
}
return rca;
}
void rrdcalc_from_rrdset_release(RRDSET *st, const RRDCALC_ACQUIRED *rca) {
if(!rca) return;
dictionary_acquired_item_release(st->rrdhost->rrdcalc_root_index, (const DICTIONARY_ITEM *)rca);
}
RRDCALC *rrdcalc_acquired_to_rrdcalc(const RRDCALC_ACQUIRED *rca) {
if(rca)
return dictionary_acquired_item_value((const DICTIONARY_ITEM *)rca);
return NULL;
}
// ----------------------------------------------------------------------------
// RRDCALC managing the linking with RRDSET
static void rrdcalc_link_to_rrdset(RRDCALC *rc) {
RRDSET *st = rc->rrdset;
RRDHOST *host = st->rrdhost;
rw_spinlock_write_lock(&st->alerts.spinlock);
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(st->alerts.base, rc, prev, next);
rw_spinlock_write_unlock(&st->alerts.spinlock);
char buf[RRDVAR_MAX_LENGTH + 1];
snprintfz(buf, RRDVAR_MAX_LENGTH, "%s.%s", rrdset_name(st), rrdcalc_name(rc));
STRING *rrdset_name_rrdcalc_name = string_strdupz(buf);
snprintfz(buf, RRDVAR_MAX_LENGTH, "%s.%s", rrdset_id(st), rrdcalc_name(rc));
STRING *rrdset_id_rrdcalc_name = string_strdupz(buf);
string_freez(rrdset_id_rrdcalc_name);
string_freez(rrdset_name_rrdcalc_name);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
host,
rc,
now,
now - rc->last_status_change,
rc->old_value,
rc->value,
RRDCALC_STATUS_REMOVED,
rc->status,
0,
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
health_alarm_log_add_entry(host, ae);
rrdset_flag_set(st, RRDSET_FLAG_HAS_RRDCALC_LINKED);
}
static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) {
RRDSET *st = rc->rrdset;
if(!st) {
netdata_log_error(
"Requested to unlink RRDCALC '%s.%s' which is not linked to any RRDSET",
rrdcalc_chart_name(rc), rrdcalc_name(rc));
return;
}
RRDHOST *host = st->rrdhost;
time_t now = now_realtime_sec();
if (likely(rc->status != RRDCALC_STATUS_REMOVED)) {
ALARM_ENTRY *ae = health_create_alarm_entry(
host,
rc,
now,
now - rc->last_status_change,
rc->old_value,
rc->value,
rc->status,
RRDCALC_STATUS_REMOVED,
0,
0);
health_alarm_log_add_entry(host, ae);
}
// unlink it
if(!having_ll_wrlock)
rw_spinlock_write_lock(&st->alerts.spinlock);
DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(st->alerts.base, rc, prev, next);
if(!having_ll_wrlock)
rw_spinlock_write_unlock(&st->alerts.spinlock);
rc->rrdset = NULL;
}
static inline bool rrdcalc_check_if_it_matches_rrdset(RRDCALC *rc, RRDSET *st) {
if ( (rc->chart != st->id)
&& (rc->chart != st->name))
return false;
if (rc->match.module_pattern && !simple_pattern_matches_string(rc->match.module_pattern, st->module_name))
return false;
if (rc->match.plugin_pattern && !simple_pattern_matches_string(rc->match.plugin_pattern, st->module_name))
return false;
if (st->rrdlabels && rc->match.chart_labels_pattern && !rrdlabels_match_simple_pattern_parsed(
st->rrdlabels, rc->match.chart_labels_pattern, '=', NULL))
return false;
return true;
}
// ----------------------------------------------------------------------------
// RRDCALC rrdhost index management - constructor
struct rrdcalc_constructor {
RRDSET *rrdset;
RRD_ALERT_PROTOTYPE *ap;
enum {
RRDCALC_REACT_NONE,
RRDCALC_REACT_NEW,
} react_action;
};
static void rrdcalc_rrdhost_insert_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc, void *constructor_data) {
RRDCALC *rc = rrdcalc;
struct rrdcalc_constructor *ctr = constructor_data;
RRDSET *st = ctr->rrdset;
RRDHOST *host = st->rrdhost;
RRD_ALERT_PROTOTYPE *ap = ctr->ap;
rc->key = string_strdupz(dictionary_acquired_item_name(item));
rc->rrdset = st;
rc->chart = string_dup(st->id);
health_prototype_copy_config(&rc->config, &ap->config);
health_prototype_copy_match_without_patterns(&rc->match, &ap->match);
rc->next_event_id = 1;
rc->value = NAN;
rc->old_value = NAN;
rc->last_repeat = 0;
rc->times_repeat = 0;
rc->last_status_change_value = rc->value;
rc->last_status_change = now_realtime_sec();
if(!rc->config.units)
rc->config.units = string_dup(st->units);
if(rc->config.update_every < rc->rrdset->update_every) {
netdata_log_info(
"HEALTH: alert '%s.%s' has update every %d, less than chart update every %d. "
"Setting alarm update frequency to %d.",
string2str(st->id), string2str(rc->config.name),
rc->config.update_every, rc->rrdset->update_every, rc->rrdset->update_every);
rc->config.update_every = st->update_every;
}
rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->config.name, &rc->next_event_id, &rc->config.hash_id);
if(!isnan(rc->config.green) && isnan(st->green))
st->green = rc->config.green;
if(!isnan(rc->config.red) && isnan(st->red))
st->red = rc->config.red;
expression_set_variable_lookup_callback(rc->config.calculation, alert_variable_lookup, rc);
expression_set_variable_lookup_callback(rc->config.warning, alert_variable_lookup, rc);
expression_set_variable_lookup_callback(rc->config.critical, alert_variable_lookup, rc);
rrdcalc_update_info_using_rrdset_labels(rc);
ctr->react_action = RRDCALC_REACT_NEW;
}
static bool rrdcalc_rrdhost_conflict_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc __maybe_unused, void *rrdcalc_new __maybe_unused, void *constructor_data) {
struct rrdcalc_constructor *ctr = constructor_data;
ctr->react_action = RRDCALC_REACT_NONE;
return false;
}
static void rrdcalc_rrdhost_react_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc, void *constructor_data) {
RRDCALC *rc = rrdcalc;
struct rrdcalc_constructor *ctr = constructor_data;
if(ctr->react_action == RRDCALC_REACT_NEW)
rrdcalc_link_to_rrdset(rc);
}
// ----------------------------------------------------------------------------
// RRDCALC rrdhost index management - destructor
static void rrdcalc_free_internals(RRDCALC *rc) {
if(unlikely(!rc)) return;
rrd_alert_match_cleanup(&rc->match);
rrd_alert_config_cleanup(&rc->config);
string_freez(rc->key);
string_freez(rc->chart);
string_freez(rc->info);
string_freez(rc->summary);
}
static void rrdcalc_rrdhost_delete_callback(const DICTIONARY_ITEM *item __maybe_unused, void *rrdcalc, void *rrdhost __maybe_unused) {
RRDCALC *rc = rrdcalc;
//RRDHOST *host = rrdhost;
if(unlikely(rc->rrdset))
rrdcalc_unlink_from_rrdset(rc, false);
// any destruction actions that require other locks
// have to be placed in rrdcalc_del(), because the object is actually locked for deletion
rrdcalc_free_internals(rc);
}
// ----------------------------------------------------------------------------
// RRDCALC rrdhost index management - index API
void rrdcalc_rrdhost_index_init(RRDHOST *host) {
if(!host->rrdcalc_root_index) {
host->rrdcalc_root_index = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE,
&dictionary_stats_category_rrdhealth, sizeof(RRDCALC));
dictionary_register_insert_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_insert_callback, NULL);
dictionary_register_conflict_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_conflict_callback, NULL);
dictionary_register_react_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_react_callback, NULL);
dictionary_register_delete_callback(host->rrdcalc_root_index, rrdcalc_rrdhost_delete_callback, host);
}
}
void rrdcalc_rrdhost_index_destroy(RRDHOST *host) {
dictionary_destroy(host->rrdcalc_root_index);
host->rrdcalc_root_index = NULL;
}
bool rrdcalc_add_from_prototype(RRDHOST *host, RRDSET *st, RRD_ALERT_PROTOTYPE *ap) {
char key[RRDCALC_MAX_KEY_SIZE + 1];
size_t key_len = rrdcalc_key(key, RRDCALC_MAX_KEY_SIZE,
string2str(st->id), string2str(ap->config.name));
struct rrdcalc_constructor tmp = {
.ap = ap,
.rrdset = st,
.react_action = RRDCALC_REACT_NONE,
};
bool ret = true;
dictionary_set_advanced(host->rrdcalc_root_index, key, (ssize_t)key_len,
NULL, sizeof(RRDCALC), &tmp);
if(tmp.react_action != RRDCALC_REACT_NEW)
ret = false;
return ret;
}
void rrdcalc_unlink_and_delete(RRDHOST *host, RRDCALC *rc, bool having_ll_wrlock) {
if(rc->rrdset)
rrdcalc_unlink_from_rrdset(rc, having_ll_wrlock);
dictionary_del_advanced(host->rrdcalc_root_index, string2str(rc->key), (ssize_t)string_strlen(rc->key));
}
// ----------------------------------------------------------------------------
// RRDCALC cleanup API functions
void rrdcalc_unlink_and_delete_all_rrdset_alerts(RRDSET *st) {
RRDCALC *rc, *last = NULL;
rw_spinlock_write_lock(&st->alerts.spinlock);
while((rc = st->alerts.base)) {
if(last == rc) {
netdata_log_error("RRDCALC: malformed list of alerts linked to chart - cannot cleanup - giving up.");
break;
}
last = rc;
rrdcalc_unlink_and_delete(st->rrdhost, rc, true);
}
rw_spinlock_write_unlock(&st->alerts.spinlock);
}
void rrdcalc_delete_all(RRDHOST *host) {
dictionary_flush(host->rrdcalc_root_index);
}
void rrd_alert_match_cleanup(struct rrd_alert_match *am) {
if(am->is_template)
string_freez(am->on.context);
else
string_freez(am->on.chart);
string_freez(am->os);
simple_pattern_free(am->os_pattern);
string_freez(am->host);
simple_pattern_free(am->host_pattern);
string_freez(am->plugin);
simple_pattern_free(am->plugin_pattern);
string_freez(am->module);
simple_pattern_free(am->module_pattern);
string_freez(am->charts);
simple_pattern_free(am->charts_pattern);
string_freez(am->host_labels);
simple_pattern_free(am->host_labels_pattern);
string_freez(am->chart_labels);
simple_pattern_free(am->chart_labels_pattern);
}
void rrd_alert_config_cleanup(struct rrd_alert_config *ac) {
string_freez(ac->name);
string_freez(ac->exec);
string_freez(ac->recipient);
string_freez(ac->classification);
string_freez(ac->component);
string_freez(ac->type);
string_freez(ac->source);
string_freez(ac->units);
string_freez(ac->summary);
string_freez(ac->info);
string_freez(ac->lookup);
string_freez(ac->dimensions);
expression_free(ac->calculation);
expression_free(ac->warning);
expression_free(ac->critical);
}