mirror of
https://github.com/netdata/netdata.git
synced 2025-04-15 10:04:15 +00:00

* cleanup alerts * fix references * fix references * fix references * load alerts once and apply them to each node * simplify health_create_alarm_entry() * Compile without warnings with compiler flags: -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self * code re-organization and cleanup * generate patterns when applying prototypes; give unique dyncfg names to all alerts * eval expressions keep the source and the parsed_as as STRING pointers * renamed host to node in dyncfg ids * renamed host to node in dyncfg ids * add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level * working functionality * code re-organization: moved health event-loop to a new file, moved health globals to health.c * rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs * dyncfg alert prototypes initialization for alerts * health dyncfg split to separate file * cleanup not-needed code * normalize matches between parsing and json * also detect !* for disabled alerts * dyncfg capability disabled * Store alert config part1 * Add rrdlabels_common_count * wip health variables lookup without indexes * Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter * working variables with runtime lookup * working variables with runtime lookup * delete rrddimvar and rrdfamily index * remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts * added /api/v1/variable that resolves a variable the same way alerts do * remove rrdcalc from eval * remove debug code * remove duplicate assignment * Fix memory leak * all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts * hide all internal structures of EVAL * Enable -Wformat flag Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Adjust binding for calculation, warning, critical * Remove unused macro * Update config hash id * use the right info and summary in alerts log * use synchronous queries for alerts * Handle cases when config_hash_id is missing from health_log * remove deadlock from health worker * parsing to json payload for health alert prototypes * cleaner parsing and avoiding memory leaks in case of duplicate members in json * fix left-over rename of function * Keep original lookup field to send to the cloud Cleanup / rename function to store config Remove unused DEFINEs, functions * Use ac->lookup * link jobs to the host when the template is registered; do not accept running a function without a host * full dyncfg support for health alerts, except action TEST * working dyncfg additions, updates, removals * fixed missing source, wrong status updates * add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint * fix dyncfg unittest * rename functions * generalize the json-c parser macros and move them to libnetdata * report progress when enabling and disabling dyncfg templates * moved rrdcalc and rrdvar to health * update alarms * added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts * enable parsed json alerts; allow sending back accepted but disabled * added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output * remove the RRDHOST pointer from DYNCFG * Fix command field submitted to the cloud * do not send updates to creation requests, for DYNCFG jobs --------- Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud> Co-authored-by: ilyam8 <ilya@netdata.cloud>
232 lines
8.2 KiB
C
232 lines
8.2 KiB
C
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "health.h"
|
|
|
|
// ----------------------------------------------------------------------------
|
|
|
|
inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
|
|
sql_health_alarm_log_save(host, ae);
|
|
}
|
|
|
|
|
|
void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) {
|
|
ND_LOG_STACK lgs[] = {
|
|
ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid),
|
|
ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname),
|
|
ND_LOG_FIELD_STR(NDF_NIDL_INSTANCE, ae->chart_name),
|
|
ND_LOG_FIELD_STR(NDF_NIDL_CONTEXT, ae->chart_context),
|
|
ND_LOG_FIELD_U64(NDF_ALERT_ID, ae->alarm_id),
|
|
ND_LOG_FIELD_U64(NDF_ALERT_UNIQUE_ID, ae->unique_id),
|
|
ND_LOG_FIELD_U64(NDF_ALERT_EVENT_ID, ae->alarm_event_id),
|
|
ND_LOG_FIELD_UUID(NDF_ALERT_CONFIG_HASH, &ae->config_hash_id),
|
|
ND_LOG_FIELD_UUID(NDF_ALERT_TRANSITION_ID, &ae->transition_id),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_NAME, ae->name),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_CLASS, ae->classification),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_COMPONENT, ae->component),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_TYPE, ae->type),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_EXEC, ae->exec),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_RECIPIENT, ae->recipient),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_SOURCE, ae->exec),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_UNITS, ae->units),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_SUMMARY, ae->summary),
|
|
ND_LOG_FIELD_STR(NDF_ALERT_INFO, ae->info),
|
|
ND_LOG_FIELD_DBL(NDF_ALERT_VALUE, ae->new_value),
|
|
ND_LOG_FIELD_DBL(NDF_ALERT_VALUE_OLD, ae->old_value),
|
|
ND_LOG_FIELD_TXT(NDF_ALERT_STATUS, rrdcalc_status2string(ae->new_status)),
|
|
ND_LOG_FIELD_TXT(NDF_ALERT_STATUS_OLD, rrdcalc_status2string(ae->old_status)),
|
|
ND_LOG_FIELD_I64(NDF_ALERT_DURATION, ae->duration),
|
|
ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, ae->exec_code),
|
|
ND_LOG_FIELD_U64(NDF_ALERT_NOTIFICATION_REALTIME_USEC, ae->delay_up_to_timestamp * USEC_PER_SEC),
|
|
ND_LOG_FIELD_END(),
|
|
};
|
|
ND_LOG_STACK_PUSH(lgs);
|
|
|
|
errno = 0;
|
|
|
|
ND_LOG_FIELD_PRIORITY priority = NDLP_INFO;
|
|
|
|
switch(ae->new_status) {
|
|
case RRDCALC_STATUS_UNDEFINED:
|
|
if(ae->old_status >= RRDCALC_STATUS_CLEAR)
|
|
priority = NDLP_NOTICE;
|
|
else
|
|
priority = NDLP_DEBUG;
|
|
break;
|
|
|
|
default:
|
|
case RRDCALC_STATUS_UNINITIALIZED:
|
|
case RRDCALC_STATUS_REMOVED:
|
|
priority = NDLP_DEBUG;
|
|
break;
|
|
|
|
case RRDCALC_STATUS_CLEAR:
|
|
priority = NDLP_INFO;
|
|
break;
|
|
|
|
case RRDCALC_STATUS_WARNING:
|
|
if(ae->old_status < RRDCALC_STATUS_WARNING)
|
|
priority = NDLP_WARNING;
|
|
break;
|
|
|
|
case RRDCALC_STATUS_CRITICAL:
|
|
if(ae->old_status < RRDCALC_STATUS_CRITICAL)
|
|
priority = NDLP_CRIT;
|
|
break;
|
|
}
|
|
|
|
netdata_logger(NDLS_HEALTH, priority, file, function, line,
|
|
"ALERT '%s' of instance '%s' on node '%s', transitioned from %s to %s",
|
|
string2str(ae->name), string2str(ae->chart), string2str(host->hostname),
|
|
rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status)
|
|
);
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// health alarm log management
|
|
|
|
inline ALARM_ENTRY* health_create_alarm_entry(
|
|
RRDHOST *host,
|
|
RRDCALC *rc,
|
|
time_t when,
|
|
time_t duration,
|
|
NETDATA_DOUBLE old_value,
|
|
NETDATA_DOUBLE new_value,
|
|
RRDCALC_STATUS old_status,
|
|
RRDCALC_STATUS new_status,
|
|
int delay,
|
|
HEALTH_ENTRY_FLAGS flags
|
|
) {
|
|
uint32_t alarm_id = rc->id;
|
|
uint32_t alarm_event_id = rc->next_event_id++;
|
|
STRING *name = rc->config.name;
|
|
STRING *chart = rc->rrdset->id;
|
|
STRING *chart_context = rc->rrdset->context;
|
|
STRING *chart_name = rc->rrdset->name;
|
|
STRING *class = rc->config.classification;
|
|
STRING *component = rc->config.component;
|
|
STRING *type = rc->config.type;
|
|
STRING *exec = rc->config.exec;
|
|
STRING *recipient = rc->config.recipient;
|
|
STRING *source = rc->config.source;
|
|
STRING *units = rc->config.units;
|
|
STRING *summary = rc->summary;
|
|
STRING *info = rc->info;
|
|
|
|
netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id);
|
|
|
|
ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY));
|
|
ae->name = string_dup(name);
|
|
ae->chart = string_dup(chart);
|
|
ae->chart_context = string_dup(chart_context);
|
|
ae->chart_name = string_dup(chart_name);
|
|
|
|
uuid_copy(ae->config_hash_id, rc->config.hash_id);
|
|
|
|
uuid_generate_random(ae->transition_id);
|
|
ae->global_id = now_realtime_usec();
|
|
|
|
ae->classification = string_dup(class);
|
|
ae->component = string_dup(component);
|
|
ae->type = string_dup(type);
|
|
ae->exec = string_dup(exec);
|
|
ae->recipient = string_dup(recipient);
|
|
ae->source = string_dup(source);
|
|
ae->units = string_dup(units);
|
|
|
|
ae->unique_id = host->health_log.next_log_id++;
|
|
ae->alarm_id = alarm_id;
|
|
ae->alarm_event_id = alarm_event_id;
|
|
ae->when = when;
|
|
ae->old_value = old_value;
|
|
ae->new_value = new_value;
|
|
|
|
char value_string[100 + 1];
|
|
ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1));
|
|
ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1));
|
|
|
|
ae->summary = string_dup(summary);
|
|
ae->info = string_dup(info);
|
|
ae->old_status = old_status;
|
|
ae->new_status = new_status;
|
|
ae->duration = duration;
|
|
ae->delay = delay;
|
|
ae->delay_up_to_timestamp = when + delay;
|
|
ae->flags |= flags;
|
|
|
|
ae->last_repeat = 0;
|
|
|
|
if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
|
|
ae->non_clear_duration += ae->duration;
|
|
|
|
return ae;
|
|
}
|
|
|
|
inline void health_alarm_log_add_entry(
|
|
RRDHOST *host,
|
|
ALARM_ENTRY *ae
|
|
) {
|
|
netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
|
|
|
|
__atomic_add_fetch(&host->health_transitions, 1, __ATOMIC_RELAXED);
|
|
|
|
// link it
|
|
rw_spinlock_write_lock(&host->health_log.spinlock);
|
|
ae->next = host->health_log.alarms;
|
|
host->health_log.alarms = ae;
|
|
host->health_log.count++;
|
|
rw_spinlock_write_unlock(&host->health_log.spinlock);
|
|
|
|
// match previous alarms
|
|
rw_spinlock_read_lock(&host->health_log.spinlock);
|
|
ALARM_ENTRY *t;
|
|
for(t = host->health_log.alarms ; t ; t = t->next) {
|
|
if(t != ae && t->alarm_id == ae->alarm_id) {
|
|
if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) {
|
|
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
|
|
t->updated_by_id = ae->unique_id;
|
|
ae->updates_id = t->unique_id;
|
|
|
|
if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) &&
|
|
(t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL))
|
|
ae->non_clear_duration += t->non_clear_duration;
|
|
|
|
health_alarm_log_save(host, t);
|
|
}
|
|
|
|
// no need to continue
|
|
break;
|
|
}
|
|
}
|
|
rw_spinlock_read_unlock(&host->health_log.spinlock);
|
|
|
|
health_alarm_log_save(host, ae);
|
|
}
|
|
|
|
inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
|
|
string_freez(ae->name);
|
|
string_freez(ae->chart);
|
|
string_freez(ae->chart_context);
|
|
string_freez(ae->classification);
|
|
string_freez(ae->component);
|
|
string_freez(ae->type);
|
|
string_freez(ae->exec);
|
|
string_freez(ae->recipient);
|
|
string_freez(ae->source);
|
|
string_freez(ae->units);
|
|
string_freez(ae->info);
|
|
string_freez(ae->old_value_string);
|
|
string_freez(ae->new_value_string);
|
|
freez(ae);
|
|
}
|
|
|
|
inline void health_alarm_log_free(RRDHOST *host) {
|
|
rw_spinlock_write_lock(&host->health_log.spinlock);
|
|
|
|
ALARM_ENTRY *ae;
|
|
while((ae = host->health_log.alarms)) {
|
|
host->health_log.alarms = ae->next;
|
|
health_alarm_log_free_one_nochecks_nounlink(ae);
|
|
}
|
|
|
|
rw_spinlock_write_unlock(&host->health_log.spinlock);
|
|
}
|