mirror of
https://github.com/netdata/netdata.git
synced 2025-04-14 17:48:37 +00:00

* cleanup alerts * fix references * fix references * fix references * load alerts once and apply them to each node * simplify health_create_alarm_entry() * Compile without warnings with compiler flags: -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self * code re-organization and cleanup * generate patterns when applying prototypes; give unique dyncfg names to all alerts * eval expressions keep the source and the parsed_as as STRING pointers * renamed host to node in dyncfg ids * renamed host to node in dyncfg ids * add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level * working functionality * code re-organization: moved health event-loop to a new file, moved health globals to health.c * rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs * dyncfg alert prototypes initialization for alerts * health dyncfg split to separate file * cleanup not-needed code * normalize matches between parsing and json * also detect !* for disabled alerts * dyncfg capability disabled * Store alert config part1 * Add rrdlabels_common_count * wip health variables lookup without indexes * Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter * working variables with runtime lookup * working variables with runtime lookup * delete rrddimvar and rrdfamily index * remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts * added /api/v1/variable that resolves a variable the same way alerts do * remove rrdcalc from eval * remove debug code * remove duplicate assignment * Fix memory leak * all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts * hide all internal structures of EVAL * Enable -Wformat flag Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> * Adjust binding for calculation, warning, critical * Remove unused macro * Update config hash id * use the right info and summary in alerts log * use synchronous queries for alerts * Handle cases when config_hash_id is missing from health_log * remove deadlock from health worker * parsing to json payload for health alert prototypes * cleaner parsing and avoiding memory leaks in case of duplicate members in json * fix left-over rename of function * Keep original lookup field to send to the cloud Cleanup / rename function to store config Remove unused DEFINEs, functions * Use ac->lookup * link jobs to the host when the template is registered; do not accept running a function without a host * full dyncfg support for health alerts, except action TEST * working dyncfg additions, updates, removals * fixed missing source, wrong status updates * add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint * fix dyncfg unittest * rename functions * generalize the json-c parser macros and move them to libnetdata * report progress when enabling and disabling dyncfg templates * moved rrdcalc and rrdvar to health * update alarms * added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts * enable parsed json alerts; allow sending back accepted but disabled * added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output * remove the RRDHOST pointer from DYNCFG * Fix command field submitted to the cloud * do not send updates to creation requests, for DYNCFG jobs --------- Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud> Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud> Co-authored-by: ilyam8 <ilya@netdata.cloud>
286 lines
12 KiB
C
286 lines
12 KiB
C
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "health.h"
|
|
|
|
void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
|
|
if(value && *value) {
|
|
buffer_sprintf(wb, "%s\"%s\":\"", prefix, label);
|
|
buffer_strcat_htmlescape(wb, value);
|
|
buffer_strcat(wb, "\"");
|
|
buffer_strcat(wb, suffix);
|
|
}
|
|
else
|
|
buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
|
|
}
|
|
|
|
static inline void health_rrdcalc_values2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
|
|
(void)host;
|
|
buffer_sprintf(wb,
|
|
"\t\t\"%s.%s\": {\n"
|
|
"\t\t\t\"id\": %lu,\n"
|
|
, rrdcalc_chart_name(rc), rrdcalc_name(rc)
|
|
, (unsigned long)rc->id);
|
|
|
|
buffer_strcat(wb, "\t\t\t\"value\":");
|
|
buffer_print_netdata_double(wb, rc->value);
|
|
buffer_strcat(wb, ",\n");
|
|
|
|
buffer_strcat(wb, "\t\t\t\"last_updated\":");
|
|
buffer_sprintf(wb, "%lu", (unsigned long)rc->last_updated);
|
|
buffer_strcat(wb, ",\n");
|
|
|
|
buffer_sprintf(wb,
|
|
"\t\t\t\"status\": \"%s\"\n"
|
|
, rrdcalc_status2string(rc->status));
|
|
|
|
buffer_strcat(wb, "\t\t}");
|
|
}
|
|
|
|
static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC *rc) {
|
|
char value_string[100 + 1];
|
|
format_value_and_unit(value_string, 100, rc->value, rrdcalc_units(rc), -1);
|
|
|
|
char hash_id[GUID_LEN + 1];
|
|
uuid_unparse_lower(rc->config.hash_id, hash_id);
|
|
|
|
buffer_sprintf(wb,
|
|
"\t\t\"%s.%s\": {\n"
|
|
"\t\t\t\"id\": %lu,\n"
|
|
"\t\t\t\"config_hash_id\": \"%s\",\n"
|
|
"\t\t\t\"name\": \"%s\",\n"
|
|
"\t\t\t\"chart\": \"%s\",\n"
|
|
"\t\t\t\"class\": \"%s\",\n"
|
|
"\t\t\t\"component\": \"%s\",\n"
|
|
"\t\t\t\"type\": \"%s\",\n"
|
|
"\t\t\t\"active\": %s,\n"
|
|
"\t\t\t\"disabled\": %s,\n"
|
|
"\t\t\t\"silenced\": %s,\n"
|
|
"\t\t\t\"exec\": \"%s\",\n"
|
|
"\t\t\t\"recipient\": \"%s\",\n"
|
|
"\t\t\t\"source\": \"%s\",\n"
|
|
"\t\t\t\"units\": \"%s\",\n"
|
|
"\t\t\t\"summary\": \"%s\",\n"
|
|
"\t\t\t\"info\": \"%s\",\n"
|
|
"\t\t\t\"status\": \"%s\",\n"
|
|
"\t\t\t\"last_status_change\": %lu,\n"
|
|
"\t\t\t\"last_updated\": %lu,\n"
|
|
"\t\t\t\"next_update\": %lu,\n"
|
|
"\t\t\t\"update_every\": %d,\n"
|
|
"\t\t\t\"delay_up_duration\": %d,\n"
|
|
"\t\t\t\"delay_down_duration\": %d,\n"
|
|
"\t\t\t\"delay_max_duration\": %d,\n"
|
|
"\t\t\t\"delay_multiplier\": %f,\n"
|
|
"\t\t\t\"delay\": %d,\n"
|
|
"\t\t\t\"delay_up_to_timestamp\": %lu,\n"
|
|
"\t\t\t\"warn_repeat_every\": \"%u\",\n"
|
|
"\t\t\t\"crit_repeat_every\": \"%u\",\n"
|
|
"\t\t\t\"value_string\": \"%s\",\n"
|
|
"\t\t\t\"last_repeat\": \"%lu\",\n"
|
|
"\t\t\t\"times_repeat\": %lu,\n"
|
|
, rrdcalc_chart_name(rc), rrdcalc_name(rc)
|
|
, (unsigned long)rc->id
|
|
, hash_id
|
|
, rrdcalc_name(rc)
|
|
, rrdcalc_chart_name(rc)
|
|
, rc->config.classification?rrdcalc_classification(rc):"Unknown"
|
|
, rc->config.component?rrdcalc_component(rc):"Unknown"
|
|
, rc->config.type?rrdcalc_type(rc):"Unknown"
|
|
, (rc->rrdset)?"true":"false"
|
|
, (rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
|
|
, (rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
|
|
, rc->config.exec?rrdcalc_exec(rc):string2str(host->health.health_default_exec)
|
|
, rc->config.recipient?rrdcalc_recipient(rc):string2str(host->health.health_default_recipient)
|
|
, rrdcalc_source(rc)
|
|
, rrdcalc_units(rc)
|
|
, string2str(rc->summary)
|
|
, string2str(rc->info)
|
|
, rrdcalc_status2string(rc->status)
|
|
, (unsigned long)rc->last_status_change
|
|
, (unsigned long)rc->last_updated
|
|
, (unsigned long)rc->next_update
|
|
, rc->config.update_every
|
|
, rc->config.delay_up_duration
|
|
, rc->config.delay_down_duration
|
|
, rc->config.delay_max_duration
|
|
, rc->config.delay_multiplier
|
|
, rc->delay_last
|
|
, (unsigned long)rc->delay_up_to_timestamp
|
|
, rc->config.warn_repeat_every
|
|
, rc->config.crit_repeat_every
|
|
, value_string
|
|
, (unsigned long)rc->last_repeat
|
|
, (unsigned long)rc->times_repeat
|
|
);
|
|
|
|
if(unlikely(rc->config.alert_action_options & ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION)) {
|
|
buffer_strcat(wb, "\t\t\t\"no_clear_notification\": true,\n");
|
|
}
|
|
|
|
if(RRDCALC_HAS_DB_LOOKUP(rc)) {
|
|
if(rc->config.dimensions)
|
|
health_string2json(wb, "\t\t\t", "lookup_dimensions", rrdcalc_dimensions(rc), ",\n");
|
|
|
|
buffer_sprintf(wb,
|
|
"\t\t\t\"db_after\": %lu,\n"
|
|
"\t\t\t\"db_before\": %lu,\n"
|
|
"\t\t\t\"lookup_method\": \"%s\",\n"
|
|
"\t\t\t\"lookup_after\": %d,\n"
|
|
"\t\t\t\"lookup_before\": %d,\n"
|
|
"\t\t\t\"lookup_options\": \"",
|
|
(unsigned long) rc->db_after,
|
|
(unsigned long) rc->db_before,
|
|
time_grouping_id2txt(rc->config.group),
|
|
rc->config.after,
|
|
rc->config.before
|
|
);
|
|
buffer_data_options2string(wb, rc->config.options);
|
|
buffer_strcat(wb, "\",\n");
|
|
}
|
|
|
|
if(rc->config.calculation) {
|
|
health_string2json(wb, "\t\t\t", "calc", expression_source(rc->config.calculation), ",\n");
|
|
health_string2json(wb, "\t\t\t", "calc_parsed", expression_parsed_as(rc->config.calculation), ",\n");
|
|
}
|
|
|
|
if(rc->config.warning) {
|
|
health_string2json(wb, "\t\t\t", "warn", expression_source(rc->config.warning), ",\n");
|
|
health_string2json(wb, "\t\t\t", "warn_parsed", expression_parsed_as(rc->config.warning), ",\n");
|
|
}
|
|
|
|
if(rc->config.critical) {
|
|
health_string2json(wb, "\t\t\t", "crit", expression_source(rc->config.critical), ",\n");
|
|
health_string2json(wb, "\t\t\t", "crit_parsed", expression_parsed_as(rc->config.critical), ",\n");
|
|
}
|
|
|
|
buffer_strcat(wb, "\t\t\t\"green\":");
|
|
buffer_print_netdata_double(wb, rc->config.green);
|
|
buffer_strcat(wb, ",\n");
|
|
|
|
buffer_strcat(wb, "\t\t\t\"red\":");
|
|
buffer_print_netdata_double(wb, rc->config.red);
|
|
buffer_strcat(wb, ",\n");
|
|
|
|
buffer_strcat(wb, "\t\t\t\"value\":");
|
|
buffer_print_netdata_double(wb, rc->value);
|
|
buffer_strcat(wb, "\n");
|
|
|
|
buffer_strcat(wb, "\t\t}");
|
|
}
|
|
|
|
void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* contexts, RRDCALC_STATUS status) {
|
|
RRDCALC *rc;
|
|
int numberOfAlarms = 0;
|
|
char *tok = NULL;
|
|
char *p = NULL;
|
|
|
|
if (contexts) {
|
|
p = (char*)buffer_tostring(contexts);
|
|
while(p && *p && (tok = strsep_skip_consecutive_separators(&p, ", |"))) {
|
|
if(!*tok) continue;
|
|
|
|
STRING *tok_string = string_strdupz(tok);
|
|
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
|
|
continue;
|
|
if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
|
|
continue;
|
|
if(unlikely(rc->rrdset
|
|
&& rc->rrdset->context == tok_string
|
|
&& ((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status)))
|
|
numberOfAlarms++;
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
|
|
string_freez(tok_string);
|
|
}
|
|
}
|
|
else {
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
|
|
continue;
|
|
if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
|
|
continue;
|
|
if(unlikely((status==RRDCALC_STATUS_RAISED)?(rc->status >= RRDCALC_STATUS_WARNING):rc->status == status))
|
|
numberOfAlarms++;
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
}
|
|
|
|
buffer_sprintf(wb, "%d", numberOfAlarms);
|
|
}
|
|
|
|
static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, void (*fp)(RRDHOST *, BUFFER *, RRDCALC *)) {
|
|
RRDCALC *rc;
|
|
int i = 0;
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
|
|
continue;
|
|
|
|
if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
|
|
continue;
|
|
|
|
if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
|
|
continue;
|
|
|
|
if(likely(i)) buffer_strcat(wb, ",\n");
|
|
fp(host, wb, rc);
|
|
i++;
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
}
|
|
|
|
void health_alarms2json(RRDHOST *host, BUFFER *wb, int all) {
|
|
buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
|
|
"\n\t\"latest_alarm_log_unique_id\": %u,"
|
|
"\n\t\"status\": %s,"
|
|
"\n\t\"now\": %lu,"
|
|
"\n\t\"alarms\": {\n",
|
|
rrdhost_hostname(host),
|
|
(host->health_log.next_log_id > 0)?(host->health_log.next_log_id - 1):0,
|
|
host->health.health_enabled?"true":"false",
|
|
(unsigned long)now_realtime_sec());
|
|
|
|
health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc2json_nolock);
|
|
|
|
buffer_strcat(wb, "\n\t}\n}\n");
|
|
}
|
|
|
|
void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all) {
|
|
buffer_sprintf(wb, "{\n\t\"hostname\": \"%s\","
|
|
"\n\t\"alarms\": {\n",
|
|
rrdhost_hostname(host));
|
|
|
|
health_alarms2json_fill_alarms(host, wb, all, health_rrdcalc_values2json_nolock);
|
|
|
|
buffer_strcat(wb, "\n\t}\n}\n");
|
|
}
|
|
|
|
void health_entry_flags_to_json_array(BUFFER *wb, const char *key, HEALTH_ENTRY_FLAGS flags) {
|
|
buffer_json_member_add_array(wb, key);
|
|
|
|
if(flags & HEALTH_ENTRY_FLAG_PROCESSED)
|
|
buffer_json_add_array_item_string(wb, "PROCESSED");
|
|
if(flags & HEALTH_ENTRY_FLAG_UPDATED)
|
|
buffer_json_add_array_item_string(wb, "UPDATED");
|
|
if(flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
|
|
buffer_json_add_array_item_string(wb, "EXEC_RUN");
|
|
if(flags & HEALTH_ENTRY_FLAG_EXEC_FAILED)
|
|
buffer_json_add_array_item_string(wb, "EXEC_FAILED");
|
|
if(flags & HEALTH_ENTRY_FLAG_SILENCED)
|
|
buffer_json_add_array_item_string(wb, "SILENCED");
|
|
if(flags & HEALTH_ENTRY_RUN_ONCE)
|
|
buffer_json_add_array_item_string(wb, "RUN_ONCE");
|
|
if(flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS)
|
|
buffer_json_add_array_item_string(wb, "EXEC_IN_PROGRESS");
|
|
if(flags & HEALTH_ENTRY_FLAG_IS_REPEATING)
|
|
buffer_json_add_array_item_string(wb, "RECURRING");
|
|
if(flags & HEALTH_ENTRY_FLAG_SAVED)
|
|
buffer_json_add_array_item_string(wb, "SAVED");
|
|
if(flags & HEALTH_ENTRY_FLAG_ACLK_QUEUED)
|
|
buffer_json_add_array_item_string(wb, "ACLK_QUEUED");
|
|
if(flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)
|
|
buffer_json_add_array_item_string(wb, "NO_CLEAR_NOTIFICATION");
|
|
|
|
buffer_json_array_close(wb);
|
|
}
|