0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-15 10:04:15 +00:00
netdata_netdata/health/health_internals.h
Costa Tsaousis f466b8aef5
DYNCFG: dynamically configured alerts ()
* cleanup alerts

* fix references

* fix references

* fix references

* load alerts once and apply them to each node

* simplify health_create_alarm_entry()

* Compile without warnings with compiler flags:

   -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self

* code re-organization and cleanup

* generate patterns when applying prototypes; give unique dyncfg names to all alerts

* eval expressions keep the source and the parsed_as as STRING pointers

* renamed host to node in dyncfg ids

* renamed host to node in dyncfg ids

* add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level

* working functionality

* code re-organization: moved health event-loop to a new file, moved health globals to health.c

* rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs

* dyncfg alert prototypes initialization for alerts

* health dyncfg split to separate file

* cleanup not-needed code

* normalize matches between parsing and json

* also detect !* for disabled alerts

* dyncfg capability disabled

* Store alert config part1

* Add rrdlabels_common_count

* wip health variables lookup without indexes

* Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter

* working variables with runtime lookup

* working variables with runtime lookup

* delete rrddimvar and rrdfamily index

* remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts

* added /api/v1/variable that resolves a variable the same way alerts do

* remove rrdcalc from eval

* remove debug code

* remove duplicate assignment

* Fix memory leak

* all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts

* hide all internal structures of EVAL

* Enable -Wformat flag

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>

* Adjust binding for calculation, warning, critical

* Remove unused macro

* Update config hash id

* use the right info and summary in alerts log

* use synchronous queries for alerts

* Handle cases when config_hash_id is missing from health_log

* remove deadlock from health worker

* parsing to json payload for health alert prototypes

* cleaner parsing and avoiding memory leaks in case of duplicate members in json

* fix left-over rename of function

* Keep original lookup field to send to the cloud
Cleanup / rename function to store config
Remove unused DEFINEs, functions

* Use ac->lookup

* link jobs to the host when the template is registered; do not accept running a function without a host

* full dyncfg support for health alerts, except action TEST

* working dyncfg additions, updates, removals

* fixed missing source, wrong status updates

* add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint

* fix dyncfg unittest

* rename functions

* generalize the json-c parser macros and move them to libnetdata

* report progress when enabling and disabling dyncfg templates

* moved rrdcalc and rrdvar to health

* update alarms

* added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts

* enable parsed json alerts; allow sending back accepted but disabled

* added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output

* remove the RRDHOST pointer from DYNCFG

* Fix command field submitted to the cloud

* do not send updates to creation requests, for DYNCFG jobs

---------

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: ilyam8 <ilya@netdata.cloud>
2024-01-23 20:20:41 +02:00

130 lines
4.5 KiB
C

// SPDX-License-Identifier: GPL-3.0-or-later
#ifndef NETDATA_HEALTH_INTERNALS_H
#define NETDATA_HEALTH_INTERNALS_H
#include "health.h"
#define HEALTH_LOG_ENTRIES_DEFAULT 1000U
#define HEALTH_LOG_ENTRIES_MAX 100000U
#define HEALTH_LOG_ENTRIES_MIN 10U
#define HEALTH_LOG_HISTORY_DEFAULT (5 * 86400)
#define HEALTH_CONF_MAX_LINE 4096
#define HEALTH_ALARM_KEY "alarm"
#define HEALTH_TEMPLATE_KEY "template"
#define HEALTH_CHART_KEY "chart"
#define HEALTH_CONTEXT_KEY "context"
#define HEALTH_ON_KEY "on"
#define HEALTH_HOST_KEY "hosts"
#define HEALTH_OS_KEY "os"
#define HEALTH_PLUGIN_KEY "plugin"
#define HEALTH_MODULE_KEY "module"
#define HEALTH_CHARTS_KEY "charts"
#define HEALTH_LOOKUP_KEY "lookup"
#define HEALTH_CALC_KEY "calc"
#define HEALTH_EVERY_KEY "every"
#define HEALTH_GREEN_KEY "green"
#define HEALTH_RED_KEY "red"
#define HEALTH_WARN_KEY "warn"
#define HEALTH_CRIT_KEY "crit"
#define HEALTH_EXEC_KEY "exec"
#define HEALTH_RECIPIENT_KEY "to"
#define HEALTH_UNITS_KEY "units"
#define HEALTH_SUMMARY_KEY "summary"
#define HEALTH_INFO_KEY "info"
#define HEALTH_CLASS_KEY "class"
#define HEALTH_COMPONENT_KEY "component"
#define HEALTH_TYPE_KEY "type"
#define HEALTH_DELAY_KEY "delay"
#define HEALTH_OPTIONS_KEY "options"
#define HEALTH_REPEAT_KEY "repeat"
#define HEALTH_HOST_LABEL_KEY "host labels"
#define HEALTH_FOREACH_KEY "foreach"
#define HEALTH_CHART_LABEL_KEY "chart labels"
void alert_action_options_to_buffer_json_array(BUFFER *wb, const char *key, ALERT_ACTION_OPTIONS options);
ALERT_ACTION_OPTIONS alert_action_options_parse(char *o);
ALERT_ACTION_OPTIONS alert_action_options_parse_one(const char *o);
typedef struct rrd_alert_prototype {
struct rrd_alert_match match;
struct rrd_alert_config config;
struct {
uint32_t uses;
bool enabled;
bool is_on_disk;
SPINLOCK spinlock;
struct rrd_alert_prototype *prev, *next;
} _internal;
} RRD_ALERT_PROTOTYPE;
bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap);
void health_prototype_cleanup(RRD_ALERT_PROTOTYPE *ap);
void health_prototype_free(RRD_ALERT_PROTOTYPE *ap);
struct health_plugin_globals {
struct {
SPINLOCK spinlock;
bool done;
} initialization;
struct {
bool enabled;
bool stock_enabled;
bool use_summary_for_notifications;
unsigned int health_log_entries_max;
uint32_t health_log_history; // the health log history in seconds to be kept in db
STRING *silencers_filename;
STRING *default_exec;
STRING *default_recipient;
SIMPLE_PATTERN *enabled_alerts;
uint32_t default_warn_repeat_every; // the default value for the interval between repeating warning notifications
uint32_t default_crit_repeat_every; // the default value for the interval between repeating critical notifications
int32_t run_at_least_every_seconds;
int32_t postpone_alarms_during_hibernation_for_seconds;
} config;
struct {
DICTIONARY *dict;
} prototypes;
};
extern struct health_plugin_globals health_globals;
int health_readfile(const char *filename, void *data, bool stock_config);
void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae);
void wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up(void);
void health_alarm_wait_for_execution(ALARM_ENTRY *ae);
bool rrdcalc_add_from_prototype(RRDHOST *host, RRDSET *st, RRD_ALERT_PROTOTYPE *ap);
int dyncfg_health_cb(const char *transaction, const char *id, DYNCFG_CMDS cmd, const char *add_name,
BUFFER *payload, usec_t *stop_monotonic_ut, bool *cancelled,
BUFFER *result, const char *source, void *data);
void health_dyncfg_unregister_all_prototypes(void);
void health_dyncfg_register_all_prototypes(void);
void health_prototype_to_json(BUFFER *wb, RRD_ALERT_PROTOTYPE *ap, bool for_hashing);
bool alert_variable_lookup(STRING *variable, void *data, NETDATA_DOUBLE *result);
struct health_raised_summary;
struct health_raised_summary *alerts_raised_summary_create(RRDHOST *host);
void alerts_raised_summary_populate(struct health_raised_summary *hrm);
void alerts_raised_summary_free(struct health_raised_summary *hrm);
void health_send_notification(RRDHOST *host, ALARM_ENTRY *ae, struct health_raised_summary *hrm);
void health_alarm_log_process_to_send_notifications(RRDHOST *host, struct health_raised_summary *hrm);
void health_apply_prototype_to_host(RRDHOST *host, RRD_ALERT_PROTOTYPE *ap);
void health_prototype_apply_to_all_hosts(RRD_ALERT_PROTOTYPE *ap);
#endif //NETDATA_HEALTH_INTERNALS_H