mirror of
https://github.com/netdata/netdata.git
synced 2025-04-07 06:45:39 +00:00

* rrdset - in progress * rrdset optimal constructor; rrdset conflict * rrdset final touches * re-organization of rrdset object members * prevent use-after-free * dictionary dfe supports also counting of iterations * rrddim managed by dictionary * rrd.h cleanup * DICTIONARY_ITEM now is referencing actual dictionary items in the code * removed rrdset linked list * Revert "removed rrdset linked list" This reverts commit 690d6a588b4b99619c2c5e10f84e8f868ae6def5. * removed rrdset linked list * added comments * Switch chart uuid to static allocation in rrdset Remove unused functions * rrdset_archive() and friends... * always create rrdfamily * enable ml_free_dimension * rrddim_foreach done with dfe * most custom rrddim loops replaced with rrddim_foreach * removed accesses to rrddim->dimensions * removed locks that are no longer needed * rrdsetvar is now managed by the dictionary * set rrdset is rrdsetvar, fixes https://github.com/netdata/netdata/pull/13646#issuecomment-1242574853 * conflict callback of rrdsetvar now properly checks if it has to reset the variable * dictionary registered callbacks accept as first parameter the DICTIONARY_ITEM * dictionary dfe now uses internal counter to report; avoided excess variables defined with dfe * dictionary walkthrough callbacks get dictionary acquired items * dictionary reference counters that can be dupped from zero * added advanced functions for get and del * rrdvar managed by dictionaries * thread safety for rrdsetvar * faster rrdvar initialization * rrdvar string lengths should match in all add, del, get functions * rrdvar internals hidden from the rest of the world * rrdvar is now acquired throughout netdata * hide the internal structures of rrdsetvar * rrdsetvar is now acquired through out netdata * rrddimvar managed by dictionary; rrddimvar linked list removed; rrddimvar structures hidden from the rest of netdata * better error handling * dont create variables if not initialized for health * dont create variables if not initialized for health again * rrdfamily is now managed by dictionaries; references of it are acquired dictionary items * type checking on acquired objects * rrdcalc renaming of functions * type checking for rrdfamily_acquired * rrdcalc managed by dictionaries * rrdcalc double free fix * host rrdvars is always needed * attempt to fix deadlock 1 * attempt to fix deadlock 2 * Remove unused variable * attempt to fix deadlock 3 * snprintfz * rrdcalc index in rrdset fix * Stop storing active charts and computing chart hashes * Remove store active chart function * Remove compute chart hash function * Remove sql_store_chart_hash function * Remove store_active_dimension function * dictionary delayed destruction * formatting and cleanup * zero dictionary base on rrdsetvar * added internal error to log delayed destructions of dictionaries * typo in rrddimvar * added debugging info to dictionary * debug info * fix for rrdcalc keys being empty * remove forgotten unlock * remove deadlock * Switch to metadata version 5 and drop chart_hash chart_hash_map chart_active dimension_active v_chart_hash * SQL cosmetic changes * do not busy wait while destroying a referenced dictionary * remove deadlock * code cleanup; re-organization; * fast cleanup and flushing of dictionaries * number formatting fixes * do not delete configured alerts when archiving a chart * rrddim obsolete linked list management outside dictionaries * removed duplicate contexts call * fix crash when rrdfamily is not initialized * dont keep rrddimvar referenced * properly cleanup rrdvar * removed some locks * Do not attempt to cleanup chart_hash / chart_hash_map * rrdcalctemplate managed by dictionary * register callbacks on the right dictionary * removed some more locks * rrdcalc secondary index replaced with linked-list; rrdcalc labels updates are now executed by health thread * when looking up for an alarm look using both chart id and chart name * host initialization a bit more modular * init rrdlabels on host update * preparation for dictionary views * improved comment * unused variables without internal checks * service threads isolation and worker info * more worker info in service thread * thread cancelability debugging with internal checks * strings data races addressed; fixes https://github.com/netdata/netdata/issues/13647 * dictionary modularization * Remove unused SQL statement definition * unit-tested thread safety of dictionaries; removed data race conditions on dictionaries and strings; dictionaries now can detect if the caller is holds a write lock and automatically all the calls become their unsafe versions; all direct calls to unsafe version is eliminated * remove worker_is_idle() from the exit of service functions, because we lose the lock time between loops * rewritten dictionary to have 2 separate locks, one for indexing and another for traversal * Update collectors/cgroups.plugin/sys_fs_cgroup.c Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update collectors/cgroups.plugin/sys_fs_cgroup.c Co-authored-by: Vladimir Kobal <vlad@prokk.net> * Update collectors/proc.plugin/proc_net_dev.c Co-authored-by: Vladimir Kobal <vlad@prokk.net> * fix memory leak in rrdset cache_dir * minor dictionary changes * dont use index locks in single threaded * obsolete dict option * rrddim options and flags separation; rrdset_done() optimization to keep array of reference pointers to rrddim; * fix jump on uninitialized value in dictionary; remove double free of cache_dir * addressed codacy findings * removed debugging code * use the private refcount on dictionaries * make dictionary item desctructors work on dictionary destruction; strictier control on dictionary API; proper cleanup sequence on rrddim; * more dictionary statistics * global statistics about dictionary operations, memory, items, callbacks * dictionary support for views - missing the public API * removed warning about unused parameter * chart and context name for cloud * chart and context name for cloud, again * dictionary statistics fixed; first implementation of dictionary views - not currently used * only the master can globally delete an item * context needs netdata prefix * fix context and chart it of spins * fix for host variables when health is not enabled * run garbage collector on item insert too * Fix info message; remove extra "using" * update dict unittest for new placement of garbage collector * we need RRDHOST->rrdvars for maintaining custom host variables * Health initialization needs the host->host_uuid * split STRING to its own files; no code changes other than that * initialize health unconditionally * unit tests do not pollute the global scope with their variables * Skip initialization when creating archived hosts on startup. When a child connects it will initialize properly Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com> Co-authored-by: Vladimir Kobal <vlad@prokk.net>
1282 lines
52 KiB
C
1282 lines
52 KiB
C
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "health.h"
|
|
|
|
unsigned int default_health_enabled = 1;
|
|
char *silencers_filename;
|
|
|
|
// the queue of executed alarm notifications that haven't been waited for yet
|
|
static struct {
|
|
ALARM_ENTRY *head; // oldest
|
|
ALARM_ENTRY *tail; // latest
|
|
} alarm_notifications_in_progress = {NULL, NULL};
|
|
|
|
typedef struct active_alerts {
|
|
char *name;
|
|
time_t last_status_change;
|
|
RRDCALC_STATUS status;
|
|
} active_alerts_t;
|
|
|
|
static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
|
|
{
|
|
ae->prev_in_progress = NULL;
|
|
ae->next_in_progress = NULL;
|
|
|
|
if (NULL != alarm_notifications_in_progress.tail) {
|
|
ae->prev_in_progress = alarm_notifications_in_progress.tail;
|
|
alarm_notifications_in_progress.tail->next_in_progress = ae;
|
|
}
|
|
if (NULL == alarm_notifications_in_progress.head) {
|
|
alarm_notifications_in_progress.head = ae;
|
|
}
|
|
alarm_notifications_in_progress.tail = ae;
|
|
|
|
}
|
|
|
|
static inline void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
|
|
{
|
|
struct alarm_entry *prev = ae->prev_in_progress;
|
|
struct alarm_entry *next = ae->next_in_progress;
|
|
|
|
if (NULL != prev) {
|
|
prev->next_in_progress = next;
|
|
}
|
|
if (NULL != next) {
|
|
next->prev_in_progress = prev;
|
|
}
|
|
if (ae == alarm_notifications_in_progress.head) {
|
|
alarm_notifications_in_progress.head = next;
|
|
}
|
|
if (ae == alarm_notifications_in_progress.tail) {
|
|
alarm_notifications_in_progress.tail = prev;
|
|
}
|
|
}
|
|
// ----------------------------------------------------------------------------
|
|
// health initialization
|
|
|
|
/**
|
|
* User Config directory
|
|
*
|
|
* Get the config directory for health and return it.
|
|
*
|
|
* @return a pointer to the user config directory
|
|
*/
|
|
inline char *health_user_config_dir(void) {
|
|
char buffer[FILENAME_MAX + 1];
|
|
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
|
|
return config_get(CONFIG_SECTION_DIRECTORIES, "health config", buffer);
|
|
}
|
|
|
|
/**
|
|
* Stock Config Directory
|
|
*
|
|
* Get the Stock config directory and return it.
|
|
*
|
|
* @return a pointer to the stock config directory.
|
|
*/
|
|
inline char *health_stock_config_dir(void) {
|
|
char buffer[FILENAME_MAX + 1];
|
|
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
|
|
return config_get(CONFIG_SECTION_DIRECTORIES, "stock health config", buffer);
|
|
}
|
|
|
|
/**
|
|
* Silencers init
|
|
*
|
|
* Function used to initialize the silencer structure.
|
|
*/
|
|
static void health_silencers_init(void) {
|
|
FILE *fd = fopen(silencers_filename, "r");
|
|
if (fd) {
|
|
fseek(fd, 0 , SEEK_END);
|
|
off_t length = (off_t) ftell(fd);
|
|
fseek(fd, 0 , SEEK_SET);
|
|
|
|
if (length > 0 && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
|
|
char *str = mallocz((length+1)* sizeof(char));
|
|
if(str) {
|
|
size_t copied;
|
|
copied = fread(str, sizeof(char), length, fd);
|
|
if (copied == (length* sizeof(char))) {
|
|
str[length] = 0x00;
|
|
json_parse(str, NULL, health_silencers_json_read_callback);
|
|
info("Parsed health silencers file %s", silencers_filename);
|
|
} else {
|
|
error("Cannot read the data from health silencers file %s", silencers_filename);
|
|
}
|
|
freez(str);
|
|
}
|
|
} else {
|
|
error(
|
|
"Health silencers file %s has the size %" PRId64 " that is out of range[ 1 , %d ]. Aborting read.",
|
|
silencers_filename,
|
|
(int64_t)length,
|
|
HEALTH_SILENCERS_MAX_FILE_LEN);
|
|
}
|
|
fclose(fd);
|
|
} else {
|
|
info("Cannot open the file %s, so Netdata will work with the default health configuration.",silencers_filename);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Health Init
|
|
*
|
|
* Initialize the health thread.
|
|
*/
|
|
void health_init(void) {
|
|
debug(D_HEALTH, "Health configuration initializing");
|
|
|
|
if(!(default_health_enabled = (unsigned int)config_get_boolean(CONFIG_SECTION_HEALTH, "enabled", default_health_enabled))) {
|
|
debug(D_HEALTH, "Health is disabled.");
|
|
return;
|
|
}
|
|
|
|
health_silencers_init();
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// re-load health configuration
|
|
|
|
/**
|
|
* Reload host
|
|
*
|
|
* Reload configuration for a specific host.
|
|
*
|
|
* @param host the structure of the host that the function will reload the configuration.
|
|
*/
|
|
static void health_reload_host(RRDHOST *host) {
|
|
if(unlikely(!host->health_enabled))
|
|
return;
|
|
|
|
char *user_path = health_user_config_dir();
|
|
char *stock_path = health_stock_config_dir();
|
|
|
|
// free all running alarms
|
|
rrdcalc_delete_all(host);
|
|
rrdcalctemplate_delete_all(host);
|
|
|
|
// invalidate all previous entries in the alarm log
|
|
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
|
|
ALARM_ENTRY *t;
|
|
for(t = host->health_log.alarms ; t ; t = t->next) {
|
|
if(t->new_status != RRDCALC_STATUS_REMOVED)
|
|
t->flags |= HEALTH_ENTRY_FLAG_UPDATED;
|
|
}
|
|
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
|
|
|
|
// reset all thresholds to all charts
|
|
RRDSET *st;
|
|
rrdset_foreach_read(st, host) {
|
|
st->green = NAN;
|
|
st->red = NAN;
|
|
}
|
|
rrdset_foreach_done(st);
|
|
|
|
// load the new alarms
|
|
health_readdir(host, user_path, stock_path, NULL);
|
|
|
|
//Discard alarms with labels that do not apply to host
|
|
rrdcalc_delete_alerts_not_matching_host_labels_from_this_host(host);
|
|
|
|
// link the loaded alarms to their charts
|
|
rrdset_foreach_write(st, host) {
|
|
if (rrdset_flag_check(st, RRDSET_FLAG_ARCHIVED))
|
|
continue;
|
|
|
|
rrdcalc_link_matching_alerts_to_rrdset(st);
|
|
rrdcalctemplate_link_matching_templates_to_rrdset(st);
|
|
}
|
|
rrdset_foreach_done(st);
|
|
|
|
}
|
|
|
|
/**
|
|
* Reload
|
|
*
|
|
* Reload the host configuration for all hosts.
|
|
*/
|
|
void health_reload(void) {
|
|
sql_refresh_hashes();
|
|
|
|
rrd_rdlock();
|
|
|
|
RRDHOST *host;
|
|
rrdhost_foreach_read(host)
|
|
health_reload_host(host);
|
|
|
|
rrd_unlock();
|
|
#ifdef ENABLE_ACLK
|
|
if (netdata_cloud_setting) {
|
|
aclk_alert_reloaded = 1;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// health main thread and friends
|
|
|
|
static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
|
|
if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
|
|
if(n) return RRDCALC_STATUS_RAISED;
|
|
return RRDCALC_STATUS_CLEAR;
|
|
}
|
|
|
|
#define ALARM_EXEC_COMMAND_LENGTH 8192
|
|
#define ACTIVE_ALARMS_LIST_EXAMINE 500
|
|
#define ACTIVE_ALARMS_LIST 15
|
|
|
|
static inline int compare_active_alerts(const void * a, const void * b) {
|
|
active_alerts_t *active_alerts_a = (active_alerts_t *)a;
|
|
active_alerts_t *active_alerts_b = (active_alerts_t *)b;
|
|
|
|
return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
|
|
}
|
|
|
|
static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
|
|
ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
|
|
|
|
if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
|
|
// do not send notifications for internal statuses
|
|
debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
|
|
goto done;
|
|
}
|
|
|
|
if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
|
|
// do not send notifications for disabled statuses
|
|
debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
|
|
// mark it as run, so that we will send the same alarm if it happens again
|
|
goto done;
|
|
}
|
|
|
|
// find the previous notification for the same alarm
|
|
// which we have run the exec script
|
|
// exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
|
|
if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
|
|
uint32_t id = ae->alarm_id;
|
|
ALARM_ENTRY *t;
|
|
for(t = ae->next; t ; t = t->next) {
|
|
if(t->alarm_id == id && t->flags & HEALTH_ENTRY_FLAG_EXEC_RUN)
|
|
break;
|
|
}
|
|
|
|
if(likely(t)) {
|
|
// we have executed this alarm notification in the past
|
|
if(t && t->new_status == ae->new_status) {
|
|
// don't send the notification for the same status again
|
|
debug(D_HEALTH, "Health not sending again notification for alarm '%s.%s' status %s", ae_chart_name(ae), ae_name(ae)
|
|
, rrdcalc_status2string(ae->new_status));
|
|
goto done;
|
|
}
|
|
}
|
|
else {
|
|
// we have not executed this alarm notification in the past
|
|
// so, don't send CLEAR notifications
|
|
if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
|
|
if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
|
|
debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
|
|
, ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
|
|
goto done;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if alarm notifications are silenced
|
|
if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
|
|
info("Health not sending notification for alarm '%s.%s' status %s (command API has disabled notifications)", ae_chart_name(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
|
|
goto done;
|
|
}
|
|
|
|
static char command_to_run[ALARM_EXEC_COMMAND_LENGTH + 1];
|
|
|
|
const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health_default_exec);
|
|
const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health_default_recipient);
|
|
|
|
int n_warn=0, n_crit=0;
|
|
RRDCALC *rc;
|
|
EVAL_EXPRESSION *expr=NULL;
|
|
BUFFER *warn_alarms, *crit_alarms;
|
|
active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
|
|
|
|
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
|
|
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
|
|
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
|
|
continue;
|
|
|
|
if(unlikely((n_warn + n_crit) >= ACTIVE_ALARMS_LIST_EXAMINE))
|
|
break;
|
|
|
|
if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
|
|
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
|
|
active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
|
|
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
|
|
active_alerts[n_warn+n_crit].status = rc->status;
|
|
n_warn++;
|
|
} else if (ae->alarm_id == rc->id)
|
|
expr = rc->warning;
|
|
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
|
|
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
|
|
active_alerts[n_warn+n_crit].name = (char *)rrdcalc_name(rc);
|
|
active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
|
|
active_alerts[n_warn+n_crit].status = rc->status;
|
|
n_crit++;
|
|
} else if (ae->alarm_id == rc->id)
|
|
expr = rc->critical;
|
|
} else if (unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
|
|
if (ae->alarm_id == rc->id)
|
|
expr = rc->warning;
|
|
}
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
|
|
if (n_warn+n_crit>1)
|
|
qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
|
|
|
|
int count_w = 0, count_c = 0;
|
|
while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
|
|
if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
|
|
if (count_w)
|
|
buffer_strcat(warn_alarms, ",");
|
|
buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
|
|
buffer_strcat(warn_alarms, "=");
|
|
buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
|
|
count_w++;
|
|
}
|
|
else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
|
|
if (count_c)
|
|
buffer_strcat(crit_alarms, ",");
|
|
buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
|
|
buffer_strcat(crit_alarms, "=");
|
|
buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
|
|
count_c++;
|
|
}
|
|
}
|
|
|
|
char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
|
|
|
|
snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
|
|
"' '" NETDATA_DOUBLE_FORMAT_ZERO
|
|
"' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
|
|
exec,
|
|
recipient,
|
|
rrdhost_registry_hostname(host),
|
|
ae->unique_id,
|
|
ae->alarm_id,
|
|
ae->alarm_event_id,
|
|
(unsigned long)ae->when,
|
|
ae_name(ae),
|
|
ae->chart?ae_chart_name(ae):"NOCHART",
|
|
ae->family?ae_family(ae):"NOFAMILY",
|
|
rrdcalc_status2string(ae->new_status),
|
|
rrdcalc_status2string(ae->old_status),
|
|
ae->new_value,
|
|
ae->old_value,
|
|
ae->source?ae_source(ae):"UNKNOWN",
|
|
(uint32_t)ae->duration,
|
|
(uint32_t)ae->non_clear_duration,
|
|
ae_units(ae),
|
|
ae_info(ae),
|
|
ae_new_value_string(ae),
|
|
ae_old_value_string(ae),
|
|
(expr && expr->source)?expr->source:"NOSOURCE",
|
|
(expr && expr->error_msg)?buffer_tostring(expr->error_msg):"NOERRMSG",
|
|
n_warn,
|
|
n_crit,
|
|
buffer_tostring(warn_alarms),
|
|
buffer_tostring(crit_alarms),
|
|
ae->classification?ae_classification(ae):"Unknown",
|
|
edit_command,
|
|
host != localhost ? host->machine_guid:""
|
|
);
|
|
|
|
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
|
|
ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
|
|
|
|
debug(D_HEALTH, "executing command '%s'", command_to_run);
|
|
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
|
|
ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
|
|
enqueue_alarm_notify_in_progress(ae);
|
|
|
|
freez(edit_command);
|
|
buffer_free(warn_alarms);
|
|
buffer_free(crit_alarms);
|
|
freez(active_alerts);
|
|
|
|
return; //health_alarm_wait_for_execution
|
|
done:
|
|
health_alarm_log_save(host, ae);
|
|
}
|
|
|
|
static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
|
|
if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
|
|
return;
|
|
|
|
spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
|
|
debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
|
|
ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
|
|
|
|
if(ae->exec_code != 0)
|
|
ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
|
|
|
|
unlink_alarm_notify_in_progress(ae);
|
|
}
|
|
|
|
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
|
|
debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
|
|
ae->chart?ae_chart_name(ae):"NOCHART", ae_name(ae),
|
|
ae->new_value,
|
|
rrdcalc_status2string(ae->old_status),
|
|
rrdcalc_status2string(ae->new_status)
|
|
);
|
|
|
|
health_alarm_execute(host, ae);
|
|
}
|
|
|
|
static inline void health_alarm_log_process(RRDHOST *host) {
|
|
uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
|
|
time_t now = now_realtime_sec();
|
|
|
|
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
|
|
|
|
ALARM_ENTRY *ae;
|
|
for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
|
|
if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
|
|
if(unlikely(
|
|
!(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
|
|
!(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
|
|
)) {
|
|
if(unlikely(ae->unique_id < first_waiting))
|
|
first_waiting = ae->unique_id;
|
|
|
|
if(likely(now >= ae->delay_up_to_timestamp))
|
|
health_process_notifications(host, ae);
|
|
}
|
|
}
|
|
}
|
|
|
|
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
|
|
|
|
// remember this for the next iteration
|
|
host->health_last_processed_id = first_waiting;
|
|
|
|
bool cleanup_excess_log_entries = host->health_log.count > host->health_log.max;
|
|
|
|
if (!cleanup_excess_log_entries)
|
|
return;
|
|
|
|
// cleanup excess entries in the log
|
|
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
|
|
|
|
ALARM_ENTRY *last = NULL;
|
|
unsigned int count = host->health_log.max * 2 / 3;
|
|
for(ae = host->health_log.alarms; ae && count ; count--, last = ae, ae = ae->next) ;
|
|
|
|
if(ae && last && last->next == ae)
|
|
last->next = NULL;
|
|
else
|
|
ae = NULL;
|
|
|
|
while(ae) {
|
|
debug(D_HEALTH, "Health removing alarm log entry with id: %u", ae->unique_id);
|
|
|
|
ALARM_ENTRY *t = ae->next;
|
|
|
|
if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING))) {
|
|
health_alarm_wait_for_execution(ae);
|
|
health_alarm_log_free_one_nochecks_nounlink(ae);
|
|
host->health_log.count--;
|
|
}
|
|
|
|
ae = t;
|
|
}
|
|
|
|
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
|
|
}
|
|
|
|
static inline int rrdcalc_isrunnable(RRDCALC *rc, time_t now, time_t *next_run) {
|
|
if(unlikely(!rc->rrdset)) {
|
|
debug(D_HEALTH, "Health not running alarm '%s.%s'. It is not linked to a chart.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
|
|
return 0;
|
|
}
|
|
|
|
if(unlikely(rc->next_update > now)) {
|
|
if (unlikely(*next_run > rc->next_update)) {
|
|
// update the next_run time of the main loop
|
|
// to run this alarm precisely the time required
|
|
*next_run = rc->next_update;
|
|
}
|
|
|
|
debug(D_HEALTH, "Health not examining alarm '%s.%s' yet (will do in %d secs).", rrdcalc_chart_name(rc), rrdcalc_name(rc), (int) (rc->next_update - now));
|
|
return 0;
|
|
}
|
|
|
|
if(unlikely(!rc->update_every)) {
|
|
debug(D_HEALTH, "Health not running alarm '%s.%s'. It does not have an update frequency", rrdcalc_chart_name(rc), rrdcalc_name(rc));
|
|
return 0;
|
|
}
|
|
|
|
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE))) {
|
|
debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as obsolete", rrdcalc_chart_name(rc), rrdcalc_name(rc));
|
|
return 0;
|
|
}
|
|
|
|
if(unlikely(rrdset_flag_check(rc->rrdset, RRDSET_FLAG_ARCHIVED))) {
|
|
debug(D_HEALTH, "Health not running alarm '%s.%s'. The chart has been marked as archived", rrdcalc_chart_name(rc), rrdcalc_name(rc));
|
|
return 0;
|
|
}
|
|
|
|
if(unlikely(!rc->rrdset->last_collected_time.tv_sec || rc->rrdset->counter_done < 2)) {
|
|
debug(D_HEALTH, "Health not running alarm '%s.%s'. Chart is not fully collected yet.", rrdcalc_chart_name(rc), rrdcalc_name(rc));
|
|
return 0;
|
|
}
|
|
|
|
int update_every = rc->rrdset->update_every;
|
|
time_t first = rrdset_first_entry_t(rc->rrdset);
|
|
time_t last = rrdset_last_entry_t(rc->rrdset);
|
|
|
|
if(unlikely(now + update_every < first /* || now - update_every > last */)) {
|
|
debug(D_HEALTH
|
|
, "Health not examining alarm '%s.%s' yet (wanted time is out of bounds - we need %lu but got %lu - %lu)."
|
|
, rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) now, (unsigned long) first
|
|
, (unsigned long) last);
|
|
return 0;
|
|
}
|
|
|
|
if(RRDCALC_HAS_DB_LOOKUP(rc)) {
|
|
time_t needed = now + rc->before + rc->after;
|
|
|
|
if(needed + update_every < first || needed - update_every > last) {
|
|
debug(D_HEALTH
|
|
, "Health not examining alarm '%s.%s' yet (not enough data yet - we need %lu but got %lu - %lu)."
|
|
, rrdcalc_chart_name(rc), rrdcalc_name(rc), (unsigned long) needed, (unsigned long) first
|
|
, (unsigned long) last);
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static inline int check_if_resumed_from_suspension(void) {
|
|
static usec_t last_realtime = 0, last_monotonic = 0;
|
|
usec_t realtime = now_realtime_usec(), monotonic = now_monotonic_usec();
|
|
int ret = 0;
|
|
|
|
// detect if monotonic and realtime have twice the difference
|
|
// in which case we assume the system was just waken from hibernation
|
|
|
|
if(last_realtime && last_monotonic && realtime - last_realtime > 2 * (monotonic - last_monotonic))
|
|
ret = 1;
|
|
|
|
last_realtime = realtime;
|
|
last_monotonic = monotonic;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static void health_main_cleanup(void *ptr) {
|
|
worker_unregister();
|
|
|
|
struct netdata_static_thread *static_thread = (struct netdata_static_thread *)ptr;
|
|
static_thread->enabled = NETDATA_MAIN_THREAD_EXITING;
|
|
|
|
info("cleaning up...");
|
|
|
|
static_thread->enabled = NETDATA_MAIN_THREAD_EXITED;
|
|
}
|
|
|
|
static SILENCE_TYPE check_silenced(RRDCALC *rc, const char *host, SILENCERS *silencers) {
|
|
SILENCER *s;
|
|
debug(D_HEALTH, "Checking if alarm was silenced via the command API. Alarm info name:%s context:%s chart:%s host:%s family:%s",
|
|
rrdcalc_name(rc), (rc->rrdset)?rrdset_context(rc->rrdset):"", rrdcalc_chart_name(rc), host, (rc->rrdset)?rrdset_family(rc->rrdset):"");
|
|
|
|
for (s = silencers->silencers; s!=NULL; s=s->next){
|
|
if (
|
|
(!s->alarms_pattern || (rc->name && s->alarms_pattern && simple_pattern_matches(s->alarms_pattern, rrdcalc_name(rc)))) &&
|
|
(!s->contexts_pattern || (rc->rrdset && rc->rrdset->context && s->contexts_pattern && simple_pattern_matches(s->contexts_pattern, rrdset_context(rc->rrdset)))) &&
|
|
(!s->hosts_pattern || (host && s->hosts_pattern && simple_pattern_matches(s->hosts_pattern,host))) &&
|
|
(!s->charts_pattern || (rc->chart && s->charts_pattern && simple_pattern_matches(s->charts_pattern, rrdcalc_chart_name(rc)))) &&
|
|
(!s->families_pattern || (rc->rrdset && rc->rrdset->family && s->families_pattern && simple_pattern_matches(s->families_pattern, rrdset_family(rc->rrdset))))
|
|
) {
|
|
debug(D_HEALTH, "Alarm matches command API silence entry %s:%s:%s:%s:%s", s->alarms,s->charts, s->contexts, s->hosts, s->families);
|
|
if (unlikely(silencers->stype == STYPE_NONE)) {
|
|
debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rrdcalc_name(rc));
|
|
} else {
|
|
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
|
|
, (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
|
|
, rrdcalc_name(rc)
|
|
, (rc->rrdset)?rrdset_context(rc->rrdset):""
|
|
, rrdcalc_chart_name(rc)
|
|
, host
|
|
, (rc->rrdset)?rrdset_family(rc->rrdset):""
|
|
);
|
|
}
|
|
return silencers->stype;
|
|
}
|
|
}
|
|
return STYPE_NONE;
|
|
}
|
|
|
|
/**
|
|
* Update Disabled Silenced
|
|
*
|
|
* Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
|
|
*
|
|
* @param host structure that contains information about the host monitored.
|
|
* @param rc structure with information about the alarm
|
|
*
|
|
* @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
|
|
*/
|
|
static int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
|
|
uint32_t rrdcalc_flags_old = rc->run_flags;
|
|
// Clear the flags
|
|
rc->run_flags &= ~(RRDCALC_FLAG_DISABLED | RRDCALC_FLAG_SILENCED);
|
|
if (unlikely(silencers->all_alarms)) {
|
|
if (silencers->stype == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
|
|
else if (silencers->stype == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
|
|
} else {
|
|
SILENCE_TYPE st = check_silenced(rc, rrdhost_hostname(host), silencers);
|
|
if (st == STYPE_DISABLE_ALARMS) rc->run_flags |= RRDCALC_FLAG_DISABLED;
|
|
else if (st == STYPE_SILENCE_NOTIFICATIONS) rc->run_flags |= RRDCALC_FLAG_SILENCED;
|
|
}
|
|
|
|
if (rrdcalc_flags_old != rc->run_flags) {
|
|
info("Alarm silencing changed for host '%s' alarm '%s': Disabled %s->%s Silenced %s->%s",
|
|
rrdhost_hostname(host),
|
|
rrdcalc_name(rc),
|
|
(rrdcalc_flags_old & RRDCALC_FLAG_DISABLED)?"true":"false",
|
|
(rc->run_flags & RRDCALC_FLAG_DISABLED)?"true":"false",
|
|
(rrdcalc_flags_old & RRDCALC_FLAG_SILENCED)?"true":"false",
|
|
(rc->run_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
|
|
);
|
|
}
|
|
if (rc->run_flags & RRDCALC_FLAG_DISABLED)
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
// Create alarms for dimensions that have been added to charts
|
|
// since the previous iteration.
|
|
static void health_execute_pending_updates(RRDHOST *host) {
|
|
RRDSET *st;
|
|
|
|
if (!rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS))
|
|
return;
|
|
|
|
rrdset_foreach_reentrant(st, host) {
|
|
if(!rrdset_flag_check(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS))
|
|
continue;
|
|
|
|
RRDDIM *rd;
|
|
rrddim_foreach_read(rd, st) {
|
|
if(!rrddim_flag_check(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARMS))
|
|
continue;
|
|
|
|
RRDCALCTEMPLATE *rt;
|
|
foreach_rrdcalctemplate_read(host, rt) {
|
|
if(!rt->foreach_dimension_pattern)
|
|
continue;
|
|
|
|
if(rrdcalctemplate_check_rrdset_conditions(rt, st, host))
|
|
rrdcalctemplate_check_rrddim_conditions_and_link(rt, st, rd, host);
|
|
}
|
|
foreach_rrdcalctemplate_done(rt);
|
|
|
|
rrddim_flag_clear(rd, RRDDIM_FLAG_PENDING_FOREACH_ALARMS);
|
|
}
|
|
rrddim_foreach_done(rd);
|
|
rrdset_flag_clear(st, RRDSET_FLAG_PENDING_FOREACH_ALARMS);
|
|
}
|
|
rrdset_foreach_done(st);
|
|
rrdhost_flag_clear(host, RRDHOST_FLAG_PENDING_FOREACH_ALARMS);
|
|
}
|
|
|
|
/**
|
|
* Health Main
|
|
*
|
|
* The main thread of the health system. In this function all the alarms will be processed.
|
|
*
|
|
* @param ptr is a pointer to the netdata_static_thread structure.
|
|
*
|
|
* @return It always returns NULL
|
|
*/
|
|
|
|
#define WORKER_HEALTH_JOB_RRD_LOCK 0
|
|
#define WORKER_HEALTH_JOB_HOST_LOCK 1
|
|
#define WORKER_HEALTH_JOB_DB_QUERY 2
|
|
#define WORKER_HEALTH_JOB_CALC_EVAL 3
|
|
#define WORKER_HEALTH_JOB_WARNING_EVAL 4
|
|
#define WORKER_HEALTH_JOB_CRITICAL_EVAL 5
|
|
#define WORKER_HEALTH_JOB_ALARM_LOG_ENTRY 6
|
|
#define WORKER_HEALTH_JOB_ALARM_LOG_PROCESS 7
|
|
|
|
#if WORKER_UTILIZATION_MAX_JOB_TYPES < 8
|
|
#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 8
|
|
#endif
|
|
|
|
void *health_main(void *ptr) {
|
|
worker_register("HEALTH");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_RRD_LOCK, "rrd lock");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_HOST_LOCK, "host lock");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_DB_QUERY, "db lookup");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_CALC_EVAL, "calc eval");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_WARNING_EVAL, "warning eval");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_CRITICAL_EVAL, "critical eval");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY, "alarm log entry");
|
|
worker_register_job_name(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS, "alarm log process");
|
|
|
|
netdata_thread_cleanup_push(health_main_cleanup, ptr);
|
|
|
|
int min_run_every = (int)config_get_number(CONFIG_SECTION_HEALTH, "run at least every seconds", 10);
|
|
if(min_run_every < 1) min_run_every = 1;
|
|
|
|
int cleanup_sql_every_loop = 7200 / min_run_every;
|
|
|
|
time_t now = now_realtime_sec();
|
|
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
|
|
|
|
rrdcalc_delete_alerts_not_matching_host_labels_from_all_hosts();
|
|
|
|
unsigned int loop = 0;
|
|
#ifdef ENABLE_ACLK
|
|
unsigned int marked_aclk_reload_loop = 0;
|
|
#endif
|
|
while(!netdata_exit) {
|
|
loop++;
|
|
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
|
|
|
|
int runnable = 0, apply_hibernation_delay = 0;
|
|
time_t next_run = now + min_run_every;
|
|
RRDCALC *rc;
|
|
|
|
if (unlikely(check_if_resumed_from_suspension())) {
|
|
apply_hibernation_delay = 1;
|
|
|
|
info(
|
|
"Postponing alarm checks for %"PRId64" seconds, "
|
|
"because it seems that the system was just resumed from suspension.",
|
|
(int64_t)hibernation_delay);
|
|
}
|
|
|
|
if (unlikely(silencers->all_alarms && silencers->stype == STYPE_DISABLE_ALARMS)) {
|
|
static int logged=0;
|
|
if (!logged) {
|
|
info("Skipping health checks, because all alarms are disabled via a %s command.",
|
|
HEALTH_CMDAPI_CMD_DISABLEALL);
|
|
logged = 1;
|
|
}
|
|
}
|
|
|
|
#ifdef ENABLE_ACLK
|
|
if (aclk_alert_reloaded && !marked_aclk_reload_loop)
|
|
marked_aclk_reload_loop = loop;
|
|
#endif
|
|
|
|
worker_is_busy(WORKER_HEALTH_JOB_RRD_LOCK);
|
|
rrd_rdlock();
|
|
|
|
RRDHOST *host;
|
|
rrdhost_foreach_read(host) {
|
|
if (unlikely(!host->health_enabled))
|
|
continue;
|
|
|
|
if (unlikely(apply_hibernation_delay)) {
|
|
info(
|
|
"Postponing health checks for %"PRId64" seconds, on host '%s'.",
|
|
(int64_t)hibernation_delay,
|
|
rrdhost_hostname(host));
|
|
|
|
host->health_delay_up_to = now + hibernation_delay;
|
|
}
|
|
|
|
if (unlikely(host->health_delay_up_to)) {
|
|
if (unlikely(now < host->health_delay_up_to))
|
|
continue;
|
|
|
|
info("Resuming health checks on host '%s'.", rrdhost_hostname(host));
|
|
host->health_delay_up_to = 0;
|
|
}
|
|
|
|
// wait until cleanup of obsolete charts on children is complete
|
|
if (host != localhost)
|
|
if (unlikely(host->trigger_chart_obsoletion_check == 1))
|
|
continue;
|
|
|
|
if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
|
|
sql_health_alarm_log_cleanup(host);
|
|
|
|
health_execute_pending_updates(host);
|
|
|
|
worker_is_busy(WORKER_HEALTH_JOB_HOST_LOCK);
|
|
|
|
// the first loop is to lookup values from the db
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
|
|
rrdcalc_update_info_using_rrdset_labels(rc);
|
|
|
|
if (update_disabled_silenced(host, rc))
|
|
continue;
|
|
|
|
// create an alert removed event if the chart is obsolete and
|
|
// has stopped being collected for 60 seconds
|
|
if (unlikely(rc->rrdset && rc->status != RRDCALC_STATUS_REMOVED &&
|
|
rrdset_flag_check(rc->rrdset, RRDSET_FLAG_OBSOLETE) &&
|
|
now > (rc->rrdset->last_collected_time.tv_sec + 60))) {
|
|
if (!rrdcalc_isrepeating(rc)) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
|
|
time_t now = now_realtime_sec();
|
|
|
|
ALARM_ENTRY *ae = health_create_alarm_entry(
|
|
host,
|
|
rc->id,
|
|
rc->next_event_id++,
|
|
rc->config_hash_id,
|
|
now,
|
|
rc->name,
|
|
rc->rrdset->id,
|
|
rc->rrdset->context,
|
|
rc->rrdset->family,
|
|
rc->classification,
|
|
rc->component,
|
|
rc->type,
|
|
rc->exec,
|
|
rc->recipient,
|
|
now - rc->last_status_change,
|
|
rc->value,
|
|
NAN,
|
|
rc->status,
|
|
RRDCALC_STATUS_REMOVED,
|
|
rc->source,
|
|
rc->units,
|
|
rc->info,
|
|
0,
|
|
rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0);
|
|
|
|
if (ae) {
|
|
health_alarm_log_add_entry(host, ae);
|
|
rc->old_status = rc->status;
|
|
rc->status = RRDCALC_STATUS_REMOVED;
|
|
rc->last_status_change = now;
|
|
rc->last_updated = now;
|
|
rc->value = NAN;
|
|
#ifdef ENABLE_ACLK
|
|
if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
|
|
sql_queue_alarm_to_aclk(host, ae, 1);
|
|
#endif
|
|
}
|
|
}
|
|
}
|
|
|
|
if (unlikely(!rrdcalc_isrunnable(rc, now, &next_run))) {
|
|
if (unlikely(rc->run_flags & RRDCALC_FLAG_RUNNABLE))
|
|
rc->run_flags &= ~RRDCALC_FLAG_RUNNABLE;
|
|
continue;
|
|
}
|
|
|
|
runnable++;
|
|
rc->old_value = rc->value;
|
|
rc->run_flags |= RRDCALC_FLAG_RUNNABLE;
|
|
|
|
// ------------------------------------------------------------
|
|
// if there is database lookup, do it
|
|
|
|
if (unlikely(RRDCALC_HAS_DB_LOOKUP(rc))) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_DB_QUERY);
|
|
|
|
/* time_t old_db_timestamp = rc->db_before; */
|
|
int value_is_null = 0;
|
|
|
|
int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rrdcalc_dimensions(rc), 1,
|
|
rc->after, rc->before, rc->group, NULL,
|
|
0, rc->options,
|
|
&rc->db_after,&rc->db_before,
|
|
NULL, NULL, NULL,
|
|
&value_is_null, NULL, 0, 0);
|
|
|
|
if (unlikely(ret != 200)) {
|
|
// database lookup failed
|
|
rc->value = NAN;
|
|
rc->run_flags |= RRDCALC_FLAG_DB_ERROR;
|
|
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup returned error %d",
|
|
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), ret
|
|
);
|
|
} else
|
|
rc->run_flags &= ~RRDCALC_FLAG_DB_ERROR;
|
|
|
|
/* - RRDCALC_FLAG_DB_STALE not currently used
|
|
if (unlikely(old_db_timestamp == rc->db_before)) {
|
|
// database is stale
|
|
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
|
|
|
|
if (unlikely(!(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))) {
|
|
rc->rrdcalc_flags |= RRDCALC_FLAG_DB_STALE;
|
|
error("Health on host '%s', alarm '%s.%s': database is stale", host->hostname, rc->chart?rc->chart:"NOCHART", rc->name);
|
|
}
|
|
}
|
|
else if (unlikely(rc->rrdcalc_flags & RRDCALC_FLAG_DB_STALE))
|
|
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_STALE;
|
|
*/
|
|
|
|
if (unlikely(value_is_null)) {
|
|
// collected value is null
|
|
rc->value = NAN;
|
|
rc->run_flags |= RRDCALC_FLAG_DB_NAN;
|
|
|
|
debug(D_HEALTH,
|
|
"Health on host '%s', alarm '%s.%s': database lookup returned empty value (possibly value is not collected yet)",
|
|
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc)
|
|
);
|
|
} else
|
|
rc->run_flags &= ~RRDCALC_FLAG_DB_NAN;
|
|
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT,
|
|
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc), rc->value
|
|
);
|
|
}
|
|
|
|
// ------------------------------------------------------------
|
|
// if there is calculation expression, run it
|
|
|
|
if (unlikely(rc->calculation)) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_CALC_EVAL);
|
|
|
|
if (unlikely(!expression_evaluate(rc->calculation))) {
|
|
// calculation failed
|
|
rc->value = NAN;
|
|
rc->run_flags |= RRDCALC_FLAG_CALC_ERROR;
|
|
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' failed: %s",
|
|
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
|
|
rc->calculation->parsed_as, buffer_tostring(rc->calculation->error_msg)
|
|
);
|
|
} else {
|
|
rc->run_flags &= ~RRDCALC_FLAG_CALC_ERROR;
|
|
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
|
|
NETDATA_DOUBLE_FORMAT
|
|
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
|
|
rc->calculation->parsed_as, rc->calculation->result,
|
|
buffer_tostring(rc->calculation->error_msg), rrdcalc_source(rc)
|
|
);
|
|
|
|
rc->value = rc->calculation->result;
|
|
}
|
|
}
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
|
|
if (unlikely(runnable && !netdata_exit)) {
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
if (unlikely(!(rc->run_flags & RRDCALC_FLAG_RUNNABLE)))
|
|
continue;
|
|
|
|
if (rc->run_flags & RRDCALC_FLAG_DISABLED) {
|
|
continue;
|
|
}
|
|
RRDCALC_STATUS warning_status = RRDCALC_STATUS_UNDEFINED;
|
|
RRDCALC_STATUS critical_status = RRDCALC_STATUS_UNDEFINED;
|
|
|
|
// --------------------------------------------------------
|
|
// check the warning expression
|
|
|
|
if (likely(rc->warning)) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_WARNING_EVAL);
|
|
|
|
if (unlikely(!expression_evaluate(rc->warning))) {
|
|
// calculation failed
|
|
rc->run_flags |= RRDCALC_FLAG_WARN_ERROR;
|
|
|
|
debug(D_HEALTH,
|
|
"Health on host '%s', alarm '%s.%s': warning expression failed with error: %s",
|
|
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
|
|
buffer_tostring(rc->warning->error_msg)
|
|
);
|
|
} else {
|
|
rc->run_flags &= ~RRDCALC_FLAG_WARN_ERROR;
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
|
|
NETDATA_DOUBLE_FORMAT
|
|
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
|
|
rrdcalc_name(rc), rc->warning->result, buffer_tostring(rc->warning->error_msg), rrdcalc_source(rc)
|
|
);
|
|
warning_status = rrdcalc_value2status(rc->warning->result);
|
|
}
|
|
}
|
|
|
|
// --------------------------------------------------------
|
|
// check the critical expression
|
|
|
|
if (likely(rc->critical)) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_CRITICAL_EVAL);
|
|
|
|
if (unlikely(!expression_evaluate(rc->critical))) {
|
|
// calculation failed
|
|
rc->run_flags |= RRDCALC_FLAG_CRIT_ERROR;
|
|
|
|
debug(D_HEALTH,
|
|
"Health on host '%s', alarm '%s.%s': critical expression failed with error: %s",
|
|
rrdhost_hostname(host), rrdcalc_chart_name(rc), rrdcalc_name(rc),
|
|
buffer_tostring(rc->critical->error_msg)
|
|
);
|
|
} else {
|
|
rc->run_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
|
|
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
|
|
NETDATA_DOUBLE_FORMAT
|
|
": %s (source: %s)", rrdhost_hostname(host), rrdcalc_chart_name(rc),
|
|
rrdcalc_name(rc), rc->critical->result, buffer_tostring(rc->critical->error_msg),
|
|
rrdcalc_source(rc)
|
|
);
|
|
critical_status = rrdcalc_value2status(rc->critical->result);
|
|
}
|
|
}
|
|
|
|
// --------------------------------------------------------
|
|
// decide the final alarm status
|
|
|
|
RRDCALC_STATUS status = RRDCALC_STATUS_UNDEFINED;
|
|
|
|
switch (warning_status) {
|
|
case RRDCALC_STATUS_CLEAR:
|
|
status = RRDCALC_STATUS_CLEAR;
|
|
break;
|
|
|
|
case RRDCALC_STATUS_RAISED:
|
|
status = RRDCALC_STATUS_WARNING;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (critical_status) {
|
|
case RRDCALC_STATUS_CLEAR:
|
|
if (status == RRDCALC_STATUS_UNDEFINED)
|
|
status = RRDCALC_STATUS_CLEAR;
|
|
break;
|
|
|
|
case RRDCALC_STATUS_RAISED:
|
|
status = RRDCALC_STATUS_CRITICAL;
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
// --------------------------------------------------------
|
|
// check if the new status and the old differ
|
|
|
|
if (status != rc->status) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
|
|
int delay = 0;
|
|
|
|
// apply trigger hysteresis
|
|
|
|
if (now > rc->delay_up_to_timestamp) {
|
|
rc->delay_up_current = rc->delay_up_duration;
|
|
rc->delay_down_current = rc->delay_down_duration;
|
|
rc->delay_last = 0;
|
|
rc->delay_up_to_timestamp = 0;
|
|
} else {
|
|
rc->delay_up_current = (int) (rc->delay_up_current * rc->delay_multiplier);
|
|
if (rc->delay_up_current > rc->delay_max_duration)
|
|
rc->delay_up_current = rc->delay_max_duration;
|
|
|
|
rc->delay_down_current = (int) (rc->delay_down_current * rc->delay_multiplier);
|
|
if (rc->delay_down_current > rc->delay_max_duration)
|
|
rc->delay_down_current = rc->delay_max_duration;
|
|
}
|
|
|
|
if (status > rc->status)
|
|
delay = rc->delay_up_current;
|
|
else
|
|
delay = rc->delay_down_current;
|
|
|
|
// COMMENTED: because we do need to send raising alarms
|
|
// if(now + delay < rc->delay_up_to_timestamp)
|
|
// delay = (int)(rc->delay_up_to_timestamp - now);
|
|
|
|
rc->delay_last = delay;
|
|
rc->delay_up_to_timestamp = now + delay;
|
|
|
|
|
|
ALARM_ENTRY *ae = health_create_alarm_entry(
|
|
host,
|
|
rc->id,
|
|
rc->next_event_id++,
|
|
rc->config_hash_id,
|
|
now,
|
|
rc->name,
|
|
rc->rrdset->id,
|
|
rc->rrdset->context,
|
|
rc->rrdset->family,
|
|
rc->classification,
|
|
rc->component,
|
|
rc->type,
|
|
rc->exec,
|
|
rc->recipient,
|
|
now - rc->last_status_change,
|
|
rc->old_value,
|
|
rc->value,
|
|
rc->status,
|
|
status,
|
|
rc->source,
|
|
rc->units,
|
|
rc->info,
|
|
rc->delay_last,
|
|
(
|
|
((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
|
|
((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
|
|
(rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
|
|
)
|
|
);
|
|
|
|
health_alarm_log_add_entry(host, ae);
|
|
|
|
rc->last_status_change = now;
|
|
rc->old_status = rc->status;
|
|
rc->status = status;
|
|
}
|
|
|
|
rc->last_updated = now;
|
|
rc->next_update = now + rc->update_every;
|
|
|
|
if (next_run > rc->next_update)
|
|
next_run = rc->next_update;
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
|
|
// process repeating alarms
|
|
foreach_rrdcalc_in_rrdhost_read(host, rc) {
|
|
int repeat_every = 0;
|
|
if(unlikely(rrdcalc_isrepeating(rc) && rc->delay_up_to_timestamp <= now)) {
|
|
if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
|
|
rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
|
|
repeat_every = rc->warn_repeat_every;
|
|
} else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
|
|
rc->run_flags &= ~RRDCALC_FLAG_RUN_ONCE;
|
|
repeat_every = rc->crit_repeat_every;
|
|
} else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
|
|
if(!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE)) {
|
|
if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
|
|
repeat_every = 1;
|
|
} else if (rc->old_status == RRDCALC_STATUS_WARNING) {
|
|
repeat_every = 1;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
continue;
|
|
}
|
|
|
|
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
|
|
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
|
|
rc->last_repeat = now;
|
|
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
|
|
|
|
ALARM_ENTRY *ae = health_create_alarm_entry(
|
|
host,
|
|
rc->id,
|
|
rc->next_event_id++,
|
|
rc->config_hash_id,
|
|
now,
|
|
rc->name,
|
|
rc->rrdset->id,
|
|
rc->rrdset->context,
|
|
rc->rrdset->family,
|
|
rc->classification,
|
|
rc->component,
|
|
rc->type,
|
|
rc->exec,
|
|
rc->recipient,
|
|
now - rc->last_status_change,
|
|
rc->old_value,
|
|
rc->value,
|
|
rc->old_status,
|
|
rc->status,
|
|
rc->source,
|
|
rc->units,
|
|
rc->info,
|
|
rc->delay_last,
|
|
(
|
|
((rc->options & RRDCALC_OPTION_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
|
|
((rc->run_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) |
|
|
(rrdcalc_isrepeating(rc)?HEALTH_ENTRY_FLAG_IS_REPEATING:0)
|
|
)
|
|
);
|
|
|
|
ae->last_repeat = rc->last_repeat;
|
|
if (!(rc->run_flags & RRDCALC_FLAG_RUN_ONCE) && rc->status == RRDCALC_STATUS_CLEAR) {
|
|
ae->flags |= HEALTH_ENTRY_RUN_ONCE;
|
|
}
|
|
rc->run_flags |= RRDCALC_FLAG_RUN_ONCE;
|
|
health_process_notifications(host, ae);
|
|
debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
|
|
health_alarm_wait_for_execution(ae);
|
|
health_alarm_log_free_one_nochecks_nounlink(ae);
|
|
}
|
|
}
|
|
foreach_rrdcalc_in_rrdhost_done(rc);
|
|
}
|
|
|
|
if (unlikely(netdata_exit))
|
|
break;
|
|
|
|
// execute notifications
|
|
// and cleanup
|
|
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_PROCESS);
|
|
health_alarm_log_process(host);
|
|
|
|
if (unlikely(netdata_exit)) {
|
|
// wait for all notifications to finish before allowing health to be cleaned up
|
|
ALARM_ENTRY *ae;
|
|
while (NULL != (ae = alarm_notifications_in_progress.head)) {
|
|
health_alarm_wait_for_execution(ae);
|
|
}
|
|
break;
|
|
}
|
|
|
|
} /* rrdhost_foreach */
|
|
|
|
// wait for all notifications to finish before allowing health to be cleaned up
|
|
ALARM_ENTRY *ae;
|
|
while (NULL != (ae = alarm_notifications_in_progress.head)) {
|
|
health_alarm_wait_for_execution(ae);
|
|
}
|
|
|
|
#ifdef ENABLE_ACLK
|
|
if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
|
|
rrdhost_foreach_read(host) {
|
|
if (unlikely(!host->health_enabled))
|
|
continue;
|
|
sql_queue_removed_alerts_to_aclk(host);
|
|
}
|
|
aclk_alert_reloaded = 0;
|
|
marked_aclk_reload_loop = 0;
|
|
}
|
|
#endif
|
|
|
|
rrd_unlock();
|
|
|
|
if(unlikely(netdata_exit))
|
|
break;
|
|
|
|
now = now_realtime_sec();
|
|
if(now < next_run) {
|
|
worker_is_idle();
|
|
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration in %d secs", loop, (int) (next_run - now));
|
|
sleep_usec(USEC_PER_SEC * (usec_t) (next_run - now));
|
|
now = now_realtime_sec();
|
|
}
|
|
else
|
|
debug(D_HEALTH, "Health monitoring iteration no %u done. Next iteration now", loop);
|
|
|
|
} // forever
|
|
|
|
netdata_thread_cleanup_pop(1);
|
|
return NULL;
|
|
}
|