0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-14 01:29:11 +00:00
netdata_netdata/libnetdata/clocks/clocks.c
Costa Tsaousis f466b8aef5
DYNCFG: dynamically configured alerts ()
* cleanup alerts

* fix references

* fix references

* fix references

* load alerts once and apply them to each node

* simplify health_create_alarm_entry()

* Compile without warnings with compiler flags:

   -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self

* code re-organization and cleanup

* generate patterns when applying prototypes; give unique dyncfg names to all alerts

* eval expressions keep the source and the parsed_as as STRING pointers

* renamed host to node in dyncfg ids

* renamed host to node in dyncfg ids

* add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level

* working functionality

* code re-organization: moved health event-loop to a new file, moved health globals to health.c

* rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs

* dyncfg alert prototypes initialization for alerts

* health dyncfg split to separate file

* cleanup not-needed code

* normalize matches between parsing and json

* also detect !* for disabled alerts

* dyncfg capability disabled

* Store alert config part1

* Add rrdlabels_common_count

* wip health variables lookup without indexes

* Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter

* working variables with runtime lookup

* working variables with runtime lookup

* delete rrddimvar and rrdfamily index

* remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts

* added /api/v1/variable that resolves a variable the same way alerts do

* remove rrdcalc from eval

* remove debug code

* remove duplicate assignment

* Fix memory leak

* all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts

* hide all internal structures of EVAL

* Enable -Wformat flag

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>

* Adjust binding for calculation, warning, critical

* Remove unused macro

* Update config hash id

* use the right info and summary in alerts log

* use synchronous queries for alerts

* Handle cases when config_hash_id is missing from health_log

* remove deadlock from health worker

* parsing to json payload for health alert prototypes

* cleaner parsing and avoiding memory leaks in case of duplicate members in json

* fix left-over rename of function

* Keep original lookup field to send to the cloud
Cleanup / rename function to store config
Remove unused DEFINEs, functions

* Use ac->lookup

* link jobs to the host when the template is registered; do not accept running a function without a host

* full dyncfg support for health alerts, except action TEST

* working dyncfg additions, updates, removals

* fixed missing source, wrong status updates

* add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint

* fix dyncfg unittest

* rename functions

* generalize the json-c parser macros and move them to libnetdata

* report progress when enabling and disabling dyncfg templates

* moved rrdcalc and rrdvar to health

* update alarms

* added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts

* enable parsed json alerts; allow sending back accepted but disabled

* added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output

* remove the RRDHOST pointer from DYNCFG

* Fix command field submitted to the cloud

* do not send updates to creation requests, for DYNCFG jobs

---------

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: ilyam8 <ilya@netdata.cloud>
2024-01-23 20:20:41 +02:00

478 lines
16 KiB
C

// SPDX-License-Identifier: GPL-3.0-or-later
#include "../libnetdata.h"
// defaults are for compatibility
// call clocks_init() once, to optimize these default settings
static clockid_t clock_boottime_to_use = CLOCK_MONOTONIC;
static clockid_t clock_monotonic_to_use = CLOCK_MONOTONIC;
// the default clock resolution is 1ms
#define DEFAULT_CLOCK_RESOLUTION_UT ((usec_t)0 * USEC_PER_SEC + (usec_t)1 * USEC_PER_MS)
// the max clock resolution is 10ms
#define MAX_CLOCK_RESOLUTION_UT ((usec_t)0 * USEC_PER_SEC + (usec_t)10 * USEC_PER_MS)
usec_t clock_monotonic_resolution = DEFAULT_CLOCK_RESOLUTION_UT;
usec_t clock_realtime_resolution = DEFAULT_CLOCK_RESOLUTION_UT;
#ifndef HAVE_CLOCK_GETTIME
inline int clock_gettime(clockid_t clk_id __maybe_unused, struct timespec *ts) {
struct timeval tv;
if(unlikely(gettimeofday(&tv, NULL) == -1)) {
netdata_log_error("gettimeofday() failed.");
return -1;
}
ts->tv_sec = tv.tv_sec;
ts->tv_nsec = (long)((tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC);
return 0;
}
#endif
// Similar to CLOCK_MONOTONIC, but provides access to a raw hardware-based time that is not subject to NTP adjustments
// or the incremental adjustments performed by adjtime(3). This clock does not count time that the system is suspended
static void test_clock_monotonic_raw(void) {
#ifdef CLOCK_MONOTONIC_RAW
struct timespec ts;
if(clock_gettime(CLOCK_MONOTONIC_RAW, &ts) == -1 && errno == EINVAL)
clock_monotonic_to_use = CLOCK_MONOTONIC;
else
clock_monotonic_to_use = CLOCK_MONOTONIC_RAW;
#else
clock_monotonic_to_use = CLOCK_MONOTONIC;
#endif
}
// When running a binary with CLOCK_BOOTTIME defined on a system with a linux kernel older than Linux 2.6.39 the
// clock_gettime(2) system call fails with EINVAL. In that case it must fall-back to CLOCK_MONOTONIC.
static void test_clock_boottime(void) {
struct timespec ts;
if(clock_gettime(CLOCK_BOOTTIME, &ts) == -1 && errno == EINVAL)
clock_boottime_to_use = clock_monotonic_to_use;
else
clock_boottime_to_use = CLOCK_BOOTTIME;
}
static usec_t get_clock_resolution(clockid_t clock) {
struct timespec ts = { 0 };
if(clock_getres(clock, &ts) == 0) {
usec_t ret = (usec_t)ts.tv_sec * USEC_PER_SEC + (usec_t)ts.tv_nsec / NSEC_PER_USEC;
if(!ret && ts.tv_nsec > 0 && ts.tv_nsec < (long int)NSEC_PER_USEC)
return (usec_t)1;
else if(ret > MAX_CLOCK_RESOLUTION_UT) {
nd_log(NDLS_DAEMON, NDLP_ERR, "clock_getres(%d) returned %"PRIu64" usec is out of range, using defaults for clock resolution.", (int)clock, ret);
return DEFAULT_CLOCK_RESOLUTION_UT;
}
return ret;
}
else {
nd_log(NDLS_DAEMON, NDLP_ERR, "clock_getres(%d) failed, using defaults for clock resolution.", (int)clock);
return DEFAULT_CLOCK_RESOLUTION_UT;
}
}
// perform any initializations required for clocks
void clocks_init(void) {
// monotonic raw has to be tested before boottime
test_clock_monotonic_raw();
// boottime has to be tested after monotonic coarse
test_clock_boottime();
clock_monotonic_resolution = get_clock_resolution(clock_monotonic_to_use);
clock_realtime_resolution = get_clock_resolution(CLOCK_REALTIME);
}
inline time_t now_sec(clockid_t clk_id) {
struct timespec ts;
if(unlikely(clock_gettime(clk_id, &ts) == -1)) {
netdata_log_error("clock_gettime(%d, &timespec) failed.", clk_id);
return 0;
}
return ts.tv_sec;
}
inline usec_t now_usec(clockid_t clk_id) {
struct timespec ts;
if(unlikely(clock_gettime(clk_id, &ts) == -1)) {
netdata_log_error("clock_gettime(%d, &timespec) failed.", clk_id);
return 0;
}
return (usec_t)ts.tv_sec * USEC_PER_SEC + (usec_t)(ts.tv_nsec % NSEC_PER_SEC) / NSEC_PER_USEC;
}
inline int now_timeval(clockid_t clk_id, struct timeval *tv) {
struct timespec ts;
if(unlikely(clock_gettime(clk_id, &ts) == -1)) {
netdata_log_error("clock_gettime(%d, &timespec) failed.", clk_id);
tv->tv_sec = 0;
tv->tv_usec = 0;
return -1;
}
tv->tv_sec = ts.tv_sec;
tv->tv_usec = (suseconds_t)((ts.tv_nsec % NSEC_PER_SEC) / NSEC_PER_USEC);
return 0;
}
inline time_t now_realtime_sec(void) {
return now_sec(CLOCK_REALTIME);
}
inline msec_t now_realtime_msec(void) {
return now_usec(CLOCK_REALTIME) / USEC_PER_MS;
}
inline usec_t now_realtime_usec(void) {
return now_usec(CLOCK_REALTIME);
}
inline int now_realtime_timeval(struct timeval *tv) {
return now_timeval(CLOCK_REALTIME, tv);
}
inline time_t now_monotonic_sec(void) {
return now_sec(clock_monotonic_to_use);
}
inline usec_t now_monotonic_usec(void) {
return now_usec(clock_monotonic_to_use);
}
inline int now_monotonic_timeval(struct timeval *tv) {
return now_timeval(clock_monotonic_to_use, tv);
}
inline time_t now_monotonic_high_precision_sec(void) {
return now_sec(CLOCK_MONOTONIC);
}
inline usec_t now_monotonic_high_precision_usec(void) {
return now_usec(CLOCK_MONOTONIC);
}
inline int now_monotonic_high_precision_timeval(struct timeval *tv) {
return now_timeval(CLOCK_MONOTONIC, tv);
}
inline time_t now_boottime_sec(void) {
return now_sec(clock_boottime_to_use);
}
inline usec_t now_boottime_usec(void) {
return now_usec(clock_boottime_to_use);
}
inline int now_boottime_timeval(struct timeval *tv) {
return now_timeval(clock_boottime_to_use, tv);
}
inline usec_t timeval_usec(struct timeval *tv) {
return (usec_t)tv->tv_sec * USEC_PER_SEC + (tv->tv_usec % USEC_PER_SEC);
}
inline msec_t timeval_msec(struct timeval *tv) {
return (msec_t)tv->tv_sec * MSEC_PER_SEC + ((tv->tv_usec % USEC_PER_SEC) / MSEC_PER_SEC);
}
inline susec_t dt_usec_signed(struct timeval *now, struct timeval *old) {
usec_t ts1 = timeval_usec(now);
usec_t ts2 = timeval_usec(old);
if(likely(ts1 >= ts2)) return (susec_t)(ts1 - ts2);
return -((susec_t)(ts2 - ts1));
}
inline usec_t dt_usec(struct timeval *now, struct timeval *old) {
usec_t ts1 = timeval_usec(now);
usec_t ts2 = timeval_usec(old);
return (ts1 > ts2) ? (ts1 - ts2) : (ts2 - ts1);
}
#ifdef __linux__
void sleep_to_absolute_time(usec_t usec) {
static int einval_printed = 0, enotsup_printed = 0, eunknown_printed = 0;
clockid_t clock = CLOCK_REALTIME;
struct timespec req = {
.tv_sec = (time_t)(usec / USEC_PER_SEC),
.tv_nsec = (suseconds_t)((usec % USEC_PER_SEC) * NSEC_PER_USEC)
};
errno = 0;
int ret = 0;
while( (ret = clock_nanosleep(clock, TIMER_ABSTIME, &req, NULL)) != 0 ) {
if(ret == EINTR) {
errno = 0;
continue;
}
else {
if (ret == EINVAL) {
if (!einval_printed) {
einval_printed++;
netdata_log_error("Invalid time given to clock_nanosleep(): clockid = %d, tv_sec = %lld, tv_nsec = %ld",
clock,
(long long)req.tv_sec,
req.tv_nsec);
}
} else if (ret == ENOTSUP) {
if (!enotsup_printed) {
enotsup_printed++;
netdata_log_error("Invalid clock id given to clock_nanosleep(): clockid = %d, tv_sec = %lld, tv_nsec = %ld",
clock,
(long long)req.tv_sec,
req.tv_nsec);
}
} else {
if (!eunknown_printed) {
eunknown_printed++;
netdata_log_error("Unknown return value %d from clock_nanosleep(): clockid = %d, tv_sec = %lld, tv_nsec = %ld",
ret,
clock,
(long long)req.tv_sec,
req.tv_nsec);
}
}
sleep_usec(usec);
}
}
}
#endif
#define HEARTBEAT_ALIGNMENT_STATISTICS_SIZE 10
netdata_mutex_t heartbeat_alignment_mutex = NETDATA_MUTEX_INITIALIZER;
static size_t heartbeat_alignment_id = 0;
struct heartbeat_thread_statistics {
size_t sequence;
usec_t dt;
};
static struct heartbeat_thread_statistics heartbeat_alignment_values[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 };
void heartbeat_statistics(usec_t *min_ptr, usec_t *max_ptr, usec_t *average_ptr, size_t *count_ptr) {
struct heartbeat_thread_statistics current[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE];
static struct heartbeat_thread_statistics old[HEARTBEAT_ALIGNMENT_STATISTICS_SIZE] = { 0 };
memcpy(current, heartbeat_alignment_values, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE);
usec_t min = 0, max = 0, total = 0, average = 0;
size_t i, count = 0;
for(i = 0; i < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE ;i++) {
if(current[i].sequence == old[i].sequence) continue;
usec_t value = current[i].dt - old[i].dt;
if(!count) {
min = max = total = value;
count = 1;
}
else {
total += value;
if(value < min) min = value;
if(value > max) max = value;
count++;
}
}
if(count)
average = total / count;
if(min_ptr) *min_ptr = min;
if(max_ptr) *max_ptr = max;
if(average_ptr) *average_ptr = average;
if(count_ptr) *count_ptr = count;
memcpy(old, current, sizeof(struct heartbeat_thread_statistics) * HEARTBEAT_ALIGNMENT_STATISTICS_SIZE);
}
inline void heartbeat_init(heartbeat_t *hb) {
hb->realtime = 0ULL;
hb->randomness = (usec_t)250 * USEC_PER_MS + ((usec_t)(now_realtime_usec() * clock_realtime_resolution) % (250 * USEC_PER_MS));
hb->randomness -= (hb->randomness % clock_realtime_resolution);
netdata_mutex_lock(&heartbeat_alignment_mutex);
hb->statistics_id = heartbeat_alignment_id;
heartbeat_alignment_id++;
netdata_mutex_unlock(&heartbeat_alignment_mutex);
if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) {
heartbeat_alignment_values[hb->statistics_id].dt = 0;
heartbeat_alignment_values[hb->statistics_id].sequence = 0;
}
}
// waits for the next heartbeat
// it waits using the monotonic clock
// it returns the dt using the realtime clock
usec_t heartbeat_next(heartbeat_t *hb, usec_t tick) {
if(unlikely(hb->randomness > tick / 2)) {
// TODO: The heartbeat tick should be specified at the heartbeat_init() function
usec_t tmp = (now_realtime_usec() * clock_realtime_resolution) % (tick / 2);
nd_log_limit_static_global_var(erl, 10, 0);
nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
"heartbeat randomness of %"PRIu64" is too big for a tick of %"PRIu64" - setting it to %"PRIu64"",
hb->randomness, tick, tmp);
hb->randomness = tmp;
}
usec_t dt;
usec_t now = now_realtime_usec();
usec_t next = now - (now % tick) + tick + hb->randomness;
// align the next time we want to the clock resolution
if(next % clock_realtime_resolution)
next = next - (next % clock_realtime_resolution) + clock_realtime_resolution;
// sleep_usec() has a loop to guarantee we will sleep for at least the requested time.
// According the specs, when we sleep for a relative time, clock adjustments should not affect the duration
// we sleep.
sleep_usec_with_now(next - now, now);
now = now_realtime_usec();
dt = now - hb->realtime;
if(hb->statistics_id < HEARTBEAT_ALIGNMENT_STATISTICS_SIZE) {
heartbeat_alignment_values[hb->statistics_id].dt += now - next;
heartbeat_alignment_values[hb->statistics_id].sequence++;
}
if(unlikely(now < next)) {
errno = 0;
nd_log_limit_static_global_var(erl, 10, 0);
nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
"heartbeat clock: woke up %"PRIu64" microseconds earlier than expected "
"(can be due to the CLOCK_REALTIME set to the past).",
next - now);
}
else if(unlikely(now - next > tick / 2)) {
errno = 0;
nd_log_limit_static_global_var(erl, 10, 0);
nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
"heartbeat clock: woke up %"PRIu64" microseconds later than expected "
"(can be due to system load or the CLOCK_REALTIME set to the future).",
now - next);
}
if(unlikely(!hb->realtime)) {
// the first time return zero
dt = 0;
}
hb->realtime = now;
return dt;
}
void sleep_usec_with_now(usec_t usec, usec_t started_ut) {
// we expect microseconds (1.000.000 per second)
// but timespec is nanoseconds (1.000.000.000 per second)
struct timespec rem = { 0, 0 }, req = {
.tv_sec = (time_t) (usec / USEC_PER_SEC),
.tv_nsec = (suseconds_t) ((usec % USEC_PER_SEC) * NSEC_PER_USEC)
};
// make sure errno is not EINTR
errno = 0;
if(!started_ut)
started_ut = now_realtime_usec();
usec_t end_ut = started_ut + usec;
while (nanosleep(&req, &rem) != 0) {
if (likely(errno == EINTR && (rem.tv_sec || rem.tv_nsec))) {
req = rem;
rem = (struct timespec){ 0, 0 };
// break an infinite loop
errno = 0;
usec_t now_ut = now_realtime_usec();
if(now_ut >= end_ut)
break;
usec_t remaining_ut = (usec_t)req.tv_sec * USEC_PER_SEC + (usec_t)req.tv_nsec * NSEC_PER_USEC > usec;
usec_t check_ut = now_ut - started_ut;
if(remaining_ut > check_ut) {
req = (struct timespec){
.tv_sec = (time_t) ( check_ut / USEC_PER_SEC),
.tv_nsec = (suseconds_t) ((check_ut % USEC_PER_SEC) * NSEC_PER_USEC)
};
}
}
else {
netdata_log_error("Cannot nanosleep() for %"PRIu64" microseconds.", usec);
break;
}
}
}
static inline collected_number uptime_from_boottime(void) {
#ifdef CLOCK_BOOTTIME_IS_AVAILABLE
return (collected_number)(now_boottime_usec() / USEC_PER_MS);
#else
netdata_log_error("uptime cannot be read from CLOCK_BOOTTIME on this system.");
return 0;
#endif
}
static procfile *read_proc_uptime_ff = NULL;
static inline collected_number read_proc_uptime(char *filename) {
if(unlikely(!read_proc_uptime_ff)) {
read_proc_uptime_ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT);
if(unlikely(!read_proc_uptime_ff)) return 0;
}
read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff);
if(unlikely(!read_proc_uptime_ff)) return 0;
if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) {
netdata_log_error("/proc/uptime has no lines.");
return 0;
}
if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) {
netdata_log_error("/proc/uptime has less than 1 word in it.");
return 0;
}
return (collected_number)(strtondd(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0);
}
inline collected_number uptime_msec(char *filename){
static int use_boottime = -1;
if(unlikely(use_boottime == -1)) {
collected_number uptime_boottime = uptime_from_boottime();
collected_number uptime_proc = read_proc_uptime(filename);
long long delta = (long long)uptime_boottime - (long long)uptime_proc;
if(delta < 0) delta = -delta;
if(delta <= 1000 && uptime_boottime != 0) {
procfile_close(read_proc_uptime_ff);
netdata_log_info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta);
use_boottime = 1;
}
else if(uptime_proc != 0) {
netdata_log_info("Using /proc/uptime for uptime (dt is %lld ms)", delta);
use_boottime = 0;
}
else {
netdata_log_error("Cannot find any way to read uptime on this system.");
return 1;
}
}
collected_number uptime;
if(use_boottime)
uptime = uptime_from_boottime();
else
uptime = read_proc_uptime(filename);
return uptime;
}