0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-16 02:24:15 +00:00
netdata_netdata/database/rrdfunctions-inflight.c
Costa Tsaousis f466b8aef5
DYNCFG: dynamically configured alerts ()
* cleanup alerts

* fix references

* fix references

* fix references

* load alerts once and apply them to each node

* simplify health_create_alarm_entry()

* Compile without warnings with compiler flags:

   -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self

* code re-organization and cleanup

* generate patterns when applying prototypes; give unique dyncfg names to all alerts

* eval expressions keep the source and the parsed_as as STRING pointers

* renamed host to node in dyncfg ids

* renamed host to node in dyncfg ids

* add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level

* working functionality

* code re-organization: moved health event-loop to a new file, moved health globals to health.c

* rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs

* dyncfg alert prototypes initialization for alerts

* health dyncfg split to separate file

* cleanup not-needed code

* normalize matches between parsing and json

* also detect !* for disabled alerts

* dyncfg capability disabled

* Store alert config part1

* Add rrdlabels_common_count

* wip health variables lookup without indexes

* Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter

* working variables with runtime lookup

* working variables with runtime lookup

* delete rrddimvar and rrdfamily index

* remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts

* added /api/v1/variable that resolves a variable the same way alerts do

* remove rrdcalc from eval

* remove debug code

* remove duplicate assignment

* Fix memory leak

* all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts

* hide all internal structures of EVAL

* Enable -Wformat flag

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>

* Adjust binding for calculation, warning, critical

* Remove unused macro

* Update config hash id

* use the right info and summary in alerts log

* use synchronous queries for alerts

* Handle cases when config_hash_id is missing from health_log

* remove deadlock from health worker

* parsing to json payload for health alert prototypes

* cleaner parsing and avoiding memory leaks in case of duplicate members in json

* fix left-over rename of function

* Keep original lookup field to send to the cloud
Cleanup / rename function to store config
Remove unused DEFINEs, functions

* Use ac->lookup

* link jobs to the host when the template is registered; do not accept running a function without a host

* full dyncfg support for health alerts, except action TEST

* working dyncfg additions, updates, removals

* fixed missing source, wrong status updates

* add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint

* fix dyncfg unittest

* rename functions

* generalize the json-c parser macros and move them to libnetdata

* report progress when enabling and disabling dyncfg templates

* moved rrdcalc and rrdvar to health

* update alarms

* added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts

* enable parsed json alerts; allow sending back accepted but disabled

* added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output

* remove the RRDHOST pointer from DYNCFG

* Fix command field submitted to the cloud

* do not send updates to creation requests, for DYNCFG jobs

---------

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: ilyam8 <ilya@netdata.cloud>
2024-01-23 20:20:41 +02:00

654 lines
22 KiB
C

// SPDX-License-Identifier: GPL-3.0-or-later
#define NETDATA_RRD_INTERNALS
#include "rrdcollector-internals.h"
#include "rrdfunctions-internals.h"
#include "rrdfunctions-inflight.h"
struct rrd_function_inflight {
bool used;
RRDHOST *host;
uuid_t transaction_uuid;
const char *transaction;
const char *cmd;
const char *sanitized_cmd;
const char *source;
size_t sanitized_cmd_length;
int timeout;
bool cancelled;
usec_t stop_monotonic_ut;
BUFFER *payload;
const DICTIONARY_ITEM *host_function_acquired;
// the collector
// we acquire this structure at the beginning,
// and we release it at the end
struct rrd_host_function *rdcf;
struct {
BUFFER *wb;
// in async mode,
// the function to call to send the result back
rrd_function_result_callback_t cb;
void *data;
} result;
struct {
// to be called in sync mode
// while the function is running
// to check if the function has been canceled
rrd_function_is_cancelled_cb_t cb;
void *data;
} is_cancelled;
struct {
// to be registered by the function itself
// used to signal the function to cancel
rrd_function_cancel_cb_t cb;
void *data;
} canceller;
struct {
// callback to receive progress reports from function
rrd_function_progress_cb_t cb;
void *data;
} progress;
struct {
// to be registered by the function itself
// used to send progress requests to function
rrd_function_progresser_cb_t cb;
void *data;
} progresser;
};
static DICTIONARY *rrd_functions_inflight_requests = NULL;
static void rrd_function_cancel_inflight(struct rrd_function_inflight *r);
// ----------------------------------------------------------------------------
static void rrd_functions_inflight_cleanup(struct rrd_function_inflight *r) {
buffer_free(r->payload);
freez((void *)r->transaction);
freez((void *)r->cmd);
freez((void *)r->sanitized_cmd);
freez((void *)r->source);
r->payload = NULL;
r->transaction = NULL;
r->cmd = NULL;
r->sanitized_cmd = NULL;
}
static void rrd_functions_inflight_delete_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) {
struct rrd_function_inflight *r = value;
// internal_error(true, "FUNCTIONS: transaction '%s' finished", r->transaction);
rrd_functions_inflight_cleanup(r);
dictionary_acquired_item_release(r->host->functions, r->host_function_acquired);
}
void rrd_functions_inflight_init(void) {
if(rrd_functions_inflight_requests)
return;
rrd_functions_inflight_requests = dictionary_create_advanced(DICT_OPTION_DONT_OVERWRITE_VALUE | DICT_OPTION_FIXED_SIZE, NULL, sizeof(struct rrd_function_inflight));
dictionary_register_delete_callback(rrd_functions_inflight_requests, rrd_functions_inflight_delete_cb, NULL);
}
void rrd_functions_inflight_destroy(void) {
if(!rrd_functions_inflight_requests)
return;
dictionary_destroy(rrd_functions_inflight_requests);
rrd_functions_inflight_requests = NULL;
}
static void rrd_inflight_async_function_register_canceller_cb(void *register_canceller_cb_data, rrd_function_cancel_cb_t canceller_cb, void *canceller_cb_data) {
struct rrd_function_inflight *r = register_canceller_cb_data;
r->canceller.cb = canceller_cb;
r->canceller.data = canceller_cb_data;
}
static void rrd_inflight_async_function_register_progresser_cb(void *register_progresser_cb_data, rrd_function_progresser_cb_t progresser_cb, void *progresser_cb_data) {
struct rrd_function_inflight *r = register_progresser_cb_data;
r->progresser.cb = progresser_cb;
r->progresser.data = progresser_cb_data;
}
// ----------------------------------------------------------------------------
// waiting for async function completion
struct rrd_function_call_wait {
RRDHOST *host;
const DICTIONARY_ITEM *host_function_acquired;
char *transaction;
bool free_with_signal;
bool data_are_ready;
netdata_mutex_t mutex;
pthread_cond_t cond;
int code;
};
static void rrd_inflight_function_cleanup(RRDHOST *host __maybe_unused, const char *transaction) {
dictionary_del(rrd_functions_inflight_requests, transaction);
dictionary_garbage_collect(rrd_functions_inflight_requests);
}
static void rrd_function_call_wait_free(struct rrd_function_call_wait *tmp) {
rrd_inflight_function_cleanup(tmp->host, tmp->transaction);
freez(tmp->transaction);
pthread_cond_destroy(&tmp->cond);
netdata_mutex_destroy(&tmp->mutex);
freez(tmp);
}
static void rrd_async_function_signal_when_ready(BUFFER *temp_wb __maybe_unused, int code, void *callback_data) {
struct rrd_function_call_wait *tmp = callback_data;
bool we_should_free = false;
netdata_mutex_lock(&tmp->mutex);
// since we got the mutex,
// the waiting thread is either in pthread_cond_timedwait()
// or gave up and left.
tmp->code = code;
tmp->data_are_ready = true;
if(tmp->free_with_signal)
we_should_free = true;
pthread_cond_signal(&tmp->cond);
netdata_mutex_unlock(&tmp->mutex);
if(we_should_free) {
buffer_free(temp_wb);
rrd_function_call_wait_free(tmp);
}
}
static void rrd_inflight_async_function_nowait_finished(BUFFER *wb, int code, void *data) {
struct rrd_function_inflight *r = data;
if(r->result.cb)
r->result.cb(wb, code, r->result.data);
rrd_inflight_function_cleanup(r->host, r->transaction);
}
static bool rrd_inflight_async_function_is_cancelled(void *data) {
struct rrd_function_inflight *r = data;
return __atomic_load_n(&r->cancelled, __ATOMIC_RELAXED);
}
static inline int rrd_call_function_async_and_dont_wait(struct rrd_function_inflight *r) {
struct rrd_function_execute rfe = {
.transaction = &r->transaction_uuid,
.function = r->sanitized_cmd,
.payload = r->payload,
.source = r->source,
.stop_monotonic_ut = &r->stop_monotonic_ut,
.result = {
.wb = r->result.wb,
.cb = rrd_inflight_async_function_nowait_finished,
.data = r,
},
.progress = {
.cb = r->progress.cb,
.data = r->progress.data,
},
.is_cancelled = {
.cb = rrd_inflight_async_function_is_cancelled,
.data = r,
},
.register_canceller = {
.cb = rrd_inflight_async_function_register_canceller_cb,
.data = r,
},
.register_progresser = {
.cb = rrd_inflight_async_function_register_progresser_cb,
.data = r,
},
};
int code = r->rdcf->execute_cb(&rfe, r->rdcf->execute_cb_data);
return code;
}
static int rrd_call_function_async_and_wait(struct rrd_function_inflight *r) {
struct rrd_function_call_wait *tmp = mallocz(sizeof(struct rrd_function_call_wait));
tmp->free_with_signal = false;
tmp->data_are_ready = false;
tmp->host = r->host;
tmp->host_function_acquired = r->host_function_acquired;
tmp->transaction = strdupz(r->transaction);
netdata_mutex_init(&tmp->mutex);
pthread_cond_init(&tmp->cond, NULL);
// we need a temporary BUFFER, because we may time out and the caller supplied one may vanish,
// so we create a new one we guarantee will survive until the collector finishes...
bool we_should_free = false;
BUFFER *temp_wb = buffer_create(1024, &netdata_buffers_statistics.buffers_functions); // we need it because we may give up on it
temp_wb->content_type = r->result.wb->content_type;
struct rrd_function_execute rfe = {
.transaction = &r->transaction_uuid,
.function = r->sanitized_cmd,
.payload = r->payload,
.source = r->source,
.stop_monotonic_ut = &r->stop_monotonic_ut,
.result = {
.wb = temp_wb,
// we overwrite the result callbacks,
// so that we can clean up the allocations made
.cb = rrd_async_function_signal_when_ready,
.data = tmp,
},
.progress = {
.cb = r->progress.cb,
.data = r->progress.data,
},
.is_cancelled = {
.cb = rrd_inflight_async_function_is_cancelled,
.data = r,
},
.register_canceller = {
.cb = rrd_inflight_async_function_register_canceller_cb,
.data = r,
},
.register_progresser = {
.cb = rrd_inflight_async_function_register_progresser_cb,
.data = r,
},
};
int code = r->rdcf->execute_cb(&rfe, r->rdcf->execute_cb_data);
// this has to happen after we execute the callback
// because if an async call is responded in sync mode, there will be a deadlock.
netdata_mutex_lock(&tmp->mutex);
if (code == HTTP_RESP_OK || tmp->data_are_ready) {
bool cancelled = false;
int rc = 0;
while (rc == 0 && !cancelled && !tmp->data_are_ready) {
usec_t now_mono_ut = now_monotonic_usec();
usec_t stop_mono_ut = __atomic_load_n(&r->stop_monotonic_ut, __ATOMIC_RELAXED) + RRDFUNCTIONS_TIMEOUT_EXTENSION_UT;
if(now_mono_ut > stop_mono_ut) {
rc = ETIMEDOUT;
break;
}
// wait for 10ms, and loop again...
struct timespec tp;
clock_gettime(CLOCK_REALTIME, &tp);
tp.tv_nsec += 10 * NSEC_PER_MSEC;
if(tp.tv_nsec > (long)(1 * NSEC_PER_SEC)) {
tp.tv_sec++;
tp.tv_nsec -= 1 * NSEC_PER_SEC;
}
// the mutex is unlocked within pthread_cond_timedwait()
rc = pthread_cond_timedwait(&tmp->cond, &tmp->mutex, &tp);
// the mutex is again ours
if(rc == ETIMEDOUT) {
// 10ms have passed
rc = 0;
if (!tmp->data_are_ready && r->is_cancelled.cb &&
r->is_cancelled.cb(r->is_cancelled.data)) {
// internal_error(true, "FUNCTIONS: transaction '%s' is cancelled while waiting for response",
// r->transaction);
cancelled = true;
rrd_function_cancel_inflight(r);
break;
}
}
}
if (tmp->data_are_ready) {
// we have a response
buffer_contents_replace(r->result.wb, buffer_tostring(temp_wb), buffer_strlen(temp_wb));
r->result.wb->content_type = temp_wb->content_type;
r->result.wb->expires = temp_wb->expires;
if(r->result.wb->expires)
buffer_cacheable(r->result.wb);
else
buffer_no_cacheable(r->result.wb);
code = tmp->code;
tmp->free_with_signal = false;
we_should_free = true;
}
else if (rc == ETIMEDOUT || cancelled) {
// timeout
// we will go away and let the callback free the structure
if(cancelled)
code = rrd_call_function_error(r->result.wb,
"Request cancelled",
HTTP_RESP_CLIENT_CLOSED_REQUEST);
else
code = rrd_call_function_error(r->result.wb,
"Timeout while waiting for a response from the collector.",
HTTP_RESP_GATEWAY_TIMEOUT);
tmp->free_with_signal = true;
we_should_free = false;
}
else {
code = rrd_call_function_error(
r->result.wb, "Internal error while communicating with the collector",
HTTP_RESP_INTERNAL_SERVER_ERROR);
tmp->free_with_signal = true;
we_should_free = false;
}
}
else {
// the response is not ok, and we don't have the data
tmp->free_with_signal = true;
we_should_free = false;
}
netdata_mutex_unlock(&tmp->mutex);
if (we_should_free) {
rrd_function_call_wait_free(tmp);
buffer_free(temp_wb);
}
return code;
}
static inline int rrd_call_function_async(struct rrd_function_inflight *r, bool wait) {
if(wait)
return rrd_call_function_async_and_wait(r);
else
return rrd_call_function_async_and_dont_wait(r);
}
// ----------------------------------------------------------------------------
int rrd_function_run(RRDHOST *host, BUFFER *result_wb, int timeout_s, HTTP_ACCESS access, const char *cmd,
bool wait, const char *transaction,
rrd_function_result_callback_t result_cb, void *result_cb_data,
rrd_function_progress_cb_t progress_cb, void *progress_cb_data,
rrd_function_is_cancelled_cb_t is_cancelled_cb, void *is_cancelled_cb_data,
BUFFER *payload, const char *source) {
int code;
char sanitized_cmd[PLUGINSD_LINE_MAX + 1];
const DICTIONARY_ITEM *host_function_acquired = NULL;
char sanitized_source[(source ? strlen(source) : 0) + 1];
rrd_functions_sanitize(sanitized_source, source ? source : "", sizeof(sanitized_source));
// ------------------------------------------------------------------------
// check for the host
if(!host) {
code = HTTP_RESP_INTERNAL_SERVER_ERROR;
rrd_call_function_error(result_wb, "no host given for running the function", code);
if(result_cb)
result_cb(result_wb, code, result_cb_data);
return code;
}
// ------------------------------------------------------------------------
// find the function
size_t sanitized_cmd_length = rrd_functions_sanitize(sanitized_cmd, cmd, sizeof(sanitized_cmd));
code = rrd_functions_find_by_name(host, result_wb, sanitized_cmd, sanitized_cmd_length, &host_function_acquired);
if(code != HTTP_RESP_OK) {
rrd_call_function_error(result_wb, "not found", code);
if(result_cb)
result_cb(result_wb, code, result_cb_data);
return code;
}
struct rrd_host_function *rdcf = dictionary_acquired_item_value(host_function_acquired);
if(!web_client_has_enough_access_level(access, rdcf->access)) {
if(!aclk_connected)
rrd_call_function_error(result_wb, "This Netdata must be connected to Netdata Cloud to access this function.", HTTP_RESP_PRECOND_FAIL);
else if(access >= HTTP_ACCESS_ANY)
rrd_call_function_error(result_wb, "You need to login to the Netdata Cloud space this agent is claimed to, to access this function.", HTTP_RESP_PRECOND_FAIL);
else /* if(access < HTTP_ACCESS_ANY && rdcf->access < access) */
rrd_call_function_error(result_wb, "To access this function you need to be an admin in this Netdata Cloud space.", HTTP_RESP_PRECOND_FAIL);
dictionary_acquired_item_release(host->functions, host_function_acquired);
if(result_cb)
result_cb(result_wb, HTTP_RESP_PRECOND_FAIL, result_cb_data);
return HTTP_RESP_PRECOND_FAIL;
}
if(timeout_s <= 0)
timeout_s = rdcf->timeout;
// ------------------------------------------------------------------------
// validate and parse the transaction, or generate a new transaction id
char uuid_str[UUID_COMPACT_STR_LEN];
uuid_t uuid;
if(!transaction || !*transaction || uuid_parse_flexi(transaction, uuid) != 0)
uuid_generate_random(uuid);
uuid_unparse_lower_compact(uuid, uuid_str);
transaction = uuid_str;
// ------------------------------------------------------------------------
// the function can only be executed in async mode
// put the function into the inflight requests
struct rrd_function_inflight t = {
.used = false,
.host = host,
.cmd = strdupz(cmd),
.sanitized_cmd = strdupz(sanitized_cmd),
.sanitized_cmd_length = sanitized_cmd_length,
.transaction = strdupz(transaction),
.source = strdupz(sanitized_source),
.payload = buffer_dup(payload),
.timeout = timeout_s,
.cancelled = false,
.stop_monotonic_ut = now_monotonic_usec() + timeout_s * USEC_PER_SEC,
.host_function_acquired = host_function_acquired,
.rdcf = rdcf,
.result = {
.wb = result_wb,
.cb = result_cb,
.data = result_cb_data,
},
.is_cancelled = {
.cb = is_cancelled_cb,
.data = is_cancelled_cb_data,
},
.progress = {
.cb = progress_cb,
.data = progress_cb_data,
},
};
uuid_copy(t.transaction_uuid, uuid);
struct rrd_function_inflight *r = dictionary_set(rrd_functions_inflight_requests, transaction, &t, sizeof(t));
if(r->used) {
nd_log(NDLS_DAEMON, NDLP_NOTICE,
"FUNCTIONS: duplicate transaction '%s', function: '%s'",
t.transaction, t.cmd);
code = rrd_call_function_error(result_wb, "duplicate transaction", HTTP_RESP_BAD_REQUEST);
rrd_functions_inflight_cleanup(&t);
dictionary_acquired_item_release(r->host->functions, t.host_function_acquired);
if(result_cb)
result_cb(result_wb, code, result_cb_data);
return code;
}
r->used = true;
// internal_error(true, "FUNCTIONS: transaction '%s' started", r->transaction);
if(r->rdcf->sync) {
// the caller has to wait
struct rrd_function_execute rfe = {
.transaction = &r->transaction_uuid,
.function = r->sanitized_cmd,
.payload = r->payload,
.source = r->source,
.stop_monotonic_ut = &r->stop_monotonic_ut,
.result = {
.wb = r->result.wb,
// we overwrite the result callbacks,
// so that we can clean up the allocations made
.cb = r->result.cb,
.data = r->result.data,
},
.progress = {
.cb = r->progress.cb,
.data = r->progress.data,
},
.is_cancelled = {
.cb = r->is_cancelled.cb,
.data = r->is_cancelled.data,
},
.register_canceller = {
.cb = NULL,
.data = NULL,
},
.register_progresser = {
.cb = NULL,
.data = NULL,
},
};
code = r->rdcf->execute_cb(&rfe, r->rdcf->execute_cb_data);
rrd_inflight_function_cleanup(host, r->transaction);
return code;
}
return rrd_call_function_async(r, wait);
}
bool rrd_function_has_this_original_result_callback(uuid_t *transaction, rrd_function_result_callback_t cb) {
bool ret = false;
char str[UUID_COMPACT_STR_LEN];
uuid_unparse_lower_compact(*transaction, str);
const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(rrd_functions_inflight_requests, str);
if(item) {
struct rrd_function_inflight *r = dictionary_acquired_item_value(item);
if(r->result.cb == cb)
ret = true;
dictionary_acquired_item_release(rrd_functions_inflight_requests, item);
}
return ret;
}
static void rrd_function_cancel_inflight(struct rrd_function_inflight *r) {
if(!r)
return;
bool cancelled = __atomic_load_n(&r->cancelled, __ATOMIC_RELAXED);
if(cancelled) {
nd_log(NDLS_DAEMON, NDLP_DEBUG,
"FUNCTIONS: received a CANCEL request for transaction '%s', but it is already cancelled.",
r->transaction);
return;
}
__atomic_store_n(&r->cancelled, true, __ATOMIC_RELAXED);
if(!rrd_collector_dispatcher_acquire(r->rdcf->collector)) {
nd_log(NDLS_DAEMON, NDLP_DEBUG,
"FUNCTIONS: received a CANCEL request for transaction '%s', but the collector is not running.",
r->transaction);
return;
}
if(r->canceller.cb)
r->canceller.cb(r->canceller.data);
rrd_collector_dispatcher_release(r->rdcf->collector);
}
void rrd_function_cancel(const char *transaction) {
// internal_error(true, "FUNCTIONS: request to cancel transaction '%s'", transaction);
const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(rrd_functions_inflight_requests, transaction);
if(!item) {
nd_log(NDLS_DAEMON, NDLP_DEBUG,
"FUNCTIONS: received a CANCEL request for transaction '%s', but the transaction is not running.",
transaction);
return;
}
struct rrd_function_inflight *r = dictionary_acquired_item_value(item);
rrd_function_cancel_inflight(r);
dictionary_acquired_item_release(rrd_functions_inflight_requests, item);
}
void rrd_function_progress(const char *transaction) {
const DICTIONARY_ITEM *item = dictionary_get_and_acquire_item(rrd_functions_inflight_requests, transaction);
if(!item) {
nd_log(NDLS_DAEMON, NDLP_DEBUG,
"FUNCTIONS: received a PROGRESS request for transaction '%s', but the transaction is not running.",
transaction);
return;
}
struct rrd_function_inflight *r = dictionary_acquired_item_value(item);
if(!rrd_collector_dispatcher_acquire(r->rdcf->collector)) {
nd_log(NDLS_DAEMON, NDLP_DEBUG,
"FUNCTIONS: received a PROGRESS request for transaction '%s', but the collector is not running.",
transaction);
goto cleanup;
}
functions_stop_monotonic_update_on_progress(&r->stop_monotonic_ut);
if(r->progresser.cb)
r->progresser.cb(r->progresser.data);
rrd_collector_dispatcher_release(r->rdcf->collector);
cleanup:
dictionary_acquired_item_release(rrd_functions_inflight_requests, item);
}
void rrd_function_call_progresser(uuid_t *transaction) {
char str[UUID_COMPACT_STR_LEN];
uuid_unparse_lower_compact(*transaction, str);
rrd_function_progress(str);
}