mirror of
https://github.com/netdata/netdata.git
synced 2025-05-13 05:02:03 +00:00

* cleanup of logging - wip
* first working iteration
* add errno annotator
* replace old logging functions with netdata_logger()
* cleanup
* update error_limit
* fix remanining error_limit references
* work on fatal()
* started working on structured logs
* full cleanup
* default logging to files; fix all plugins initialization
* fix formatting of numbers
* cleanup and reorg
* fix coverity issues
* cleanup obsolete code
* fix formatting of numbers
* fix log rotation
* fix for older systems
* add detection of systemd journal via stderr
* finished on access.log
* remove left-over transport
* do not add empty fields to the logs
* journal get compact uuids; X-Transaction-ID header is added in web responses
* allow compiling on systems without memfd sealing
* added libnetdata/uuid directory
* move datetime formatters to libnetdata
* add missing files
* link the makefiles in libnetdata
* added uuid_parse_flexi() to parse UUIDs with and without hyphens; the web server now read X-Transaction-ID and uses it for functions and web responses
* added stream receiver, sender, proc plugin and pluginsd log stack
* iso8601 advanced usage; line_splitter module in libnetdata; code cleanup
* add message ids to streaming inbound and outbound connections
* cleanup line_splitter between lines to avoid logging garbage; when killing children, kill them with SIGABRT if internal checks is enabled
* send SIGABRT to external plugins only if we are not shutting down
* fix cross cleanup in pluginsd parser
* fatal when there is a stack error in logs
* compile netdata with -fexceptions
* do not kill external plugins with SIGABRT
* metasync info logs to debug level
* added severity to logs
* added json output; added options per log output; added documentation; fixed issues mentioned
* allow memfd only on linux
* moved journal low level functions to journal.c/h
* move health logs to daemon.log with proper priorities
* fixed a couple of bugs; health log in journal
* updated docs
* systemd-cat-native command to push structured logs to journal from the command line
* fix makefiles
* restored NETDATA_LOG_SEVERITY_LEVEL
* fix makefiles
* systemd-cat-native can also work as the logger of Netdata scripts
* do not require a socket to systemd-journal to log-as-netdata
* alarm notify logs in native format
* properly compare log ids
* fatals log alerts; alarm-notify.sh working
* fix overflow warning
* alarm-notify.sh now logs the request (command line)
* anotate external plugins logs with the function cmd they run
* added context, component and type to alarm-notify.sh; shell sanitization removes control character and characters that may be expanded by bash
* reformatted alarm-notify logs
* unify cgroup-network-helper.sh
* added quotes around params
* charts.d.plugin switched logging to journal native
* quotes for logfmt
* unify the status codes of streaming receivers and senders
* alarm-notify: dont log anything, if there is nothing to do
* all external plugins log to stderr when running outside netdata; alarm-notify now shows an error when notifications menthod are needed but are not available
* migrate cgroup-name.sh to new logging
* systemd-cat-native now supports messages with newlines
* socket.c logs use priority
* cleanup log field types
* inherit the systemd set INVOCATION_ID if found
* allow systemd-cat-native to send messages to a systemd-journal-remote URL
* log2journal command that can convert structured logs to journal export format
* various fixes and documentation of log2journal
* updated log2journal docs
* updated log2journal docs
* updated documentation of fields
* allow compiling without libcurl
* do not use socket as format string
* added version information to newly added tools
* updated documentation and help messages
* fix the namespace socket path
* print errno with error
* do not timeout
* updated docs
* updated docs
* updated docs
* log2journal updated docs and params
* when talking to a remote journal, systemd-cat-native batches the messages
* enable lz4 compression for systemd-cat-native when sending messages to a systemd-journal-remote
* Revert "enable lz4 compression for systemd-cat-native when sending messages to a systemd-journal-remote"
This reverts commit b079d53c11
.
* note about uncompressed traffic
* log2journal: code reorg and cleanup to make modular
* finished rewriting log2journal
* more comments
* rewriting rules support
* increased limits
* updated docs
* updated docs
* fix old log call
* use journal only when stderr is connected to journal
* update netdata.spec for libcurl, libpcre2 and log2journal
* pcre2-devel
* do not require pcre2 in centos < 8, amazonlinux < 2023, open suse
* log2journal only on systems pcre2 is available
* ignore log2journal in .gitignore
* avoid log2journal on centos 7, amazonlinux 2 and opensuse
* add pcre2-8 to static build
* undo last commit
* Bundle to static
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
* Add build deps for deb packages
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
* Add dependencies; build from source
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
* Test build for amazon linux and centos expect to fail for suse
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
* fix minor oversight
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
* Reorg code
* Add the install from source (deps) as a TODO
* Not enable the build on suse ecosystem
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
---------
Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
346 lines
12 KiB
C
346 lines
12 KiB
C
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "internal.h"
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// visualizing flags
|
|
|
|
struct rrdcontext_reason rrdcontext_reasons[] = {
|
|
// context related
|
|
{RRD_FLAG_UPDATE_REASON_TRIGGERED, "triggered transition", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_NEW_OBJECT, "object created", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_UPDATED_OBJECT, "object updated", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_LOAD_SQL, "loaded from sql", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_CHANGED_METADATA, "changed metadata", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_ZERO_RETENTION, "has no retention", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_CHANGED_FIRST_TIME_T, "updated first_time_t", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_CHANGED_LAST_TIME_T, "updated last_time_t", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_STOPPED_BEING_COLLECTED, "stopped collected", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_STARTED_BEING_COLLECTED, "started collected", 5 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_UNUSED, "unused", 5 * USEC_PER_SEC },
|
|
|
|
// not context related
|
|
{RRD_FLAG_UPDATE_REASON_CHANGED_LINKING, "changed rrd link", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD, "child disconnected", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_DB_ROTATION, "db rotation", 65 * USEC_PER_SEC },
|
|
{RRD_FLAG_UPDATE_REASON_UPDATE_RETENTION, "updated retention", 65 * USEC_PER_SEC },
|
|
|
|
// terminator
|
|
{0, NULL, 0 },
|
|
};
|
|
|
|
void rrd_reasons_to_buffer_json_array_items(RRD_FLAGS flags, BUFFER *wb) {
|
|
for(int i = 0, added = 0; rrdcontext_reasons[i].name ; i++) {
|
|
if (flags & rrdcontext_reasons[i].flag) {
|
|
buffer_json_add_array_item_string(wb, rrdcontext_reasons[i].name);
|
|
added++;
|
|
}
|
|
}
|
|
}
|
|
// ----------------------------------------------------------------------------
|
|
// public API
|
|
|
|
void rrdcontext_updated_rrddim(RRDDIM *rd) {
|
|
rrdmetric_from_rrddim(rd);
|
|
}
|
|
|
|
void rrdcontext_removed_rrddim(RRDDIM *rd) {
|
|
rrdmetric_rrddim_is_freed(rd);
|
|
}
|
|
|
|
void rrdcontext_updated_rrddim_algorithm(RRDDIM *rd) {
|
|
rrdmetric_updated_rrddim_flags(rd);
|
|
}
|
|
|
|
void rrdcontext_updated_rrddim_multiplier(RRDDIM *rd) {
|
|
rrdmetric_updated_rrddim_flags(rd);
|
|
}
|
|
|
|
void rrdcontext_updated_rrddim_divisor(RRDDIM *rd) {
|
|
rrdmetric_updated_rrddim_flags(rd);
|
|
}
|
|
|
|
void rrdcontext_updated_rrddim_flags(RRDDIM *rd) {
|
|
rrdmetric_updated_rrddim_flags(rd);
|
|
}
|
|
|
|
void rrdcontext_collected_rrddim(RRDDIM *rd) {
|
|
rrdmetric_collected_rrddim(rd);
|
|
}
|
|
|
|
void rrdcontext_updated_rrdset(RRDSET *st) {
|
|
rrdinstance_from_rrdset(st);
|
|
}
|
|
|
|
void rrdcontext_removed_rrdset(RRDSET *st) {
|
|
rrdinstance_rrdset_is_freed(st);
|
|
}
|
|
|
|
void rrdcontext_updated_retention_rrdset(RRDSET *st) {
|
|
rrdinstance_rrdset_has_updated_retention(st);
|
|
}
|
|
|
|
void rrdcontext_updated_rrdset_name(RRDSET *st) {
|
|
rrdinstance_updated_rrdset_name(st);
|
|
}
|
|
|
|
void rrdcontext_updated_rrdset_flags(RRDSET *st) {
|
|
rrdinstance_updated_rrdset_flags(st);
|
|
}
|
|
|
|
void rrdcontext_collected_rrdset(RRDSET *st) {
|
|
rrdinstance_collected_rrdset(st);
|
|
}
|
|
|
|
void rrdcontext_host_child_connected(RRDHOST *host) {
|
|
(void)host;
|
|
|
|
// no need to do anything here
|
|
;
|
|
}
|
|
|
|
usec_t rrdcontext_next_db_rotation_ut = 0;
|
|
void rrdcontext_db_rotation(void) {
|
|
// called when the db rotates its database
|
|
rrdcontext_next_db_rotation_ut = now_realtime_usec() + FULL_RETENTION_SCAN_DELAY_AFTER_DB_ROTATION_SECS * USEC_PER_SEC;
|
|
}
|
|
|
|
int rrdcontext_find_dimension_uuid(RRDSET *st, const char *id, uuid_t *store_uuid) {
|
|
if(!st->rrdhost) return 1;
|
|
if(!st->context) return 2;
|
|
|
|
RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(st->rrdhost->rrdctx.contexts, string2str(st->context));
|
|
if(!rca) return 3;
|
|
|
|
RRDCONTEXT *rc = rrdcontext_acquired_value(rca);
|
|
|
|
RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_get_and_acquire_item(rc->rrdinstances, string2str(st->id));
|
|
if(!ria) {
|
|
rrdcontext_release(rca);
|
|
return 4;
|
|
}
|
|
|
|
RRDINSTANCE *ri = rrdinstance_acquired_value(ria);
|
|
|
|
RRDMETRIC_ACQUIRED *rma = (RRDMETRIC_ACQUIRED *)dictionary_get_and_acquire_item(ri->rrdmetrics, id);
|
|
if(!rma) {
|
|
rrdinstance_release(ria);
|
|
rrdcontext_release(rca);
|
|
return 5;
|
|
}
|
|
|
|
RRDMETRIC *rm = rrdmetric_acquired_value(rma);
|
|
|
|
uuid_copy(*store_uuid, rm->uuid);
|
|
|
|
rrdmetric_release(rma);
|
|
rrdinstance_release(ria);
|
|
rrdcontext_release(rca);
|
|
return 0;
|
|
}
|
|
|
|
int rrdcontext_find_chart_uuid(RRDSET *st, uuid_t *store_uuid) {
|
|
if(!st->rrdhost) return 1;
|
|
if(!st->context) return 2;
|
|
|
|
RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(st->rrdhost->rrdctx.contexts, string2str(st->context));
|
|
if(!rca) return 3;
|
|
|
|
RRDCONTEXT *rc = rrdcontext_acquired_value(rca);
|
|
|
|
RRDINSTANCE_ACQUIRED *ria = (RRDINSTANCE_ACQUIRED *)dictionary_get_and_acquire_item(rc->rrdinstances, string2str(st->id));
|
|
if(!ria) {
|
|
rrdcontext_release(rca);
|
|
return 4;
|
|
}
|
|
|
|
RRDINSTANCE *ri = rrdinstance_acquired_value(ria);
|
|
uuid_copy(*store_uuid, ri->uuid);
|
|
|
|
rrdinstance_release(ria);
|
|
rrdcontext_release(rca);
|
|
return 0;
|
|
}
|
|
|
|
void rrdcontext_host_child_disconnected(RRDHOST *host) {
|
|
rrdcontext_recalculate_host_retention(host, RRD_FLAG_UPDATE_REASON_DISCONNECTED_CHILD, false);
|
|
}
|
|
|
|
int rrdcontext_foreach_instance_with_rrdset_in_context(RRDHOST *host, const char *context, int (*callback)(RRDSET *st, void *data), void *data) {
|
|
if(unlikely(!host || !context || !*context || !callback))
|
|
return -1;
|
|
|
|
RRDCONTEXT_ACQUIRED *rca = (RRDCONTEXT_ACQUIRED *)dictionary_get_and_acquire_item(host->rrdctx.contexts, context);
|
|
if(unlikely(!rca)) return -1;
|
|
|
|
RRDCONTEXT *rc = rrdcontext_acquired_value(rca);
|
|
if(unlikely(!rc)) return -1;
|
|
|
|
int ret = 0;
|
|
RRDINSTANCE *ri;
|
|
dfe_start_read(rc->rrdinstances, ri) {
|
|
if(ri->rrdset) {
|
|
int r = callback(ri->rrdset, data);
|
|
if(r >= 0) ret += r;
|
|
else {
|
|
ret = r;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
dfe_done(ri);
|
|
|
|
rrdcontext_release(rca);
|
|
|
|
return ret;
|
|
}
|
|
|
|
// ----------------------------------------------------------------------------
|
|
// ACLK interface
|
|
|
|
static bool rrdhost_check_our_claim_id(const char *claim_id) {
|
|
if(!localhost->aclk_state.claimed_id) return false;
|
|
return (strcasecmp(claim_id, localhost->aclk_state.claimed_id) == 0) ? true : false;
|
|
}
|
|
|
|
static RRDHOST *rrdhost_find_by_node_id(const char *node_id) {
|
|
uuid_t uuid;
|
|
if (uuid_parse(node_id, uuid))
|
|
return NULL;
|
|
|
|
RRDHOST *host = NULL;
|
|
dfe_start_read(rrdhost_root_index, host) {
|
|
if(!host->node_id) continue;
|
|
|
|
if(uuid_memcmp(&uuid, host->node_id) == 0)
|
|
break;
|
|
}
|
|
dfe_done(host);
|
|
|
|
return host;
|
|
}
|
|
|
|
void rrdcontext_hub_checkpoint_command(void *ptr) {
|
|
struct ctxs_checkpoint *cmd = ptr;
|
|
|
|
if(!rrdhost_check_our_claim_id(cmd->claim_id)) {
|
|
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
|
"RRDCONTEXT: received checkpoint command for claim_id '%s', node id '%s', "
|
|
"but this is not our claim id. Ours '%s', received '%s'. Ignoring command.",
|
|
cmd->claim_id, cmd->node_id,
|
|
localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET",
|
|
cmd->claim_id);
|
|
|
|
return;
|
|
}
|
|
|
|
RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id);
|
|
if(!host) {
|
|
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
|
"RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', "
|
|
"but there is no node with such node id here. Ignoring command.",
|
|
cmd->claim_id, cmd->node_id);
|
|
|
|
return;
|
|
}
|
|
|
|
if(rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) {
|
|
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
|
"RRDCONTEXT: received checkpoint command for claim id '%s', node id '%s', "
|
|
"while node '%s' has an active context streaming.",
|
|
cmd->claim_id, cmd->node_id, rrdhost_hostname(host));
|
|
|
|
// disable it temporarily, so that our worker will not attempt to send messages in parallel
|
|
rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS);
|
|
}
|
|
|
|
uint64_t our_version_hash = rrdcontext_version_hash(host);
|
|
|
|
if(cmd->version_hash != our_version_hash) {
|
|
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
|
"RRDCONTEXT: received version hash %"PRIu64" for host '%s', does not match our version hash %"PRIu64". "
|
|
"Sending snapshot of all contexts.",
|
|
cmd->version_hash, rrdhost_hostname(host), our_version_hash);
|
|
|
|
#ifdef ENABLE_ACLK
|
|
// prepare the snapshot
|
|
char uuid[UUID_STR_LEN];
|
|
uuid_unparse_lower(*host->node_id, uuid);
|
|
contexts_snapshot_t bundle = contexts_snapshot_new(cmd->claim_id, uuid, our_version_hash);
|
|
|
|
// do a deep scan on every metric of the host to make sure all our data are updated
|
|
rrdcontext_recalculate_host_retention(host, RRD_FLAG_NONE, false);
|
|
|
|
// calculate version hash and pack all the messages together in one go
|
|
our_version_hash = rrdcontext_version_hash_with_callback(host, rrdcontext_message_send_unsafe, true, bundle);
|
|
|
|
// update the version
|
|
contexts_snapshot_set_version(bundle, our_version_hash);
|
|
|
|
// send it
|
|
aclk_send_contexts_snapshot(bundle);
|
|
#endif
|
|
}
|
|
|
|
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
|
"RRDCONTEXT: host '%s' enabling streaming of contexts",
|
|
rrdhost_hostname(host));
|
|
|
|
rrdhost_flag_set(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS);
|
|
char node_str[UUID_STR_LEN];
|
|
uuid_unparse_lower(*host->node_id, node_str);
|
|
nd_log(NDLS_ACCESS, NDLP_DEBUG,
|
|
"ACLK REQ [%s (%s)]: STREAM CONTEXTS ENABLED",
|
|
node_str, rrdhost_hostname(host));
|
|
}
|
|
|
|
void rrdcontext_hub_stop_streaming_command(void *ptr) {
|
|
struct stop_streaming_ctxs *cmd = ptr;
|
|
|
|
if(!rrdhost_check_our_claim_id(cmd->claim_id)) {
|
|
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
|
"RRDCONTEXT: received stop streaming command for claim_id '%s', node id '%s', "
|
|
"but this is not our claim id. Ours '%s', received '%s'. Ignoring command.",
|
|
cmd->claim_id, cmd->node_id,
|
|
localhost->aclk_state.claimed_id?localhost->aclk_state.claimed_id:"NOT SET",
|
|
cmd->claim_id);
|
|
|
|
return;
|
|
}
|
|
|
|
RRDHOST *host = rrdhost_find_by_node_id(cmd->node_id);
|
|
if(!host) {
|
|
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
|
"RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', "
|
|
"but there is no node with such node id here. Ignoring command.",
|
|
cmd->claim_id, cmd->node_id);
|
|
|
|
return;
|
|
}
|
|
|
|
if(!rrdhost_flag_check(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS)) {
|
|
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
|
"RRDCONTEXT: received stop streaming command for claim id '%s', node id '%s', "
|
|
"but node '%s' does not have active context streaming. Ignoring command.",
|
|
cmd->claim_id, cmd->node_id, rrdhost_hostname(host));
|
|
|
|
return;
|
|
}
|
|
|
|
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
|
"RRDCONTEXT: host '%s' disabling streaming of contexts",
|
|
rrdhost_hostname(host));
|
|
|
|
rrdhost_flag_clear(host, RRDHOST_FLAG_ACLK_STREAM_CONTEXTS);
|
|
}
|
|
|
|
bool rrdcontext_retention_match(RRDCONTEXT_ACQUIRED *rca, time_t after, time_t before) {
|
|
if(unlikely(!rca)) return false;
|
|
|
|
RRDCONTEXT *rc = rrdcontext_acquired_value(rca);
|
|
|
|
if(rrd_flag_is_collected(rc))
|
|
return query_matches_retention(after, before, rc->first_time_s, before > rc->last_time_s ? before : rc->last_time_s, 1);
|
|
else
|
|
return query_matches_retention(after, before, rc->first_time_s, rc->last_time_s, 1);
|
|
}
|