0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-03-29 02:55:52 +00:00

improve status file deduplication ()

* move restarts in agent from dedup - keep reading it for v3 from dedup

* use up to 10 hashes for deduplicating crash events

* monitor shutdown steps

* leftover x-ray vision

* fix out of memory

* out of memory properly calculates max rss
This commit is contained in:
Costa Tsaousis 2025-02-27 12:03:30 +00:00 committed by GitHub
parent 1cb57b50f2
commit acb738e873
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 241 additions and 54 deletions

View file

@ -1167,6 +1167,6 @@ else
progress "is installed now!"
fi
echo >&2 " enjoy real-time performance and health monitoring..."
echo >&2 " Enjoy X-Ray Vision for your infrastructure..."
echo >&2
exit 0

View file

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-3.0-or-later
#include "daemon-shutdown-watcher.h"
#include "daemon-status-file.h"
watcher_step_t *watcher_steps;
@ -33,6 +34,8 @@ static void watcher_wait_for_step(const watcher_step_id_t step_id, usec_t shutdo
(int)step_id + 1, (int)WATCHER_STEP_ID_MAX, start_duration_txt,
watcher_steps[step_id].msg);
daemon_status_file_shutdown_step(watcher_steps[step_id].msg);
#ifdef ENABLE_SENTRY
// Wait with a timeout
time_t timeout = 135; // systemd gives us 150, we timeout at 135
@ -65,6 +68,7 @@ static void watcher_wait_for_step(const watcher_step_id_t step_id, usec_t shutdo
(int)step_id + 1, (int)WATCHER_STEP_ID_MAX, start_duration_txt,
watcher_steps[step_id].msg, step_duration_txt);
daemon_status_file_shutdown_step("sentry timeout");
abort();
}
}
@ -104,7 +108,6 @@ void *watcher_main(void *arg)
watcher_wait_for_step(WATCHER_STEP_ID_CLOSE_SQL_DATABASES, shutdown_start_time);
watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_PID_FILE, shutdown_start_time);
watcher_wait_for_step(WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES, shutdown_start_time);
watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE, shutdown_start_time);
completion_wait_for(&shutdown_end_completion);
usec_t shutdown_end_time = now_monotonic_usec();
@ -165,8 +168,6 @@ void watcher_thread_start() {
"remove pid file";
watcher_steps[WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES].msg =
"free openssl structures";
watcher_steps[WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE].msg =
"remove incomplete shutdown file";
for (size_t i = 0; i != WATCHER_STEP_ID_MAX; i++) {
completion_init(&watcher_steps[i].p);

View file

@ -29,7 +29,6 @@ typedef enum {
WATCHER_STEP_ID_CLOSE_SQL_DATABASES,
WATCHER_STEP_ID_REMOVE_PID_FILE,
WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES,
WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE,
// Always keep this as the last enum value
WATCHER_STEP_ID_MAX

View file

@ -316,12 +316,11 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
netdata_ssl_cleanup();
watcher_step_complete(WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES);
watcher_step_complete(WATCHER_STEP_ID_REMOVE_INCOMPLETE_SHUTDOWN_FILE);
watcher_shutdown_end();
watcher_thread_stop();
curl_global_cleanup();
daemon_status_file_shutdown_step(NULL);
daemon_status_file_update_status(DAEMON_STATUS_EXITED);
#ifdef OS_WINDOWS

View file

@ -9,7 +9,7 @@
#include <openssl/pem.h>
#include <openssl/err.h>
#define STATUS_FILE_VERSION 3
#define STATUS_FILE_VERSION 4
#define STATUS_FILENAME "status-netdata.json"
@ -81,6 +81,7 @@ static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
buffer_json_member_add_uuid(wb, "ND_node_id", ds->node_id.uuid); // custom
buffer_json_member_add_uuid(wb, "ND_claim_id", ds->claim_id.uuid); // custom
buffer_json_member_add_uint64(wb, "ND_restarts", ds->restarts); // custom
ND_PROFILE_2json(wb, "ND_profile", ds->profile); // custom
buffer_json_member_add_string(wb, "ND_status", DAEMON_STATUS_2str(ds->status)); // custom
@ -155,13 +156,21 @@ static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
}
buffer_json_object_close(wb);
buffer_json_member_add_object(wb, "dedup"); // custom
buffer_json_member_add_array(wb, "dedup"); // custom
{
buffer_json_member_add_datetime_rfc3339(wb, "@timestamp", ds->dedup.timestamp_ut, true); // custom
buffer_json_member_add_uint64(wb, "hash", ds->dedup.hash); // custom
buffer_json_member_add_uint64(wb, "restarts", ds->dedup.restarts); // custom
for(size_t i = 0; i < _countof(ds->dedup); i++) {
if (ds->dedup[i].timestamp_ut == 0)
continue;
buffer_json_add_array_item_object(wb); // custom
{
buffer_json_member_add_datetime_rfc3339(wb, "@timestamp", ds->dedup[i].timestamp_ut, true); // custom
buffer_json_member_add_uint64(wb, "hash", ds->dedup[i].hash); // custom
}
buffer_json_object_close(wb);
}
}
buffer_json_object_close(wb);
buffer_json_array_close(wb);
}
// --------------------------------------------------------------------------------------------------------------------
@ -180,6 +189,7 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
bool strict = false; // allow missing fields and values
bool required_v1 = version >= 1 ? strict : false;
bool required_v3 = version >= 3 ? strict : false;
bool required_v4 = version >= 4 ? strict : false;
// Parse timestamp
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required_v1);
@ -203,6 +213,9 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "init", ds->timings.init, error, required_v1);
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "exit", ds->timings.exit, error, required_v1);
});
if(version >= 4)
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "ND_restarts", ds->restarts, error, required_v4);
});
// Parse host object
@ -257,15 +270,32 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
});
// Parse the last posted object
JSONC_PARSE_SUBOBJECT(jobj, path, "dedup", error, required_v3, {
datetime[0] = '\0';
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required_v1);
if(datetime[0])
ds->dedup.timestamp_ut = rfc3339_parse_ut(datetime, NULL);
if(version == 3) {
JSONC_PARSE_SUBOBJECT(jobj, path, "dedup", error, required_v3, {
datetime[0] = '\0';
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required_v3);
if (datetime[0])
ds->dedup[0].timestamp_ut = rfc3339_parse_ut(datetime, NULL);
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "hash", ds->dedup.hash, error, required_v3);
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "restarts", ds->dedup.restarts, error, required_v3);
});
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "hash", ds->dedup[0].hash, error, required_v3);
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "restarts", ds->restarts, error, required_v3);
});
}
else if(version >= 4) {
JSONC_PARSE_ARRAY(jobj, path, "dedup", error, required_v4, {
size_t i = 0;
JSONC_PARSE_ARRAY_ITEM_OBJECT(jobj, path, i, required_v4, {
if(i >= _countof(ds->dedup))
break;
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required_v4);
if (datetime[0])
ds->dedup[i].timestamp_ut = rfc3339_parse_ut(datetime, NULL);
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "hash", ds->dedup[i].hash, error, required_v4);
});
});
}
return true;
}
@ -349,11 +379,11 @@ static void daemon_status_file_refresh(DAEMON_STATUS status) {
session_status.os_id = strdupz(last_session_status.os_id);
if(!session_status.os_id_like && last_session_status.os_id_like)
session_status.os_id_like = strdupz(last_session_status.os_id_like);
if(!session_status.dedup.restarts)
session_status.dedup.restarts = last_session_status.dedup.restarts + 1;
if(!session_status.dedup.timestamp_ut || !session_status.dedup.hash) {
session_status.dedup.timestamp_ut = last_session_status.dedup.timestamp_ut;
session_status.dedup.hash = last_session_status.dedup.hash;
if(!session_status.restarts)
session_status.restarts = last_session_status.restarts + 1;
if(!session_status.dedup[0].timestamp_ut || !session_status.dedup[0].hash) {
for (size_t i = 0; i < _countof(session_status.dedup); i++)
session_status.dedup[i] = last_session_status.dedup[i];
}
if(!session_status.install_type) {
@ -561,6 +591,65 @@ static void daemon_status_file_out_of_memory(void) {
daemon_status_file_exit_reason_save(EXIT_REASON_OUT_OF_MEMORY);
}
// --------------------------------------------------------------------------------------------------------------------
// deduplication hashes management
static bool dedup_already_posted(XXH64_hash_t hash) {
spinlock_lock(&dsf_spinlock);
usec_t now_ut = now_realtime_usec();
for(size_t i = 0; i < _countof(session_status.dedup); i++) {
if(session_status.dedup[i].timestamp_ut == 0)
continue;
if(hash == session_status.dedup[i].hash &&
now_ut - session_status.dedup[i].timestamp_ut < 86400 * USEC_PER_SEC) {
// we have already posted this crash
spinlock_unlock(&dsf_spinlock);
return true;
}
}
spinlock_unlock(&dsf_spinlock);
return false;
}
static void dedup_keep_hash(DAEMON_STATUS_FILE *ds, XXH64_hash_t hash) {
spinlock_lock(&dsf_spinlock);
// find the same hash
for(size_t i = 0; i < _countof(ds->dedup); i++) {
if(ds->dedup[i].hash == hash) {
ds->dedup[i].timestamp_ut = now_realtime_usec();
spinlock_unlock(&dsf_spinlock);
return;
}
}
// find an empty slot
for(size_t i = 0; i < _countof(ds->dedup); i++) {
if(!ds->dedup[i].hash) {
ds->dedup[i].hash = hash;
ds->dedup[i].timestamp_ut = now_realtime_usec();
spinlock_unlock(&dsf_spinlock);
return;
}
}
// find the oldest slot
size_t store_at_slot = 0;
for(size_t i = 1; i < _countof(ds->dedup); i++) {
if(ds->dedup[i].timestamp_ut < ds->dedup[store_at_slot].timestamp_ut)
store_at_slot = i;
}
ds->dedup[store_at_slot].timestamp_ut = now_realtime_usec();
ds->dedup[store_at_slot].hash = hash;
spinlock_unlock(&dsf_spinlock);
}
// --------------------------------------------------------------------------------------------------------------------
// POST the last status to agent-events
@ -596,10 +685,7 @@ void post_status_file(struct post_status_file_thread_data *d) {
CURLcode rc = curl_easy_perform(curl);
if(rc == CURLE_OK) {
XXH64_hash_t hash = daemon_status_file_hash(&d->status, d->msg, d->cause);
spinlock_lock(&dsf_spinlock);
session_status.dedup.timestamp_ut = now_realtime_usec();
session_status.dedup.hash = hash;
spinlock_unlock(&dsf_spinlock);
dedup_keep_hash(&session_status, hash);
daemon_status_file_save(&session_status);
}
@ -797,14 +883,9 @@ void daemon_status_file_check_crash(void) {
"Last exit status: %s (%s):\n\n%s",
NETDATA_VERSION, msg, cause, buffer_tostring(wb));
if(last_session_status.dedup.timestamp_ut && last_session_status.dedup.hash) {
XXH64_hash_t hash = daemon_status_file_hash(&last_session_status, msg, cause);
if(hash == last_session_status.dedup.hash &&
now_realtime_usec() - last_session_status.dedup.timestamp_ut < 86400 * USEC_PER_SEC) {
// we have already posted this crash
disable_crash_report = true;
}
}
// check if we have already posted this crash in the last 24 hours
XXH64_hash_t hash = daemon_status_file_hash(&last_session_status, msg, cause);
disable_crash_report = dedup_already_posted(hash);
if(!disable_crash_report && (analytics_check_enabled() || post_crash_report)) {
netdata_conf_ssl();
@ -827,12 +908,34 @@ bool daemon_status_file_was_incomplete_shutdown(void) {
}
void daemon_status_file_startup_step(const char *step) {
if(session_status.fatal.filename)
// we have a fatal logged
return;
freez((char *)session_status.fatal.function);
session_status.fatal.function = step ? strdupz(step) : NULL;
if(step != NULL)
daemon_status_file_update_status(DAEMON_STATUS_NONE);
}
void daemon_status_file_shutdown_step(const char *step) {
if(session_status.fatal.filename)
// we have a fatal logged
return;
freez((char *)session_status.fatal.function);
if(!step)
session_status.fatal.function = NULL;
else {
char buf[1024];
snprintfz(buf, sizeof(buf), "shutdown(%s)", step);
session_status.fatal.function = strdupz(buf);
}
daemon_status_file_update_status(DAEMON_STATUS_NONE);
}
// --------------------------------------------------------------------------------------------------------------------
// ng_log() hook for receiving fatal message information

View file

@ -34,6 +34,7 @@ typedef struct daemon_status_file {
time_t boottime; // system boottime
time_t uptime; // netdata uptime
usec_t timestamp_ut; // the timestamp of the status file
size_t restarts; // the number of times this agent has restarted
ND_UUID boot_id; // the boot id of the system
ND_UUID invocation; // the netdata invocation id generated the file
@ -72,8 +73,7 @@ typedef struct daemon_status_file {
struct {
XXH64_hash_t hash;
usec_t timestamp_ut;
size_t restarts;
} dedup;
} dedup[10];
} DAEMON_STATUS_FILE;
// loads the last status saved
@ -88,7 +88,9 @@ void daemon_status_file_check_crash(void);
bool daemon_status_file_has_last_crashed(void);
bool daemon_status_file_was_incomplete_shutdown(void);
void daemon_status_file_startup_step(const char *step);
void daemon_status_file_shutdown_step(const char *step);
void daemon_status_file_register_fatal(const char *filename, const char *function, const char *message, const char *errno_str, const char *stack_trace, long line);

View file

@ -262,6 +262,17 @@
strncpyz(path + len, member, sizeof_path - len); \
} while(0)
#define JSONC_PATH_CONCAT_INDEX(path, sizeof_path, index, error) do { \
char _idx_str[32]; \
snprintfz(_idx_str, sizeof(_idx_str), "[%zu]", index); \
size_t _path_len = strlen(path); \
if (_path_len + strlen(_idx_str) >= sizeof_path) { \
buffer_sprintf(error, "path too long while adding array index"); \
return false; \
} \
strncpyz(path + _path_len, _idx_str, sizeof_path - _path_len); \
} while(0)
#define JSONC_PARSE_SUBOBJECT(jobj, path, member, error, required, block) do { \
BUILD_BUG_ON(sizeof(path) < 128); /* ensure path is an array of at least 128 bytes */ \
json_object *JSONC_TEMP_VAR(_j, __LINE__); \
@ -273,19 +284,79 @@
} \
else { \
if (!json_object_is_type(JSONC_TEMP_VAR(_j, __LINE__), json_type_object)) { \
buffer_sprintf(error, "not an object '%s.%s'", *path ? path : "", member); \
return false; \
if(required) { \
buffer_sprintf(error, "not an object '%s.%s'", *path ? path : "", member); \
return false; \
} \
} \
else { \
json_object *JSONC_TEMP_VAR(saved_jobj, __LINE__) = jobj; \
jobj = JSONC_TEMP_VAR(_j, __LINE__); \
char JSONC_TEMP_VAR(saved_path, __LINE__)[strlen(path) + 1]; \
strncpyz(JSONC_TEMP_VAR(saved_path, __LINE__), path, sizeof(JSONC_TEMP_VAR(saved_path, __LINE__))); \
JSONC_PATH_CONCAT(path, sizeof(path), path, member, error); \
/* Run the user's code block */ \
block \
/* Restore the previous scope's values */ \
jobj = JSONC_TEMP_VAR(saved_jobj, __LINE__); \
strncpyz(path, JSONC_TEMP_VAR(saved_path, __LINE__), sizeof(path)); \
} \
} \
} while(0)
#define JSONC_PARSE_ARRAY(jobj, path, member, error, required, block) do { \
BUILD_BUG_ON(sizeof(path) < 128); /* ensure path is an array of at least 128 bytes */ \
json_object *JSONC_TEMP_VAR(_jarray, __LINE__); \
if (!json_object_object_get_ex(jobj, member, &JSONC_TEMP_VAR(_jarray, __LINE__))) { \
if (required) { \
buffer_sprintf(error, "missing '%s.%s' array", *path ? path : "", member); \
return false; \
} \
} \
else { \
if (!json_object_is_type(JSONC_TEMP_VAR(_jarray, __LINE__), json_type_array)) { \
if (required) { \
buffer_sprintf(error, "not an array '%s.%s'", *path ? path : "", member); \
return false; \
} \
} \
else { \
json_object *JSONC_TEMP_VAR(saved_jobj, __LINE__) = jobj; \
jobj = JSONC_TEMP_VAR(_jarray, __LINE__); \
char JSONC_TEMP_VAR(saved_path, __LINE__)[strlen(path) + 1]; \
strncpyz(JSONC_TEMP_VAR(saved_path, __LINE__), path, sizeof(JSONC_TEMP_VAR(saved_path, __LINE__))); \
JSONC_PATH_CONCAT(path, sizeof(path), path, member, error); \
/* Run the user's code block */ \
block \
/* Restore the previous scope's values */ \
jobj = JSONC_TEMP_VAR(saved_jobj, __LINE__); \
strncpyz(path, JSONC_TEMP_VAR(saved_path, __LINE__), sizeof(path)); \
} \
} \
} while(0)
#define JSONC_PARSE_ARRAY_ITEM_OBJECT(jobj, path, index, required, block) do { \
size_t JSONC_TEMP_VAR(_array_len, __LINE__) = json_object_array_length(jobj); \
for (index = 0; index < JSONC_TEMP_VAR(_array_len, __LINE__); index++) { \
json_object *JSONC_TEMP_VAR(_jitem, __LINE__) = json_object_array_get_idx(jobj, index); \
if (!json_object_is_type(JSONC_TEMP_VAR(_jitem, __LINE__), json_type_object)) { \
if(required) { \
buffer_sprintf(error, "not an object '%s[%zu]'", *path ? path : "", index); \
return false; \
} \
} \
else { \
json_object *JSONC_TEMP_VAR(saved_jobj, __LINE__) = jobj; \
jobj = JSONC_TEMP_VAR(_jitem, __LINE__); \
char JSONC_TEMP_VAR(saved_path, __LINE__)[strlen(path) + 1]; \
strncpyz(JSONC_TEMP_VAR(saved_path, __LINE__), path, sizeof(JSONC_TEMP_VAR(saved_path, __LINE__))); \
JSONC_PATH_CONCAT_INDEX(path, sizeof(path), index, error); \
/* Run the user's code block */ \
block \
/* Restore the previous scope's values */ \
jobj = JSONC_TEMP_VAR(saved_jobj, __LINE__); \
strncpyz(path, JSONC_TEMP_VAR(saved_path, __LINE__), sizeof(path)); \
} \
json_object *JSONC_TEMP_VAR(saved_jobj, __LINE__) = jobj; \
jobj = JSONC_TEMP_VAR(_j, __LINE__); \
char JSONC_TEMP_VAR(saved_path, __LINE__)[strlen(path) + 1]; \
strncpyz(JSONC_TEMP_VAR(saved_path, __LINE__), path, sizeof(JSONC_TEMP_VAR(saved_path, __LINE__))); \
JSONC_PATH_CONCAT(path, sizeof(path), path, member, error); \
/* Run the user's code block */ \
block \
/* Restore the previous scope's values */ \
jobj = JSONC_TEMP_VAR(saved_jobj, __LINE__); \
strncpyz(path, JSONC_TEMP_VAR(saved_path, __LINE__), sizeof(path)); \
} \
} while(0)

View file

@ -12,16 +12,28 @@ static void out_of_memory(const char *call, size_t size) {
if(out_of_memory_callback)
out_of_memory_callback();
#if defined(OS_LINUX) || defined(OS_WINDOWS)
int rss_multiplier = 1024;
#else
int rss_multiplier = 1;
#endif
struct rusage usage = { 0 };
if(getrusage(RUSAGE_SELF, &usage) != 0)
usage.ru_maxrss = 0;
char mem_available[64];
char rss_used[64];
OS_SYSTEM_MEMORY sm = os_last_reported_system_memory();
size_snprintf(mem_available, sizeof(mem_available), sm.ram_available_bytes, "B", false);
size_snprintf(rss_used, sizeof(rss_used), usage.ru_maxrss * rss_multiplier, "B", false);
fatal("Out of memory on %s(%zu bytes)!\n"
"System memory available: %lu, while our max RSS usage is: %ld\n"
"System memory available: %s, while our max RSS usage is: %s\n"
"O/S mmap limit: %llu, while our mmap count is: %zu",
call, size,
sm.ram_available_bytes, usage.ru_maxrss,
mem_available, rss_used,
os_mmap_limit(), __atomic_load_n(&nd_mmap_count, __ATOMIC_RELAXED));
}