0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-13 09:11:50 +00:00

Avoid post initialization errors repeateadly ()

do not post initialization errors repeateadly; detect almost full disks; increase status file version
This commit is contained in:
Costa Tsaousis 2025-02-25 21:03:10 +00:00 committed by GitHub
parent ede4ba0182
commit 84a207f559
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 77 additions and 25 deletions

View file

@ -173,7 +173,7 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
exit(ret);
}
run = true;
daemon_status_file_save(DAEMON_STATUS_EXITING);
daemon_status_file_update_status(DAEMON_STATUS_EXITING);
nd_log_limits_unlimited();
netdata_log_exit_reason();
@ -322,7 +322,7 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
watcher_thread_stop();
curl_global_cleanup();
daemon_status_file_save(DAEMON_STATUS_EXITED);
daemon_status_file_update_status(DAEMON_STATUS_EXITED);
#ifdef OS_WINDOWS
return;

View file

@ -37,13 +37,14 @@ ENUM_STR_DEFINE_FUNCTIONS(DAEMON_OS_TYPE, DAEMON_OS_TYPE_UNKNOWN, "unknown");
static DAEMON_STATUS_FILE last_session_status = { 0 };
static DAEMON_STATUS_FILE session_status = { 0 };
static SPINLOCK dsf_spinlock = SPINLOCK_INITIALIZER;
// --------------------------------------------------------------------------------------------------------------------
// json generation
static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
buffer_json_member_add_datetime_rfc3339(wb, "@timestamp", ds->timestamp_ut, true); // ECS
buffer_json_member_add_uint64(wb, "version", 1); // custom
buffer_json_member_add_uint64(wb, "version", 2); // custom
buffer_json_member_add_object(wb, "agent"); // ECS
{
@ -116,7 +117,7 @@ static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
}
buffer_json_object_close(wb);
buffer_json_member_add_object(wb, "fatal");
buffer_json_member_add_object(wb, "fatal"); // custom
{
buffer_json_member_add_uint64(wb, "line", ds->fatal.line);
buffer_json_member_add_string_or_empty(wb, "filename", ds->fatal.filename);
@ -125,6 +126,14 @@ static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
buffer_json_member_add_string_or_empty(wb, "stack_trace", ds->fatal.stack_trace);
}
buffer_json_object_close(wb);
buffer_json_member_add_object(wb, "dedup"); // custom
{
buffer_json_member_add_time_t(wb, "timestamp", ds->dedup.timestamp); // custom
buffer_json_member_add_string(wb, "status", DAEMON_STATUS_2str(ds->dedup.status)); // custom
EXIT_REASON_2json(wb, "exit_reason", ds->dedup.exit_reason); // custom
}
buffer_json_object_close(wb);
}
// --------------------------------------------------------------------------------------------------------------------
@ -141,6 +150,7 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "version", version, error, true);
bool required = false; // allow missing fields and values
bool required_v2 = false; // allow missing fields and values for version 2
// Parse timestamp
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required);
@ -215,13 +225,22 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "line", ds->fatal.line, error, required);
});
// Parse the last posted object
JSONC_PARSE_SUBOBJECT(jobj, path, "dedup", error, required_v2, {
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "timestamp", ds->dedup.timestamp, error, required_v2);
JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "status", DAEMON_STATUS_2id, ds->dedup.status, error, required_v2);
JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "exit_reason", EXIT_REASON_2id_one, ds->dedup.exit_reason, error, required_v2);
});
return true;
}
// --------------------------------------------------------------------------------------------------------------------
// get the current status
static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
static void daemon_status_file_refresh(DAEMON_STATUS status) {
spinlock_lock(&dsf_spinlock);
usec_t now_ut = now_realtime_usec();
#if defined(OS_LINUX)
@ -295,6 +314,11 @@ static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
session_status.os_id = strdupz(last_session_status.os_id);
if(!session_status.os_id_like && last_session_status.os_id_like)
session_status.os_id_like = strdupz(last_session_status.os_id_like);
if(!session_status.dedup.timestamp) {
session_status.dedup.timestamp = last_session_status.dedup.timestamp;
session_status.dedup.status = last_session_status.dedup.status;
session_status.dedup.exit_reason = last_session_status.dedup.exit_reason;
}
get_daemon_status_fields_from_system_info(&session_status);
@ -307,7 +331,7 @@ static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
session_status.memory = os_system_memory(true);
session_status.var_cache = os_disk_space(netdata_configured_cache_dir);
return session_status;
spinlock_unlock(&dsf_spinlock);
}
// --------------------------------------------------------------------------------------------------------------------
@ -439,17 +463,13 @@ static bool save_status_file(const char *directory, const char *content, size_t
return true;
}
void daemon_status_file_save(DAEMON_STATUS status) {
static SPINLOCK spinlock = SPINLOCK_INITIALIZER;
spinlock_lock(&spinlock);
// Get current status
DAEMON_STATUS_FILE ds = daemon_status_file_get(status);
static void daemon_status_file_save(DAEMON_STATUS_FILE *ds) {
spinlock_lock(&dsf_spinlock);
// Prepare JSON content
CLEAN_BUFFER *wb = buffer_create(0, NULL);
buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT);
daemon_status_file_to_json(wb, &ds);
daemon_status_file_to_json(wb, ds);
buffer_json_finalize(wb);
const char *content = buffer_tostring(wb);
@ -476,7 +496,12 @@ void daemon_status_file_save(DAEMON_STATUS status) {
if (!saved)
nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to save status file in any location");
spinlock_unlock(&spinlock);
spinlock_unlock(&dsf_spinlock);
}
void daemon_status_file_update_status(DAEMON_STATUS status) {
daemon_status_file_refresh(status);
daemon_status_file_save(&session_status);
}
// --------------------------------------------------------------------------------------------------------------------
@ -512,7 +537,14 @@ void post_status_file(struct post_status_file_thread_data *d) {
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
CURLcode rc = curl_easy_perform(curl);
(void)rc;
if(rc == CURLE_OK) {
spinlock_lock(&dsf_spinlock);
session_status.dedup.timestamp = now_realtime_sec();
session_status.dedup.status = d->status.status;
session_status.dedup.exit_reason = d->status.exit_reason;
spinlock_unlock(&dsf_spinlock);
daemon_status_file_save(&session_status);
}
curl_easy_cleanup(curl);
curl_slist_free_all(headers);
@ -532,7 +564,7 @@ void *post_status_file_thread(void *ptr) {
void daemon_status_file_check_crash(void) {
last_session_status = daemon_status_file_load();
daemon_status_file_save(DAEMON_STATUS_INITIALIZING);
daemon_status_file_update_status(DAEMON_STATUS_INITIALIZING);
ND_LOG_FIELD_PRIORITY pri = NDLP_NOTICE;
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
@ -592,12 +624,24 @@ void daemon_status_file_check_crash(void) {
cause = "disk full";
msg = "Netdata couldn't start because the disk is full";
}
else if (OS_SYSTEM_DISK_SPACE_OK(last_session_status.var_cache) &&
last_session_status.var_cache.free_bytes < 1 * 1024 * 1024) {
cause = "disk almost full";
msg = "Netdata couldn't start while the disk is almost full";
}
else {
cause = "crashed on start";
msg = "Netdata was last killed/crashed while starting";
}
pri = NDLP_ERR;
post_crash_report = true;
if(session_status.dedup.status == DAEMON_STATUS_INITIALIZING &&
now_realtime_sec() - last_session_status.dedup.timestamp < 86400) {
// we have already posted this crash
disable_crash_report = true;
}
break;
case DAEMON_STATUS_EXITING:
@ -678,19 +722,18 @@ void daemon_status_file_startup_step(const char *step) {
freez((char *)session_status.fatal.function);
session_status.fatal.function = step ? strdupz(step) : NULL;
if(step != NULL)
daemon_status_file_save(DAEMON_STATUS_NONE);
daemon_status_file_update_status(DAEMON_STATUS_NONE);
}
// --------------------------------------------------------------------------------------------------------------------
// ng_log() hook for receiving fatal message information
void daemon_status_file_register_fatal(const char *filename, const char *function, const char *message, const char *stack_trace, long line) {
static SPINLOCK spinlock = SPINLOCK_INITIALIZER;
spinlock_lock(&spinlock);
spinlock_lock(&dsf_spinlock);
// do not check the function, because it may have a startup step in it
if(session_status.fatal.filename || session_status.fatal.message || session_status.fatal.stack_trace) {
spinlock_unlock(&spinlock);
spinlock_unlock(&dsf_spinlock);
freez((void *)filename);
freez((void *)function);
freez((void *)message);
@ -705,5 +748,8 @@ void daemon_status_file_register_fatal(const char *filename, const char *functio
session_status.fatal.stack_trace = stack_trace;
session_status.fatal.line = line;
spinlock_unlock(&spinlock);
spinlock_unlock(&dsf_spinlock);
exit_initiated |= EXIT_REASON_FATAL;
daemon_status_file_save(&session_status);
}

View file

@ -66,13 +66,19 @@ typedef struct daemon_status_file {
const char *stack_trace;
const char *message;
} fatal;
struct {
time_t timestamp;
DAEMON_STATUS status;
EXIT_REASON exit_reason;
} dedup;
} DAEMON_STATUS_FILE;
// loads the last status saved
DAEMON_STATUS_FILE daemon_status_file_load(void);
// saves the current status
void daemon_status_file_save(DAEMON_STATUS status);
void daemon_status_file_update_status(DAEMON_STATUS status);
// check for a crash
void daemon_status_file_check_crash(void);

View file

@ -1058,7 +1058,7 @@ int netdata_main(int argc, char **argv) {
webrtc_initialize();
daemon_status_file_startup_step(NULL);
daemon_status_file_save(DAEMON_STATUS_RUNNING);
daemon_status_file_update_status(DAEMON_STATUS_RUNNING);
return 10;
}

View file

@ -113,7 +113,7 @@ void nd_process_signals(void) {
// is delivered that either terminates the process or causes the invocation
// of a signal-catching function.
if(pause() == -1 && errno == EINTR) {
daemon_status_file_save(DAEMON_STATUS_NONE);
daemon_status_file_update_status(DAEMON_STATUS_NONE);
errno_clear();
// loop once, but keep looping while signals are coming in,
@ -157,7 +157,7 @@ void nd_process_signals(void) {
case NETDATA_SIGNAL_FATAL:
nd_log_limits_unlimited();
exit_initiated_set(signals_waiting[i].reason);
daemon_status_file_save(DAEMON_STATUS_NONE);
daemon_status_file_update_status(DAEMON_STATUS_NONE);
fatal("SIGNAL: Received %s. netdata now exits.", name);
break;