mirror of
https://github.com/netdata/netdata.git
synced 2025-04-23 04:50:22 +00:00
Avoid post initialization errors repeateadly (#19709)
do not post initialization errors repeateadly; detect almost full disks; increase status file version
This commit is contained in:
parent
ede4ba0182
commit
84a207f559
5 changed files with 77 additions and 25 deletions
|
@ -173,7 +173,7 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
|
||||||
exit(ret);
|
exit(ret);
|
||||||
}
|
}
|
||||||
run = true;
|
run = true;
|
||||||
daemon_status_file_save(DAEMON_STATUS_EXITING);
|
daemon_status_file_update_status(DAEMON_STATUS_EXITING);
|
||||||
|
|
||||||
nd_log_limits_unlimited();
|
nd_log_limits_unlimited();
|
||||||
netdata_log_exit_reason();
|
netdata_log_exit_reason();
|
||||||
|
@ -322,7 +322,7 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
|
||||||
watcher_thread_stop();
|
watcher_thread_stop();
|
||||||
curl_global_cleanup();
|
curl_global_cleanup();
|
||||||
|
|
||||||
daemon_status_file_save(DAEMON_STATUS_EXITED);
|
daemon_status_file_update_status(DAEMON_STATUS_EXITED);
|
||||||
|
|
||||||
#ifdef OS_WINDOWS
|
#ifdef OS_WINDOWS
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -37,13 +37,14 @@ ENUM_STR_DEFINE_FUNCTIONS(DAEMON_OS_TYPE, DAEMON_OS_TYPE_UNKNOWN, "unknown");
|
||||||
|
|
||||||
static DAEMON_STATUS_FILE last_session_status = { 0 };
|
static DAEMON_STATUS_FILE last_session_status = { 0 };
|
||||||
static DAEMON_STATUS_FILE session_status = { 0 };
|
static DAEMON_STATUS_FILE session_status = { 0 };
|
||||||
|
static SPINLOCK dsf_spinlock = SPINLOCK_INITIALIZER;
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------------
|
||||||
// json generation
|
// json generation
|
||||||
|
|
||||||
static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
|
static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
|
||||||
buffer_json_member_add_datetime_rfc3339(wb, "@timestamp", ds->timestamp_ut, true); // ECS
|
buffer_json_member_add_datetime_rfc3339(wb, "@timestamp", ds->timestamp_ut, true); // ECS
|
||||||
buffer_json_member_add_uint64(wb, "version", 1); // custom
|
buffer_json_member_add_uint64(wb, "version", 2); // custom
|
||||||
|
|
||||||
buffer_json_member_add_object(wb, "agent"); // ECS
|
buffer_json_member_add_object(wb, "agent"); // ECS
|
||||||
{
|
{
|
||||||
|
@ -116,7 +117,7 @@ static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
|
||||||
}
|
}
|
||||||
buffer_json_object_close(wb);
|
buffer_json_object_close(wb);
|
||||||
|
|
||||||
buffer_json_member_add_object(wb, "fatal");
|
buffer_json_member_add_object(wb, "fatal"); // custom
|
||||||
{
|
{
|
||||||
buffer_json_member_add_uint64(wb, "line", ds->fatal.line);
|
buffer_json_member_add_uint64(wb, "line", ds->fatal.line);
|
||||||
buffer_json_member_add_string_or_empty(wb, "filename", ds->fatal.filename);
|
buffer_json_member_add_string_or_empty(wb, "filename", ds->fatal.filename);
|
||||||
|
@ -125,6 +126,14 @@ static void daemon_status_file_to_json(BUFFER *wb, DAEMON_STATUS_FILE *ds) {
|
||||||
buffer_json_member_add_string_or_empty(wb, "stack_trace", ds->fatal.stack_trace);
|
buffer_json_member_add_string_or_empty(wb, "stack_trace", ds->fatal.stack_trace);
|
||||||
}
|
}
|
||||||
buffer_json_object_close(wb);
|
buffer_json_object_close(wb);
|
||||||
|
|
||||||
|
buffer_json_member_add_object(wb, "dedup"); // custom
|
||||||
|
{
|
||||||
|
buffer_json_member_add_time_t(wb, "timestamp", ds->dedup.timestamp); // custom
|
||||||
|
buffer_json_member_add_string(wb, "status", DAEMON_STATUS_2str(ds->dedup.status)); // custom
|
||||||
|
EXIT_REASON_2json(wb, "exit_reason", ds->dedup.exit_reason); // custom
|
||||||
|
}
|
||||||
|
buffer_json_object_close(wb);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------------
|
||||||
|
@ -141,6 +150,7 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
|
||||||
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "version", version, error, true);
|
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "version", version, error, true);
|
||||||
|
|
||||||
bool required = false; // allow missing fields and values
|
bool required = false; // allow missing fields and values
|
||||||
|
bool required_v2 = false; // allow missing fields and values for version 2
|
||||||
|
|
||||||
// Parse timestamp
|
// Parse timestamp
|
||||||
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required);
|
JSONC_PARSE_TXT2CHAR_OR_ERROR_AND_RETURN(jobj, path, "@timestamp", datetime, error, required);
|
||||||
|
@ -215,13 +225,22 @@ static bool daemon_status_file_from_json(json_object *jobj, void *data, BUFFER *
|
||||||
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "line", ds->fatal.line, error, required);
|
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "line", ds->fatal.line, error, required);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Parse the last posted object
|
||||||
|
JSONC_PARSE_SUBOBJECT(jobj, path, "dedup", error, required_v2, {
|
||||||
|
JSONC_PARSE_UINT64_OR_ERROR_AND_RETURN(jobj, path, "timestamp", ds->dedup.timestamp, error, required_v2);
|
||||||
|
JSONC_PARSE_TXT2ENUM_OR_ERROR_AND_RETURN(jobj, path, "status", DAEMON_STATUS_2id, ds->dedup.status, error, required_v2);
|
||||||
|
JSONC_PARSE_ARRAY_OF_TXT2BITMAP_OR_ERROR_AND_RETURN(jobj, path, "exit_reason", EXIT_REASON_2id_one, ds->dedup.exit_reason, error, required_v2);
|
||||||
|
});
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------------
|
||||||
// get the current status
|
// get the current status
|
||||||
|
|
||||||
static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
|
static void daemon_status_file_refresh(DAEMON_STATUS status) {
|
||||||
|
spinlock_lock(&dsf_spinlock);
|
||||||
|
|
||||||
usec_t now_ut = now_realtime_usec();
|
usec_t now_ut = now_realtime_usec();
|
||||||
|
|
||||||
#if defined(OS_LINUX)
|
#if defined(OS_LINUX)
|
||||||
|
@ -295,6 +314,11 @@ static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
|
||||||
session_status.os_id = strdupz(last_session_status.os_id);
|
session_status.os_id = strdupz(last_session_status.os_id);
|
||||||
if(!session_status.os_id_like && last_session_status.os_id_like)
|
if(!session_status.os_id_like && last_session_status.os_id_like)
|
||||||
session_status.os_id_like = strdupz(last_session_status.os_id_like);
|
session_status.os_id_like = strdupz(last_session_status.os_id_like);
|
||||||
|
if(!session_status.dedup.timestamp) {
|
||||||
|
session_status.dedup.timestamp = last_session_status.dedup.timestamp;
|
||||||
|
session_status.dedup.status = last_session_status.dedup.status;
|
||||||
|
session_status.dedup.exit_reason = last_session_status.dedup.exit_reason;
|
||||||
|
}
|
||||||
|
|
||||||
get_daemon_status_fields_from_system_info(&session_status);
|
get_daemon_status_fields_from_system_info(&session_status);
|
||||||
|
|
||||||
|
@ -307,7 +331,7 @@ static DAEMON_STATUS_FILE daemon_status_file_get(DAEMON_STATUS status) {
|
||||||
session_status.memory = os_system_memory(true);
|
session_status.memory = os_system_memory(true);
|
||||||
session_status.var_cache = os_disk_space(netdata_configured_cache_dir);
|
session_status.var_cache = os_disk_space(netdata_configured_cache_dir);
|
||||||
|
|
||||||
return session_status;
|
spinlock_unlock(&dsf_spinlock);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------------
|
||||||
|
@ -439,17 +463,13 @@ static bool save_status_file(const char *directory, const char *content, size_t
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void daemon_status_file_save(DAEMON_STATUS status) {
|
static void daemon_status_file_save(DAEMON_STATUS_FILE *ds) {
|
||||||
static SPINLOCK spinlock = SPINLOCK_INITIALIZER;
|
spinlock_lock(&dsf_spinlock);
|
||||||
spinlock_lock(&spinlock);
|
|
||||||
|
|
||||||
// Get current status
|
|
||||||
DAEMON_STATUS_FILE ds = daemon_status_file_get(status);
|
|
||||||
|
|
||||||
// Prepare JSON content
|
// Prepare JSON content
|
||||||
CLEAN_BUFFER *wb = buffer_create(0, NULL);
|
CLEAN_BUFFER *wb = buffer_create(0, NULL);
|
||||||
buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT);
|
buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_DEFAULT);
|
||||||
daemon_status_file_to_json(wb, &ds);
|
daemon_status_file_to_json(wb, ds);
|
||||||
buffer_json_finalize(wb);
|
buffer_json_finalize(wb);
|
||||||
|
|
||||||
const char *content = buffer_tostring(wb);
|
const char *content = buffer_tostring(wb);
|
||||||
|
@ -476,7 +496,12 @@ void daemon_status_file_save(DAEMON_STATUS status) {
|
||||||
if (!saved)
|
if (!saved)
|
||||||
nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to save status file in any location");
|
nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to save status file in any location");
|
||||||
|
|
||||||
spinlock_unlock(&spinlock);
|
spinlock_unlock(&dsf_spinlock);
|
||||||
|
}
|
||||||
|
|
||||||
|
void daemon_status_file_update_status(DAEMON_STATUS status) {
|
||||||
|
daemon_status_file_refresh(status);
|
||||||
|
daemon_status_file_save(&session_status);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------------
|
||||||
|
@ -512,7 +537,14 @@ void post_status_file(struct post_status_file_thread_data *d) {
|
||||||
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
curl_easy_setopt(curl, CURLOPT_HTTPHEADER, headers);
|
||||||
|
|
||||||
CURLcode rc = curl_easy_perform(curl);
|
CURLcode rc = curl_easy_perform(curl);
|
||||||
(void)rc;
|
if(rc == CURLE_OK) {
|
||||||
|
spinlock_lock(&dsf_spinlock);
|
||||||
|
session_status.dedup.timestamp = now_realtime_sec();
|
||||||
|
session_status.dedup.status = d->status.status;
|
||||||
|
session_status.dedup.exit_reason = d->status.exit_reason;
|
||||||
|
spinlock_unlock(&dsf_spinlock);
|
||||||
|
daemon_status_file_save(&session_status);
|
||||||
|
}
|
||||||
|
|
||||||
curl_easy_cleanup(curl);
|
curl_easy_cleanup(curl);
|
||||||
curl_slist_free_all(headers);
|
curl_slist_free_all(headers);
|
||||||
|
@ -532,7 +564,7 @@ void *post_status_file_thread(void *ptr) {
|
||||||
|
|
||||||
void daemon_status_file_check_crash(void) {
|
void daemon_status_file_check_crash(void) {
|
||||||
last_session_status = daemon_status_file_load();
|
last_session_status = daemon_status_file_load();
|
||||||
daemon_status_file_save(DAEMON_STATUS_INITIALIZING);
|
daemon_status_file_update_status(DAEMON_STATUS_INITIALIZING);
|
||||||
ND_LOG_FIELD_PRIORITY pri = NDLP_NOTICE;
|
ND_LOG_FIELD_PRIORITY pri = NDLP_NOTICE;
|
||||||
|
|
||||||
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
|
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
|
||||||
|
@ -592,12 +624,24 @@ void daemon_status_file_check_crash(void) {
|
||||||
cause = "disk full";
|
cause = "disk full";
|
||||||
msg = "Netdata couldn't start because the disk is full";
|
msg = "Netdata couldn't start because the disk is full";
|
||||||
}
|
}
|
||||||
|
else if (OS_SYSTEM_DISK_SPACE_OK(last_session_status.var_cache) &&
|
||||||
|
last_session_status.var_cache.free_bytes < 1 * 1024 * 1024) {
|
||||||
|
cause = "disk almost full";
|
||||||
|
msg = "Netdata couldn't start while the disk is almost full";
|
||||||
|
}
|
||||||
else {
|
else {
|
||||||
cause = "crashed on start";
|
cause = "crashed on start";
|
||||||
msg = "Netdata was last killed/crashed while starting";
|
msg = "Netdata was last killed/crashed while starting";
|
||||||
}
|
}
|
||||||
pri = NDLP_ERR;
|
pri = NDLP_ERR;
|
||||||
post_crash_report = true;
|
post_crash_report = true;
|
||||||
|
|
||||||
|
if(session_status.dedup.status == DAEMON_STATUS_INITIALIZING &&
|
||||||
|
now_realtime_sec() - last_session_status.dedup.timestamp < 86400) {
|
||||||
|
// we have already posted this crash
|
||||||
|
disable_crash_report = true;
|
||||||
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DAEMON_STATUS_EXITING:
|
case DAEMON_STATUS_EXITING:
|
||||||
|
@ -678,19 +722,18 @@ void daemon_status_file_startup_step(const char *step) {
|
||||||
freez((char *)session_status.fatal.function);
|
freez((char *)session_status.fatal.function);
|
||||||
session_status.fatal.function = step ? strdupz(step) : NULL;
|
session_status.fatal.function = step ? strdupz(step) : NULL;
|
||||||
if(step != NULL)
|
if(step != NULL)
|
||||||
daemon_status_file_save(DAEMON_STATUS_NONE);
|
daemon_status_file_update_status(DAEMON_STATUS_NONE);
|
||||||
}
|
}
|
||||||
|
|
||||||
// --------------------------------------------------------------------------------------------------------------------
|
// --------------------------------------------------------------------------------------------------------------------
|
||||||
// ng_log() hook for receiving fatal message information
|
// ng_log() hook for receiving fatal message information
|
||||||
|
|
||||||
void daemon_status_file_register_fatal(const char *filename, const char *function, const char *message, const char *stack_trace, long line) {
|
void daemon_status_file_register_fatal(const char *filename, const char *function, const char *message, const char *stack_trace, long line) {
|
||||||
static SPINLOCK spinlock = SPINLOCK_INITIALIZER;
|
spinlock_lock(&dsf_spinlock);
|
||||||
spinlock_lock(&spinlock);
|
|
||||||
|
|
||||||
// do not check the function, because it may have a startup step in it
|
// do not check the function, because it may have a startup step in it
|
||||||
if(session_status.fatal.filename || session_status.fatal.message || session_status.fatal.stack_trace) {
|
if(session_status.fatal.filename || session_status.fatal.message || session_status.fatal.stack_trace) {
|
||||||
spinlock_unlock(&spinlock);
|
spinlock_unlock(&dsf_spinlock);
|
||||||
freez((void *)filename);
|
freez((void *)filename);
|
||||||
freez((void *)function);
|
freez((void *)function);
|
||||||
freez((void *)message);
|
freez((void *)message);
|
||||||
|
@ -705,5 +748,8 @@ void daemon_status_file_register_fatal(const char *filename, const char *functio
|
||||||
session_status.fatal.stack_trace = stack_trace;
|
session_status.fatal.stack_trace = stack_trace;
|
||||||
session_status.fatal.line = line;
|
session_status.fatal.line = line;
|
||||||
|
|
||||||
spinlock_unlock(&spinlock);
|
spinlock_unlock(&dsf_spinlock);
|
||||||
|
|
||||||
|
exit_initiated |= EXIT_REASON_FATAL;
|
||||||
|
daemon_status_file_save(&session_status);
|
||||||
}
|
}
|
||||||
|
|
|
@ -66,13 +66,19 @@ typedef struct daemon_status_file {
|
||||||
const char *stack_trace;
|
const char *stack_trace;
|
||||||
const char *message;
|
const char *message;
|
||||||
} fatal;
|
} fatal;
|
||||||
|
|
||||||
|
struct {
|
||||||
|
time_t timestamp;
|
||||||
|
DAEMON_STATUS status;
|
||||||
|
EXIT_REASON exit_reason;
|
||||||
|
} dedup;
|
||||||
} DAEMON_STATUS_FILE;
|
} DAEMON_STATUS_FILE;
|
||||||
|
|
||||||
// loads the last status saved
|
// loads the last status saved
|
||||||
DAEMON_STATUS_FILE daemon_status_file_load(void);
|
DAEMON_STATUS_FILE daemon_status_file_load(void);
|
||||||
|
|
||||||
// saves the current status
|
// saves the current status
|
||||||
void daemon_status_file_save(DAEMON_STATUS status);
|
void daemon_status_file_update_status(DAEMON_STATUS status);
|
||||||
|
|
||||||
// check for a crash
|
// check for a crash
|
||||||
void daemon_status_file_check_crash(void);
|
void daemon_status_file_check_crash(void);
|
||||||
|
|
|
@ -1058,7 +1058,7 @@ int netdata_main(int argc, char **argv) {
|
||||||
webrtc_initialize();
|
webrtc_initialize();
|
||||||
|
|
||||||
daemon_status_file_startup_step(NULL);
|
daemon_status_file_startup_step(NULL);
|
||||||
daemon_status_file_save(DAEMON_STATUS_RUNNING);
|
daemon_status_file_update_status(DAEMON_STATUS_RUNNING);
|
||||||
return 10;
|
return 10;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -113,7 +113,7 @@ void nd_process_signals(void) {
|
||||||
// is delivered that either terminates the process or causes the invocation
|
// is delivered that either terminates the process or causes the invocation
|
||||||
// of a signal-catching function.
|
// of a signal-catching function.
|
||||||
if(pause() == -1 && errno == EINTR) {
|
if(pause() == -1 && errno == EINTR) {
|
||||||
daemon_status_file_save(DAEMON_STATUS_NONE);
|
daemon_status_file_update_status(DAEMON_STATUS_NONE);
|
||||||
errno_clear();
|
errno_clear();
|
||||||
|
|
||||||
// loop once, but keep looping while signals are coming in,
|
// loop once, but keep looping while signals are coming in,
|
||||||
|
@ -157,7 +157,7 @@ void nd_process_signals(void) {
|
||||||
case NETDATA_SIGNAL_FATAL:
|
case NETDATA_SIGNAL_FATAL:
|
||||||
nd_log_limits_unlimited();
|
nd_log_limits_unlimited();
|
||||||
exit_initiated_set(signals_waiting[i].reason);
|
exit_initiated_set(signals_waiting[i].reason);
|
||||||
daemon_status_file_save(DAEMON_STATUS_NONE);
|
daemon_status_file_update_status(DAEMON_STATUS_NONE);
|
||||||
fatal("SIGNAL: Received %s. netdata now exits.", name);
|
fatal("SIGNAL: Received %s. netdata now exits.", name);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue