mirror of
https://github.com/netdata/netdata.git
synced 2025-05-02 08:20:10 +00:00
User configurable crash reporting (#19789)
* use a counter for temp status files * provide user configuration to reporting crashes * docs for crash reports * updated documentation * revert condition
This commit is contained in:
parent
fb67eb9c9c
commit
35c6875bbb
2 changed files with 83 additions and 61 deletions
src/daemon
|
@ -35,17 +35,18 @@ After `netdata.conf` has been modified, Netdata needs to be [restarted](/docs/ne
|
||||||
|
|
||||||
### `global` section options
|
### `global` section options
|
||||||
|
|
||||||
| setting | default | info |
|
| setting | default | info |
|
||||||
|:----------------------------------:|:-------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|:----------------------------------:|:--------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) |
|
| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) |
|
||||||
| OOM score | `0` | |
|
| OOM score | `0` | |
|
||||||
| glibc malloc arena max for plugins | `1` | |
|
| glibc malloc arena max for plugins | `1` | |
|
||||||
| glibc malloc arena max for Netdata | `1` | |
|
| glibc malloc arena max for Netdata | `1` | |
|
||||||
| hostname | auto-detected | The hostname of the computer running Netdata. |
|
| hostname | auto-detected | The hostname of the computer running Netdata. |
|
||||||
| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). |
|
| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). |
|
||||||
| timezone | auto-detected | The timezone retrieved from the environment variable |
|
| timezone | auto-detected | The timezone retrieved from the environment variable |
|
||||||
| run as user | `netdata` | The user Netdata will run as. |
|
| run as user | `netdata` | The user Netdata will run as. |
|
||||||
| pthread stack size | auto-detected | |
|
| pthread stack size | auto-detected | |
|
||||||
|
| crash reports | `all` or `off` | It is `off` when anonymous telemetry is disabled, otherwise `all`. When it is `all` Netdata reports agent restarts and crashes. It can also be `crashes` to report only crashes. Each kind of event is deduplicated and reported at most once per day. |
|
||||||
|
|
||||||
### `db` section options
|
### `db` section options
|
||||||
|
|
||||||
|
|
|
@ -568,45 +568,34 @@ static bool save_status_file(const char *directory, const char *content, size_t
|
||||||
if(!directory || !*directory)
|
if(!directory || !*directory)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
|
static uint64_t tmp_attempt_counter = 0;
|
||||||
|
|
||||||
char filename[FILENAME_MAX];
|
char filename[FILENAME_MAX];
|
||||||
char temp_filename[FILENAME_MAX];
|
char temp_filename[FILENAME_MAX];
|
||||||
|
char tid_str[UINT64_MAX_LENGTH];
|
||||||
|
|
||||||
/* Construct filenames using async-safe string operations */
|
print_uint64(tid_str, __atomic_add_fetch(&tmp_attempt_counter, 1, __ATOMIC_RELAXED));
|
||||||
/* Using simple string concatenation instead of snprintf */
|
|
||||||
size_t dir_len = strlen(directory);
|
size_t dir_len = strlen(directory);
|
||||||
if (dir_len + 1 + strlen(STATUS_FILENAME) >= FILENAME_MAX)
|
size_t fil_len = strlen(STATUS_FILENAME);
|
||||||
return false; /* Path too long */
|
size_t tid_len = strlen(tid_str);
|
||||||
|
|
||||||
memcpy(filename, directory, dir_len);
|
if (dir_len + 1 + fil_len + 1 + tid_len + 1 >= sizeof(filename))
|
||||||
filename[dir_len] = '/';
|
return false; // cannot fit the filename
|
||||||
memcpy(filename + dir_len + 1, STATUS_FILENAME, strlen(STATUS_FILENAME) + 1);
|
|
||||||
|
|
||||||
/* Create a unique temp filename using thread id */
|
// create the filename
|
||||||
unsigned int tid = (unsigned int)gettid_cached();
|
size_t pos = 0;
|
||||||
char tid_str[16];
|
memcpy(&filename[pos], directory, dir_len); pos += dir_len;
|
||||||
char *tid_ptr = tid_str + sizeof(tid_str) - 1;
|
filename[pos] = '/'; pos++;
|
||||||
*tid_ptr = '\0';
|
memcpy(&filename[pos], STATUS_FILENAME, fil_len); pos += fil_len;
|
||||||
|
filename[pos] = '\0';
|
||||||
|
|
||||||
unsigned int tid_copy = tid;
|
// create the temp filename
|
||||||
do {
|
memcpy(temp_filename, filename, pos);
|
||||||
tid_ptr--;
|
temp_filename[pos] = '-'; pos++;
|
||||||
*tid_ptr = "0123456789abcdef"[tid_copy & 0xf];
|
memcpy(&temp_filename[pos], tid_str, tid_len); pos += tid_len;
|
||||||
tid_copy >>= 4;
|
temp_filename[pos] = '\0';
|
||||||
} while (tid_copy && tid_ptr > tid_str);
|
|
||||||
|
|
||||||
size_t temp_name_len = dir_len + 1 + strlen(STATUS_FILENAME) + 1 + (sizeof(tid_str) - (tid_ptr - tid_str));
|
// Open file with O_WRONLY, O_CREAT, and O_TRUNC flags
|
||||||
if (temp_name_len >= FILENAME_MAX)
|
|
||||||
return false; /* Path too long */
|
|
||||||
|
|
||||||
memcpy(temp_filename, directory, dir_len);
|
|
||||||
temp_filename[dir_len] = '/';
|
|
||||||
char *ptr = temp_filename + dir_len + 1;
|
|
||||||
memcpy(ptr, STATUS_FILENAME, strlen(STATUS_FILENAME));
|
|
||||||
ptr += strlen(STATUS_FILENAME);
|
|
||||||
*ptr++ = '-';
|
|
||||||
memcpy(ptr, tid_ptr, strlen(tid_ptr) + 1);
|
|
||||||
|
|
||||||
/* Open file with O_WRONLY, O_CREAT, and O_TRUNC flags */
|
|
||||||
int fd = open(temp_filename, O_WRONLY | O_CREAT | O_TRUNC, 0664);
|
int fd = open(temp_filename, O_WRONLY | O_CREAT | O_TRUNC, 0664);
|
||||||
if (fd == -1)
|
if (fd == -1)
|
||||||
return false;
|
return false;
|
||||||
|
@ -620,7 +609,7 @@ static bool save_status_file(const char *directory, const char *content, size_t
|
||||||
|
|
||||||
if (bytes_written == -1) {
|
if (bytes_written == -1) {
|
||||||
if (errno == EINTR)
|
if (errno == EINTR)
|
||||||
continue; /* Retry if interrupted by signal */
|
continue; /* Retry if interrupted by signal */
|
||||||
|
|
||||||
close(fd);
|
close(fd);
|
||||||
unlink(temp_filename); /* Remove the temp file */
|
unlink(temp_filename); /* Remove the temp file */
|
||||||
|
@ -822,6 +811,30 @@ static bool is_ci(void) {
|
||||||
return ci && *ci && strcasecmp(ci, "true") == 0;
|
return ci && *ci && strcasecmp(ci, "true") == 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
enum crash_report_t {
|
||||||
|
DSF_REPORT_DISABLED = 0,
|
||||||
|
DSF_REPORT_ALL,
|
||||||
|
DSF_REPORT_CRASHES,
|
||||||
|
};
|
||||||
|
|
||||||
|
static enum crash_report_t check_crash_reports_config(void) {
|
||||||
|
bool analytics = analytics_check_enabled();
|
||||||
|
|
||||||
|
const char *t = inicfg_get(&netdata_config, CONFIG_SECTION_GLOBAL, "crash reports", analytics ? "all" : "off");
|
||||||
|
|
||||||
|
enum crash_report_t rc;
|
||||||
|
if(!t || !*t)
|
||||||
|
rc = analytics ? DSF_REPORT_ALL : DSF_REPORT_DISABLED;
|
||||||
|
else if(strcmp(t, "all") == 0)
|
||||||
|
rc = DSF_REPORT_ALL;
|
||||||
|
else if(strcmp(t, "crashes") == 0)
|
||||||
|
rc = DSF_REPORT_CRASHES;
|
||||||
|
else
|
||||||
|
rc = DSF_REPORT_DISABLED;
|
||||||
|
|
||||||
|
return rc;
|
||||||
|
}
|
||||||
|
|
||||||
void daemon_status_file_check_crash(void) {
|
void daemon_status_file_check_crash(void) {
|
||||||
FUNCTION_RUN_ONCE();
|
FUNCTION_RUN_ONCE();
|
||||||
|
|
||||||
|
@ -834,8 +847,8 @@ void daemon_status_file_check_crash(void) {
|
||||||
struct log_priority pri = PRI_ALL_NORMAL;
|
struct log_priority pri = PRI_ALL_NORMAL;
|
||||||
|
|
||||||
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
|
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
|
||||||
bool post_crash_report = false;
|
bool this_is_a_crash = false;
|
||||||
bool disable_crash_report = false;
|
bool crash_report_ignore = false;
|
||||||
bool dump_json = true;
|
bool dump_json = true;
|
||||||
const char *msg = "", *cause = "";
|
const char *msg = "", *cause = "";
|
||||||
switch(last_session_status.status) {
|
switch(last_session_status.status) {
|
||||||
|
@ -844,7 +857,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
// probably a previous version of netdata was running
|
// probably a previous version of netdata was running
|
||||||
cause = "no last status";
|
cause = "no last status";
|
||||||
msg = "No status found for the previous Netdata session";
|
msg = "No status found for the previous Netdata session";
|
||||||
disable_crash_report = true;
|
crash_report_ignore = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DAEMON_STATUS_EXITED:
|
case DAEMON_STATUS_EXITED:
|
||||||
|
@ -858,14 +871,14 @@ void daemon_status_file_check_crash(void) {
|
||||||
cause = "deadly signal and exit";
|
cause = "deadly signal and exit";
|
||||||
msg = "Netdata was last stopped gracefully after receiving a deadly signal";
|
msg = "Netdata was last stopped gracefully after receiving a deadly signal";
|
||||||
pri = PRI_NETDATA_BUG;
|
pri = PRI_NETDATA_BUG;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
}
|
}
|
||||||
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
|
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
|
||||||
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
||||||
cause = "fatal and exit";
|
cause = "fatal and exit";
|
||||||
msg = "Netdata was last stopped gracefully after it encountered a fatal error";
|
msg = "Netdata was last stopped gracefully after it encountered a fatal error";
|
||||||
pri = PRI_NETDATA_BUG;
|
pri = PRI_NETDATA_BUG;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
}
|
}
|
||||||
else if(last_session_status.exit_reason & EXIT_REASON_SYSTEM_SHUTDOWN) {
|
else if(last_session_status.exit_reason & EXIT_REASON_SYSTEM_SHUTDOWN) {
|
||||||
cause = "exit on system shutdown";
|
cause = "exit on system shutdown";
|
||||||
|
@ -899,7 +912,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
cause = "deadly signal on start";
|
cause = "deadly signal on start";
|
||||||
msg = "Netdata was last crashed while starting after receiving a deadly signal";
|
msg = "Netdata was last crashed while starting after receiving a deadly signal";
|
||||||
pri = PRI_NETDATA_BUG;
|
pri = PRI_NETDATA_BUG;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
}
|
}
|
||||||
else if (last_session_status.exit_reason & EXIT_REASON_OUT_OF_MEMORY) {
|
else if (last_session_status.exit_reason & EXIT_REASON_OUT_OF_MEMORY) {
|
||||||
cause = "out of memory";
|
cause = "out of memory";
|
||||||
|
@ -940,8 +953,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
msg = "Netdata was last killed/crashed while starting";
|
msg = "Netdata was last killed/crashed while starting";
|
||||||
pri = PRI_BAD_BUT_NO_REASON;
|
pri = PRI_BAD_BUT_NO_REASON;
|
||||||
}
|
}
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DAEMON_STATUS_EXITING:
|
case DAEMON_STATUS_EXITING:
|
||||||
|
@ -949,7 +961,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
cause = "deadly signal on exit";
|
cause = "deadly signal on exit";
|
||||||
msg = "Netdata was last crashed while exiting after receiving a deadly signal";
|
msg = "Netdata was last crashed while exiting after receiving a deadly signal";
|
||||||
pri = PRI_NETDATA_BUG;
|
pri = PRI_NETDATA_BUG;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
}
|
}
|
||||||
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
|
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
|
||||||
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
||||||
|
@ -969,7 +981,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
msg = "Netdata was last killed/crashed while it was instructed to exit";
|
msg = "Netdata was last killed/crashed while it was instructed to exit";
|
||||||
}
|
}
|
||||||
pri = PRI_NETDATA_BUG;
|
pri = PRI_NETDATA_BUG;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case DAEMON_STATUS_RUNNING: {
|
case DAEMON_STATUS_RUNNING: {
|
||||||
|
@ -990,7 +1002,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
cause = "deadly signal";
|
cause = "deadly signal";
|
||||||
msg = "Netdata was last crashed after receiving a deadly signal";
|
msg = "Netdata was last crashed after receiving a deadly signal";
|
||||||
pri = PRI_NETDATA_BUG;
|
pri = PRI_NETDATA_BUG;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
}
|
}
|
||||||
else if (last_session_status.exit_reason != EXIT_REASON_NONE &&
|
else if (last_session_status.exit_reason != EXIT_REASON_NONE &&
|
||||||
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
||||||
|
@ -1002,7 +1014,7 @@ void daemon_status_file_check_crash(void) {
|
||||||
cause = "killed hard";
|
cause = "killed hard";
|
||||||
msg = "Netdata was last killed/crashed while operating normally";
|
msg = "Netdata was last killed/crashed while operating normally";
|
||||||
pri = PRI_BAD_BUT_NO_REASON;
|
pri = PRI_BAD_BUT_NO_REASON;
|
||||||
post_crash_report = true;
|
this_is_a_crash = true;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1025,22 +1037,31 @@ void daemon_status_file_check_crash(void) {
|
||||||
"Last exit status: %s (%s):\n\n%s",
|
"Last exit status: %s (%s):\n\n%s",
|
||||||
NETDATA_VERSION, msg, cause, buffer_tostring(wb));
|
NETDATA_VERSION, msg, cause, buffer_tostring(wb));
|
||||||
|
|
||||||
// check if we have already posted this crash in the last 24 hours
|
enum crash_report_t r = check_crash_reports_config();
|
||||||
XXH64_hash_t hash = daemon_status_file_hash(&last_session_status, msg, cause);
|
if( // must be first for netdata.conf option to be used
|
||||||
if(dedup_already_posted(&session_status, hash) || (last_session_status.restarts < 10 && is_ci()))
|
(r == DSF_REPORT_ALL || (this_is_a_crash && r == DSF_REPORT_CRASHES)) &&
|
||||||
disable_crash_report = true;
|
|
||||||
|
|
||||||
if(!disable_crash_report && (analytics_check_enabled() || post_crash_report)) {
|
// not a useful report (no previous status file)
|
||||||
netdata_conf_ssl();
|
!crash_report_ignore &&
|
||||||
|
|
||||||
|
// we are not running in CI
|
||||||
|
(last_session_status.restarts >= 10 || !is_ci()) &&
|
||||||
|
|
||||||
|
// we have not already reported this
|
||||||
|
!dedup_already_posted(&session_status, daemon_status_file_hash(&last_session_status, msg, cause))
|
||||||
|
|
||||||
|
) {
|
||||||
daemon_status_file_startup_step("startup(post status file)");
|
daemon_status_file_startup_step("startup(post status file)");
|
||||||
|
|
||||||
|
netdata_conf_ssl();
|
||||||
|
|
||||||
struct post_status_file_thread_data d = {
|
struct post_status_file_thread_data d = {
|
||||||
.cause = cause,
|
.cause = cause,
|
||||||
.msg = msg,
|
.msg = msg,
|
||||||
.status = &last_session_status,
|
.status = &last_session_status,
|
||||||
.priority = pri.post,
|
.priority = pri.post,
|
||||||
};
|
};
|
||||||
|
|
||||||
post_status_file(&d);
|
post_status_file(&d);
|
||||||
|
|
||||||
// MacOS crashes when starting under launchctl, when we create a thread to post the status file,
|
// MacOS crashes when starting under launchctl, when we create a thread to post the status file,
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue