0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-27 22:26:21 +00:00

User configurable crash reporting ()

* use a counter for temp status files

* provide user configuration to reporting crashes

* docs for crash reports

* updated documentation

* revert condition
This commit is contained in:
Costa Tsaousis 2025-03-06 19:39:13 +00:00 committed by GitHub
parent fb67eb9c9c
commit 35c6875bbb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 83 additions and 61 deletions

View file

@ -35,17 +35,18 @@ After `netdata.conf` has been modified, Netdata needs to be [restarted](/docs/ne
### `global` section options
| setting | default | info |
|:----------------------------------:|:-------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) |
| OOM score | `0` | |
| glibc malloc arena max for plugins | `1` | |
| glibc malloc arena max for Netdata | `1` | |
| hostname | auto-detected | The hostname of the computer running Netdata. |
| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). |
| timezone | auto-detected | The timezone retrieved from the environment variable |
| run as user | `netdata` | The user Netdata will run as. |
| pthread stack size | auto-detected | |
| setting | default | info |
|:----------------------------------:|:--------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) |
| OOM score | `0` | |
| glibc malloc arena max for plugins | `1` | |
| glibc malloc arena max for Netdata | `1` | |
| hostname | auto-detected | The hostname of the computer running Netdata. |
| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). |
| timezone | auto-detected | The timezone retrieved from the environment variable |
| run as user | `netdata` | The user Netdata will run as. |
| pthread stack size | auto-detected | |
| crash reports | `all` or `off` | It is `off` when anonymous telemetry is disabled, otherwise `all`. When it is `all` Netdata reports agent restarts and crashes. It can also be `crashes` to report only crashes. Each kind of event is deduplicated and reported at most once per day. |
### `db` section options

View file

@ -568,45 +568,34 @@ static bool save_status_file(const char *directory, const char *content, size_t
if(!directory || !*directory)
return false;
static uint64_t tmp_attempt_counter = 0;
char filename[FILENAME_MAX];
char temp_filename[FILENAME_MAX];
char tid_str[UINT64_MAX_LENGTH];
/* Construct filenames using async-safe string operations */
/* Using simple string concatenation instead of snprintf */
print_uint64(tid_str, __atomic_add_fetch(&tmp_attempt_counter, 1, __ATOMIC_RELAXED));
size_t dir_len = strlen(directory);
if (dir_len + 1 + strlen(STATUS_FILENAME) >= FILENAME_MAX)
return false; /* Path too long */
size_t fil_len = strlen(STATUS_FILENAME);
size_t tid_len = strlen(tid_str);
memcpy(filename, directory, dir_len);
filename[dir_len] = '/';
memcpy(filename + dir_len + 1, STATUS_FILENAME, strlen(STATUS_FILENAME) + 1);
if (dir_len + 1 + fil_len + 1 + tid_len + 1 >= sizeof(filename))
return false; // cannot fit the filename
/* Create a unique temp filename using thread id */
unsigned int tid = (unsigned int)gettid_cached();
char tid_str[16];
char *tid_ptr = tid_str + sizeof(tid_str) - 1;
*tid_ptr = '\0';
// create the filename
size_t pos = 0;
memcpy(&filename[pos], directory, dir_len); pos += dir_len;
filename[pos] = '/'; pos++;
memcpy(&filename[pos], STATUS_FILENAME, fil_len); pos += fil_len;
filename[pos] = '\0';
unsigned int tid_copy = tid;
do {
tid_ptr--;
*tid_ptr = "0123456789abcdef"[tid_copy & 0xf];
tid_copy >>= 4;
} while (tid_copy && tid_ptr > tid_str);
// create the temp filename
memcpy(temp_filename, filename, pos);
temp_filename[pos] = '-'; pos++;
memcpy(&temp_filename[pos], tid_str, tid_len); pos += tid_len;
temp_filename[pos] = '\0';
size_t temp_name_len = dir_len + 1 + strlen(STATUS_FILENAME) + 1 + (sizeof(tid_str) - (tid_ptr - tid_str));
if (temp_name_len >= FILENAME_MAX)
return false; /* Path too long */
memcpy(temp_filename, directory, dir_len);
temp_filename[dir_len] = '/';
char *ptr = temp_filename + dir_len + 1;
memcpy(ptr, STATUS_FILENAME, strlen(STATUS_FILENAME));
ptr += strlen(STATUS_FILENAME);
*ptr++ = '-';
memcpy(ptr, tid_ptr, strlen(tid_ptr) + 1);
/* Open file with O_WRONLY, O_CREAT, and O_TRUNC flags */
// Open file with O_WRONLY, O_CREAT, and O_TRUNC flags
int fd = open(temp_filename, O_WRONLY | O_CREAT | O_TRUNC, 0664);
if (fd == -1)
return false;
@ -620,7 +609,7 @@ static bool save_status_file(const char *directory, const char *content, size_t
if (bytes_written == -1) {
if (errno == EINTR)
continue; /* Retry if interrupted by signal */
continue; /* Retry if interrupted by signal */
close(fd);
unlink(temp_filename); /* Remove the temp file */
@ -822,6 +811,30 @@ static bool is_ci(void) {
return ci && *ci && strcasecmp(ci, "true") == 0;
}
enum crash_report_t {
DSF_REPORT_DISABLED = 0,
DSF_REPORT_ALL,
DSF_REPORT_CRASHES,
};
static enum crash_report_t check_crash_reports_config(void) {
bool analytics = analytics_check_enabled();
const char *t = inicfg_get(&netdata_config, CONFIG_SECTION_GLOBAL, "crash reports", analytics ? "all" : "off");
enum crash_report_t rc;
if(!t || !*t)
rc = analytics ? DSF_REPORT_ALL : DSF_REPORT_DISABLED;
else if(strcmp(t, "all") == 0)
rc = DSF_REPORT_ALL;
else if(strcmp(t, "crashes") == 0)
rc = DSF_REPORT_CRASHES;
else
rc = DSF_REPORT_DISABLED;
return rc;
}
void daemon_status_file_check_crash(void) {
FUNCTION_RUN_ONCE();
@ -834,8 +847,8 @@ void daemon_status_file_check_crash(void) {
struct log_priority pri = PRI_ALL_NORMAL;
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
bool post_crash_report = false;
bool disable_crash_report = false;
bool this_is_a_crash = false;
bool crash_report_ignore = false;
bool dump_json = true;
const char *msg = "", *cause = "";
switch(last_session_status.status) {
@ -844,7 +857,7 @@ void daemon_status_file_check_crash(void) {
// probably a previous version of netdata was running
cause = "no last status";
msg = "No status found for the previous Netdata session";
disable_crash_report = true;
crash_report_ignore = true;
break;
case DAEMON_STATUS_EXITED:
@ -858,14 +871,14 @@ void daemon_status_file_check_crash(void) {
cause = "deadly signal and exit";
msg = "Netdata was last stopped gracefully after receiving a deadly signal";
pri = PRI_NETDATA_BUG;
post_crash_report = true;
this_is_a_crash = true;
}
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
!is_exit_reason_normal(last_session_status.exit_reason)) {
cause = "fatal and exit";
msg = "Netdata was last stopped gracefully after it encountered a fatal error";
pri = PRI_NETDATA_BUG;
post_crash_report = true;
this_is_a_crash = true;
}
else if(last_session_status.exit_reason & EXIT_REASON_SYSTEM_SHUTDOWN) {
cause = "exit on system shutdown";
@ -899,7 +912,7 @@ void daemon_status_file_check_crash(void) {
cause = "deadly signal on start";
msg = "Netdata was last crashed while starting after receiving a deadly signal";
pri = PRI_NETDATA_BUG;
post_crash_report = true;
this_is_a_crash = true;
}
else if (last_session_status.exit_reason & EXIT_REASON_OUT_OF_MEMORY) {
cause = "out of memory";
@ -940,8 +953,7 @@ void daemon_status_file_check_crash(void) {
msg = "Netdata was last killed/crashed while starting";
pri = PRI_BAD_BUT_NO_REASON;
}
post_crash_report = true;
this_is_a_crash = true;
break;
case DAEMON_STATUS_EXITING:
@ -949,7 +961,7 @@ void daemon_status_file_check_crash(void) {
cause = "deadly signal on exit";
msg = "Netdata was last crashed while exiting after receiving a deadly signal";
pri = PRI_NETDATA_BUG;
post_crash_report = true;
this_is_a_crash = true;
}
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
!is_exit_reason_normal(last_session_status.exit_reason)) {
@ -969,7 +981,7 @@ void daemon_status_file_check_crash(void) {
msg = "Netdata was last killed/crashed while it was instructed to exit";
}
pri = PRI_NETDATA_BUG;
post_crash_report = true;
this_is_a_crash = true;
break;
case DAEMON_STATUS_RUNNING: {
@ -990,7 +1002,7 @@ void daemon_status_file_check_crash(void) {
cause = "deadly signal";
msg = "Netdata was last crashed after receiving a deadly signal";
pri = PRI_NETDATA_BUG;
post_crash_report = true;
this_is_a_crash = true;
}
else if (last_session_status.exit_reason != EXIT_REASON_NONE &&
!is_exit_reason_normal(last_session_status.exit_reason)) {
@ -1002,7 +1014,7 @@ void daemon_status_file_check_crash(void) {
cause = "killed hard";
msg = "Netdata was last killed/crashed while operating normally";
pri = PRI_BAD_BUT_NO_REASON;
post_crash_report = true;
this_is_a_crash = true;
}
break;
}
@ -1025,22 +1037,31 @@ void daemon_status_file_check_crash(void) {
"Last exit status: %s (%s):\n\n%s",
NETDATA_VERSION, msg, cause, buffer_tostring(wb));
// check if we have already posted this crash in the last 24 hours
XXH64_hash_t hash = daemon_status_file_hash(&last_session_status, msg, cause);
if(dedup_already_posted(&session_status, hash) || (last_session_status.restarts < 10 && is_ci()))
disable_crash_report = true;
enum crash_report_t r = check_crash_reports_config();
if( // must be first for netdata.conf option to be used
(r == DSF_REPORT_ALL || (this_is_a_crash && r == DSF_REPORT_CRASHES)) &&
if(!disable_crash_report && (analytics_check_enabled() || post_crash_report)) {
netdata_conf_ssl();
// not a useful report (no previous status file)
!crash_report_ignore &&
// we are not running in CI
(last_session_status.restarts >= 10 || !is_ci()) &&
// we have not already reported this
!dedup_already_posted(&session_status, daemon_status_file_hash(&last_session_status, msg, cause))
) {
daemon_status_file_startup_step("startup(post status file)");
netdata_conf_ssl();
struct post_status_file_thread_data d = {
.cause = cause,
.msg = msg,
.status = &last_session_status,
.priority = pri.post,
};
post_status_file(&d);
// MacOS crashes when starting under launchctl, when we create a thread to post the status file,