mirror of
https://github.com/netdata/netdata.git
synced 2025-04-27 22:26:21 +00:00
User configurable crash reporting (#19789)
* use a counter for temp status files * provide user configuration to reporting crashes * docs for crash reports * updated documentation * revert condition
This commit is contained in:
parent
fb67eb9c9c
commit
35c6875bbb
2 changed files with 83 additions and 61 deletions
src/daemon
|
@ -35,17 +35,18 @@ After `netdata.conf` has been modified, Netdata needs to be [restarted](/docs/ne
|
|||
|
||||
### `global` section options
|
||||
|
||||
| setting | default | info |
|
||||
|:----------------------------------:|:-------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) |
|
||||
| OOM score | `0` | |
|
||||
| glibc malloc arena max for plugins | `1` | |
|
||||
| glibc malloc arena max for Netdata | `1` | |
|
||||
| hostname | auto-detected | The hostname of the computer running Netdata. |
|
||||
| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). |
|
||||
| timezone | auto-detected | The timezone retrieved from the environment variable |
|
||||
| run as user | `netdata` | The user Netdata will run as. |
|
||||
| pthread stack size | auto-detected | |
|
||||
| setting | default | info |
|
||||
|:----------------------------------:|:--------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) |
|
||||
| OOM score | `0` | |
|
||||
| glibc malloc arena max for plugins | `1` | |
|
||||
| glibc malloc arena max for Netdata | `1` | |
|
||||
| hostname | auto-detected | The hostname of the computer running Netdata. |
|
||||
| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). |
|
||||
| timezone | auto-detected | The timezone retrieved from the environment variable |
|
||||
| run as user | `netdata` | The user Netdata will run as. |
|
||||
| pthread stack size | auto-detected | |
|
||||
| crash reports | `all` or `off` | It is `off` when anonymous telemetry is disabled, otherwise `all`. When it is `all` Netdata reports agent restarts and crashes. It can also be `crashes` to report only crashes. Each kind of event is deduplicated and reported at most once per day. |
|
||||
|
||||
### `db` section options
|
||||
|
||||
|
|
|
@ -568,45 +568,34 @@ static bool save_status_file(const char *directory, const char *content, size_t
|
|||
if(!directory || !*directory)
|
||||
return false;
|
||||
|
||||
static uint64_t tmp_attempt_counter = 0;
|
||||
|
||||
char filename[FILENAME_MAX];
|
||||
char temp_filename[FILENAME_MAX];
|
||||
char tid_str[UINT64_MAX_LENGTH];
|
||||
|
||||
/* Construct filenames using async-safe string operations */
|
||||
/* Using simple string concatenation instead of snprintf */
|
||||
print_uint64(tid_str, __atomic_add_fetch(&tmp_attempt_counter, 1, __ATOMIC_RELAXED));
|
||||
size_t dir_len = strlen(directory);
|
||||
if (dir_len + 1 + strlen(STATUS_FILENAME) >= FILENAME_MAX)
|
||||
return false; /* Path too long */
|
||||
size_t fil_len = strlen(STATUS_FILENAME);
|
||||
size_t tid_len = strlen(tid_str);
|
||||
|
||||
memcpy(filename, directory, dir_len);
|
||||
filename[dir_len] = '/';
|
||||
memcpy(filename + dir_len + 1, STATUS_FILENAME, strlen(STATUS_FILENAME) + 1);
|
||||
if (dir_len + 1 + fil_len + 1 + tid_len + 1 >= sizeof(filename))
|
||||
return false; // cannot fit the filename
|
||||
|
||||
/* Create a unique temp filename using thread id */
|
||||
unsigned int tid = (unsigned int)gettid_cached();
|
||||
char tid_str[16];
|
||||
char *tid_ptr = tid_str + sizeof(tid_str) - 1;
|
||||
*tid_ptr = '\0';
|
||||
// create the filename
|
||||
size_t pos = 0;
|
||||
memcpy(&filename[pos], directory, dir_len); pos += dir_len;
|
||||
filename[pos] = '/'; pos++;
|
||||
memcpy(&filename[pos], STATUS_FILENAME, fil_len); pos += fil_len;
|
||||
filename[pos] = '\0';
|
||||
|
||||
unsigned int tid_copy = tid;
|
||||
do {
|
||||
tid_ptr--;
|
||||
*tid_ptr = "0123456789abcdef"[tid_copy & 0xf];
|
||||
tid_copy >>= 4;
|
||||
} while (tid_copy && tid_ptr > tid_str);
|
||||
// create the temp filename
|
||||
memcpy(temp_filename, filename, pos);
|
||||
temp_filename[pos] = '-'; pos++;
|
||||
memcpy(&temp_filename[pos], tid_str, tid_len); pos += tid_len;
|
||||
temp_filename[pos] = '\0';
|
||||
|
||||
size_t temp_name_len = dir_len + 1 + strlen(STATUS_FILENAME) + 1 + (sizeof(tid_str) - (tid_ptr - tid_str));
|
||||
if (temp_name_len >= FILENAME_MAX)
|
||||
return false; /* Path too long */
|
||||
|
||||
memcpy(temp_filename, directory, dir_len);
|
||||
temp_filename[dir_len] = '/';
|
||||
char *ptr = temp_filename + dir_len + 1;
|
||||
memcpy(ptr, STATUS_FILENAME, strlen(STATUS_FILENAME));
|
||||
ptr += strlen(STATUS_FILENAME);
|
||||
*ptr++ = '-';
|
||||
memcpy(ptr, tid_ptr, strlen(tid_ptr) + 1);
|
||||
|
||||
/* Open file with O_WRONLY, O_CREAT, and O_TRUNC flags */
|
||||
// Open file with O_WRONLY, O_CREAT, and O_TRUNC flags
|
||||
int fd = open(temp_filename, O_WRONLY | O_CREAT | O_TRUNC, 0664);
|
||||
if (fd == -1)
|
||||
return false;
|
||||
|
@ -620,7 +609,7 @@ static bool save_status_file(const char *directory, const char *content, size_t
|
|||
|
||||
if (bytes_written == -1) {
|
||||
if (errno == EINTR)
|
||||
continue; /* Retry if interrupted by signal */
|
||||
continue; /* Retry if interrupted by signal */
|
||||
|
||||
close(fd);
|
||||
unlink(temp_filename); /* Remove the temp file */
|
||||
|
@ -822,6 +811,30 @@ static bool is_ci(void) {
|
|||
return ci && *ci && strcasecmp(ci, "true") == 0;
|
||||
}
|
||||
|
||||
enum crash_report_t {
|
||||
DSF_REPORT_DISABLED = 0,
|
||||
DSF_REPORT_ALL,
|
||||
DSF_REPORT_CRASHES,
|
||||
};
|
||||
|
||||
static enum crash_report_t check_crash_reports_config(void) {
|
||||
bool analytics = analytics_check_enabled();
|
||||
|
||||
const char *t = inicfg_get(&netdata_config, CONFIG_SECTION_GLOBAL, "crash reports", analytics ? "all" : "off");
|
||||
|
||||
enum crash_report_t rc;
|
||||
if(!t || !*t)
|
||||
rc = analytics ? DSF_REPORT_ALL : DSF_REPORT_DISABLED;
|
||||
else if(strcmp(t, "all") == 0)
|
||||
rc = DSF_REPORT_ALL;
|
||||
else if(strcmp(t, "crashes") == 0)
|
||||
rc = DSF_REPORT_CRASHES;
|
||||
else
|
||||
rc = DSF_REPORT_DISABLED;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
void daemon_status_file_check_crash(void) {
|
||||
FUNCTION_RUN_ONCE();
|
||||
|
||||
|
@ -834,8 +847,8 @@ void daemon_status_file_check_crash(void) {
|
|||
struct log_priority pri = PRI_ALL_NORMAL;
|
||||
|
||||
bool new_version = strcmp(last_session_status.version, session_status.version) != 0;
|
||||
bool post_crash_report = false;
|
||||
bool disable_crash_report = false;
|
||||
bool this_is_a_crash = false;
|
||||
bool crash_report_ignore = false;
|
||||
bool dump_json = true;
|
||||
const char *msg = "", *cause = "";
|
||||
switch(last_session_status.status) {
|
||||
|
@ -844,7 +857,7 @@ void daemon_status_file_check_crash(void) {
|
|||
// probably a previous version of netdata was running
|
||||
cause = "no last status";
|
||||
msg = "No status found for the previous Netdata session";
|
||||
disable_crash_report = true;
|
||||
crash_report_ignore = true;
|
||||
break;
|
||||
|
||||
case DAEMON_STATUS_EXITED:
|
||||
|
@ -858,14 +871,14 @@ void daemon_status_file_check_crash(void) {
|
|||
cause = "deadly signal and exit";
|
||||
msg = "Netdata was last stopped gracefully after receiving a deadly signal";
|
||||
pri = PRI_NETDATA_BUG;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
}
|
||||
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
|
||||
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
||||
cause = "fatal and exit";
|
||||
msg = "Netdata was last stopped gracefully after it encountered a fatal error";
|
||||
pri = PRI_NETDATA_BUG;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
}
|
||||
else if(last_session_status.exit_reason & EXIT_REASON_SYSTEM_SHUTDOWN) {
|
||||
cause = "exit on system shutdown";
|
||||
|
@ -899,7 +912,7 @@ void daemon_status_file_check_crash(void) {
|
|||
cause = "deadly signal on start";
|
||||
msg = "Netdata was last crashed while starting after receiving a deadly signal";
|
||||
pri = PRI_NETDATA_BUG;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
}
|
||||
else if (last_session_status.exit_reason & EXIT_REASON_OUT_OF_MEMORY) {
|
||||
cause = "out of memory";
|
||||
|
@ -940,8 +953,7 @@ void daemon_status_file_check_crash(void) {
|
|||
msg = "Netdata was last killed/crashed while starting";
|
||||
pri = PRI_BAD_BUT_NO_REASON;
|
||||
}
|
||||
post_crash_report = true;
|
||||
|
||||
this_is_a_crash = true;
|
||||
break;
|
||||
|
||||
case DAEMON_STATUS_EXITING:
|
||||
|
@ -949,7 +961,7 @@ void daemon_status_file_check_crash(void) {
|
|||
cause = "deadly signal on exit";
|
||||
msg = "Netdata was last crashed while exiting after receiving a deadly signal";
|
||||
pri = PRI_NETDATA_BUG;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
}
|
||||
else if(last_session_status.exit_reason != EXIT_REASON_NONE &&
|
||||
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
||||
|
@ -969,7 +981,7 @@ void daemon_status_file_check_crash(void) {
|
|||
msg = "Netdata was last killed/crashed while it was instructed to exit";
|
||||
}
|
||||
pri = PRI_NETDATA_BUG;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
break;
|
||||
|
||||
case DAEMON_STATUS_RUNNING: {
|
||||
|
@ -990,7 +1002,7 @@ void daemon_status_file_check_crash(void) {
|
|||
cause = "deadly signal";
|
||||
msg = "Netdata was last crashed after receiving a deadly signal";
|
||||
pri = PRI_NETDATA_BUG;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
}
|
||||
else if (last_session_status.exit_reason != EXIT_REASON_NONE &&
|
||||
!is_exit_reason_normal(last_session_status.exit_reason)) {
|
||||
|
@ -1002,7 +1014,7 @@ void daemon_status_file_check_crash(void) {
|
|||
cause = "killed hard";
|
||||
msg = "Netdata was last killed/crashed while operating normally";
|
||||
pri = PRI_BAD_BUT_NO_REASON;
|
||||
post_crash_report = true;
|
||||
this_is_a_crash = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -1025,22 +1037,31 @@ void daemon_status_file_check_crash(void) {
|
|||
"Last exit status: %s (%s):\n\n%s",
|
||||
NETDATA_VERSION, msg, cause, buffer_tostring(wb));
|
||||
|
||||
// check if we have already posted this crash in the last 24 hours
|
||||
XXH64_hash_t hash = daemon_status_file_hash(&last_session_status, msg, cause);
|
||||
if(dedup_already_posted(&session_status, hash) || (last_session_status.restarts < 10 && is_ci()))
|
||||
disable_crash_report = true;
|
||||
enum crash_report_t r = check_crash_reports_config();
|
||||
if( // must be first for netdata.conf option to be used
|
||||
(r == DSF_REPORT_ALL || (this_is_a_crash && r == DSF_REPORT_CRASHES)) &&
|
||||
|
||||
if(!disable_crash_report && (analytics_check_enabled() || post_crash_report)) {
|
||||
netdata_conf_ssl();
|
||||
// not a useful report (no previous status file)
|
||||
!crash_report_ignore &&
|
||||
|
||||
// we are not running in CI
|
||||
(last_session_status.restarts >= 10 || !is_ci()) &&
|
||||
|
||||
// we have not already reported this
|
||||
!dedup_already_posted(&session_status, daemon_status_file_hash(&last_session_status, msg, cause))
|
||||
|
||||
) {
|
||||
daemon_status_file_startup_step("startup(post status file)");
|
||||
|
||||
netdata_conf_ssl();
|
||||
|
||||
struct post_status_file_thread_data d = {
|
||||
.cause = cause,
|
||||
.msg = msg,
|
||||
.status = &last_session_status,
|
||||
.priority = pri.post,
|
||||
};
|
||||
|
||||
post_status_file(&d);
|
||||
|
||||
// MacOS crashes when starting under launchctl, when we create a thread to post the status file,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue