diff --git a/src/daemon/config/README.md b/src/daemon/config/README.md index 3dd095b65d..384778baf9 100644 --- a/src/daemon/config/README.md +++ b/src/daemon/config/README.md @@ -35,17 +35,18 @@ After `netdata.conf` has been modified, Netdata needs to be [restarted](/docs/ne ### `global` section options -| setting | default | info | -|:----------------------------------:|:-------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) | -| OOM score | `0` | | -| glibc malloc arena max for plugins | `1` | | -| glibc malloc arena max for Netdata | `1` | | -| hostname | auto-detected | The hostname of the computer running Netdata. | -| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). | -| timezone | auto-detected | The timezone retrieved from the environment variable | -| run as user | `netdata` | The user Netdata will run as. | -| pthread stack size | auto-detected | | +| setting | default | info | +|:----------------------------------:|:--------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| process scheduling policy | `keep` | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only) | +| OOM score | `0` | | +| glibc malloc arena max for plugins | `1` | | +| glibc malloc arena max for Netdata | `1` | | +| hostname | auto-detected | The hostname of the computer running Netdata. | +| host access prefix | empty | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43). | +| timezone | auto-detected | The timezone retrieved from the environment variable | +| run as user | `netdata` | The user Netdata will run as. | +| pthread stack size | auto-detected | | +| crash reports | `all` or `off` | It is `off` when anonymous telemetry is disabled, otherwise `all`. When it is `all` Netdata reports agent restarts and crashes. It can also be `crashes` to report only crashes. Each kind of event is deduplicated and reported at most once per day. | ### `db` section options diff --git a/src/daemon/daemon-status-file.c b/src/daemon/daemon-status-file.c index 9c35921458..6476b5ed5f 100644 --- a/src/daemon/daemon-status-file.c +++ b/src/daemon/daemon-status-file.c @@ -568,45 +568,34 @@ static bool save_status_file(const char *directory, const char *content, size_t if(!directory || !*directory) return false; + static uint64_t tmp_attempt_counter = 0; + char filename[FILENAME_MAX]; char temp_filename[FILENAME_MAX]; + char tid_str[UINT64_MAX_LENGTH]; - /* Construct filenames using async-safe string operations */ - /* Using simple string concatenation instead of snprintf */ + print_uint64(tid_str, __atomic_add_fetch(&tmp_attempt_counter, 1, __ATOMIC_RELAXED)); size_t dir_len = strlen(directory); - if (dir_len + 1 + strlen(STATUS_FILENAME) >= FILENAME_MAX) - return false; /* Path too long */ + size_t fil_len = strlen(STATUS_FILENAME); + size_t tid_len = strlen(tid_str); - memcpy(filename, directory, dir_len); - filename[dir_len] = '/'; - memcpy(filename + dir_len + 1, STATUS_FILENAME, strlen(STATUS_FILENAME) + 1); + if (dir_len + 1 + fil_len + 1 + tid_len + 1 >= sizeof(filename)) + return false; // cannot fit the filename - /* Create a unique temp filename using thread id */ - unsigned int tid = (unsigned int)gettid_cached(); - char tid_str[16]; - char *tid_ptr = tid_str + sizeof(tid_str) - 1; - *tid_ptr = '\0'; + // create the filename + size_t pos = 0; + memcpy(&filename[pos], directory, dir_len); pos += dir_len; + filename[pos] = '/'; pos++; + memcpy(&filename[pos], STATUS_FILENAME, fil_len); pos += fil_len; + filename[pos] = '\0'; - unsigned int tid_copy = tid; - do { - tid_ptr--; - *tid_ptr = "0123456789abcdef"[tid_copy & 0xf]; - tid_copy >>= 4; - } while (tid_copy && tid_ptr > tid_str); + // create the temp filename + memcpy(temp_filename, filename, pos); + temp_filename[pos] = '-'; pos++; + memcpy(&temp_filename[pos], tid_str, tid_len); pos += tid_len; + temp_filename[pos] = '\0'; - size_t temp_name_len = dir_len + 1 + strlen(STATUS_FILENAME) + 1 + (sizeof(tid_str) - (tid_ptr - tid_str)); - if (temp_name_len >= FILENAME_MAX) - return false; /* Path too long */ - - memcpy(temp_filename, directory, dir_len); - temp_filename[dir_len] = '/'; - char *ptr = temp_filename + dir_len + 1; - memcpy(ptr, STATUS_FILENAME, strlen(STATUS_FILENAME)); - ptr += strlen(STATUS_FILENAME); - *ptr++ = '-'; - memcpy(ptr, tid_ptr, strlen(tid_ptr) + 1); - - /* Open file with O_WRONLY, O_CREAT, and O_TRUNC flags */ + // Open file with O_WRONLY, O_CREAT, and O_TRUNC flags int fd = open(temp_filename, O_WRONLY | O_CREAT | O_TRUNC, 0664); if (fd == -1) return false; @@ -620,7 +609,7 @@ static bool save_status_file(const char *directory, const char *content, size_t if (bytes_written == -1) { if (errno == EINTR) - continue; /* Retry if interrupted by signal */ + continue; /* Retry if interrupted by signal */ close(fd); unlink(temp_filename); /* Remove the temp file */ @@ -822,6 +811,30 @@ static bool is_ci(void) { return ci && *ci && strcasecmp(ci, "true") == 0; } +enum crash_report_t { + DSF_REPORT_DISABLED = 0, + DSF_REPORT_ALL, + DSF_REPORT_CRASHES, +}; + +static enum crash_report_t check_crash_reports_config(void) { + bool analytics = analytics_check_enabled(); + + const char *t = inicfg_get(&netdata_config, CONFIG_SECTION_GLOBAL, "crash reports", analytics ? "all" : "off"); + + enum crash_report_t rc; + if(!t || !*t) + rc = analytics ? DSF_REPORT_ALL : DSF_REPORT_DISABLED; + else if(strcmp(t, "all") == 0) + rc = DSF_REPORT_ALL; + else if(strcmp(t, "crashes") == 0) + rc = DSF_REPORT_CRASHES; + else + rc = DSF_REPORT_DISABLED; + + return rc; +} + void daemon_status_file_check_crash(void) { FUNCTION_RUN_ONCE(); @@ -834,8 +847,8 @@ void daemon_status_file_check_crash(void) { struct log_priority pri = PRI_ALL_NORMAL; bool new_version = strcmp(last_session_status.version, session_status.version) != 0; - bool post_crash_report = false; - bool disable_crash_report = false; + bool this_is_a_crash = false; + bool crash_report_ignore = false; bool dump_json = true; const char *msg = "", *cause = ""; switch(last_session_status.status) { @@ -844,7 +857,7 @@ void daemon_status_file_check_crash(void) { // probably a previous version of netdata was running cause = "no last status"; msg = "No status found for the previous Netdata session"; - disable_crash_report = true; + crash_report_ignore = true; break; case DAEMON_STATUS_EXITED: @@ -858,14 +871,14 @@ void daemon_status_file_check_crash(void) { cause = "deadly signal and exit"; msg = "Netdata was last stopped gracefully after receiving a deadly signal"; pri = PRI_NETDATA_BUG; - post_crash_report = true; + this_is_a_crash = true; } else if(last_session_status.exit_reason != EXIT_REASON_NONE && !is_exit_reason_normal(last_session_status.exit_reason)) { cause = "fatal and exit"; msg = "Netdata was last stopped gracefully after it encountered a fatal error"; pri = PRI_NETDATA_BUG; - post_crash_report = true; + this_is_a_crash = true; } else if(last_session_status.exit_reason & EXIT_REASON_SYSTEM_SHUTDOWN) { cause = "exit on system shutdown"; @@ -899,7 +912,7 @@ void daemon_status_file_check_crash(void) { cause = "deadly signal on start"; msg = "Netdata was last crashed while starting after receiving a deadly signal"; pri = PRI_NETDATA_BUG; - post_crash_report = true; + this_is_a_crash = true; } else if (last_session_status.exit_reason & EXIT_REASON_OUT_OF_MEMORY) { cause = "out of memory"; @@ -940,8 +953,7 @@ void daemon_status_file_check_crash(void) { msg = "Netdata was last killed/crashed while starting"; pri = PRI_BAD_BUT_NO_REASON; } - post_crash_report = true; - + this_is_a_crash = true; break; case DAEMON_STATUS_EXITING: @@ -949,7 +961,7 @@ void daemon_status_file_check_crash(void) { cause = "deadly signal on exit"; msg = "Netdata was last crashed while exiting after receiving a deadly signal"; pri = PRI_NETDATA_BUG; - post_crash_report = true; + this_is_a_crash = true; } else if(last_session_status.exit_reason != EXIT_REASON_NONE && !is_exit_reason_normal(last_session_status.exit_reason)) { @@ -969,7 +981,7 @@ void daemon_status_file_check_crash(void) { msg = "Netdata was last killed/crashed while it was instructed to exit"; } pri = PRI_NETDATA_BUG; - post_crash_report = true; + this_is_a_crash = true; break; case DAEMON_STATUS_RUNNING: { @@ -990,7 +1002,7 @@ void daemon_status_file_check_crash(void) { cause = "deadly signal"; msg = "Netdata was last crashed after receiving a deadly signal"; pri = PRI_NETDATA_BUG; - post_crash_report = true; + this_is_a_crash = true; } else if (last_session_status.exit_reason != EXIT_REASON_NONE && !is_exit_reason_normal(last_session_status.exit_reason)) { @@ -1002,7 +1014,7 @@ void daemon_status_file_check_crash(void) { cause = "killed hard"; msg = "Netdata was last killed/crashed while operating normally"; pri = PRI_BAD_BUT_NO_REASON; - post_crash_report = true; + this_is_a_crash = true; } break; } @@ -1025,22 +1037,31 @@ void daemon_status_file_check_crash(void) { "Last exit status: %s (%s):\n\n%s", NETDATA_VERSION, msg, cause, buffer_tostring(wb)); - // check if we have already posted this crash in the last 24 hours - XXH64_hash_t hash = daemon_status_file_hash(&last_session_status, msg, cause); - if(dedup_already_posted(&session_status, hash) || (last_session_status.restarts < 10 && is_ci())) - disable_crash_report = true; + enum crash_report_t r = check_crash_reports_config(); + if( // must be first for netdata.conf option to be used + (r == DSF_REPORT_ALL || (this_is_a_crash && r == DSF_REPORT_CRASHES)) && - if(!disable_crash_report && (analytics_check_enabled() || post_crash_report)) { - netdata_conf_ssl(); + // not a useful report (no previous status file) + !crash_report_ignore && + // we are not running in CI + (last_session_status.restarts >= 10 || !is_ci()) && + + // we have not already reported this + !dedup_already_posted(&session_status, daemon_status_file_hash(&last_session_status, msg, cause)) + + ) { daemon_status_file_startup_step("startup(post status file)"); + netdata_conf_ssl(); + struct post_status_file_thread_data d = { .cause = cause, .msg = msg, .status = &last_session_status, .priority = pri.post, }; + post_status_file(&d); // MacOS crashes when starting under launchctl, when we create a thread to post the status file,