minor fixes (#19849)

* extended dbegnine stats should be enabled * creash reports are enabled when the agent is claimed (directly or indirectly) * make mmap() report out of memory * for open cache, use the current hot size, not the max ever used
2025-05-19 15:31:48 +00:00 · 2025-03-13 14:25:55 +00:00 · 2025-03-13 14:25:55 +00:00 · b51fdecd43
commit b51fdecd43
parent 35214fd7ac
9 changed files with 52 additions and 53 deletions
--- a/src/daemon/config/README.md
+++ b/src/daemon/config/README.md
@ -35,18 +35,18 @@ After `netdata.conf` has been modified, Netdata needs to be [restarted](/docs/ne

 ### `global` section options

-|              setting               |    default     | info                                                                                                                                                                                                                                                   |
-|:----------------------------------:|:--------------:|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
-|     process scheduling policy      |     `keep`     | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only)                                                                                                                                                     |
-|             OOM score              |      `0`       |                                                                                                                                                                                                                                                        |
-| glibc malloc arena max for plugins |      `1`       |                                                                                                                                                                                                                                                        |
-| glibc malloc arena max for Netdata |      `1`       |                                                                                                                                                                                                                                                        |
-|              hostname              | auto-detected  | The hostname of the computer running Netdata.                                                                                                                                                                                                          |
-|         host access prefix         |     empty      | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43).           |
-|              timezone              | auto-detected  | The timezone retrieved from the environment variable                                                                                                                                                                                                   |
-|            run as user             |   `netdata`    | The user Netdata will run as.                                                                                                                                                                                                                          |
-|         pthread stack size         | auto-detected  |                                                                                                                                                                                                                                                        |
-|           crash reports            | `all` or `off` | It is `off` when anonymous telemetry is disabled, otherwise `all`. When it is `all` Netdata reports agent restarts and crashes. It can also be `crashes` to report only crashes. Each kind of event is deduplicated and reported at most once per day. |  
+|              setting               |    default     | info                                                                                                                                                                                                                                                                                                                                                                                                                                                            |
+|:----------------------------------:|:--------------:|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+|     process scheduling policy      |     `keep`     | See [Netdata process scheduling policy](/src/daemon/README.md#process-scheduling-policy-unix-only)                                                                                                                                                                                                                                                                                                                                                              |
+|             OOM score              |      `0`       |                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| glibc malloc arena max for plugins |      `1`       |                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| glibc malloc arena max for Netdata |      `1`       |                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|              hostname              | auto-detected  | The hostname of the computer running Netdata.                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|         host access prefix         |     empty      | This is used in Docker environments where /proc, /sys, etc have to be accessed via another path. You may also have to set SYS_PTRACE capability on the docker for this work. Check [issue 43](https://github.com/netdata/netdata/issues/43).                                                                                                                                                                                                                    |
+|              timezone              | auto-detected  | The timezone retrieved from the environment variable                                                                                                                                                                                                                                                                                                                                                                                                            |
+|            run as user             |   `netdata`    | The user Netdata will run as.                                                                                                                                                                                                                                                                                                                                                                                                                                   |
+|         pthread stack size         | auto-detected  |                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+|           crash reports            | `all` or `off` | `all` when anonymous telemetry is enabled, or the agent is claimed or connected to Netdata Cloud (directly or via a Netdata Parent). When it is `all` Netdata reports restarts and crashes. It can also be `crashes` to report only crashes. When it is `off` nothing is reported. Each kind of event is deduplicated and reported at most once per day. [Read more at this blog post](https://www.netdata.cloud/blog/2025-03-06-monitoring-netdata-restarts/). |  

 ### `db` section options

--- a/src/daemon/daemon-status-file.c
+++ b/src/daemon/daemon-status-file.c
@ -866,13 +866,15 @@ enum crash_report_t {
 };

 static enum crash_report_t check_crash_reports_config(void) {
-    bool analytics = analytics_check_enabled();
+    bool default_enabled = analytics_check_enabled() ||
+                           !UUIDiszero(session_status.node_id) || !UUIDiszero(last_session_status.node_id) ||
+                           !UUIDiszero(session_status.claim_id) || !UUIDiszero(last_session_status.claim_id);

-    const char *t = inicfg_get(&netdata_config, CONFIG_SECTION_GLOBAL, "crash reports", analytics ? "all" : "off");
+    const char *t = inicfg_get(&netdata_config, CONFIG_SECTION_GLOBAL, "crash reports", default_enabled ? "all" : "off");

    enum crash_report_t rc;
    if(!t || !*t)
-        rc = analytics ? DSF_REPORT_ALL : DSF_REPORT_DISABLED;
+        rc = default_enabled ? DSF_REPORT_ALL : DSF_REPORT_DISABLED;
    else if(strcmp(t, "all") == 0)
        rc = DSF_REPORT_ALL;
    else if(strcmp(t, "crashes") == 0)
--- a/src/daemon/pulse/pulse.h
+++ b/src/daemon/pulse/pulse.h
@ -42,11 +42,6 @@ void *pulse_thread_memory_extended_main(void *ptr);
 #define p1_store(variable, value) __atomic_store_n(variable, value, __ATOMIC_RELAXED)
 #define p1_load(variable) __atomic_load_n(variable, value, __ATOMIC_RELAXED)

-#if !defined(PULSE_EXTENDED_STATISTICS) && (defined(NETDATA_INTERNAL_CHECKS) || defined(NETDATA_GOD_MODE))
-#define PULSE_EXTENDED_STATISTICS
-#endif
-
-#if defined(PULSE_EXTENDED_STATISTICS)
 #define p2_add_fetch(variable, value) __atomic_add_fetch(variable, value, __ATOMIC_RELAXED)
 #define p2_sub_fetch(variable, value) __atomic_sub_fetch(variable, value, __ATOMIC_RELAXED)

@ -55,15 +50,5 @@ void *pulse_thread_memory_extended_main(void *ptr);

 #define p2_store(variable, value) __atomic_store_n(variable, value, __ATOMIC_RELAXED)
 #define p2_load(variable) __atomic_load_n(variable, value, __ATOMIC_RELAXED)
-#else
-#define p2_add_fetch(variable, value) debug_dummy()
-#define p2_sub_fetch(variable, value) debug_dummy()
-
-#define p2_fetch_add(variable, value) debug_dummy()
-#define p2_fetch_sub(variable, value) debug_dummy()
-
-#define p2_store(variable, value) debug_dummy()
-#define p2_load(variable) debug_dummy()
-#endif

 #endif /* NETDATA_PULSE_H */
--- a/src/database/engine/cache.c
+++ b/src/database/engine/cache.c
@ -334,7 +334,8 @@ static inline void pgc_size_histogram_del(PGC *cache, struct pgc_size_histogram
 // ----------------------------------------------------------------------------
 // evictions control

-static ALWAYS_INLINE int64_t pgc_threshold(ssize_t threshold, int64_t wanted, int64_t current, int64_t clean) {
+ALWAYS_INLINE
+static int64_t pgc_threshold(ssize_t threshold, int64_t wanted, int64_t current, int64_t clean) {
    if(current < clean)
        current = clean;

@ -348,6 +349,18 @@ static ALWAYS_INLINE int64_t pgc_threshold(ssize_t threshold, int64_t wanted, in
    return ret;
 }

+ALWAYS_INLINE
+static int64_t pgc_wanted_size(const int64_t hot, const int64_t hot_max, const int64_t dirty_max, const int64_t index) {
+    // our promise to users
+    const int64_t max_size1 = MAX(hot_max, hot) * 2;
+
+    // protection against slow flushing
+    const int64_t max_size2 = hot_max + MAX(dirty_max * 2, hot_max * 2 / 3) + index;
+
+    // the final wanted cache size
+    return MIN(max_size1, max_size2);
+}
+
 static ssize_t cache_usage_per1000(PGC *cache, int64_t *size_to_evict) {

    if(size_to_evict)
@ -372,20 +385,15 @@ static ssize_t cache_usage_per1000(PGC *cache, int64_t *size_to_evict) {
        const int64_t dirty_max = __atomic_load_n(&cache->dirty.stats->max_size, __ATOMIC_RELAXED);
        const int64_t hot_max = __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED);

-        // our promise to users
-        const int64_t max_size1 = MAX(hot_max, hot) * 2;
-
-        // protection against slow flushing
-        const int64_t max_size2 = hot_max + ((dirty_max * 2 < hot_max * 2 / 3) ? hot_max * 2 / 3 : dirty_max * 2) + index;
-
-        // the final wanted cache size
-        wanted_cache_size = MIN(max_size1, max_size2);
-
        if(cache->config.dynamic_target_size_cb) {
+            wanted_cache_size = pgc_wanted_size(hot, hot, dirty, index);
+
            const int64_t wanted_cache_size_cb = cache->config.dynamic_target_size_cb();
            if(wanted_cache_size_cb > wanted_cache_size)
                wanted_cache_size = wanted_cache_size_cb;
        }
+        else
+            wanted_cache_size = pgc_wanted_size(hot, hot_max, dirty_max, index);

        if (wanted_cache_size < hot + dirty + index + cache->config.clean_size)
            wanted_cache_size = hot + dirty + index + cache->config.clean_size;
--- a/src/database/engine/journalfile.c
+++ b/src/database/engine/journalfile.c
@ -1329,7 +1329,7 @@ void journalfile_migrate_to_v2_callback(Word_t section, unsigned datafile_fileno
    int fd_v2;
    uint8_t *data_start = nd_mmap_advanced(path, total_file_size, MAP_SHARED, 0, false, true, &fd_v2);
    if(!data_start)
-        fatal("DBENGINE: failed to memory map file '%s' of size %zu.", path, total_file_size);
+        out_of_memory(__FUNCTION__, total_file_size, path);

    memset(data_start, 0, extent_offset);

--- a/src/libnetdata/aral/aral.c
+++ b/src/libnetdata/aral/aral.c
@ -544,8 +544,7 @@ static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_
        page->data =
            nd_mmap_advanced(page->filename, size, MAP_SHARED, 0, false, ar->config.options & ARAL_DONT_DUMP, NULL);
        if (unlikely(!page->data))
-            fatal("ARAL: '%s' cannot allocate aral buffer of size %zu on filename '%s'",
-                  ar->config.name, size, page->filename);
+            out_of_memory(__FUNCTION__, size, page->filename);

        total_size = size + sizeof(ARAL_PAGE);
        stats = &ar->stats->mmap;
--- a/src/libnetdata/memory/nd-mallocz.c
+++ b/src/libnetdata/memory/nd-mallocz.c
@ -8,7 +8,7 @@ void mallocz_register_out_of_memory_cb(out_of_memory_cb cb) {
 }

 ALWAYS_INLINE NORETURN
-static void out_of_memory(const char *call, size_t size) {
+void out_of_memory(const char *call, size_t size, const char *details) {
    exit_initiated_add(EXIT_REASON_OUT_OF_MEMORY);

    if(out_of_memory_callback)
@ -33,10 +33,12 @@ static void out_of_memory(const char *call, size_t size) {

    fatal("Out of memory on %s(%zu bytes)!\n"
          "System memory available: %s, while our max RSS usage is: %s\n"
-          "O/S mmap limit: %llu, while our mmap count is: %zu",
+          "O/S mmap limit: %llu, while our mmap count is: %zu\n"
+          "Additional details: %s",
          call, size,
          mem_available, rss_used,
-          os_mmap_limit(), __atomic_load_n(&nd_mmap_count, __ATOMIC_RELAXED));
+          os_mmap_limit(), __atomic_load_n(&nd_mmap_count, __ATOMIC_RELAXED),
+          details ? details : "none");
 }

 // ----------------------------------------------------------------------------
@ -429,7 +431,7 @@ char *strdupz(const char *s) {

    char *t = strdup(s);
    if (unlikely(!t))
-        out_of_memory(__FUNCTION__ , strlen(s) + 1);
+        out_of_memory(__FUNCTION__ , strlen(s) + 1, NULL);

    return t;
 }
@ -440,7 +442,7 @@ char *strndupz(const char *s, size_t len) {

    char *t = strndup(s, len);
    if (unlikely(!t))
-        out_of_memory(__FUNCTION__ , len + 1);
+        out_of_memory(__FUNCTION__ , len + 1, NULL);

    return t;
 }
@ -459,7 +461,7 @@ void *mallocz(size_t size) {
    workers_memory_call(WORKERS_MEMORY_CALL_LIBC_MALLOC);
    void *p = malloc(size);
    if (unlikely(!p))
-        out_of_memory(__FUNCTION__, size);
+        out_of_memory(__FUNCTION__, size, NULL);

    return p;
 }
@ -469,7 +471,7 @@ void *callocz(size_t nmemb, size_t size) {
    workers_memory_call(WORKERS_MEMORY_CALL_LIBC_CALLOC);
    void *p = calloc(nmemb, size);
    if (unlikely(!p))
-        out_of_memory(__FUNCTION__, nmemb * size);
+        out_of_memory(__FUNCTION__, nmemb * size, NULL);

    return p;
 }
@ -479,7 +481,7 @@ void *reallocz(void *ptr, size_t size) {
    workers_memory_call(WORKERS_MEMORY_CALL_LIBC_REALLOC);
    void *p = realloc(ptr, size);
    if (unlikely(!p))
-        out_of_memory(__FUNCTION__, size);
+        out_of_memory(__FUNCTION__, size, NULL);

    return p;
 }
@ -489,7 +491,7 @@ int posix_memalignz(void **memptr, size_t alignment, size_t size) {
    workers_memory_call(WORKERS_MEMORY_CALL_LIBC_POSIX_MEMALIGN);
    int rc = posix_memalign(memptr, alignment, size);
    if(unlikely(rc))
-        out_of_memory(__FUNCTION__, size);
+        out_of_memory(__FUNCTION__, size, NULL);

    return rc;
 }
--- a/src/libnetdata/memory/nd-mallocz.h
+++ b/src/libnetdata/memory/nd-mallocz.h
@ -67,4 +67,7 @@ void posix_memalign_freez(void *ptr);
 typedef void (*out_of_memory_cb)(void);
 void mallocz_register_out_of_memory_cb(out_of_memory_cb cb);

+NORETURN
+void out_of_memory(const char *call, size_t size, const char *details);
+
 #endif //NETDATA_ND_MALLOCZ_H
--- a/src/libnetdata/uuid/uuidmap.c
+++ b/src/libnetdata/uuid/uuidmap.c
@ -61,7 +61,7 @@ static void uuidmap_init_aral(void) {

 static UUIDMAP_ID get_next_id_unsafe(struct uuidmap_partition *partition) {
    // Check if we've reached the maximum ID value
-    if (partition->next_id >= 0x1FFFFFFF)
+    if (unlikely(partition->next_id >= 0x1FFFFFFF))
        fatal("UUIDMAP: Maximum ID limit reached for partition %u. UUIDs exhausted.",
              (unsigned int)(partition - uuid_map.p));