From 87dbbaadb625edb5d05c3605f7dd02fc0a18237c Mon Sep 17 00:00:00 2001
From: Costa Tsaousis <costa@netdata.cloud>
Date: Mon, 17 Mar 2025 23:48:57 +0000
Subject: [PATCH] do not recurse cleanup on shutdown (#19894)

* do not recurse cleanup on shutdown

* make exit_initiated sigatomic_t and hide it behind a function

* fix freebsd and windows
---
 src/collectors/cups.plugin/cups_plugin.c      |  6 ++--
 .../freebsd.plugin/plugin_freebsd.c           |  2 +-
 src/collectors/nfacct.plugin/plugin_nfacct.c  |  2 +-
 src/collectors/perf.plugin/perf_plugin.c      |  2 +-
 .../xenstat.plugin/xenstat_plugin.c           |  2 +-
 src/daemon/commands.c                         |  2 +-
 src/daemon/daemon-service.c                   |  2 +-
 src/daemon/daemon-shutdown.c                  | 32 ++++++++++++-------
 src/daemon/daemon-shutdown.h                  |  7 ++--
 src/daemon/daemon-status-file.c               |  2 +-
 src/daemon/daemon-systemd-watcher.c           |  2 +-
 src/daemon/main.c                             |  2 +-
 src/daemon/signal-handler.c                   |  2 +-
 src/daemon/winsvc.cc                          |  2 +-
 src/database/engine/cache.c                   |  2 +-
 src/database/engine/page.c                    |  4 +--
 src/health/rrdcalc.c                          |  2 +-
 src/libnetdata/exit/exit_initiated.c          | 16 +++++++---
 src/libnetdata/exit/exit_initiated.h          |  3 +-
 src/web/server/web_client.c                   |  4 +--
 20 files changed, 54 insertions(+), 44 deletions(-)

diff --git a/src/collectors/cups.plugin/cups_plugin.c b/src/collectors/cups.plugin/cups_plugin.c
index 82bfffffa3..4b794d05e0 100644
--- a/src/collectors/cups.plugin/cups_plugin.c
+++ b/src/collectors/cups.plugin/cups_plugin.c
@@ -243,7 +243,7 @@ int main(int argc, char **argv) {
     for (iteration = 0; 1; iteration++) {
         heartbeat_next(&hb);
 
-        if (unlikely(exit_initiated))
+        if (unlikely(exit_initiated_get()))
             break;
 
         reset_metrics();
@@ -315,7 +315,7 @@ int main(int argc, char **argv) {
         }
         cupsFreeDests(num_dest_total, dests);
 
-        if (unlikely(exit_initiated))
+        if (unlikely(exit_initiated_get()))
             break;
 
         cups_job_t *jobs, *curr_job;
@@ -410,7 +410,7 @@ int main(int argc, char **argv) {
 
         fflush(stdout);
 
-        if (unlikely(exit_initiated))
+        if (unlikely(exit_initiated_get()))
             break;
 
         // restart check (14400 seconds)
diff --git a/src/collectors/freebsd.plugin/plugin_freebsd.c b/src/collectors/freebsd.plugin/plugin_freebsd.c
index c2a6d900f0..5a01436108 100644
--- a/src/collectors/freebsd.plugin/plugin_freebsd.c
+++ b/src/collectors/freebsd.plugin/plugin_freebsd.c
@@ -91,7 +91,7 @@ void *freebsd_main(void *ptr)
 
     // initialize FreeBSD plugin
     if (freebsd_plugin_init())
-        netdata_cleanup_and_exit(EXIT_REASON_FATAL, NULL, NULL, NULL);
+        netdata_cleanup_and_exit_fatal(EXIT_REASON_FATAL);
 
     // check the enabled status for each module
     int i;
diff --git a/src/collectors/nfacct.plugin/plugin_nfacct.c b/src/collectors/nfacct.plugin/plugin_nfacct.c
index 18544abac5..f9bbe0efa8 100644
--- a/src/collectors/nfacct.plugin/plugin_nfacct.c
+++ b/src/collectors/nfacct.plugin/plugin_nfacct.c
@@ -837,7 +837,7 @@ int main(int argc, char **argv) {
     for(iteration = 0; 1; iteration++) {
         usec_t dt = heartbeat_next(&hb);
 
-        if(unlikely(exit_initiated)) break;
+        if(unlikely(exit_initiated_get())) break;
 
         if(debug && iteration)
             fprintf(stderr, "nfacct.plugin: iteration %zu, dt %"PRIu64" usec\n"
diff --git a/src/collectors/perf.plugin/perf_plugin.c b/src/collectors/perf.plugin/perf_plugin.c
index f69f9ab179..f139f3d277 100644
--- a/src/collectors/perf.plugin/perf_plugin.c
+++ b/src/collectors/perf.plugin/perf_plugin.c
@@ -1325,7 +1325,7 @@ int main(int argc, char **argv) {
     for(iteration = 0; 1; iteration++) {
         usec_t dt = heartbeat_next(&hb);
 
-        if (unlikely(exit_initiated))
+        if (unlikely(exit_initiated_get()))
             break;
 
         if (unlikely(debug && iteration))
diff --git a/src/collectors/xenstat.plugin/xenstat_plugin.c b/src/collectors/xenstat.plugin/xenstat_plugin.c
index d229009ce8..9fb55a63b0 100644
--- a/src/collectors/xenstat.plugin/xenstat_plugin.c
+++ b/src/collectors/xenstat.plugin/xenstat_plugin.c
@@ -1026,7 +1026,7 @@ int main(int argc, char **argv) {
     for(iteration = 0; 1; iteration++) {
         usec_t dt = heartbeat_next(&hb);
 
-        if(unlikely(exit_initiated)) break;
+        if(unlikely(exit_initiated_get())) break;
 
         if(unlikely(debug && iteration))
             fprintf(stderr, "xenstat.plugin: iteration %zu, dt %lu usec\n", iteration, dt);
diff --git a/src/daemon/commands.c b/src/daemon/commands.c
index 44ff29aa80..0d53db16d5 100644
--- a/src/daemon/commands.c
+++ b/src/daemon/commands.c
@@ -164,7 +164,7 @@ static cmd_status_t cmd_exit_execute(char *args, char **message)
 
     nd_log_limits_unlimited();
     netdata_log_info("COMMAND: Cleaning up to exit.");
-    netdata_cleanup_and_exit(EXIT_REASON_CMD_EXIT, NULL, NULL, NULL);
+    netdata_cleanup_and_exit_gracefully(EXIT_REASON_CMD_EXIT);
     exit(0);
 
     return CMD_STATUS_SUCCESS;
diff --git a/src/daemon/daemon-service.c b/src/daemon/daemon-service.c
index 9f1d30202b..8e0c68b291 100644
--- a/src/daemon/daemon-service.c
+++ b/src/daemon/daemon-service.c
@@ -90,7 +90,7 @@ bool service_running(SERVICE_TYPE service) {
     if (sth->type == SERVICE_THREAD_TYPE_NETDATA)
         cancelled = nd_thread_signaled_to_cancel();
 
-    return !sth->stop_immediately && !exit_initiated && !cancelled;
+    return !sth->stop_immediately && !exit_initiated_get() && !cancelled;
 }
 
 void service_signal_exit(SERVICE_TYPE service) {
diff --git a/src/daemon/daemon-shutdown.c b/src/daemon/daemon-shutdown.c
index 942a7dd411..7c03998f31 100644
--- a/src/daemon/daemon-shutdown.c
+++ b/src/daemon/daemon-shutdown.c
@@ -38,7 +38,7 @@ extern struct netdata_static_thread *static_threads;
 
 void netdata_log_exit_reason(void) {
     CLEAN_BUFFER *wb = buffer_create(0, NULL);
-    EXIT_REASON_2buffer(wb, exit_initiated, ", ");
+    EXIT_REASON_2buffer(wb, exit_initiated_get(), ", ");
 
     ND_LOG_STACK lgs[] = {
         ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &netdata_exit_msgid),
@@ -46,7 +46,7 @@ void netdata_log_exit_reason(void) {
     };
     ND_LOG_STACK_PUSH(lgs);
 
-    nd_log(NDLS_DAEMON, is_exit_reason_normal(exit_initiated) ? NDLP_NOTICE : NDLP_CRIT,
+    nd_log(NDLS_DAEMON, is_exit_reason_normal(exit_initiated_get()) ? NDLP_NOTICE : NDLP_CRIT,
            "NETDATA SHUTDOWN: initializing shutdown with code due to: %s",
            buffer_tostring(wb));
 }
@@ -109,8 +109,7 @@ static void *rrdeng_exit_background(void *ptr) {
 }
 
 #ifdef ENABLE_DBENGINE
-static void rrdeng_flush_everything_and_wait(bool wait_flush, bool wait_collectors, bool dirty_only)
-{
+static void rrdeng_flush_everything_and_wait(bool wait_flush, bool wait_collectors, bool dirty_only) {
     static size_t starting_size_to_flush = 0;
 
     if(!pgc_hot_and_dirty_entries(main_cache))
@@ -172,9 +171,12 @@ static void rrdeng_flush_everything_and_wait(bool wait_flush, bool wait_collecto
 }
 #endif
 
-void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char *action_result, const char *action_data) {
+#if !defined(OS_WINDOWS)
+NORETURN
+#endif
+static void netdata_cleanup_and_exit(EXIT_REASON reason) {
     exit_initiated_set(reason);
-    int ret = is_exit_reason_normal(exit_initiated) ? 0 : 1;
+    int ret = is_exit_reason_normal(exit_initiated_get()) ? 0 : 1;
 
     // don't recurse (due to a fatal, while exiting)
     static bool run = false;
@@ -198,13 +200,9 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
         rrdeng_flush_everything_and_wait(false, false, true);
 #endif
 
-    // send the stat from our caller
-    analytics_statistic_t statistic = { action, action_result, action_data };
-    analytics_statistic_send(&statistic);
-
     // notify we are exiting
-    statistic = (analytics_statistic_t) {"EXIT", ret?"ERROR":"OK","-"};
-    analytics_statistic_send(&statistic);
+    //analytics_statistic_t statistic = (analytics_statistic_t) {"EXIT", ret?"ERROR":"OK","-"};
+    //analytics_statistic_send(&statistic);
 
     netdata_main_spawn_server_cleanup();
     watcher_step_complete(WATCHER_STEP_ID_DESTROY_MAIN_SPAWN_SERVER);
@@ -393,3 +391,13 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char
     }
 #endif
 }
+
+void netdata_cleanup_and_exit_gracefully(EXIT_REASON reason) {
+    exit_initiated_add(reason);
+    FUNCTION_RUN_ONCE();
+    netdata_cleanup_and_exit(reason);
+}
+
+void netdata_cleanup_and_exit_fatal(EXIT_REASON reason) {
+    netdata_cleanup_and_exit(reason);
+}
diff --git a/src/daemon/daemon-shutdown.h b/src/daemon/daemon-shutdown.h
index c5de7ac1ad..a3276dbeeb 100644
--- a/src/daemon/daemon-shutdown.h
+++ b/src/daemon/daemon-shutdown.h
@@ -10,10 +10,7 @@ void cancel_main_threads(void);
 void abort_on_fatal_disable(void);
 void abort_on_fatal_enable(void);
 
-#ifdef OS_WINDOWS
-void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char *action_result, const char *action_data);
-#else
-void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char *action_result, const char *action_data) NORETURN;
-#endif
+void netdata_cleanup_and_exit_gracefully(EXIT_REASON reason);
+void netdata_cleanup_and_exit_fatal(EXIT_REASON reason);
 
 #endif //NETDATA_DAEMON_SHUTDOWN_H
diff --git a/src/daemon/daemon-status-file.c b/src/daemon/daemon-status-file.c
index 274028d88a..38bbbf6170 100644
--- a/src/daemon/daemon-status-file.c
+++ b/src/daemon/daemon-status-file.c
@@ -614,7 +614,7 @@ static void daemon_status_file_refresh(DAEMON_STATUS status) {
 
     get_daemon_status_fields_from_system_info(&session_status);
 
-    session_status.exit_reason = exit_initiated;
+    session_status.exit_reason = exit_initiated_get();
     session_status.profile = nd_profile_detect_and_configure(false);
 
     if(status != DAEMON_STATUS_NONE)
diff --git a/src/daemon/daemon-systemd-watcher.c b/src/daemon/daemon-systemd-watcher.c
index 7de847771b..d283663d35 100644
--- a/src/daemon/daemon-systemd-watcher.c
+++ b/src/daemon/daemon-systemd-watcher.c
@@ -28,7 +28,7 @@ static int shutdown_event_handler(sd_bus_message *m, void *userdata __maybe_unus
            shutdown ? "true" : "false");
 
     if(shutdown)
-        netdata_cleanup_and_exit(EXIT_REASON_SYSTEM_SHUTDOWN, NULL, NULL, NULL);
+        netdata_cleanup_and_exit_gracefully(EXIT_REASON_SYSTEM_SHUTDOWN);
 
     return 0;
 }
diff --git a/src/daemon/main.c b/src/daemon/main.c
index dbb430658e..d70b7189c7 100644
--- a/src/daemon/main.c
+++ b/src/daemon/main.c
@@ -240,7 +240,7 @@ int unittest_prepare_rrd(const char **user) {
 }
 
 static void fatal_cleanup_and_exit_cb(void) {
-    netdata_cleanup_and_exit(EXIT_REASON_FATAL, "fatal error", "exiting", NULL);
+    netdata_cleanup_and_exit_fatal(EXIT_REASON_FATAL);
     exit(1);
 }
 
diff --git a/src/daemon/signal-handler.c b/src/daemon/signal-handler.c
index 6d5579effc..3d7d8b8af4 100644
--- a/src/daemon/signal-handler.c
+++ b/src/daemon/signal-handler.c
@@ -237,7 +237,7 @@ static void process_triggered_signals(void) {
                     nd_log_limits_unlimited();
                     netdata_log_info("SIGNAL: Received %s. Cleaning up to exit...", name);
                     commands_exit();
-                    netdata_cleanup_and_exit(signals_waiting[i].reason, NULL, NULL, NULL);
+                    netdata_cleanup_and_exit_gracefully(signals_waiting[i].reason);
                     exit(0);
                     break;
 
diff --git a/src/daemon/winsvc.cc b/src/daemon/winsvc.cc
index c177785e94..4e570c1f23 100644
--- a/src/daemon/winsvc.cc
+++ b/src/daemon/winsvc.cc
@@ -109,7 +109,7 @@ static void *call_netdata_cleanup(void *arg)
             reason = EXIT_REASON_SERVICE_STOP;
             break;
     }
-    netdata_cleanup_and_exit(reason, NULL, NULL, NULL);
+    netdata_cleanup_and_exit_gracefully(reason);
 
     // Close event handle
     netdata_service_log("Closing stop event handle...");
diff --git a/src/database/engine/cache.c b/src/database/engine/cache.c
index 34242cf258..48da8123ea 100644
--- a/src/database/engine/cache.c
+++ b/src/database/engine/cache.c
@@ -2312,7 +2312,7 @@ bool pgc_flush_pages(PGC *cache) {
 }
 
 void pgc_page_hot_set_end_time_s(PGC *cache __maybe_unused, PGC_PAGE *page, time_t end_time_s, size_t additional_bytes) {
-    internal_fatal(!is_page_hot(page) && !exit_initiated,
+    internal_fatal(!is_page_hot(page) && !exit_initiated_get(),
                    "DBENGINE CACHE: end_time_s update on non-hot page");
 
     internal_fatal(end_time_s < __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED),
diff --git a/src/database/engine/page.c b/src/database/engine/page.c
index 9fa3634aef..376ae8d358 100644
--- a/src/database/engine/page.c
+++ b/src/database/engine/page.c
@@ -897,14 +897,14 @@ size_t pgd_append_point(
     uint32_t expected_slot)
 {
     if (pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) {
-        if(exit_initiated == EXIT_REASON_NONE)
+        if(exit_initiated_get() == EXIT_REASON_NONE)
             pgd_fatal(pg, "Data collection on page already scheduled for flushing");
         else
             return 0;
     }
 
     if (!(pg->states & PGD_STATE_CREATED_FROM_COLLECTOR)) {
-        if(exit_initiated == EXIT_REASON_NONE)
+        if(exit_initiated_get() == EXIT_REASON_NONE)
             pgd_fatal(pg, "DBENGINE: collection on page not created from a collector");
         else
             return 0;
diff --git a/src/health/rrdcalc.c b/src/health/rrdcalc.c
index 61822febc8..08e8b6ff14 100644
--- a/src/health/rrdcalc.c
+++ b/src/health/rrdcalc.c
@@ -250,7 +250,7 @@ static void rrdcalc_link_to_rrdset(RRDCALC *rc) {
 static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) {
     RRDSET *st = rc->rrdset;
 
-    if (!exit_initiated) {
+    if (!exit_initiated_get()) {
         RRDHOST *host = st->rrdhost;
 
         time_t now = now_realtime_sec();
diff --git a/src/libnetdata/exit/exit_initiated.c b/src/libnetdata/exit/exit_initiated.c
index adb871edef..108e30653f 100644
--- a/src/libnetdata/exit/exit_initiated.c
+++ b/src/libnetdata/exit/exit_initiated.c
@@ -2,7 +2,7 @@
 
 #include "../libnetdata.h"
 
-volatile EXIT_REASON exit_initiated = EXIT_REASON_NONE;
+static volatile sig_atomic_t exit_initiated = EXIT_REASON_NONE;
 
 ENUM_STR_MAP_DEFINE(EXIT_REASON) = {
     { EXIT_REASON_SIGBUS, "signal-bus-error"},
@@ -108,15 +108,22 @@ void exit_initiated_init(void) {
         self = os_get_file_metadata(self_path);
 }
 
+ALWAYS_INLINE
+EXIT_REASON exit_initiated_get(void) {
+    return (EXIT_REASON)exit_initiated;
+}
+
 void exit_initiated_add(EXIT_REASON reason) {
-    exit_initiated |= reason;
+    exit_initiated |= (sig_atomic_t)reason;
 }
 
 void exit_initiated_set(EXIT_REASON reason) {
-    if(exit_initiated == EXIT_REASON_NONE && !(reason & EXIT_REASON_SYSTEM_SHUTDOWN) && is_system_shutdown())
+    EXIT_REASON old = exit_initiated_get();
+
+    if(old == EXIT_REASON_NONE && !(reason & EXIT_REASON_SYSTEM_SHUTDOWN) && is_system_shutdown())
         reason |= EXIT_REASON_SYSTEM_SHUTDOWN;
 
-    if(exit_initiated == EXIT_REASON_NONE && self_path && OS_FILE_METADATA_OK(self)) {
+    if(old == EXIT_REASON_NONE && self_path && OS_FILE_METADATA_OK(self)) {
         OS_FILE_METADATA self_now = os_get_file_metadata(self_path);
         if(OS_FILE_METADATA_OK(self_now) && (self_now.modified_time != self.modified_time || self_now.size_bytes != self.size_bytes))
             reason |= EXIT_REASON_UPDATE;
@@ -127,4 +134,3 @@ void exit_initiated_set(EXIT_REASON reason) {
     // we will have all of them
     exit_initiated_add(reason);
 }
-
diff --git a/src/libnetdata/exit/exit_initiated.h b/src/libnetdata/exit/exit_initiated.h
index eebe8e90c9..3fcc445717 100644
--- a/src/libnetdata/exit/exit_initiated.h
+++ b/src/libnetdata/exit/exit_initiated.h
@@ -82,9 +82,8 @@ typedef enum {
 typedef struct web_buffer BUFFER;
 BITMAP_STR_DEFINE_FUNCTIONS_EXTERN(EXIT_REASON);
 
-extern volatile EXIT_REASON exit_initiated;
-
 void exit_initiated_init(void);
+EXIT_REASON exit_initiated_get(void);
 void exit_initiated_set(EXIT_REASON reason);
 void exit_initiated_add(EXIT_REASON reason);
 
diff --git a/src/web/server/web_client.c b/src/web/server/web_client.c
index f58ff31233..582bf30944 100644
--- a/src/web/server/web_client.c
+++ b/src/web/server/web_client.c
@@ -1185,13 +1185,13 @@ static inline int web_client_process_url(RRDHOST *host, struct web_client *w, ch
             w->response.data->content_type = CT_TEXT_PLAIN;
             buffer_flush(w->response.data);
 
-            if(!exit_initiated)
+            if(!exit_initiated_get())
                 buffer_strcat(w->response.data, "ok, will do...");
             else
                 buffer_strcat(w->response.data, "I am doing it already");
 
             netdata_log_error("web request to exit received.");
-            netdata_cleanup_and_exit(EXIT_REASON_API_QUIT, NULL, NULL, NULL);
+            netdata_cleanup_and_exit_gracefully(EXIT_REASON_API_QUIT);
             return HTTP_RESP_OK;
         }
         else if(unlikely(hash == hash_debug && strcmp(tok, "debug") == 0)) {