From 87dbbaadb625edb5d05c3605f7dd02fc0a18237c Mon Sep 17 00:00:00 2001 From: Costa Tsaousis <costa@netdata.cloud> Date: Mon, 17 Mar 2025 23:48:57 +0000 Subject: [PATCH] do not recurse cleanup on shutdown (#19894) * do not recurse cleanup on shutdown * make exit_initiated sigatomic_t and hide it behind a function * fix freebsd and windows --- src/collectors/cups.plugin/cups_plugin.c | 6 ++-- .../freebsd.plugin/plugin_freebsd.c | 2 +- src/collectors/nfacct.plugin/plugin_nfacct.c | 2 +- src/collectors/perf.plugin/perf_plugin.c | 2 +- .../xenstat.plugin/xenstat_plugin.c | 2 +- src/daemon/commands.c | 2 +- src/daemon/daemon-service.c | 2 +- src/daemon/daemon-shutdown.c | 32 ++++++++++++------- src/daemon/daemon-shutdown.h | 7 ++-- src/daemon/daemon-status-file.c | 2 +- src/daemon/daemon-systemd-watcher.c | 2 +- src/daemon/main.c | 2 +- src/daemon/signal-handler.c | 2 +- src/daemon/winsvc.cc | 2 +- src/database/engine/cache.c | 2 +- src/database/engine/page.c | 4 +-- src/health/rrdcalc.c | 2 +- src/libnetdata/exit/exit_initiated.c | 16 +++++++--- src/libnetdata/exit/exit_initiated.h | 3 +- src/web/server/web_client.c | 4 +-- 20 files changed, 54 insertions(+), 44 deletions(-) diff --git a/src/collectors/cups.plugin/cups_plugin.c b/src/collectors/cups.plugin/cups_plugin.c index 82bfffffa3..4b794d05e0 100644 --- a/src/collectors/cups.plugin/cups_plugin.c +++ b/src/collectors/cups.plugin/cups_plugin.c @@ -243,7 +243,7 @@ int main(int argc, char **argv) { for (iteration = 0; 1; iteration++) { heartbeat_next(&hb); - if (unlikely(exit_initiated)) + if (unlikely(exit_initiated_get())) break; reset_metrics(); @@ -315,7 +315,7 @@ int main(int argc, char **argv) { } cupsFreeDests(num_dest_total, dests); - if (unlikely(exit_initiated)) + if (unlikely(exit_initiated_get())) break; cups_job_t *jobs, *curr_job; @@ -410,7 +410,7 @@ int main(int argc, char **argv) { fflush(stdout); - if (unlikely(exit_initiated)) + if (unlikely(exit_initiated_get())) break; // restart check (14400 seconds) diff --git a/src/collectors/freebsd.plugin/plugin_freebsd.c b/src/collectors/freebsd.plugin/plugin_freebsd.c index c2a6d900f0..5a01436108 100644 --- a/src/collectors/freebsd.plugin/plugin_freebsd.c +++ b/src/collectors/freebsd.plugin/plugin_freebsd.c @@ -91,7 +91,7 @@ void *freebsd_main(void *ptr) // initialize FreeBSD plugin if (freebsd_plugin_init()) - netdata_cleanup_and_exit(EXIT_REASON_FATAL, NULL, NULL, NULL); + netdata_cleanup_and_exit_fatal(EXIT_REASON_FATAL); // check the enabled status for each module int i; diff --git a/src/collectors/nfacct.plugin/plugin_nfacct.c b/src/collectors/nfacct.plugin/plugin_nfacct.c index 18544abac5..f9bbe0efa8 100644 --- a/src/collectors/nfacct.plugin/plugin_nfacct.c +++ b/src/collectors/nfacct.plugin/plugin_nfacct.c @@ -837,7 +837,7 @@ int main(int argc, char **argv) { for(iteration = 0; 1; iteration++) { usec_t dt = heartbeat_next(&hb); - if(unlikely(exit_initiated)) break; + if(unlikely(exit_initiated_get())) break; if(debug && iteration) fprintf(stderr, "nfacct.plugin: iteration %zu, dt %"PRIu64" usec\n" diff --git a/src/collectors/perf.plugin/perf_plugin.c b/src/collectors/perf.plugin/perf_plugin.c index f69f9ab179..f139f3d277 100644 --- a/src/collectors/perf.plugin/perf_plugin.c +++ b/src/collectors/perf.plugin/perf_plugin.c @@ -1325,7 +1325,7 @@ int main(int argc, char **argv) { for(iteration = 0; 1; iteration++) { usec_t dt = heartbeat_next(&hb); - if (unlikely(exit_initiated)) + if (unlikely(exit_initiated_get())) break; if (unlikely(debug && iteration)) diff --git a/src/collectors/xenstat.plugin/xenstat_plugin.c b/src/collectors/xenstat.plugin/xenstat_plugin.c index d229009ce8..9fb55a63b0 100644 --- a/src/collectors/xenstat.plugin/xenstat_plugin.c +++ b/src/collectors/xenstat.plugin/xenstat_plugin.c @@ -1026,7 +1026,7 @@ int main(int argc, char **argv) { for(iteration = 0; 1; iteration++) { usec_t dt = heartbeat_next(&hb); - if(unlikely(exit_initiated)) break; + if(unlikely(exit_initiated_get())) break; if(unlikely(debug && iteration)) fprintf(stderr, "xenstat.plugin: iteration %zu, dt %lu usec\n", iteration, dt); diff --git a/src/daemon/commands.c b/src/daemon/commands.c index 44ff29aa80..0d53db16d5 100644 --- a/src/daemon/commands.c +++ b/src/daemon/commands.c @@ -164,7 +164,7 @@ static cmd_status_t cmd_exit_execute(char *args, char **message) nd_log_limits_unlimited(); netdata_log_info("COMMAND: Cleaning up to exit."); - netdata_cleanup_and_exit(EXIT_REASON_CMD_EXIT, NULL, NULL, NULL); + netdata_cleanup_and_exit_gracefully(EXIT_REASON_CMD_EXIT); exit(0); return CMD_STATUS_SUCCESS; diff --git a/src/daemon/daemon-service.c b/src/daemon/daemon-service.c index 9f1d30202b..8e0c68b291 100644 --- a/src/daemon/daemon-service.c +++ b/src/daemon/daemon-service.c @@ -90,7 +90,7 @@ bool service_running(SERVICE_TYPE service) { if (sth->type == SERVICE_THREAD_TYPE_NETDATA) cancelled = nd_thread_signaled_to_cancel(); - return !sth->stop_immediately && !exit_initiated && !cancelled; + return !sth->stop_immediately && !exit_initiated_get() && !cancelled; } void service_signal_exit(SERVICE_TYPE service) { diff --git a/src/daemon/daemon-shutdown.c b/src/daemon/daemon-shutdown.c index 942a7dd411..7c03998f31 100644 --- a/src/daemon/daemon-shutdown.c +++ b/src/daemon/daemon-shutdown.c @@ -38,7 +38,7 @@ extern struct netdata_static_thread *static_threads; void netdata_log_exit_reason(void) { CLEAN_BUFFER *wb = buffer_create(0, NULL); - EXIT_REASON_2buffer(wb, exit_initiated, ", "); + EXIT_REASON_2buffer(wb, exit_initiated_get(), ", "); ND_LOG_STACK lgs[] = { ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &netdata_exit_msgid), @@ -46,7 +46,7 @@ void netdata_log_exit_reason(void) { }; ND_LOG_STACK_PUSH(lgs); - nd_log(NDLS_DAEMON, is_exit_reason_normal(exit_initiated) ? NDLP_NOTICE : NDLP_CRIT, + nd_log(NDLS_DAEMON, is_exit_reason_normal(exit_initiated_get()) ? NDLP_NOTICE : NDLP_CRIT, "NETDATA SHUTDOWN: initializing shutdown with code due to: %s", buffer_tostring(wb)); } @@ -109,8 +109,7 @@ static void *rrdeng_exit_background(void *ptr) { } #ifdef ENABLE_DBENGINE -static void rrdeng_flush_everything_and_wait(bool wait_flush, bool wait_collectors, bool dirty_only) -{ +static void rrdeng_flush_everything_and_wait(bool wait_flush, bool wait_collectors, bool dirty_only) { static size_t starting_size_to_flush = 0; if(!pgc_hot_and_dirty_entries(main_cache)) @@ -172,9 +171,12 @@ static void rrdeng_flush_everything_and_wait(bool wait_flush, bool wait_collecto } #endif -void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char *action_result, const char *action_data) { +#if !defined(OS_WINDOWS) +NORETURN +#endif +static void netdata_cleanup_and_exit(EXIT_REASON reason) { exit_initiated_set(reason); - int ret = is_exit_reason_normal(exit_initiated) ? 0 : 1; + int ret = is_exit_reason_normal(exit_initiated_get()) ? 0 : 1; // don't recurse (due to a fatal, while exiting) static bool run = false; @@ -198,13 +200,9 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char rrdeng_flush_everything_and_wait(false, false, true); #endif - // send the stat from our caller - analytics_statistic_t statistic = { action, action_result, action_data }; - analytics_statistic_send(&statistic); - // notify we are exiting - statistic = (analytics_statistic_t) {"EXIT", ret?"ERROR":"OK","-"}; - analytics_statistic_send(&statistic); + //analytics_statistic_t statistic = (analytics_statistic_t) {"EXIT", ret?"ERROR":"OK","-"}; + //analytics_statistic_send(&statistic); netdata_main_spawn_server_cleanup(); watcher_step_complete(WATCHER_STEP_ID_DESTROY_MAIN_SPAWN_SERVER); @@ -393,3 +391,13 @@ void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char } #endif } + +void netdata_cleanup_and_exit_gracefully(EXIT_REASON reason) { + exit_initiated_add(reason); + FUNCTION_RUN_ONCE(); + netdata_cleanup_and_exit(reason); +} + +void netdata_cleanup_and_exit_fatal(EXIT_REASON reason) { + netdata_cleanup_and_exit(reason); +} diff --git a/src/daemon/daemon-shutdown.h b/src/daemon/daemon-shutdown.h index c5de7ac1ad..a3276dbeeb 100644 --- a/src/daemon/daemon-shutdown.h +++ b/src/daemon/daemon-shutdown.h @@ -10,10 +10,7 @@ void cancel_main_threads(void); void abort_on_fatal_disable(void); void abort_on_fatal_enable(void); -#ifdef OS_WINDOWS -void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char *action_result, const char *action_data); -#else -void netdata_cleanup_and_exit(EXIT_REASON reason, const char *action, const char *action_result, const char *action_data) NORETURN; -#endif +void netdata_cleanup_and_exit_gracefully(EXIT_REASON reason); +void netdata_cleanup_and_exit_fatal(EXIT_REASON reason); #endif //NETDATA_DAEMON_SHUTDOWN_H diff --git a/src/daemon/daemon-status-file.c b/src/daemon/daemon-status-file.c index 274028d88a..38bbbf6170 100644 --- a/src/daemon/daemon-status-file.c +++ b/src/daemon/daemon-status-file.c @@ -614,7 +614,7 @@ static void daemon_status_file_refresh(DAEMON_STATUS status) { get_daemon_status_fields_from_system_info(&session_status); - session_status.exit_reason = exit_initiated; + session_status.exit_reason = exit_initiated_get(); session_status.profile = nd_profile_detect_and_configure(false); if(status != DAEMON_STATUS_NONE) diff --git a/src/daemon/daemon-systemd-watcher.c b/src/daemon/daemon-systemd-watcher.c index 7de847771b..d283663d35 100644 --- a/src/daemon/daemon-systemd-watcher.c +++ b/src/daemon/daemon-systemd-watcher.c @@ -28,7 +28,7 @@ static int shutdown_event_handler(sd_bus_message *m, void *userdata __maybe_unus shutdown ? "true" : "false"); if(shutdown) - netdata_cleanup_and_exit(EXIT_REASON_SYSTEM_SHUTDOWN, NULL, NULL, NULL); + netdata_cleanup_and_exit_gracefully(EXIT_REASON_SYSTEM_SHUTDOWN); return 0; } diff --git a/src/daemon/main.c b/src/daemon/main.c index dbb430658e..d70b7189c7 100644 --- a/src/daemon/main.c +++ b/src/daemon/main.c @@ -240,7 +240,7 @@ int unittest_prepare_rrd(const char **user) { } static void fatal_cleanup_and_exit_cb(void) { - netdata_cleanup_and_exit(EXIT_REASON_FATAL, "fatal error", "exiting", NULL); + netdata_cleanup_and_exit_fatal(EXIT_REASON_FATAL); exit(1); } diff --git a/src/daemon/signal-handler.c b/src/daemon/signal-handler.c index 6d5579effc..3d7d8b8af4 100644 --- a/src/daemon/signal-handler.c +++ b/src/daemon/signal-handler.c @@ -237,7 +237,7 @@ static void process_triggered_signals(void) { nd_log_limits_unlimited(); netdata_log_info("SIGNAL: Received %s. Cleaning up to exit...", name); commands_exit(); - netdata_cleanup_and_exit(signals_waiting[i].reason, NULL, NULL, NULL); + netdata_cleanup_and_exit_gracefully(signals_waiting[i].reason); exit(0); break; diff --git a/src/daemon/winsvc.cc b/src/daemon/winsvc.cc index c177785e94..4e570c1f23 100644 --- a/src/daemon/winsvc.cc +++ b/src/daemon/winsvc.cc @@ -109,7 +109,7 @@ static void *call_netdata_cleanup(void *arg) reason = EXIT_REASON_SERVICE_STOP; break; } - netdata_cleanup_and_exit(reason, NULL, NULL, NULL); + netdata_cleanup_and_exit_gracefully(reason); // Close event handle netdata_service_log("Closing stop event handle..."); diff --git a/src/database/engine/cache.c b/src/database/engine/cache.c index 34242cf258..48da8123ea 100644 --- a/src/database/engine/cache.c +++ b/src/database/engine/cache.c @@ -2312,7 +2312,7 @@ bool pgc_flush_pages(PGC *cache) { } void pgc_page_hot_set_end_time_s(PGC *cache __maybe_unused, PGC_PAGE *page, time_t end_time_s, size_t additional_bytes) { - internal_fatal(!is_page_hot(page) && !exit_initiated, + internal_fatal(!is_page_hot(page) && !exit_initiated_get(), "DBENGINE CACHE: end_time_s update on non-hot page"); internal_fatal(end_time_s < __atomic_load_n(&page->end_time_s, __ATOMIC_RELAXED), diff --git a/src/database/engine/page.c b/src/database/engine/page.c index 9fa3634aef..376ae8d358 100644 --- a/src/database/engine/page.c +++ b/src/database/engine/page.c @@ -897,14 +897,14 @@ size_t pgd_append_point( uint32_t expected_slot) { if (pg->states & PGD_STATE_SCHEDULED_FOR_FLUSHING) { - if(exit_initiated == EXIT_REASON_NONE) + if(exit_initiated_get() == EXIT_REASON_NONE) pgd_fatal(pg, "Data collection on page already scheduled for flushing"); else return 0; } if (!(pg->states & PGD_STATE_CREATED_FROM_COLLECTOR)) { - if(exit_initiated == EXIT_REASON_NONE) + if(exit_initiated_get() == EXIT_REASON_NONE) pgd_fatal(pg, "DBENGINE: collection on page not created from a collector"); else return 0; diff --git a/src/health/rrdcalc.c b/src/health/rrdcalc.c index 61822febc8..08e8b6ff14 100644 --- a/src/health/rrdcalc.c +++ b/src/health/rrdcalc.c @@ -250,7 +250,7 @@ static void rrdcalc_link_to_rrdset(RRDCALC *rc) { static void rrdcalc_unlink_from_rrdset(RRDCALC *rc, bool having_ll_wrlock) { RRDSET *st = rc->rrdset; - if (!exit_initiated) { + if (!exit_initiated_get()) { RRDHOST *host = st->rrdhost; time_t now = now_realtime_sec(); diff --git a/src/libnetdata/exit/exit_initiated.c b/src/libnetdata/exit/exit_initiated.c index adb871edef..108e30653f 100644 --- a/src/libnetdata/exit/exit_initiated.c +++ b/src/libnetdata/exit/exit_initiated.c @@ -2,7 +2,7 @@ #include "../libnetdata.h" -volatile EXIT_REASON exit_initiated = EXIT_REASON_NONE; +static volatile sig_atomic_t exit_initiated = EXIT_REASON_NONE; ENUM_STR_MAP_DEFINE(EXIT_REASON) = { { EXIT_REASON_SIGBUS, "signal-bus-error"}, @@ -108,15 +108,22 @@ void exit_initiated_init(void) { self = os_get_file_metadata(self_path); } +ALWAYS_INLINE +EXIT_REASON exit_initiated_get(void) { + return (EXIT_REASON)exit_initiated; +} + void exit_initiated_add(EXIT_REASON reason) { - exit_initiated |= reason; + exit_initiated |= (sig_atomic_t)reason; } void exit_initiated_set(EXIT_REASON reason) { - if(exit_initiated == EXIT_REASON_NONE && !(reason & EXIT_REASON_SYSTEM_SHUTDOWN) && is_system_shutdown()) + EXIT_REASON old = exit_initiated_get(); + + if(old == EXIT_REASON_NONE && !(reason & EXIT_REASON_SYSTEM_SHUTDOWN) && is_system_shutdown()) reason |= EXIT_REASON_SYSTEM_SHUTDOWN; - if(exit_initiated == EXIT_REASON_NONE && self_path && OS_FILE_METADATA_OK(self)) { + if(old == EXIT_REASON_NONE && self_path && OS_FILE_METADATA_OK(self)) { OS_FILE_METADATA self_now = os_get_file_metadata(self_path); if(OS_FILE_METADATA_OK(self_now) && (self_now.modified_time != self.modified_time || self_now.size_bytes != self.size_bytes)) reason |= EXIT_REASON_UPDATE; @@ -127,4 +134,3 @@ void exit_initiated_set(EXIT_REASON reason) { // we will have all of them exit_initiated_add(reason); } - diff --git a/src/libnetdata/exit/exit_initiated.h b/src/libnetdata/exit/exit_initiated.h index eebe8e90c9..3fcc445717 100644 --- a/src/libnetdata/exit/exit_initiated.h +++ b/src/libnetdata/exit/exit_initiated.h @@ -82,9 +82,8 @@ typedef enum { typedef struct web_buffer BUFFER; BITMAP_STR_DEFINE_FUNCTIONS_EXTERN(EXIT_REASON); -extern volatile EXIT_REASON exit_initiated; - void exit_initiated_init(void); +EXIT_REASON exit_initiated_get(void); void exit_initiated_set(EXIT_REASON reason); void exit_initiated_add(EXIT_REASON reason); diff --git a/src/web/server/web_client.c b/src/web/server/web_client.c index f58ff31233..582bf30944 100644 --- a/src/web/server/web_client.c +++ b/src/web/server/web_client.c @@ -1185,13 +1185,13 @@ static inline int web_client_process_url(RRDHOST *host, struct web_client *w, ch w->response.data->content_type = CT_TEXT_PLAIN; buffer_flush(w->response.data); - if(!exit_initiated) + if(!exit_initiated_get()) buffer_strcat(w->response.data, "ok, will do..."); else buffer_strcat(w->response.data, "I am doing it already"); netdata_log_error("web request to exit received."); - netdata_cleanup_and_exit(EXIT_REASON_API_QUIT, NULL, NULL, NULL); + netdata_cleanup_and_exit_gracefully(EXIT_REASON_API_QUIT); return HTTP_RESP_OK; } else if(unlikely(hash == hash_debug && strcmp(tok, "debug") == 0)) {