0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-06 22:38:55 +00:00
netdata_netdata/src/daemon/pulse/pulse-aral.c
Costa Tsaousis 5f72d4279b
Streaming improvements No 3 ()
* ML uses synchronous queries

* do not call malloc_trim() to free memory, since to locks everything

* Reschedule dimensions for training from worker threads.

* when we collect or read from the database, it is SAMPLES. When we generate points for a chart is POINTS

* keep the receiver send buffer 10x the default

* support autoscaling stream circular buffers

* nd_poll() prefers sending data vs receiving data - in an attempt to dequeue as soon as possible

* fix last commit

* allow removing receiver and senders inline, if the stream thread is not working on them

* fix logs

* Revert "nd_poll() prefers sending data vs receiving data - in an attempt to dequeue as soon as possible"

This reverts commit 51539a97da.

* do not access receiver or sender after it has been removed

* open cache hot2clean

* open cache hot2clean does not need flushing

* use aral for extent pages up to 65k

* track aral malloc and mmap allocations separately; add 8192 as a possible value to PGD

* do not evict too frequently if not needed

* fix aral metrics

* fix aral metrics again

* accurate accounting of memory for dictionaries, strings, labels and MRG

* log during shutdown the progress of dbengine flushing

* move metasync shutfown after dbengine

* max iterations per I/O events

* max iterations per I/O events - break the loop

* max iterations per I/O events - break the loop - again

* disable inline evictions for all caches

* when writing to sockets, send everything that can be sent

* cleanup code to trigger evictions

* fix calculation of eviction size

* fix calculation of eviction size once more

* fix calculation of eviction size once more - again

* ml and replication stop while backfilling is running

* process opcodes while draining the sockets; log with limit when asking to disconnect a node

* fix log

* ml stops when replication queries are running

* report pgd_padding to pulse

* aral precise memory accounting

* removed all alignas() and fix the 2 issues that resulted in unaligned memory accesses (one in mqtt and another in streaming)

* remove the bigger sizes from PGD, but keep multiples of gorilla buffers

* exclude judy from sanitizers

* use 16 bytes alignment on 32 bit machines

* internal check about memory alignment

* experiment: do not allow more children to connect while there is backfilling or replication queries running

* when the node is initializing, retry in 30 seconds

* connector cleanup and isolation of control logic about enabling/disabling various parts

* stop also health queries while backfilling is running

* tuning

* drain the input

* improve interactivity when suspending

* more interactive stream_control

* debug logs to find the connection issue

* abstracted everything about stream control

* Add ml_host_{start,stop} again.

* Do not create/update anomaly-detection charts when ML is not running for a host.

* rrdhost flag RECEIVER_DISCONNECTED has been reversed to COLLECTOR_ONLINE and has been used for localhost and virtual hosts too, to have a single point of truth about the availability of collected data or not

* ml_host_start() and ml_host_stop() are used by streaming receivers; ml_host_start() is used for localhost and virtual hosts

* fixed typo

* allow up to 3 backfills at a time

* add throttling based on user queries

* restore cache line paddings

* unify streaming logs to make it easier to grep logs

* tuning of stream_control

* more logs unification

* use mallocz_release_as_much_memory_to_the_system() under extreme conditions

* do not rely on the response code of evict_pages()

* log the gap of the database every time a node is connected

* updated ram requirements

---------

Co-authored-by: vkalintiris <vasilis@netdata.cloud>
2024-12-11 18:02:17 +02:00

172 lines
6.6 KiB
C

// SPDX-License-Identifier: GPL-3.0-or-later
#define PULSE_INTERNALS 1
#include "pulse-aral.h"
struct aral_info {
const char *name;
RRDSET *st_memory;
RRDDIM *rd_malloc_used, *rd_malloc_free, *rd_mmap_used, *rd_mmap_free, *rd_structures, *rd_padding;
RRDSET *st_utilization;
RRDDIM *rd_utilization;
};
DEFINE_JUDYL_TYPED(ARAL_STATS, struct aral_info *);
static struct {
SPINLOCK spinlock;
ARAL_STATS_JudyLSet idx;
} globals = { 0 };
static void pulse_aral_register_statistics(struct aral_statistics *stats, const char *name) {
if(!name || !stats)
return;
spinlock_lock(&globals.spinlock);
struct aral_info *ai = ARAL_STATS_GET(&globals.idx, (Word_t)stats);
if(!ai) {
ai = callocz(1, sizeof(struct aral_info));
ai->name = strdupz(name);
ARAL_STATS_SET(&globals.idx, (Word_t)stats, ai);
}
spinlock_unlock(&globals.spinlock);
}
void pulse_aral_register(ARAL *ar, const char *name) {
if(!ar) return;
if(!name)
name = aral_name(ar);
struct aral_statistics *stats = aral_get_statistics(ar);
pulse_aral_register_statistics(stats, name);
}
void pulse_aral_unregister(ARAL *ar) {
if(!ar) return;
struct aral_statistics *stats = aral_get_statistics(ar);
spinlock_lock(&globals.spinlock);
struct aral_info *ai = ARAL_STATS_GET(&globals.idx, (Word_t)stats);
if(ai) {
ARAL_STATS_DEL(&globals.idx, (Word_t)stats);
freez((void *)ai->name);
freez(ai);
}
spinlock_unlock(&globals.spinlock);
}
void pulse_aral_init(void) {
pulse_aral_register_statistics(aral_by_size_statistics(), "by-size");
}
void pulse_aral_do(bool extended) {
if(!extended) return;
spinlock_lock(&globals.spinlock);
Word_t s = 0;
for(struct aral_info *ai = ARAL_STATS_FIRST(&globals.idx, &s);
ai;
ai = ARAL_STATS_NEXT(&globals.idx, &s)) {
struct aral_statistics *stats = (void *)(uintptr_t)s;
if (!stats)
continue;
size_t malloc_allocated_bytes = __atomic_load_n(&stats->malloc.allocated_bytes, __ATOMIC_RELAXED);
size_t malloc_used_bytes = __atomic_load_n(&stats->malloc.used_bytes, __ATOMIC_RELAXED);
if(malloc_used_bytes > malloc_allocated_bytes)
malloc_allocated_bytes = malloc_used_bytes;
size_t malloc_free_bytes = malloc_allocated_bytes - malloc_used_bytes;
size_t mmap_allocated_bytes = __atomic_load_n(&stats->mmap.allocated_bytes, __ATOMIC_RELAXED);
size_t mmap_used_bytes = __atomic_load_n(&stats->mmap.used_bytes, __ATOMIC_RELAXED);
if(mmap_used_bytes > mmap_allocated_bytes)
mmap_allocated_bytes = mmap_used_bytes;
size_t mmap_free_bytes = mmap_allocated_bytes - mmap_used_bytes;
size_t structures_bytes = __atomic_load_n(&stats->structures.allocated_bytes, __ATOMIC_RELAXED);
size_t padding_bytes = __atomic_load_n(&stats->malloc.padding_bytes, __ATOMIC_RELAXED) +
__atomic_load_n(&stats->mmap.padding_bytes, __ATOMIC_RELAXED);
NETDATA_DOUBLE utilization;
if((malloc_used_bytes + mmap_used_bytes != 0) && (malloc_allocated_bytes + mmap_allocated_bytes != 0))
utilization = 100.0 * (NETDATA_DOUBLE)(malloc_used_bytes + mmap_used_bytes) / (NETDATA_DOUBLE)(malloc_allocated_bytes + mmap_allocated_bytes);
else
utilization = 100.0;
{
if (unlikely(!ai->st_memory)) {
char id[256];
snprintfz(id, sizeof(id), "aral_%s_memory", ai->name);
netdata_fix_chart_id(id);
ai->st_memory = rrdset_create_localhost(
"netdata",
id,
NULL,
"ARAL",
"netdata.aral_memory",
"Array Allocator Memory Utilization",
"bytes",
"netdata",
"pulse",
910000,
localhost->rrd_update_every,
RRDSET_TYPE_STACKED);
rrdlabels_add(ai->st_memory->rrdlabels, "ARAL", ai->name, RRDLABEL_SRC_AUTO);
ai->rd_malloc_free = rrddim_add(ai->st_memory, "malloc free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
ai->rd_mmap_free = rrddim_add(ai->st_memory, "mmap free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
ai->rd_malloc_used = rrddim_add(ai->st_memory, "malloc used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
ai->rd_mmap_used = rrddim_add(ai->st_memory, "mmap used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
ai->rd_structures = rrddim_add(ai->st_memory, "structures", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
ai->rd_padding = rrddim_add(ai->st_memory, "padding", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
}
rrddim_set_by_pointer(ai->st_memory, ai->rd_malloc_used, (collected_number)malloc_used_bytes);
rrddim_set_by_pointer(ai->st_memory, ai->rd_malloc_free, (collected_number)malloc_free_bytes);
rrddim_set_by_pointer(ai->st_memory, ai->rd_mmap_used, (collected_number)mmap_used_bytes);
rrddim_set_by_pointer(ai->st_memory, ai->rd_mmap_free, (collected_number)mmap_free_bytes);
rrddim_set_by_pointer(ai->st_memory, ai->rd_structures, (collected_number)structures_bytes);
rrddim_set_by_pointer(ai->st_memory, ai->rd_padding, (collected_number)padding_bytes);
rrdset_done(ai->st_memory);
}
{
if (unlikely(!ai->st_utilization)) {
char id[256];
snprintfz(id, sizeof(id), "aral_%s_utilization", ai->name);
netdata_fix_chart_id(id);
ai->st_utilization = rrdset_create_localhost(
"netdata",
id,
NULL,
"ARAL",
"netdata.aral_utilization",
"Array Allocator Memory Utilization",
"%",
"netdata",
"pulse",
910001,
localhost->rrd_update_every,
RRDSET_TYPE_LINE);
rrdlabels_add(ai->st_utilization->rrdlabels, "ARAL", ai->name, RRDLABEL_SRC_AUTO);
ai->rd_utilization = rrddim_add(ai->st_utilization, "utilization", NULL, 1, 10000, RRD_ALGORITHM_ABSOLUTE);
}
rrddim_set_by_pointer(ai->st_utilization, ai->rd_utilization, (collected_number)(utilization * 10000.0));
rrdset_done(ai->st_utilization);
}
}
spinlock_unlock(&globals.spinlock);
}