mirror of
https://github.com/netdata/netdata.git
synced 2025-04-03 04:55:33 +00:00
Streaming improvements No 3 (#19168)
* ML uses synchronous queries
* do not call malloc_trim() to free memory, since to locks everything
* Reschedule dimensions for training from worker threads.
* when we collect or read from the database, it is SAMPLES. When we generate points for a chart is POINTS
* keep the receiver send buffer 10x the default
* support autoscaling stream circular buffers
* nd_poll() prefers sending data vs receiving data - in an attempt to dequeue as soon as possible
* fix last commit
* allow removing receiver and senders inline, if the stream thread is not working on them
* fix logs
* Revert "nd_poll() prefers sending data vs receiving data - in an attempt to dequeue as soon as possible"
This reverts commit 51539a97da
.
* do not access receiver or sender after it has been removed
* open cache hot2clean
* open cache hot2clean does not need flushing
* use aral for extent pages up to 65k
* track aral malloc and mmap allocations separately; add 8192 as a possible value to PGD
* do not evict too frequently if not needed
* fix aral metrics
* fix aral metrics again
* accurate accounting of memory for dictionaries, strings, labels and MRG
* log during shutdown the progress of dbengine flushing
* move metasync shutfown after dbengine
* max iterations per I/O events
* max iterations per I/O events - break the loop
* max iterations per I/O events - break the loop - again
* disable inline evictions for all caches
* when writing to sockets, send everything that can be sent
* cleanup code to trigger evictions
* fix calculation of eviction size
* fix calculation of eviction size once more
* fix calculation of eviction size once more - again
* ml and replication stop while backfilling is running
* process opcodes while draining the sockets; log with limit when asking to disconnect a node
* fix log
* ml stops when replication queries are running
* report pgd_padding to pulse
* aral precise memory accounting
* removed all alignas() and fix the 2 issues that resulted in unaligned memory accesses (one in mqtt and another in streaming)
* remove the bigger sizes from PGD, but keep multiples of gorilla buffers
* exclude judy from sanitizers
* use 16 bytes alignment on 32 bit machines
* internal check about memory alignment
* experiment: do not allow more children to connect while there is backfilling or replication queries running
* when the node is initializing, retry in 30 seconds
* connector cleanup and isolation of control logic about enabling/disabling various parts
* stop also health queries while backfilling is running
* tuning
* drain the input
* improve interactivity when suspending
* more interactive stream_control
* debug logs to find the connection issue
* abstracted everything about stream control
* Add ml_host_{start,stop} again.
* Do not create/update anomaly-detection charts when ML is not running for a host.
* rrdhost flag RECEIVER_DISCONNECTED has been reversed to COLLECTOR_ONLINE and has been used for localhost and virtual hosts too, to have a single point of truth about the availability of collected data or not
* ml_host_start() and ml_host_stop() are used by streaming receivers; ml_host_start() is used for localhost and virtual hosts
* fixed typo
* allow up to 3 backfills at a time
* add throttling based on user queries
* restore cache line paddings
* unify streaming logs to make it easier to grep logs
* tuning of stream_control
* more logs unification
* use mallocz_release_as_much_memory_to_the_system() under extreme conditions
* do not rely on the response code of evict_pages()
* log the gap of the database every time a node is connected
* updated ram requirements
---------
Co-authored-by: vkalintiris <vasilis@netdata.cloud>
This commit is contained in:
parent
2956244a2a
commit
5f72d4279b
90 changed files with 1791 additions and 1131 deletions
CMakeLists.txt
docs/netdata-agent/sizing-netdata-agents
src
aclk/mqtt_websockets
daemon
main.c
pulse
pulse-aral.cpulse-daemon-memory.cpulse-dbengine.cpulse-gorilla.cpulse-http-api.cpulse-ml.cpulse-queries.cpulse-sqlite3.c
watcher.cwatcher.hdatabase
engine
cache.ccache.hmetric.cmetric.hpage.cpage.hpagecache.cpdc.cpdc.hrrdengine.crrdengine.hrrdengineapi.crrdengineapi.h
rrd.hrrdhost.crrdlabels.crrdset.chealth
libnetdata
aral
common.hdictionary
libjudy
libnetdata.csimple_hashtable
string
url
ml
plugins.d
streaming
protocol
replication.creplication.hrrdhost-status.cstream-capabilities.cstream-circular-buffer.cstream-circular-buffer.hstream-compression
stream-conf.cstream-connector.cstream-control.cstream-control.hstream-parents.cstream-path.cstream-receiver-connection.cstream-receiver-internals.hstream-receiver.cstream-sender-commit.cstream-sender-execute.cstream-sender-internals.hstream-sender.cstream-thread.cstream-thread.hstream-traffic-types.hstream.hweb/api
|
@ -1549,6 +1549,8 @@ set(STREAMING_PLUGIN_FILES
|
|||
src/streaming/stream-traffic-types.h
|
||||
src/streaming/stream-circular-buffer.c
|
||||
src/streaming/stream-circular-buffer.h
|
||||
src/streaming/stream-control.c
|
||||
src/streaming/stream-control.h
|
||||
)
|
||||
|
||||
set(WEB_PLUGIN_FILES
|
||||
|
|
|
@ -19,7 +19,7 @@ This number can be lowered by limiting the number of Database Tiers or switching
|
|||
| nodes currently received | nodes collected | 512 KiB | Structures and reception buffers |
|
||||
| nodes currently sent | nodes collected | 512 KiB | Structures and dispatch buffers |
|
||||
|
||||
These numbers vary depending on name length, the number of dimensions per instance and per context, the number and length of the labels added, the number of Machine Learning models maintained and similar parameters. For most use cases, they represent the worst case scenario, so you may find out Netdata actually needs less than that.
|
||||
These numbers vary depending on metric name length, the average number of dimensions per instance and per context, the number and length of the labels added, the number of database tiers configured, the number of Machine Learning models maintained per metric and similar parameters. For most use cases, they represent the worst case scenario, so you may find out Netdata actually needs less than that.
|
||||
|
||||
Each metric currently being collected needs (1 index + 20 collection + 5 ml) = 26 KiB. When it stops being collected, it needs 1 KiB (index).
|
||||
|
||||
|
@ -84,3 +84,11 @@ We frequently see that the following strategy gives the best results:
|
|||
3. Set the page cache in `netdata.conf` to use 1/3 of the available memory.
|
||||
|
||||
This will allow Netdata queries to have more caches, while leaving plenty of available memory of logs and the operating system.
|
||||
|
||||
In Netdata 2.1 we added the `netdata.conf` option `[db].dbengine use all ram for caches` and `[db].dbengine out of memory protection`.
|
||||
Combining these two parameters is probably simpler to get best results:
|
||||
|
||||
- `[db].dbengine out of memory protection` is by default 10% of total system RAM, but not more than 5GiB. When the amount of free memory is less than this, Netdata automatically starts releasing memory from its caches to avoid getting out of memory. On `systemd-journal` centralization points, set this to the amount of memory to be dedicated for systemd journal.
|
||||
- `[db].dbengine use all ram for caches` is by default `no`. Set it to `yes` to use all the memory except the memory given above.
|
||||
|
||||
With these settings, netdata will use all the memory available but leave the amount specified for systemd journal.
|
||||
|
|
|
@ -745,8 +745,13 @@ static size_t mqtt_ng_connect_size(struct mqtt_auth_properties *auth,
|
|||
#define WRITE_POS(frag) (&(frag->data[frag->len]))
|
||||
|
||||
// [MQTT-1.5.2] Two Byte Integer
|
||||
#define PACK_2B_INT(buffer, integer, frag) { *(uint16_t *)WRITE_POS(frag) = htobe16((integer)); \
|
||||
DATA_ADVANCE(buffer, sizeof(uint16_t), frag); }
|
||||
#define PACK_2B_INT(buffer, integer, frag) { \
|
||||
uint16_t temp = htobe16((integer)); \
|
||||
memcpy(WRITE_POS(frag), &temp, sizeof(uint16_t)); \
|
||||
DATA_ADVANCE(buffer, sizeof(uint16_t), frag); \
|
||||
}
|
||||
// #define PACK_2B_INT(buffer, integer, frag) { *(uint16_t *)WRITE_POS(frag) = htobe16((integer));
|
||||
// DATA_ADVANCE(buffer, sizeof(uint16_t), frag); }
|
||||
|
||||
static int _optimized_add(struct header_buffer *buf, void *data, size_t data_len, free_fnc_t data_free_fnc, struct buffer_fragment **frag)
|
||||
{
|
||||
|
|
|
@ -394,11 +394,10 @@ void netdata_cleanup_and_exit(int ret, const char *action, const char *action_re
|
|||
{
|
||||
watcher_step_complete(WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
|
||||
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH);
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
@ -406,15 +405,44 @@ void netdata_cleanup_and_exit(int ret, const char *action, const char *action_re
|
|||
|
||||
#ifdef ENABLE_DBENGINE
|
||||
if(dbengine_enabled) {
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO, "Preparing DBENGINE shutdown...");
|
||||
for (size_t tier = 0; tier < storage_tiers; tier++)
|
||||
rrdeng_prepare_exit(multidb_ctx[tier]);
|
||||
|
||||
for (size_t tier = 0; tier < storage_tiers; tier++) {
|
||||
if (!multidb_ctx[tier])
|
||||
continue;
|
||||
completion_wait_for(&multidb_ctx[tier]->quiesce.completion);
|
||||
completion_destroy(&multidb_ctx[tier]->quiesce.completion);
|
||||
}
|
||||
struct pgc_statistics pgc_main_stats = pgc_get_statistics(main_cache);
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO, "Waiting for DBENGINE to commit unsaved data to disk (%zu pages, %zu bytes)...",
|
||||
pgc_main_stats.queues[PGC_QUEUE_HOT].entries + pgc_main_stats.queues[PGC_QUEUE_DIRTY].entries,
|
||||
pgc_main_stats.queues[PGC_QUEUE_HOT].size + pgc_main_stats.queues[PGC_QUEUE_DIRTY].size);
|
||||
|
||||
bool finished_tiers[RRD_STORAGE_TIERS] = { 0 };
|
||||
size_t waiting_tiers, iterations = 0;
|
||||
do {
|
||||
waiting_tiers = 0;
|
||||
iterations++;
|
||||
|
||||
for (size_t tier = 0; tier < storage_tiers; tier++) {
|
||||
if (!multidb_ctx[tier] || finished_tiers[tier])
|
||||
continue;
|
||||
|
||||
waiting_tiers++;
|
||||
if (completion_timedwait_for(&multidb_ctx[tier]->quiesce.completion, 1)) {
|
||||
completion_destroy(&multidb_ctx[tier]->quiesce.completion);
|
||||
finished_tiers[tier] = true;
|
||||
waiting_tiers--;
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO, "DBENGINE tier %zu finished!", tier);
|
||||
}
|
||||
else if(iterations % 10 == 0) {
|
||||
pgc_main_stats = pgc_get_statistics(main_cache);
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO,
|
||||
"Still waiting for DBENGINE tier %zu to finish "
|
||||
"(cache still has %zu pages, %zu bytes hot, for all tiers)...",
|
||||
tier,
|
||||
pgc_main_stats.queues[PGC_QUEUE_HOT].entries + pgc_main_stats.queues[PGC_QUEUE_DIRTY].entries,
|
||||
pgc_main_stats.queues[PGC_QUEUE_HOT].size + pgc_main_stats.queues[PGC_QUEUE_DIRTY].size);
|
||||
}
|
||||
}
|
||||
} while(waiting_tiers);
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO, "DBENGINE shutdown completed...");
|
||||
}
|
||||
#endif
|
||||
watcher_step_complete(WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS);
|
||||
|
@ -422,9 +450,6 @@ void netdata_cleanup_and_exit(int ret, const char *action, const char *action_re
|
|||
rrd_finalize_collection_for_all_hosts();
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS);
|
||||
|
||||
metadata_sync_shutdown();
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
|
||||
|
||||
#ifdef ENABLE_DBENGINE
|
||||
if(dbengine_enabled) {
|
||||
size_t running = 1;
|
||||
|
@ -452,18 +477,22 @@ void netdata_cleanup_and_exit(int ret, const char *action, const char *action_re
|
|||
rrdeng_exit(multidb_ctx[tier]);
|
||||
rrdeng_enq_cmd(NULL, RRDENG_OPCODE_SHUTDOWN_EVLOOP, NULL, NULL, STORAGE_PRIORITY_BEST_EFFORT, NULL, NULL);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
|
||||
} else {
|
||||
}
|
||||
else {
|
||||
// Skip these steps
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH);
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
|
||||
}
|
||||
#else
|
||||
// Skip these steps
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH);
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
|
||||
// Skip these steps
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH);
|
||||
watcher_step_complete(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING);
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
|
||||
#endif
|
||||
|
||||
metadata_sync_shutdown();
|
||||
watcher_step_complete(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
|
||||
}
|
||||
|
||||
// Don't register a shutdown event if we crashed
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
struct aral_info {
|
||||
const char *name;
|
||||
RRDSET *st_memory;
|
||||
RRDDIM *rd_used, *rd_free, *rd_structures;
|
||||
RRDDIM *rd_malloc_used, *rd_malloc_free, *rd_mmap_used, *rd_mmap_free, *rd_structures, *rd_padding;
|
||||
|
||||
RRDSET *st_utilization;
|
||||
RRDDIM *rd_utilization;
|
||||
|
@ -74,24 +74,26 @@ void pulse_aral_do(bool extended) {
|
|||
if (!stats)
|
||||
continue;
|
||||
|
||||
size_t allocated_bytes = __atomic_load_n(&stats->malloc.allocated_bytes, __ATOMIC_RELAXED) +
|
||||
__atomic_load_n(&stats->mmap.allocated_bytes, __ATOMIC_RELAXED);
|
||||
size_t malloc_allocated_bytes = __atomic_load_n(&stats->malloc.allocated_bytes, __ATOMIC_RELAXED);
|
||||
size_t malloc_used_bytes = __atomic_load_n(&stats->malloc.used_bytes, __ATOMIC_RELAXED);
|
||||
if(malloc_used_bytes > malloc_allocated_bytes)
|
||||
malloc_allocated_bytes = malloc_used_bytes;
|
||||
size_t malloc_free_bytes = malloc_allocated_bytes - malloc_used_bytes;
|
||||
|
||||
size_t used_bytes = __atomic_load_n(&stats->malloc.used_bytes, __ATOMIC_RELAXED) +
|
||||
__atomic_load_n(&stats->mmap.used_bytes, __ATOMIC_RELAXED);
|
||||
|
||||
// slight difference may exist, due to the time needed to get these values
|
||||
// fix the obvious discrepancies
|
||||
if(used_bytes > allocated_bytes)
|
||||
used_bytes = allocated_bytes;
|
||||
size_t mmap_allocated_bytes = __atomic_load_n(&stats->mmap.allocated_bytes, __ATOMIC_RELAXED);
|
||||
size_t mmap_used_bytes = __atomic_load_n(&stats->mmap.used_bytes, __ATOMIC_RELAXED);
|
||||
if(mmap_used_bytes > mmap_allocated_bytes)
|
||||
mmap_allocated_bytes = mmap_used_bytes;
|
||||
size_t mmap_free_bytes = mmap_allocated_bytes - mmap_used_bytes;
|
||||
|
||||
size_t structures_bytes = __atomic_load_n(&stats->structures.allocated_bytes, __ATOMIC_RELAXED);
|
||||
|
||||
size_t free_bytes = allocated_bytes - used_bytes;
|
||||
size_t padding_bytes = __atomic_load_n(&stats->malloc.padding_bytes, __ATOMIC_RELAXED) +
|
||||
__atomic_load_n(&stats->mmap.padding_bytes, __ATOMIC_RELAXED);
|
||||
|
||||
NETDATA_DOUBLE utilization;
|
||||
if(used_bytes && allocated_bytes)
|
||||
utilization = 100.0 * (NETDATA_DOUBLE)used_bytes / (NETDATA_DOUBLE)allocated_bytes;
|
||||
if((malloc_used_bytes + mmap_used_bytes != 0) && (malloc_allocated_bytes + mmap_allocated_bytes != 0))
|
||||
utilization = 100.0 * (NETDATA_DOUBLE)(malloc_used_bytes + mmap_used_bytes) / (NETDATA_DOUBLE)(malloc_allocated_bytes + mmap_allocated_bytes);
|
||||
else
|
||||
utilization = 100.0;
|
||||
|
||||
|
@ -118,14 +120,20 @@ void pulse_aral_do(bool extended) {
|
|||
|
||||
rrdlabels_add(ai->st_memory->rrdlabels, "ARAL", ai->name, RRDLABEL_SRC_AUTO);
|
||||
|
||||
ai->rd_free = rrddim_add(ai->st_memory, "free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_used = rrddim_add(ai->st_memory, "used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_structures = rrddim_add(ai->st_memory, "structures", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_malloc_free = rrddim_add(ai->st_memory, "malloc free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_mmap_free = rrddim_add(ai->st_memory, "mmap free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_malloc_used = rrddim_add(ai->st_memory, "malloc used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_mmap_used = rrddim_add(ai->st_memory, "mmap used", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_structures = rrddim_add(ai->st_memory, "structures", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
ai->rd_padding = rrddim_add(ai->st_memory, "padding", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
}
|
||||
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_used, (collected_number)allocated_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_free, (collected_number)free_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_malloc_used, (collected_number)malloc_used_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_malloc_free, (collected_number)malloc_free_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_mmap_used, (collected_number)mmap_used_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_mmap_free, (collected_number)mmap_free_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_structures, (collected_number)structures_bytes);
|
||||
rrddim_set_by_pointer(ai->st_memory, ai->rd_padding, (collected_number)padding_bytes);
|
||||
rrdset_done(ai->st_memory);
|
||||
}
|
||||
|
||||
|
|
|
@ -87,9 +87,7 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
netdata_buffers_statistics.buffers_streaming +
|
||||
netdata_buffers_statistics.cbuffers_streaming +
|
||||
netdata_buffers_statistics.buffers_web +
|
||||
replication_allocated_buffers() +
|
||||
aral_by_size_overhead() +
|
||||
judy_aral_overhead();
|
||||
replication_allocated_buffers() + aral_by_size_free_bytes() + judy_aral_free_bytes();
|
||||
|
||||
size_t strings = 0;
|
||||
string_statistics(NULL, NULL, NULL, NULL, NULL, &strings, NULL, NULL);
|
||||
|
@ -101,8 +99,7 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
rrddim_set_by_pointer(st_memory, rd_collectors,
|
||||
(collected_number)dictionary_stats_memory_total(dictionary_stats_category_collectors));
|
||||
|
||||
rrddim_set_by_pointer(st_memory,
|
||||
rd_rrdhosts,
|
||||
rrddim_set_by_pointer(st_memory,rd_rrdhosts,
|
||||
(collected_number)dictionary_stats_memory_total(dictionary_stats_category_rrdhost) + (collected_number)netdata_buffers_statistics.rrdhost_allocations_size);
|
||||
|
||||
rrddim_set_by_pointer(st_memory, rd_rrdsets,
|
||||
|
@ -124,14 +121,15 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
(collected_number)dictionary_stats_memory_total(dictionary_stats_category_replication) + (collected_number)replication_allocated_memory());
|
||||
#else
|
||||
uint64_t metadata =
|
||||
aral_by_size_used_bytes() +
|
||||
dictionary_stats_category_rrdhost.memory.dict +
|
||||
dictionary_stats_category_rrdset.memory.dict +
|
||||
dictionary_stats_category_rrddim.memory.dict +
|
||||
dictionary_stats_category_rrdcontext.memory.dict +
|
||||
dictionary_stats_category_rrdhealth.memory.dict +
|
||||
dictionary_stats_category_functions.memory.dict +
|
||||
dictionary_stats_category_replication.memory.dict +
|
||||
aral_by_size_structures_bytes() + aral_by_size_used_bytes() +
|
||||
dictionary_stats_category_rrdhost.memory.dict + dictionary_stats_category_rrdhost.memory.index +
|
||||
dictionary_stats_category_rrdset.memory.dict + dictionary_stats_category_rrdset.memory.index +
|
||||
dictionary_stats_category_rrddim.memory.dict + dictionary_stats_category_rrddim.memory.index +
|
||||
dictionary_stats_category_rrdcontext.memory.dict + dictionary_stats_category_rrdcontext.memory.index +
|
||||
dictionary_stats_category_rrdhealth.memory.dict + dictionary_stats_category_rrdhealth.memory.index +
|
||||
dictionary_stats_category_functions.memory.dict + dictionary_stats_category_functions.memory.index +
|
||||
dictionary_stats_category_replication.memory.dict + dictionary_stats_category_replication.memory.index +
|
||||
netdata_buffers_statistics.rrdhost_allocations_size +
|
||||
replication_allocated_memory();
|
||||
|
||||
rrddim_set_by_pointer(st_memory, rd_metadata, (collected_number)metadata);
|
||||
|
@ -157,7 +155,7 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
(collected_number) workers_allocated_memory());
|
||||
|
||||
rrddim_set_by_pointer(st_memory, rd_aral,
|
||||
(collected_number) aral_by_size_structures());
|
||||
(collected_number)aral_by_size_structures_bytes());
|
||||
|
||||
rrddim_set_by_pointer(st_memory,
|
||||
rd_judy, (collected_number) judy_aral_structures());
|
||||
|
@ -168,6 +166,13 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
rrdset_done(st_memory);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
if(!extended)
|
||||
return;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
{
|
||||
static RRDSET *st_memory_buffers = NULL;
|
||||
static RRDDIM *rd_queries = NULL;
|
||||
|
@ -212,8 +217,8 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
rd_cbuffers_streaming = rrddim_add(st_memory_buffers, "streaming cbuf", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_buffers_replication = rrddim_add(st_memory_buffers, "replication", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_buffers_web = rrddim_add(st_memory_buffers, "web", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_buffers_aral = rrddim_add(st_memory_buffers, "aral", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_buffers_judy = rrddim_add(st_memory_buffers, "judy", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_buffers_aral = rrddim_add(st_memory_buffers, "aral-by-size free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_buffers_judy = rrddim_add(st_memory_buffers, "aral-judy free", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
}
|
||||
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_queries, (collected_number)netdata_buffers_statistics.query_targets_size + (collected_number) onewayalloc_allocated_memory());
|
||||
|
@ -228,17 +233,12 @@ void pulse_daemon_memory_do(bool extended) {
|
|||
rrddim_set_by_pointer(st_memory_buffers, rd_cbuffers_streaming, (collected_number)netdata_buffers_statistics.cbuffers_streaming);
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_replication, (collected_number)replication_allocated_buffers());
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_web, (collected_number)netdata_buffers_statistics.buffers_web);
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_aral, (collected_number)aral_by_size_overhead());
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_judy, (collected_number)judy_aral_overhead());
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_aral, (collected_number)aral_by_size_free_bytes());
|
||||
rrddim_set_by_pointer(st_memory_buffers, rd_buffers_judy, (collected_number)judy_aral_free_bytes());
|
||||
|
||||
rrdset_done(st_memory_buffers);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
if(!extended)
|
||||
return;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
|
||||
}
|
||||
|
|
|
@ -668,15 +668,26 @@ void pulse_dbengine_do(bool extended) {
|
|||
mrg_stats_old = mrg_stats;
|
||||
mrg_get_statistics(main_mrg, &mrg_stats);
|
||||
|
||||
struct rrdeng_buffer_sizes buffers = rrdeng_get_buffer_sizes();
|
||||
size_t buffers_total_size = buffers.handles + buffers.xt_buf + buffers.xt_io + buffers.pdc + buffers.descriptors +
|
||||
buffers.opcodes + buffers.wal + buffers.workers + buffers.epdl + buffers.deol + buffers.pd + buffers.pgc + buffers.pgd + buffers.mrg;
|
||||
struct rrdeng_buffer_sizes dbmem = rrdeng_pulse_memory_sizes();
|
||||
|
||||
size_t buffers_total_size = dbmem.xt_buf + dbmem.wal;
|
||||
#ifdef PDC_USE_JULYL
|
||||
buffers_total_size += buffers.julyl;
|
||||
#endif
|
||||
|
||||
pulse_dbengine_total_memory = pgc_main_stats.size + pgc_open_stats.size + pgc_extent_stats.size + mrg_stats.size + buffers_total_size;
|
||||
size_t aral_structures_total_size = 0, aral_used_total_size = 0;
|
||||
size_t aral_padding_total_size = 0;
|
||||
for(size_t i = 0; i < RRDENG_MEM_MAX ; i++) {
|
||||
buffers_total_size += aral_free_bytes_from_stats(dbmem.as[i]);
|
||||
aral_structures_total_size += aral_structures_bytes_from_stats(dbmem.as[i]);
|
||||
aral_used_total_size += aral_used_bytes_from_stats(dbmem.as[i]);
|
||||
aral_padding_total_size += aral_padding_bytes_from_stats(dbmem.as[i]);
|
||||
}
|
||||
|
||||
pulse_dbengine_total_memory =
|
||||
pgc_main_stats.size + (ssize_t)pgc_open_stats.size + pgc_extent_stats.size +
|
||||
mrg_stats.size +
|
||||
buffers_total_size + aral_structures_total_size + aral_padding_total_size + pgd_padding_bytes();
|
||||
|
||||
size_t priority = 135000;
|
||||
|
||||
|
@ -687,6 +698,9 @@ void pulse_dbengine_do(bool extended) {
|
|||
static RRDDIM *rd_pgc_memory_extent = NULL; // extent compresses cache memory
|
||||
static RRDDIM *rd_pgc_memory_metrics = NULL; // metric registry memory
|
||||
static RRDDIM *rd_pgc_memory_buffers = NULL;
|
||||
static RRDDIM *rd_pgc_memory_aral_padding = NULL;
|
||||
static RRDDIM *rd_pgc_memory_pgd_padding = NULL;
|
||||
static RRDDIM *rd_pgc_memory_aral_structures = NULL;
|
||||
|
||||
if (unlikely(!st_pgc_memory)) {
|
||||
st_pgc_memory = rrdset_create_localhost(
|
||||
|
@ -708,6 +722,9 @@ void pulse_dbengine_do(bool extended) {
|
|||
rd_pgc_memory_extent = rrddim_add(st_pgc_memory, "extent cache", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_memory_metrics = rrddim_add(st_pgc_memory, "metrics registry", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_memory_buffers = rrddim_add(st_pgc_memory, "buffers", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_memory_aral_padding = rrddim_add(st_pgc_memory, "aral padding", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_memory_pgd_padding = rrddim_add(st_pgc_memory, "pgd padding", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_memory_aral_structures = rrddim_add(st_pgc_memory, "aral structures", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
}
|
||||
priority++;
|
||||
|
||||
|
@ -717,6 +734,9 @@ void pulse_dbengine_do(bool extended) {
|
|||
rrddim_set_by_pointer(st_pgc_memory, rd_pgc_memory_extent, (collected_number)pgc_extent_stats.size);
|
||||
rrddim_set_by_pointer(st_pgc_memory, rd_pgc_memory_metrics, (collected_number)mrg_stats.size);
|
||||
rrddim_set_by_pointer(st_pgc_memory, rd_pgc_memory_buffers, (collected_number)buffers_total_size);
|
||||
rrddim_set_by_pointer(st_pgc_memory, rd_pgc_memory_aral_padding, (collected_number)aral_padding_total_size);
|
||||
rrddim_set_by_pointer(st_pgc_memory, rd_pgc_memory_pgd_padding, (collected_number)pgd_padding_bytes());
|
||||
rrddim_set_by_pointer(st_pgc_memory, rd_pgc_memory_aral_structures, (collected_number)aral_structures_total_size);
|
||||
|
||||
rrdset_done(st_pgc_memory);
|
||||
}
|
||||
|
@ -756,9 +776,9 @@ void pulse_dbengine_do(bool extended) {
|
|||
localhost->rrd_update_every,
|
||||
RRDSET_TYPE_STACKED);
|
||||
|
||||
rd_pgc_buffers_pgc = rrddim_add(st_pgc_buffers, "pgc", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_pgd = rrddim_add(st_pgc_buffers, "pgd", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_mrg = rrddim_add(st_pgc_buffers, "mrg", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_pgc = rrddim_add(st_pgc_buffers, "pgc", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_pgd = rrddim_add(st_pgc_buffers, "pgd", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_mrg = rrddim_add(st_pgc_buffers, "mrg", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_opcodes = rrddim_add(st_pgc_buffers, "opcodes", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_handles = rrddim_add(st_pgc_buffers, "query handles", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
rd_pgc_buffers_descriptors = rrddim_add(st_pgc_buffers, "descriptors", NULL, 1, 1, RRD_ALGORITHM_ABSOLUTE);
|
||||
|
@ -776,20 +796,20 @@ void pulse_dbengine_do(bool extended) {
|
|||
}
|
||||
priority++;
|
||||
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pgc, (collected_number)buffers.pgc);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pgd, (collected_number)buffers.pgd);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_mrg, (collected_number)buffers.mrg);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_opcodes, (collected_number)buffers.opcodes);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_handles, (collected_number)buffers.handles);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_descriptors, (collected_number)buffers.descriptors);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_wal, (collected_number)buffers.wal);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_workers, (collected_number)buffers.workers);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pdc, (collected_number)buffers.pdc);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pd, (collected_number)buffers.pd);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_xt_io, (collected_number)buffers.xt_io);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_xt_buf, (collected_number)buffers.xt_buf);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_epdl, (collected_number)buffers.epdl);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_deol, (collected_number)buffers.deol);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pgc, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_PGC]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pgd, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_PGD]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_mrg, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_MRG]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_opcodes, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_OPCODES]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_handles, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_HANDLES]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_descriptors, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_DESCRIPTORS]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_wal, (collected_number)dbmem.wal);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_workers, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_WORKERS]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pdc, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_PDC]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_pd, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_PD]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_xt_io, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_XT_IO]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_xt_buf, (collected_number)dbmem.xt_buf);
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_epdl, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_EPDL]));
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_deol, (collected_number)aral_free_bytes_from_stats(dbmem.as[RRDENG_MEM_DEOL]));
|
||||
#ifdef PDC_USE_JULYL
|
||||
rrddim_set_by_pointer(st_pgc_buffers, rd_pgc_buffers_julyl, (collected_number)buffers.julyl);
|
||||
#endif
|
||||
|
|
|
@ -6,11 +6,15 @@
|
|||
static struct gorilla_statistics {
|
||||
bool enabled;
|
||||
|
||||
alignas(64) uint64_t tier0_hot_gorilla_buffers;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t tier0_hot_gorilla_buffers;
|
||||
|
||||
alignas(64) uint64_t gorilla_tier0_disk_actual_bytes;
|
||||
alignas(64) uint64_t gorilla_tier0_disk_optimal_bytes;
|
||||
alignas(64) uint64_t gorilla_tier0_disk_original_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t gorilla_tier0_disk_actual_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t gorilla_tier0_disk_optimal_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t gorilla_tier0_disk_original_bytes;
|
||||
} gorilla_statistics = { 0 };
|
||||
|
||||
void pulse_gorilla_hot_buffer_added() {
|
||||
|
|
|
@ -8,16 +8,25 @@
|
|||
static struct web_statistics {
|
||||
bool extended;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint16_t connected_clients;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t web_client_count; // oops! this is used for giving unique IDs to web_clients!
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t web_requests;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t web_usec;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t web_usec_max;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t bytes_received;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t bytes_sent;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t content_size_uncompressed;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t content_size_compressed;
|
||||
} web_statistics;
|
||||
|
||||
|
|
|
@ -4,14 +4,22 @@
|
|||
#include "pulse-ml.h"
|
||||
|
||||
static struct ml_statistics {
|
||||
alignas(64) uint64_t ml_models_consulted;
|
||||
alignas(64) uint64_t ml_models_received;
|
||||
alignas(64) uint64_t ml_models_ignored;
|
||||
alignas(64) uint64_t ml_models_sent;
|
||||
alignas(64) uint64_t ml_models_deserialization_failures;
|
||||
alignas(64) uint64_t ml_memory_consumption;
|
||||
alignas(64) uint64_t ml_memory_new;
|
||||
alignas(64) uint64_t ml_memory_delete;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_models_consulted;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_models_received;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_models_ignored;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_models_sent;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_models_deserialization_failures;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_memory_consumption;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_memory_new;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_memory_delete;
|
||||
} ml_statistics = {0};
|
||||
|
||||
void pulse_ml_models_received()
|
||||
|
|
|
@ -5,30 +5,49 @@
|
|||
#include "streaming/replication.h"
|
||||
|
||||
static struct query_statistics {
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_data_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_data_db_points_read;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_data_result_points_generated;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_weights_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_weights_db_points_read;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_weights_result_points_generated;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_badges_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_badges_db_points_read;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t api_badges_result_points_generated;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t health_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t health_db_points_read;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t health_result_points_generated;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_db_points_read;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t ml_result_points_generated;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t backfill_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t backfill_db_points_read;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t exporters_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t exporters_db_points_read;
|
||||
} query_statistics;
|
||||
|
||||
|
@ -182,12 +201,12 @@ void pulse_queries_do(bool extended __maybe_unused) {
|
|||
if (unlikely(!st_points_read)) {
|
||||
st_points_read = rrdset_create_localhost(
|
||||
"netdata"
|
||||
, "db_points_read"
|
||||
, "db_samples_read"
|
||||
, NULL
|
||||
, "Time-Series Queries"
|
||||
, NULL
|
||||
, "Netdata Time-Series DB Samples Read"
|
||||
, "points/s"
|
||||
, "samples/s"
|
||||
, "netdata"
|
||||
, "pulse"
|
||||
, 131001
|
||||
|
@ -233,7 +252,7 @@ void pulse_queries_do(bool extended __maybe_unused) {
|
|||
, NULL
|
||||
, "Time-Series Queries"
|
||||
, NULL
|
||||
, "Netdata Time-Series Samples Generated"
|
||||
, "Netdata Time-Series Points Generated"
|
||||
, "points/s"
|
||||
, "netdata"
|
||||
, "pulse"
|
||||
|
|
|
@ -6,20 +6,34 @@
|
|||
static struct sqlite3_statistics {
|
||||
bool enabled;
|
||||
|
||||
alignas(64) uint64_t sqlite3_queries_made;
|
||||
alignas(64) uint64_t sqlite3_queries_ok;
|
||||
alignas(64) uint64_t sqlite3_queries_failed;
|
||||
alignas(64) uint64_t sqlite3_queries_failed_busy;
|
||||
alignas(64) uint64_t sqlite3_queries_failed_locked;
|
||||
alignas(64) uint64_t sqlite3_rows;
|
||||
alignas(64) uint64_t sqlite3_metadata_cache_hit;
|
||||
alignas(64) uint64_t sqlite3_context_cache_hit;
|
||||
alignas(64) uint64_t sqlite3_metadata_cache_miss;
|
||||
alignas(64) uint64_t sqlite3_context_cache_miss;
|
||||
alignas(64) uint64_t sqlite3_metadata_cache_spill;
|
||||
alignas(64) uint64_t sqlite3_context_cache_spill;
|
||||
alignas(64) uint64_t sqlite3_metadata_cache_write;
|
||||
alignas(64) uint64_t sqlite3_context_cache_write;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_queries_made;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_queries_ok;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_queries_failed;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_queries_failed_busy;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_queries_failed_locked;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_rows;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_metadata_cache_hit;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_context_cache_hit;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_metadata_cache_miss;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_context_cache_miss;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_metadata_cache_spill;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_context_cache_spill;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_metadata_cache_write;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t sqlite3_context_cache_write;
|
||||
} sqlite3_statistics = { };
|
||||
|
||||
void pulse_sqlite3_query_completed(bool success, bool busy, bool locked) {
|
||||
|
|
|
@ -82,10 +82,10 @@ void *watcher_main(void *arg)
|
|||
watcher_wait_for_step(WATCHER_STEP_ID_CANCEL_MAIN_THREADS);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_STOP_DBENGINE_TIERS);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_STOP_METASYNC_THREADS);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_CLOSE_SQL_DATABASES);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_REMOVE_PID_FILE);
|
||||
watcher_wait_for_step(WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES);
|
||||
|
@ -140,14 +140,14 @@ void watcher_thread_start() {
|
|||
"flush dbengine tiers";
|
||||
watcher_steps[WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS].msg =
|
||||
"stop collection for all hosts";
|
||||
watcher_steps[WATCHER_STEP_ID_STOP_METASYNC_THREADS].msg =
|
||||
"stop metasync threads";
|
||||
watcher_steps[WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH].msg =
|
||||
"wait for dbengine collectors to finish";
|
||||
watcher_steps[WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING].msg =
|
||||
"wait for dbengine main cache to finish flushing";
|
||||
watcher_steps[WATCHER_STEP_ID_STOP_DBENGINE_TIERS].msg =
|
||||
"stop dbengine tiers";
|
||||
watcher_steps[WATCHER_STEP_ID_STOP_METASYNC_THREADS].msg =
|
||||
"stop metasync threads";
|
||||
watcher_steps[WATCHER_STEP_ID_CLOSE_SQL_DATABASES].msg =
|
||||
"close SQL databases";
|
||||
watcher_steps[WATCHER_STEP_ID_REMOVE_PID_FILE].msg =
|
||||
|
|
|
@ -24,10 +24,10 @@ typedef enum {
|
|||
WATCHER_STEP_ID_CANCEL_MAIN_THREADS,
|
||||
WATCHER_STEP_ID_FLUSH_DBENGINE_TIERS,
|
||||
WATCHER_STEP_ID_STOP_COLLECTION_FOR_ALL_HOSTS,
|
||||
WATCHER_STEP_ID_STOP_METASYNC_THREADS,
|
||||
WATCHER_STEP_ID_WAIT_FOR_DBENGINE_COLLECTORS_TO_FINISH,
|
||||
WATCHER_STEP_ID_WAIT_FOR_DBENGINE_MAIN_CACHE_TO_FINISH_FLUSHING,
|
||||
WATCHER_STEP_ID_STOP_DBENGINE_TIERS,
|
||||
WATCHER_STEP_ID_STOP_METASYNC_THREADS,
|
||||
WATCHER_STEP_ID_CLOSE_SQL_DATABASES,
|
||||
WATCHER_STEP_ID_REMOVE_PID_FILE,
|
||||
WATCHER_STEP_ID_FREE_OPENSSL_STRUCTURES,
|
||||
|
|
|
@ -71,7 +71,7 @@ struct pgc_page {
|
|||
};
|
||||
|
||||
struct pgc_queue {
|
||||
alignas(64) SPINLOCK spinlock;
|
||||
SPINLOCK spinlock;
|
||||
union {
|
||||
PGC_PAGE *base;
|
||||
Pvoid_t sections_judy;
|
||||
|
@ -113,13 +113,12 @@ struct pgc {
|
|||
} config;
|
||||
|
||||
struct {
|
||||
SPINLOCK spinlock; // when locked, the evict_thread is currently evicting pages
|
||||
ND_THREAD *thread; // the thread
|
||||
struct completion completion; // signal the thread to wake up
|
||||
} evictor;
|
||||
|
||||
struct pgc_index {
|
||||
alignas(64) RW_SPINLOCK rw_spinlock;
|
||||
RW_SPINLOCK rw_spinlock;
|
||||
Pvoid_t sections_judy;
|
||||
#ifdef PGC_WITH_ARAL
|
||||
ARAL *aral;
|
||||
|
@ -127,7 +126,7 @@ struct pgc {
|
|||
} *index;
|
||||
|
||||
struct {
|
||||
alignas(64) SPINLOCK spinlock;
|
||||
SPINLOCK spinlock;
|
||||
size_t per1000;
|
||||
} usage;
|
||||
|
||||
|
@ -137,7 +136,7 @@ struct pgc {
|
|||
struct pgc_statistics stats; // statistics
|
||||
|
||||
#ifdef NETDATA_PGC_POINTER_CHECK
|
||||
alignas(64) netdata_mutex_t global_pointer_registry_mutex;
|
||||
netdata_mutex_t global_pointer_registry_mutex;
|
||||
Pvoid_t global_pointer_registry;
|
||||
#endif
|
||||
};
|
||||
|
@ -343,6 +342,20 @@ static inline void pgc_size_histogram_del(PGC *cache, struct pgc_size_histogram
|
|||
// ----------------------------------------------------------------------------
|
||||
// evictions control
|
||||
|
||||
static inline uint64_t pgc_threshold(size_t threshold, uint64_t wanted, uint64_t current, uint64_t clean) {
|
||||
if(current < clean)
|
||||
current = clean;
|
||||
|
||||
if(wanted < current - clean)
|
||||
wanted = current - clean;
|
||||
|
||||
uint64_t ret = wanted * threshold / 1000ULL;
|
||||
if(ret < current - clean)
|
||||
ret = current - clean;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) {
|
||||
|
||||
if(size_to_evict)
|
||||
|
@ -351,33 +364,33 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) {
|
|||
else if(!spinlock_trylock(&cache->usage.spinlock))
|
||||
return __atomic_load_n(&cache->usage.per1000, __ATOMIC_RELAXED);
|
||||
|
||||
size_t wanted_cache_size;
|
||||
uint64_t wanted_cache_size;
|
||||
|
||||
const size_t dirty = __atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED);
|
||||
const size_t hot = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED);
|
||||
const size_t clean = __atomic_load_n(&cache->clean.stats->size, __ATOMIC_RELAXED);
|
||||
const size_t evicting = __atomic_load_n(&cache->stats.evicting_size, __ATOMIC_RELAXED);
|
||||
const size_t flushing = __atomic_load_n(&cache->stats.flushing_size, __ATOMIC_RELAXED);
|
||||
const size_t current_cache_size = __atomic_load_n(&cache->stats.size, __ATOMIC_RELAXED);
|
||||
const size_t all_pages_size = hot + dirty + clean + evicting + flushing;
|
||||
const size_t index = current_cache_size > all_pages_size ? current_cache_size - all_pages_size : 0;
|
||||
const size_t referenced_size = __atomic_load_n(&cache->stats.referenced_size, __ATOMIC_RELAXED);
|
||||
const uint64_t dirty = __atomic_load_n(&cache->dirty.stats->size, __ATOMIC_RELAXED);
|
||||
const uint64_t hot = __atomic_load_n(&cache->hot.stats->size, __ATOMIC_RELAXED);
|
||||
const uint64_t clean = __atomic_load_n(&cache->clean.stats->size, __ATOMIC_RELAXED);
|
||||
const uint64_t evicting = __atomic_load_n(&cache->stats.evicting_size, __ATOMIC_RELAXED);
|
||||
const uint64_t flushing = __atomic_load_n(&cache->stats.flushing_size, __ATOMIC_RELAXED);
|
||||
const uint64_t current_cache_size = __atomic_load_n(&cache->stats.size, __ATOMIC_RELAXED);
|
||||
const uint64_t all_pages_size = hot + dirty + clean + evicting + flushing;
|
||||
const uint64_t index = current_cache_size > all_pages_size ? current_cache_size - all_pages_size : 0;
|
||||
const uint64_t referenced_size = __atomic_load_n(&cache->stats.referenced_size, __ATOMIC_RELAXED);
|
||||
|
||||
if(cache->config.options & PGC_OPTIONS_AUTOSCALE) {
|
||||
const size_t dirty_max = __atomic_load_n(&cache->dirty.stats->max_size, __ATOMIC_RELAXED);
|
||||
const size_t hot_max = __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED);
|
||||
const uint64_t dirty_max = __atomic_load_n(&cache->dirty.stats->max_size, __ATOMIC_RELAXED);
|
||||
const uint64_t hot_max = __atomic_load_n(&cache->hot.stats->max_size, __ATOMIC_RELAXED);
|
||||
|
||||
// our promise to users
|
||||
const size_t max_size1 = MAX(hot_max, hot) * 2;
|
||||
const uint64_t max_size1 = MAX(hot_max, hot) * 2;
|
||||
|
||||
// protection against slow flushing
|
||||
const size_t max_size2 = hot_max + ((dirty_max * 2 < hot_max * 2 / 3) ? hot_max * 2 / 3 : dirty_max * 2) + index;
|
||||
const uint64_t max_size2 = hot_max + ((dirty_max * 2 < hot_max * 2 / 3) ? hot_max * 2 / 3 : dirty_max * 2) + index;
|
||||
|
||||
// the final wanted cache size
|
||||
wanted_cache_size = MIN(max_size1, max_size2);
|
||||
|
||||
if(cache->config.dynamic_target_size_cb) {
|
||||
const size_t wanted_cache_size_cb = cache->config.dynamic_target_size_cb();
|
||||
const uint64_t wanted_cache_size_cb = cache->config.dynamic_target_size_cb();
|
||||
if(wanted_cache_size_cb > wanted_cache_size)
|
||||
wanted_cache_size = wanted_cache_size_cb;
|
||||
}
|
||||
|
@ -395,21 +408,19 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) {
|
|||
wanted_cache_size = referenced_size + dirty;
|
||||
|
||||
// if we don't have enough clean pages, there is no reason to be aggressive or critical
|
||||
if(current_cache_size > wanted_cache_size && wanted_cache_size < current_cache_size - clean)
|
||||
if(wanted_cache_size < current_cache_size - clean)
|
||||
wanted_cache_size = current_cache_size - clean;
|
||||
|
||||
bool signal_the_evictor = false;
|
||||
if(cache->config.out_of_memory_protection_bytes) {
|
||||
// out of memory protection
|
||||
OS_SYSTEM_MEMORY sm = os_system_memory(false);
|
||||
if(sm.ram_total_bytes) {
|
||||
// when the total exists, ram_available_bytes is also right
|
||||
|
||||
const size_t min_available = cache->config.out_of_memory_protection_bytes;
|
||||
const uint64_t min_available = cache->config.out_of_memory_protection_bytes;
|
||||
if (sm.ram_available_bytes < min_available) {
|
||||
// we must shrink
|
||||
wanted_cache_size = current_cache_size - (min_available - sm.ram_available_bytes);
|
||||
signal_the_evictor = true;
|
||||
}
|
||||
else if(cache->config.use_all_ram) {
|
||||
// we can grow
|
||||
|
@ -418,38 +429,40 @@ static inline size_t cache_usage_per1000(PGC *cache, size_t *size_to_evict) {
|
|||
}
|
||||
}
|
||||
|
||||
const size_t per1000 = (size_t)((unsigned long long)current_cache_size * 1000ULL / (unsigned long long)wanted_cache_size);
|
||||
|
||||
const size_t per1000 = (size_t)(current_cache_size * 1000ULL / wanted_cache_size);
|
||||
__atomic_store_n(&cache->usage.per1000, per1000, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(&cache->stats.wanted_cache_size, wanted_cache_size, __ATOMIC_RELAXED);
|
||||
__atomic_store_n(&cache->stats.current_cache_size, current_cache_size, __ATOMIC_RELAXED);
|
||||
|
||||
uint64_t healthy_target = pgc_threshold(cache->config.healthy_size_per1000, wanted_cache_size, current_cache_size, clean);
|
||||
if(current_cache_size > healthy_target) {
|
||||
uint64_t low_watermark_target = pgc_threshold(cache->config.evict_low_threshold_per1000, wanted_cache_size, current_cache_size, clean);
|
||||
|
||||
uint64_t size_to_evict_now = current_cache_size - low_watermark_target;
|
||||
if(size_to_evict_now > clean)
|
||||
size_to_evict_now = clean;
|
||||
|
||||
if(size_to_evict)
|
||||
*size_to_evict = (size_t)size_to_evict_now;
|
||||
|
||||
bool signal = false;
|
||||
if(per1000 >= cache->config.severe_pressure_per1000) {
|
||||
__atomic_add_fetch(&cache->stats.events_cache_under_severe_pressure, 1, __ATOMIC_RELAXED);
|
||||
signal = true;
|
||||
}
|
||||
else if(per1000 >= cache->config.aggressive_evict_per1000) {
|
||||
__atomic_add_fetch(&cache->stats.events_cache_needs_space_aggressively, 1, __ATOMIC_RELAXED);
|
||||
signal = true;
|
||||
}
|
||||
|
||||
if(signal) {
|
||||
completion_mark_complete_a_job(&cache->evictor.completion);
|
||||
__atomic_add_fetch(&cache->stats.waste_evict_thread_signals, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
}
|
||||
|
||||
spinlock_unlock(&cache->usage.spinlock);
|
||||
|
||||
if(size_to_evict) {
|
||||
size_t target = (size_t)((uint64_t)wanted_cache_size * (uint64_t)cache->config.evict_low_threshold_per1000 / 1000ULL);
|
||||
|
||||
if(target < wanted_cache_size - clean)
|
||||
target = wanted_cache_size - clean;
|
||||
|
||||
if(current_cache_size > target)
|
||||
*size_to_evict = current_cache_size - target;
|
||||
else
|
||||
*size_to_evict = 0;
|
||||
}
|
||||
|
||||
if(per1000 >= cache->config.severe_pressure_per1000)
|
||||
__atomic_add_fetch(&cache->stats.events_cache_under_severe_pressure, 1, __ATOMIC_RELAXED);
|
||||
|
||||
else if(per1000 >= cache->config.aggressive_evict_per1000)
|
||||
__atomic_add_fetch(&cache->stats.events_cache_needs_space_aggressively, 1, __ATOMIC_RELAXED);
|
||||
|
||||
if (signal_the_evictor && spinlock_trylock(&cache->evictor.spinlock)) {
|
||||
completion_mark_complete_a_job(&cache->evictor.completion);
|
||||
spinlock_unlock(&cache->evictor.spinlock);
|
||||
__atomic_add_fetch(&cache->stats.waste_evict_thread_signals, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
return per1000;
|
||||
}
|
||||
|
||||
|
@ -558,7 +571,7 @@ struct section_pages {
|
|||
PGC_PAGE *base;
|
||||
};
|
||||
|
||||
static struct aral_statistics aral_statistics_for_pgc = { 0 };
|
||||
static struct aral_statistics pgc_aral_statistics = { 0 };
|
||||
|
||||
static ARAL *pgc_sections_aral = NULL;
|
||||
|
||||
|
@ -1169,6 +1182,7 @@ static bool evict_pages_with_filter(PGC *cache, size_t max_skip, size_t max_evic
|
|||
else if(unlikely(wait)) {
|
||||
// evict as many as necessary for the cache to go at the predefined threshold
|
||||
per1000 = cache_usage_per1000(cache, &max_size_to_evict);
|
||||
max_size_to_evict /= 2; // do it in 2 steps
|
||||
if(per1000 >= cache->config.severe_pressure_per1000) {
|
||||
under_sever_pressure = true;
|
||||
max_pages_to_evict = max_pages_to_evict ? max_pages_to_evict * 2 : 4096;
|
||||
|
@ -1934,30 +1948,35 @@ static void *pgc_evict_thread(void *ptr) {
|
|||
worker_register_job_name(0, "signaled");
|
||||
worker_register_job_name(1, "scheduled");
|
||||
|
||||
unsigned job_id = 0;
|
||||
unsigned job_id = 0, severe_pressure_counter = 0;
|
||||
|
||||
while (true) {
|
||||
worker_is_idle();
|
||||
unsigned new_job_id = completion_wait_for_a_job_with_timeout(
|
||||
&cache->evictor.completion, job_id, 100);
|
||||
&cache->evictor.completion, job_id, 1000);
|
||||
|
||||
bool was_signaled = new_job_id > job_id;
|
||||
worker_is_busy(was_signaled ? 1 : 0);
|
||||
worker_is_busy(new_job_id > job_id ? 1 : 0);
|
||||
job_id = new_job_id;
|
||||
|
||||
if (nd_thread_signaled_to_cancel())
|
||||
return NULL;
|
||||
|
||||
evict_pages(cache, 0, 0, true, false);
|
||||
|
||||
size_t size_to_evict = 0;
|
||||
size_t per1000 = cache_usage_per1000(cache, &size_to_evict);
|
||||
bool was_critical = per1000 >= cache->config.severe_pressure_per1000;
|
||||
if(cache_usage_per1000(cache, &size_to_evict) > cache->config.severe_pressure_per1000) {
|
||||
severe_pressure_counter++;
|
||||
|
||||
if(size_to_evict > 0) {
|
||||
evict_pages(cache, 0, 0, true, false);
|
||||
if(severe_pressure_counter > 100) {
|
||||
// so, we tried 100 times to reduce memory,
|
||||
// but it is still severe!
|
||||
|
||||
if (was_signaled || was_critical)
|
||||
mallocz_release_as_much_memory_to_the_system();
|
||||
severe_pressure_counter = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
severe_pressure_counter = 0;
|
||||
}
|
||||
|
||||
worker_unregister();
|
||||
|
@ -2040,7 +2059,7 @@ PGC *pgc_create(const char *name,
|
|||
sizeof(PGC_PAGE) + cache->config.additional_bytes_per_page,
|
||||
0,
|
||||
0,
|
||||
&aral_statistics_for_pgc,
|
||||
&pgc_aral_statistics,
|
||||
NULL,
|
||||
NULL,
|
||||
false,
|
||||
|
@ -2075,7 +2094,6 @@ PGC *pgc_create(const char *name,
|
|||
|
||||
// last create the eviction thread
|
||||
{
|
||||
spinlock_init(&cache->evictor.spinlock);
|
||||
completion_init(&cache->evictor.completion);
|
||||
cache->evictor.thread = nd_thread_create(name, NETDATA_THREAD_OPTION_JOINABLE, pgc_evict_thread, cache);
|
||||
}
|
||||
|
@ -2083,12 +2101,8 @@ PGC *pgc_create(const char *name,
|
|||
return cache;
|
||||
}
|
||||
|
||||
size_t pgc_aral_structures(void) {
|
||||
return aral_structures_from_stats(&aral_statistics_for_pgc);
|
||||
}
|
||||
|
||||
size_t pgc_aral_overhead(void) {
|
||||
return aral_overhead_from_stats(&aral_statistics_for_pgc);
|
||||
struct aral_statistics *pgc_aral_stats(void) {
|
||||
return &pgc_aral_statistics;
|
||||
}
|
||||
|
||||
void pgc_flush_all_hot_and_dirty_pages(PGC *cache, Word_t section) {
|
||||
|
@ -2455,6 +2469,7 @@ void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_
|
|||
|
||||
if(!page_acquire(cache, page)) {
|
||||
internal_fatal(true, "Migration to journal v2: cannot acquire page for migration to v2");
|
||||
page_transition_unlock(cache, page);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -2561,8 +2576,17 @@ void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_
|
|||
Word_t start_time = 0;
|
||||
while ((PValue2 = JudyLFirstThenNext(mi->JudyL_pages_by_start_time, &start_time, &start_time_first))) {
|
||||
struct jv2_page_info *pi = *PValue2;
|
||||
|
||||
// balance-parents: transition from hot to clean directly
|
||||
page_set_clean(cache, pi->page, true, false);
|
||||
page_transition_unlock(cache, pi->page);
|
||||
pgc_page_hot_to_dirty_and_release(cache, pi->page, true);
|
||||
page_release(cache, pi->page, true);
|
||||
|
||||
// before balance-parents:
|
||||
// page_transition_unlock(cache, pi->page);
|
||||
// pgc_page_hot_to_dirty_and_release(cache, pi->page, true);
|
||||
|
||||
// old test - don't enable:
|
||||
// make_acquired_page_clean_and_evict_or_page_release(cache, pi->page);
|
||||
aral_freez(ar_pi, pi);
|
||||
}
|
||||
|
@ -2590,7 +2614,8 @@ void pgc_open_cache_to_journal_v2(PGC *cache, Word_t section, unsigned datafile_
|
|||
|
||||
__atomic_sub_fetch(&cache->stats.workers_jv2_flush, 1, __ATOMIC_RELAXED);
|
||||
|
||||
flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL, false, false);
|
||||
// balance-parents: do not flush, there is nothing dirty
|
||||
// flush_pages(cache, cache->config.max_flushes_inline, PGC_SECTION_ALL, false, false);
|
||||
}
|
||||
|
||||
static bool match_page_data(PGC_PAGE *page, void *data) {
|
||||
|
|
|
@ -48,116 +48,180 @@ struct pgc_size_histogram {
|
|||
};
|
||||
|
||||
struct pgc_queue_statistics {
|
||||
CACHE_LINE_PADDING();
|
||||
struct pgc_size_histogram size_histogram;
|
||||
|
||||
alignas(64) size_t entries;
|
||||
alignas(64) size_t size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t size;
|
||||
|
||||
alignas(64) size_t max_entries;
|
||||
alignas(64) size_t max_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t max_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t max_size;
|
||||
|
||||
alignas(64) size_t added_entries;
|
||||
alignas(64) size_t added_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t added_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t added_size;
|
||||
|
||||
alignas(64) size_t removed_entries;
|
||||
alignas(64) size_t removed_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t removed_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t removed_size;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
};
|
||||
|
||||
struct pgc_statistics {
|
||||
alignas(64) size_t wanted_cache_size;
|
||||
alignas(64) size_t current_cache_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t wanted_cache_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t current_cache_size;
|
||||
CACHE_LINE_PADDING();
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// volume
|
||||
|
||||
alignas(64) size_t entries; // all the entries (includes clean, dirty, hot)
|
||||
alignas(64) size_t size; // all the entries (includes clean, dirty, hot)
|
||||
CACHE_LINE_PADDING();
|
||||
size_t entries; // all the entries (includes clean, dirty, hot)
|
||||
CACHE_LINE_PADDING();
|
||||
size_t size; // all the entries (includes clean, dirty, hot)
|
||||
|
||||
alignas(64) size_t referenced_entries; // all the entries currently referenced
|
||||
alignas(64) size_t referenced_size; // all the entries currently referenced
|
||||
CACHE_LINE_PADDING();
|
||||
size_t referenced_entries; // all the entries currently referenced
|
||||
CACHE_LINE_PADDING();
|
||||
size_t referenced_size; // all the entries currently referenced
|
||||
|
||||
alignas(64) size_t added_entries;
|
||||
alignas(64) size_t added_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t added_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t added_size;
|
||||
|
||||
alignas(64) size_t removed_entries;
|
||||
alignas(64) size_t removed_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t removed_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t removed_size;
|
||||
|
||||
#ifdef PGC_COUNT_POINTS_COLLECTED
|
||||
alignas(64) size_t points_collected;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t points_collected;
|
||||
#endif
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// migrations
|
||||
|
||||
alignas(64) size_t evicting_entries;
|
||||
alignas(64) size_t evicting_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t evicting_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t evicting_size;
|
||||
|
||||
alignas(64) size_t flushing_entries;
|
||||
alignas(64) size_t flushing_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t flushing_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t flushing_size;
|
||||
|
||||
alignas(64) size_t hot2dirty_entries;
|
||||
alignas(64) size_t hot2dirty_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t hot2dirty_entries;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t hot2dirty_size;
|
||||
|
||||
alignas(64) size_t hot_empty_pages_evicted_immediately;
|
||||
alignas(64) size_t hot_empty_pages_evicted_later;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t hot_empty_pages_evicted_immediately;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t hot_empty_pages_evicted_later;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// workload
|
||||
|
||||
alignas(64) size_t acquires;
|
||||
alignas(64) size_t releases;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t acquires;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t releases;
|
||||
|
||||
alignas(64) size_t acquires_for_deletion;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t acquires_for_deletion;
|
||||
|
||||
alignas(64) size_t searches_exact;
|
||||
alignas(64) size_t searches_exact_hits;
|
||||
alignas(64) size_t searches_exact_misses;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches_exact;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches_exact_hits;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches_exact_misses;
|
||||
|
||||
alignas(64) size_t searches_closest;
|
||||
alignas(64) size_t searches_closest_hits;
|
||||
alignas(64) size_t searches_closest_misses;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches_closest;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches_closest_hits;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches_closest_misses;
|
||||
|
||||
alignas(64) size_t flushes_completed;
|
||||
alignas(64) size_t flushes_completed_size;
|
||||
alignas(64) size_t flushes_cancelled_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t flushes_completed;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t flushes_completed_size;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t flushes_cancelled_size;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// critical events
|
||||
|
||||
alignas(64) size_t events_cache_under_severe_pressure;
|
||||
alignas(64) size_t events_cache_needs_space_aggressively;
|
||||
alignas(64) size_t events_flush_critical;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t events_cache_under_severe_pressure;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t events_cache_needs_space_aggressively;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t events_flush_critical;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// worker threads
|
||||
|
||||
alignas(64) size_t workers_search;
|
||||
alignas(64) size_t workers_add;
|
||||
alignas(64) size_t workers_evict;
|
||||
alignas(64) size_t workers_flush;
|
||||
alignas(64) size_t workers_jv2_flush;
|
||||
alignas(64) size_t workers_hot2dirty;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t workers_search;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t workers_add;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t workers_evict;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t workers_flush;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t workers_jv2_flush;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t workers_hot2dirty;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// waste events
|
||||
|
||||
// waste events - spins
|
||||
alignas(64) size_t waste_insert_spins;
|
||||
alignas(64) size_t waste_evict_useless_spins;
|
||||
alignas(64) size_t waste_release_spins;
|
||||
alignas(64) size_t waste_acquire_spins;
|
||||
alignas(64) size_t waste_delete_spins;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_insert_spins;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_evict_useless_spins;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_release_spins;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_acquire_spins;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_delete_spins;
|
||||
|
||||
// waste events - eviction
|
||||
alignas(64) size_t waste_evict_relocated;
|
||||
alignas(64) size_t waste_evict_thread_signals;
|
||||
alignas(64) size_t waste_evictions_inline_on_add;
|
||||
alignas(64) size_t waste_evictions_inline_on_release;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_evict_relocated;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_evict_thread_signals;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_evictions_inline_on_add;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_evictions_inline_on_release;
|
||||
|
||||
// waste events - flushing
|
||||
alignas(64) size_t waste_flush_on_add;
|
||||
alignas(64) size_t waste_flush_on_release;
|
||||
alignas(64) size_t waste_flushes_cancelled;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_flush_on_add;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_flush_on_release;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t waste_flushes_cancelled;
|
||||
|
||||
// ----------------------------------------------------------------------------------------------------------------
|
||||
// per queue statistics
|
||||
|
@ -248,8 +312,7 @@ bool pgc_flush_pages(PGC *cache);
|
|||
struct pgc_statistics pgc_get_statistics(PGC *cache);
|
||||
size_t pgc_hot_and_dirty_entries(PGC *cache);
|
||||
|
||||
size_t pgc_aral_structures(void);
|
||||
size_t pgc_aral_overhead(void);
|
||||
struct aral_statistics *pgc_aral_stats(void);
|
||||
|
||||
static inline size_t indexing_partition(Word_t ptr, Word_t modulo) __attribute__((const));
|
||||
static inline size_t indexing_partition(Word_t ptr, Word_t modulo) {
|
||||
|
|
|
@ -90,19 +90,16 @@ static inline void MRG_STATS_DELETE_MISS(MRG *mrg, size_t partition) {
|
|||
#define mrg_index_write_lock(mrg, partition) rw_spinlock_write_lock(&(mrg)->index[partition].rw_spinlock)
|
||||
#define mrg_index_write_unlock(mrg, partition) rw_spinlock_write_unlock(&(mrg)->index[partition].rw_spinlock)
|
||||
|
||||
static inline void mrg_stats_size_judyl_change(MRG *mrg, size_t mem_before_judyl, size_t mem_after_judyl, size_t partition) {
|
||||
if(mem_after_judyl > mem_before_judyl)
|
||||
__atomic_add_fetch(&mrg->index[partition].stats.size, mem_after_judyl - mem_before_judyl, __ATOMIC_RELAXED);
|
||||
else if(mem_after_judyl < mem_before_judyl)
|
||||
__atomic_sub_fetch(&mrg->index[partition].stats.size, mem_before_judyl - mem_after_judyl, __ATOMIC_RELAXED);
|
||||
static inline void mrg_stats_size_judyl_change(MRG *mrg, int64_t judy_mem, size_t partition) {
|
||||
__atomic_add_fetch(&mrg->index[partition].stats.size, judy_mem, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg, size_t partition) {
|
||||
__atomic_add_fetch(&mrg->index[partition].stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(nd_uuid_t)), __ATOMIC_RELAXED);
|
||||
static inline void mrg_stats_size_judyhs_added_uuid(MRG *mrg, size_t partition, int64_t judy_mem) {
|
||||
__atomic_add_fetch(&mrg->index[partition].stats.size, judy_mem, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg, size_t partition) {
|
||||
__atomic_sub_fetch(&mrg->index[partition].stats.size, JUDYHS_INDEX_SIZE_ESTIMATE(sizeof(nd_uuid_t)), __ATOMIC_RELAXED);
|
||||
static inline void mrg_stats_size_judyhs_removed_uuid(MRG *mrg, size_t partition, int64_t judy_mem) {
|
||||
__atomic_sub_fetch(&mrg->index[partition].stats.size, judy_mem, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
static inline size_t uuid_partition(MRG *mrg __maybe_unused, nd_uuid_t *uuid) {
|
||||
|
@ -163,7 +160,7 @@ static inline bool acquired_metric_has_retention(MRG *mrg, METRIC *metric) {
|
|||
static inline void acquired_for_deletion_metric_delete(MRG *mrg, METRIC *metric) {
|
||||
size_t partition = metric->partition;
|
||||
|
||||
size_t mem_before_judyl, mem_after_judyl;
|
||||
int64_t judy_mem;
|
||||
|
||||
mrg_index_write_lock(mrg, partition);
|
||||
|
||||
|
@ -174,10 +171,10 @@ static inline void acquired_for_deletion_metric_delete(MRG *mrg, METRIC *metric)
|
|||
return;
|
||||
}
|
||||
|
||||
mem_before_judyl = JudyLMemUsed(*sections_judy_pptr);
|
||||
judy_mem = -(int64_t)JudyLMemUsed(*sections_judy_pptr);
|
||||
int rc = JudyLDel(sections_judy_pptr, metric->section, PJE0);
|
||||
mem_after_judyl = JudyLMemUsed(*sections_judy_pptr);
|
||||
mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl, partition);
|
||||
judy_mem += (int64_t)JudyLMemUsed(*sections_judy_pptr);
|
||||
mrg_stats_size_judyl_change(mrg, judy_mem, partition);
|
||||
|
||||
if(unlikely(!rc)) {
|
||||
MRG_STATS_DELETE_MISS(mrg, partition);
|
||||
|
@ -186,10 +183,15 @@ static inline void acquired_for_deletion_metric_delete(MRG *mrg, METRIC *metric)
|
|||
}
|
||||
|
||||
if(!*sections_judy_pptr) {
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
rc = JudyHSDel(&mrg->index[partition].uuid_judy, &metric->uuid, sizeof(nd_uuid_t), PJE0);
|
||||
|
||||
int64_t judy_mem = JudyAllocThreadPulseGetAndReset();
|
||||
|
||||
if(unlikely(!rc))
|
||||
fatal("DBENGINE METRIC: cannot delete UUID from JudyHS");
|
||||
mrg_stats_size_judyhs_removed_uuid(mrg, partition);
|
||||
mrg_stats_size_judyhs_removed_uuid(mrg, partition, judy_mem);
|
||||
}
|
||||
|
||||
MRG_STATS_DELETED_METRIC(mrg, partition);
|
||||
|
@ -262,19 +264,22 @@ static inline METRIC *metric_add_and_acquire(MRG *mrg, MRG_ENTRY *entry, bool *r
|
|||
while(1) {
|
||||
mrg_index_write_lock(mrg, partition);
|
||||
|
||||
size_t mem_before_judyl, mem_after_judyl;
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
Pvoid_t *sections_judy_pptr = JudyHSIns(&mrg->index[partition].uuid_judy, entry->uuid, sizeof(nd_uuid_t), PJE0);
|
||||
|
||||
int64_t judy_mem = JudyAllocThreadPulseGetAndReset();
|
||||
|
||||
if (unlikely(!sections_judy_pptr || sections_judy_pptr == PJERR))
|
||||
fatal("DBENGINE METRIC: corrupted UUIDs JudyHS array");
|
||||
|
||||
if (unlikely(!*sections_judy_pptr))
|
||||
mrg_stats_size_judyhs_added_uuid(mrg, partition);
|
||||
mrg_stats_size_judyhs_added_uuid(mrg, partition, judy_mem);
|
||||
|
||||
mem_before_judyl = JudyLMemUsed(*sections_judy_pptr);
|
||||
judy_mem = -(int64_t)JudyLMemUsed(*sections_judy_pptr);
|
||||
PValue = JudyLIns(sections_judy_pptr, entry->section, PJE0);
|
||||
mem_after_judyl = JudyLMemUsed(*sections_judy_pptr);
|
||||
mrg_stats_size_judyl_change(mrg, mem_before_judyl, mem_after_judyl, partition);
|
||||
judy_mem += (int64_t)JudyLMemUsed(*sections_judy_pptr);
|
||||
mrg_stats_size_judyl_change(mrg, judy_mem, partition);
|
||||
|
||||
if (unlikely(!PValue || PValue == PJERR))
|
||||
fatal("DBENGINE METRIC: corrupted section JudyL array");
|
||||
|
@ -380,12 +385,8 @@ inline MRG *mrg_create(ssize_t partitions) {
|
|||
return mrg;
|
||||
}
|
||||
|
||||
inline size_t mrg_aral_structures(void) {
|
||||
return aral_structures_from_stats(&mrg_aral_statistics);
|
||||
}
|
||||
|
||||
inline size_t mrg_aral_overhead(void) {
|
||||
return aral_overhead_from_stats(&mrg_aral_statistics);
|
||||
struct aral_statistics *mrg_aral_stats(void) {
|
||||
return &mrg_aral_statistics;
|
||||
}
|
||||
|
||||
inline void mrg_destroy(MRG *mrg __maybe_unused) {
|
||||
|
|
|
@ -4,8 +4,6 @@
|
|||
|
||||
#include "../rrd.h"
|
||||
|
||||
#define MRG_CACHE_LINE_PADDING(x) uint8_t padding##x[64]
|
||||
|
||||
typedef struct metric METRIC;
|
||||
typedef struct mrg MRG;
|
||||
|
||||
|
@ -21,7 +19,7 @@ struct mrg_statistics {
|
|||
// --- non-atomic --- under a write lock
|
||||
|
||||
size_t entries;
|
||||
size_t size; // total memory used, with indexing
|
||||
ssize_t size; // total memory used, with indexing
|
||||
|
||||
size_t additions;
|
||||
size_t additions_duplicate;
|
||||
|
@ -30,21 +28,22 @@ struct mrg_statistics {
|
|||
size_t delete_having_retention_or_referenced;
|
||||
size_t delete_misses;
|
||||
|
||||
MRG_CACHE_LINE_PADDING(0);
|
||||
|
||||
// --- atomic --- multiple readers / writers
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
size_t entries_referenced;
|
||||
|
||||
MRG_CACHE_LINE_PADDING(2);
|
||||
CACHE_LINE_PADDING();
|
||||
size_t current_references;
|
||||
|
||||
MRG_CACHE_LINE_PADDING(3);
|
||||
CACHE_LINE_PADDING();
|
||||
size_t search_hits;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t search_misses;
|
||||
|
||||
MRG_CACHE_LINE_PADDING(4);
|
||||
CACHE_LINE_PADDING();
|
||||
size_t writers;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t writers_conflicts;
|
||||
};
|
||||
|
||||
|
@ -83,9 +82,7 @@ bool mrg_metric_set_writer(MRG *mrg, METRIC *metric);
|
|||
bool mrg_metric_clear_writer(MRG *mrg, METRIC *metric);
|
||||
|
||||
void mrg_get_statistics(MRG *mrg, struct mrg_statistics *s);
|
||||
size_t mrg_aral_structures(void);
|
||||
size_t mrg_aral_overhead(void);
|
||||
|
||||
struct aral_statistics *mrg_aral_stats(void);
|
||||
|
||||
void mrg_update_metric_retention_and_granularity_by_uuid(
|
||||
MRG *mrg, Word_t section, nd_uuid_t *uuid,
|
||||
|
|
|
@ -62,6 +62,7 @@ struct pgd {
|
|||
#define PGD_ARAL_PARTITIONS_MAX 256
|
||||
|
||||
struct {
|
||||
int64_t padding_used;
|
||||
size_t partitions;
|
||||
|
||||
size_t sizeof_pgd;
|
||||
|
@ -77,7 +78,7 @@ struct {
|
|||
#error "You need to update the slots reserved for storage tiers"
|
||||
#endif
|
||||
|
||||
static struct aral_statistics aral_statistics_for_pgd = { 0 };
|
||||
static struct aral_statistics pgd_aral_statistics = { 0 };
|
||||
|
||||
static size_t aral_sizes_delta;
|
||||
static size_t aral_sizes_count;
|
||||
|
@ -89,8 +90,11 @@ static size_t aral_sizes[] = {
|
|||
[RRD_STORAGE_TIERS - 2] = 0,
|
||||
[RRD_STORAGE_TIERS - 1] = 0,
|
||||
|
||||
// gorilla buffer size
|
||||
// gorilla buffer sizes
|
||||
RRDENG_GORILLA_32BIT_BUFFER_SIZE,
|
||||
RRDENG_GORILLA_32BIT_BUFFER_SIZE * 2,
|
||||
RRDENG_GORILLA_32BIT_BUFFER_SIZE * 3,
|
||||
RRDENG_GORILLA_32BIT_BUFFER_SIZE * 4,
|
||||
|
||||
// our structures
|
||||
sizeof(gorilla_writer_t),
|
||||
|
@ -101,12 +105,13 @@ static ARAL **arals = NULL;
|
|||
#define arals_slot(slot, partition) ((partition) * aral_sizes_count + (slot))
|
||||
static ARAL *pgd_get_aral_by_size_and_partition(size_t size, size_t partition);
|
||||
|
||||
size_t pgd_aral_structures(void) {
|
||||
return aral_structures(pgd_alloc_globals.aral_pgd[0]);
|
||||
size_t pgd_padding_bytes(void) {
|
||||
int64_t x = __atomic_load_n(&pgd_alloc_globals.padding_used, __ATOMIC_RELAXED);
|
||||
return (x > 0) ? x : 0;
|
||||
}
|
||||
|
||||
size_t pgd_aral_overhead(void) {
|
||||
return aral_overhead(pgd_alloc_globals.aral_pgd[0]);
|
||||
struct aral_statistics *pgd_aral_stats(void) {
|
||||
return &pgd_aral_statistics;
|
||||
}
|
||||
|
||||
int aral_size_sort_compare(const void *a, const void *b) {
|
||||
|
@ -175,7 +180,7 @@ void pgd_init_arals(void) {
|
|||
aral_sizes[slot],
|
||||
0,
|
||||
0,
|
||||
&aral_statistics_for_pgd,
|
||||
&pgd_aral_statistics,
|
||||
NULL, NULL, false, false);
|
||||
}
|
||||
}
|
||||
|
@ -254,6 +259,9 @@ static inline PGD *pgd_alloc(bool for_collector) {
|
|||
static inline void *pgd_data_alloc(size_t size, size_t partition, bool for_collector) {
|
||||
ARAL *ar = pgd_get_aral_by_size_and_partition(size, partition);
|
||||
if(ar) {
|
||||
int64_t padding = (int64_t)aral_requested_element_size(ar) - (int64_t)size;
|
||||
__atomic_add_fetch(&pgd_alloc_globals.padding_used, padding, __ATOMIC_RELAXED);
|
||||
|
||||
if(for_collector)
|
||||
return aral_mallocz_marked(ar);
|
||||
else
|
||||
|
@ -265,8 +273,12 @@ static inline void *pgd_data_alloc(size_t size, size_t partition, bool for_colle
|
|||
|
||||
static void pgd_data_free(void *page, size_t size, size_t partition) {
|
||||
ARAL *ar = pgd_get_aral_by_size_and_partition(size, partition);
|
||||
if(ar)
|
||||
if(ar) {
|
||||
int64_t padding = (int64_t)aral_requested_element_size(ar) - (int64_t)size;
|
||||
__atomic_sub_fetch(&pgd_alloc_globals.padding_used, padding, __ATOMIC_RELAXED);
|
||||
|
||||
aral_freez(ar, page);
|
||||
}
|
||||
else
|
||||
freez(page);
|
||||
timing_dbengine_evict_step(TIMING_STEP_DBENGINE_EVICT_FREE_MAIN_PGD_TIER1_ARAL);
|
||||
|
|
|
@ -38,8 +38,8 @@ uint32_t pgd_memory_footprint(PGD *pg);
|
|||
uint32_t pgd_capacity(PGD *pg);
|
||||
uint32_t pgd_disk_footprint(PGD *pg);
|
||||
|
||||
size_t pgd_aral_structures(void);
|
||||
size_t pgd_aral_overhead(void);
|
||||
struct aral_statistics *pgd_aral_stats(void);
|
||||
size_t pgd_padding_bytes(void);
|
||||
|
||||
void pgd_copy_to_extent(PGD *pg, uint8_t *dst, uint32_t dst_size);
|
||||
|
||||
|
|
|
@ -1033,7 +1033,7 @@ void pgc_open_add_hot_page(Word_t section, Word_t metric_id, time_t start_time_s
|
|||
|
||||
size_t dynamic_open_cache_size(void) {
|
||||
size_t main_wanted_cache_size = pgc_get_wanted_cache_size(main_cache);
|
||||
size_t target_size = main_wanted_cache_size / 100 * 10; // 10%
|
||||
size_t target_size = main_wanted_cache_size / 100 * 5;
|
||||
|
||||
if(target_size < 2 * 1024 * 1024)
|
||||
target_size = 2 * 1024 * 1024;
|
||||
|
@ -1048,7 +1048,7 @@ size_t dynamic_open_cache_size(void) {
|
|||
|
||||
size_t dynamic_extent_cache_size(void) {
|
||||
size_t main_wanted_cache_size = pgc_get_wanted_cache_size(main_cache);
|
||||
size_t target_size = main_wanted_cache_size / 100 * 10; // 10%
|
||||
size_t target_size = main_wanted_cache_size / 100 * 30;
|
||||
|
||||
if(target_size < 5 * 1024 * 1024)
|
||||
target_size = 5 * 1024 * 1024;
|
||||
|
@ -1070,12 +1070,12 @@ void pgc_and_mrg_initialize(void)
|
|||
main_mrg = mrg_create(0);
|
||||
|
||||
size_t target_cache_size = (size_t)default_rrdeng_page_cache_mb * 1024ULL * 1024ULL;
|
||||
size_t main_cache_size = (target_cache_size / 100) * 95;
|
||||
size_t main_cache_size = (target_cache_size / 100) * 70;
|
||||
size_t open_cache_size = 0;
|
||||
size_t extent_cache_size = (target_cache_size / 100) * 5;
|
||||
size_t extent_cache_size = (target_cache_size / 100) * 30;
|
||||
|
||||
if(extent_cache_size < 3 * 1024 * 1024) {
|
||||
extent_cache_size = 3 * 1024 * 1024;
|
||||
if(extent_cache_size < 5 * 1024 * 1024) {
|
||||
extent_cache_size = 5 * 1024 * 1024;
|
||||
main_cache_size = target_cache_size - extent_cache_size;
|
||||
}
|
||||
|
||||
|
@ -1092,7 +1092,7 @@ void pgc_and_mrg_initialize(void)
|
|||
pgc_max_evictors(),
|
||||
1000,
|
||||
1,
|
||||
PGC_OPTIONS_AUTOSCALE,
|
||||
PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_EVICT_PAGES_NO_INLINE,
|
||||
0,
|
||||
0
|
||||
);
|
||||
|
@ -1109,7 +1109,7 @@ void pgc_and_mrg_initialize(void)
|
|||
pgc_max_evictors(),
|
||||
1000,
|
||||
1,
|
||||
PGC_OPTIONS_AUTOSCALE, // flushing inline: all dirty pages are just converted to clean
|
||||
PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_FLUSH_PAGES_NO_INLINE | PGC_OPTIONS_EVICT_PAGES_NO_INLINE,
|
||||
0,
|
||||
sizeof(struct extent_io_data)
|
||||
);
|
||||
|
@ -1126,7 +1126,7 @@ void pgc_and_mrg_initialize(void)
|
|||
pgc_max_evictors(),
|
||||
1000,
|
||||
1,
|
||||
PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_FLUSH_PAGES_NO_INLINE, // no flushing needed
|
||||
PGC_OPTIONS_AUTOSCALE | PGC_OPTIONS_FLUSH_PAGES_NO_INLINE | PGC_OPTIONS_EVICT_PAGES_NO_INLINE, // no flushing needed
|
||||
0,
|
||||
0
|
||||
);
|
||||
|
|
|
@ -71,8 +71,8 @@ static void pdc_release(PDC *pdc) {
|
|||
aral_freez(pdc_globals.pdc.ar, pdc);
|
||||
}
|
||||
|
||||
size_t pdc_cache_size(void) {
|
||||
return aral_overhead(pdc_globals.pdc.ar) + aral_structures(pdc_globals.pdc.ar);
|
||||
struct aral_statistics *pdc_aral_stats(void) {
|
||||
return aral_get_statistics(pdc_globals.pdc.ar);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -100,8 +100,8 @@ static void page_details_release(struct page_details *pd) {
|
|||
aral_freez(pdc_globals.pd.ar, pd);
|
||||
}
|
||||
|
||||
size_t pd_cache_size(void) {
|
||||
return aral_overhead(pdc_globals.pd.ar) + aral_structures(pdc_globals.pd.ar);
|
||||
struct aral_statistics *pd_aral_stats(void) {
|
||||
return aral_get_statistics(pdc_globals.pd.ar);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -129,8 +129,8 @@ static void epdl_release(EPDL *epdl) {
|
|||
aral_freez(pdc_globals.epdl.ar, epdl);
|
||||
}
|
||||
|
||||
size_t epdl_cache_size(void) {
|
||||
return aral_overhead(pdc_globals.epdl.ar) + aral_structures(pdc_globals.epdl.ar);
|
||||
struct aral_statistics *epdl_aral_stats(void) {
|
||||
return aral_get_statistics(pdc_globals.epdl.ar);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -159,8 +159,8 @@ static void deol_release(DEOL *deol) {
|
|||
aral_freez(pdc_globals.deol.ar, deol);
|
||||
}
|
||||
|
||||
size_t deol_cache_size(void) {
|
||||
return aral_overhead(pdc_globals.deol.ar) + aral_structures(pdc_globals.deol.ar);
|
||||
struct aral_statistics *deol_aral_stats(void) {
|
||||
return aral_get_statistics(pdc_globals.deol.ar);
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
|
|
@ -34,10 +34,11 @@ typedef void (*execute_extent_page_details_list_t)(struct rrdengine_instance *ct
|
|||
void pdc_to_epdl_router(struct rrdengine_instance *ctx, struct page_details_control *pdc, execute_extent_page_details_list_t exec_first_extent_list, execute_extent_page_details_list_t exec_rest_extent_list);
|
||||
void epdl_find_extent_and_populate_pages(struct rrdengine_instance *ctx, EPDL *epdl, bool worker);
|
||||
|
||||
size_t pdc_cache_size(void);
|
||||
size_t pd_cache_size(void);
|
||||
size_t epdl_cache_size(void);
|
||||
size_t deol_cache_size(void);
|
||||
struct aral_statistics *pdc_aral_stats(void);
|
||||
struct aral_statistics *pd_aral_stats(void);
|
||||
struct aral_statistics *epdl_aral_stats(void);
|
||||
struct aral_statistics *deol_aral_stats(void);
|
||||
|
||||
size_t extent_buffer_cache_size(void);
|
||||
|
||||
void pdc_init(void);
|
||||
|
|
|
@ -5,11 +5,7 @@
|
|||
#include "pdc.h"
|
||||
#include "dbengine-compression.h"
|
||||
|
||||
rrdeng_stats_t global_io_errors = 0;
|
||||
rrdeng_stats_t global_fs_errors = 0;
|
||||
rrdeng_stats_t rrdeng_reserved_file_descriptors = 0;
|
||||
rrdeng_stats_t global_pg_cache_over_half_dirty_events = 0;
|
||||
rrdeng_stats_t global_flushing_pressure_page_deletions = 0;
|
||||
struct rrdeng_global_stats global_stats = { 0 };
|
||||
|
||||
unsigned rrdeng_pages_per_extent = DEFAULT_PAGES_PER_EXTENT;
|
||||
|
||||
|
@ -1587,25 +1583,27 @@ static void after_journal_v2_indexing(struct rrdengine_instance *ctx __maybe_unu
|
|||
rrdeng_enq_cmd(ctx, RRDENG_OPCODE_DATABASE_ROTATE, NULL, NULL, STORAGE_PRIORITY_INTERNAL_DBENGINE, NULL, NULL);
|
||||
}
|
||||
|
||||
struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void) {
|
||||
struct rrdeng_buffer_sizes rrdeng_pulse_memory_sizes(void) {
|
||||
return (struct rrdeng_buffer_sizes) {
|
||||
.pgc = pgc_aral_overhead() + pgc_aral_structures(),
|
||||
.pgd = pgd_aral_overhead() + pgd_aral_structures(),
|
||||
.mrg = mrg_aral_overhead() + mrg_aral_structures(),
|
||||
.opcodes = aral_overhead(rrdeng_main.cmd_queue.ar) + aral_structures(rrdeng_main.cmd_queue.ar),
|
||||
.handles = aral_overhead(rrdeng_main.handles.ar) + aral_structures(rrdeng_main.handles.ar),
|
||||
.descriptors = aral_overhead(rrdeng_main.descriptors.ar) + aral_structures(rrdeng_main.descriptors.ar),
|
||||
.wal = __atomic_load_n(&wal_globals.atomics.allocated, __ATOMIC_RELAXED) * (sizeof(WAL) + RRDENG_BLOCK_SIZE),
|
||||
.workers = aral_overhead(rrdeng_main.work_cmd.ar),
|
||||
.pdc = pdc_cache_size(),
|
||||
.xt_io = aral_overhead(rrdeng_main.xt_io_descr.ar) + aral_structures(rrdeng_main.xt_io_descr.ar),
|
||||
.xt_buf = extent_buffer_cache_size(),
|
||||
.epdl = epdl_cache_size(),
|
||||
.deol = deol_cache_size(),
|
||||
.pd = pd_cache_size(),
|
||||
.as = {
|
||||
[RRDENG_MEM_PGC] = pgc_aral_stats(),
|
||||
[RRDENG_MEM_PGD] = pgd_aral_stats(),
|
||||
[RRDENG_MEM_MRG] = mrg_aral_stats(),
|
||||
[RRDENG_MEM_PDC] = pdc_aral_stats(),
|
||||
[RRDENG_MEM_EPDL] = epdl_aral_stats(),
|
||||
[RRDENG_MEM_DEOL] = deol_aral_stats(),
|
||||
[RRDENG_MEM_PD] = pd_aral_stats(),
|
||||
[RRDENG_MEM_OPCODES] = aral_get_statistics(rrdeng_main.cmd_queue.ar),
|
||||
[RRDENG_MEM_HANDLES] = aral_get_statistics(rrdeng_main.handles.ar),
|
||||
[RRDENG_MEM_DESCRIPTORS] = aral_get_statistics(rrdeng_main.descriptors.ar),
|
||||
[RRDENG_MEM_WORKERS] = aral_get_statistics(rrdeng_main.work_cmd.ar),
|
||||
[RRDENG_MEM_XT_IO] = aral_get_statistics(rrdeng_main.xt_io_descr.ar),
|
||||
},
|
||||
.wal = __atomic_load_n(&wal_globals.atomics.allocated, __ATOMIC_RELAXED) * (sizeof(WAL) + RRDENG_BLOCK_SIZE),
|
||||
.xt_buf = extent_buffer_cache_size(),
|
||||
|
||||
#ifdef PDC_USE_JULYL
|
||||
.julyl = julyl_cache_size(),
|
||||
.julyl = julyl_cache_size(),
|
||||
#endif
|
||||
};
|
||||
}
|
||||
|
|
|
@ -327,34 +327,60 @@ void wal_release(WAL *wal);
|
|||
* They only describe operations since DB engine instance load time.
|
||||
*/
|
||||
struct rrdengine_statistics {
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t before_decompress_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t after_decompress_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t before_compress_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t after_compress_bytes;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t io_write_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t io_write_requests;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t io_read_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t io_read_requests;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t datafile_creations;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t datafile_deletions;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t journalfile_creations;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t journalfile_deletions;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t io_errors;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t fs_errors;
|
||||
};
|
||||
|
||||
/* I/O errors global counter */
|
||||
extern rrdeng_stats_t global_io_errors;
|
||||
/* File-System errors global counter */
|
||||
extern rrdeng_stats_t global_fs_errors;
|
||||
/* number of File-Descriptors that have been reserved by dbengine */
|
||||
extern rrdeng_stats_t rrdeng_reserved_file_descriptors;
|
||||
/* inability to flush global counters */
|
||||
extern rrdeng_stats_t global_pg_cache_over_half_dirty_events;
|
||||
extern rrdeng_stats_t global_flushing_pressure_page_deletions; /* number of deleted pages */
|
||||
struct rrdeng_global_stats {
|
||||
CACHE_LINE_PADDING();
|
||||
/* I/O errors global counter */
|
||||
rrdeng_stats_t global_io_errors;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
/* File-System errors global counter */
|
||||
rrdeng_stats_t global_fs_errors;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
/* number of File-Descriptors that have been reserved by dbengine */
|
||||
rrdeng_stats_t rrdeng_reserved_file_descriptors;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
/* inability to flush global counters */
|
||||
rrdeng_stats_t global_pg_cache_over_half_dirty_events;
|
||||
CACHE_LINE_PADDING();
|
||||
rrdeng_stats_t global_flushing_pressure_page_deletions; /* number of deleted pages */
|
||||
};
|
||||
|
||||
extern struct rrdeng_global_stats global_stats;
|
||||
|
||||
typedef struct tier_config_prototype {
|
||||
int tier; // the tier of this ctx
|
||||
|
@ -387,22 +413,35 @@ struct rrdengine_instance {
|
|||
} njfv2idx;
|
||||
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
unsigned last_fileno; // newest index of datafile and journalfile
|
||||
CACHE_LINE_PADDING();
|
||||
unsigned last_flush_fileno; // newest index of datafile received data
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
size_t collectors_running;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t collectors_running_duplicate;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t inflight_queries; // the number of queries currently running
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t current_disk_space; // the current disk space size used
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t transaction_id; // the transaction id of the next extent flushing
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
bool migration_to_v2_running;
|
||||
CACHE_LINE_PADDING();
|
||||
bool now_deleting_files;
|
||||
CACHE_LINE_PADDING();
|
||||
unsigned extents_currently_being_flushed; // non-zero until we commit data to disk (both datafile and journal file)
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
time_t first_time_s;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t metrics;
|
||||
CACHE_LINE_PADDING();
|
||||
uint64_t samples;
|
||||
} atomic;
|
||||
|
||||
|
@ -440,12 +479,12 @@ static inline void ctx_io_write_op_bytes(struct rrdengine_instance *ctx, size_t
|
|||
|
||||
static inline void ctx_io_error(struct rrdengine_instance *ctx) {
|
||||
__atomic_add_fetch(&ctx->stats.io_errors, 1, __ATOMIC_RELAXED);
|
||||
rrd_stat_atomic_add(&global_io_errors, 1);
|
||||
rrd_stat_atomic_add(&global_stats.global_io_errors, 1);
|
||||
}
|
||||
|
||||
static inline void ctx_fs_error(struct rrdengine_instance *ctx) {
|
||||
__atomic_add_fetch(&ctx->stats.fs_errors, 1, __ATOMIC_RELAXED);
|
||||
rrd_stat_atomic_add(&global_fs_errors, 1);
|
||||
rrd_stat_atomic_add(&global_stats.global_fs_errors, 1);
|
||||
}
|
||||
|
||||
#define ctx_last_fileno_get(ctx) __atomic_load_n(&(ctx)->atomic.last_fileno, __ATOMIC_RELAXED)
|
||||
|
|
|
@ -1046,13 +1046,13 @@ void rrdeng_get_37_statistics(struct rrdengine_instance *ctx, unsigned long long
|
|||
array[27] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.page_cache_descriptors, __ATOMIC_RELAXED);
|
||||
array[28] = (uint64_t)__atomic_load_n(&ctx->stats.io_errors, __ATOMIC_RELAXED);
|
||||
array[29] = (uint64_t)__atomic_load_n(&ctx->stats.fs_errors, __ATOMIC_RELAXED);
|
||||
array[30] = (uint64_t)__atomic_load_n(&global_io_errors, __ATOMIC_RELAXED); // used
|
||||
array[31] = (uint64_t)__atomic_load_n(&global_fs_errors, __ATOMIC_RELAXED); // used
|
||||
array[32] = (uint64_t)__atomic_load_n(&rrdeng_reserved_file_descriptors, __ATOMIC_RELAXED); // used
|
||||
array[30] = (uint64_t)__atomic_load_n(&global_stats.global_io_errors, __ATOMIC_RELAXED); // used
|
||||
array[31] = (uint64_t)__atomic_load_n(&global_stats.global_fs_errors, __ATOMIC_RELAXED); // used
|
||||
array[32] = (uint64_t)__atomic_load_n(&global_stats.rrdeng_reserved_file_descriptors, __ATOMIC_RELAXED); // used
|
||||
array[33] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.pg_cache_over_half_dirty_events, __ATOMIC_RELAXED);
|
||||
array[34] = (uint64_t)__atomic_load_n(&global_pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); // used
|
||||
array[34] = (uint64_t)__atomic_load_n(&global_stats.global_pg_cache_over_half_dirty_events, __ATOMIC_RELAXED); // used
|
||||
array[35] = 0; // (uint64_t)__atomic_load_n(&ctx->stats.flushing_pressure_page_deletions, __ATOMIC_RELAXED);
|
||||
array[36] = (uint64_t)__atomic_load_n(&global_flushing_pressure_page_deletions, __ATOMIC_RELAXED); // used
|
||||
array[36] = (uint64_t)__atomic_load_n(&global_stats.global_flushing_pressure_page_deletions, __ATOMIC_RELAXED); // used
|
||||
array[37] = 0; //(uint64_t)pg_cache->active_descriptors;
|
||||
|
||||
fatal_assert(RRDENG_NR_STATS == 38);
|
||||
|
@ -1144,15 +1144,15 @@ int rrdeng_init(
|
|||
max_open_files = rlimit_nofile.rlim_cur / 4;
|
||||
|
||||
/* reserve RRDENG_FD_BUDGET_PER_INSTANCE file descriptors for this instance */
|
||||
rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
if (rrdeng_reserved_file_descriptors > max_open_files) {
|
||||
rrd_stat_atomic_add(&global_stats.rrdeng_reserved_file_descriptors, RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
if (global_stats.rrdeng_reserved_file_descriptors > max_open_files) {
|
||||
netdata_log_error(
|
||||
"Exceeded the budget of available file descriptors (%u/%u), cannot create new dbengine instance.",
|
||||
(unsigned)rrdeng_reserved_file_descriptors,
|
||||
(unsigned)global_stats.rrdeng_reserved_file_descriptors,
|
||||
(unsigned)max_open_files);
|
||||
|
||||
rrd_stat_atomic_add(&global_fs_errors, 1);
|
||||
rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
rrd_stat_atomic_add(&global_stats.global_fs_errors, 1);
|
||||
rrd_stat_atomic_add(&global_stats.rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
return UV_EMFILE;
|
||||
}
|
||||
|
||||
|
@ -1196,7 +1196,7 @@ int rrdeng_init(
|
|||
*ctxp = NULL;
|
||||
}
|
||||
|
||||
rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
rrd_stat_atomic_add(&global_stats.rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
return UV_EIO;
|
||||
}
|
||||
|
||||
|
@ -1243,7 +1243,7 @@ int rrdeng_exit(struct rrdengine_instance *ctx) {
|
|||
if (unittest_running) //(ctx->config.unittest)
|
||||
freez(ctx);
|
||||
|
||||
rrd_stat_atomic_add(&rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
rrd_stat_atomic_add(&global_stats.rrdeng_reserved_file_descriptors, -RRDENG_FD_BUDGET_PER_INSTANCE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -208,27 +208,35 @@ struct rrdeng_cache_efficiency_stats {
|
|||
size_t metrics_retention_started;
|
||||
};
|
||||
|
||||
typedef enum rrdeng_mem {
|
||||
RRDENG_MEM_PGC = 0,
|
||||
RRDENG_MEM_PGD,
|
||||
RRDENG_MEM_MRG,
|
||||
RRDENG_MEM_OPCODES,
|
||||
RRDENG_MEM_HANDLES,
|
||||
RRDENG_MEM_DESCRIPTORS,
|
||||
RRDENG_MEM_WORKERS,
|
||||
RRDENG_MEM_PDC,
|
||||
RRDENG_MEM_XT_IO,
|
||||
RRDENG_MEM_EPDL,
|
||||
RRDENG_MEM_DEOL,
|
||||
RRDENG_MEM_PD,
|
||||
|
||||
// terminator
|
||||
RRDENG_MEM_MAX,
|
||||
} RRDENG_MEM;
|
||||
|
||||
struct rrdeng_buffer_sizes {
|
||||
size_t workers;
|
||||
size_t pdc;
|
||||
struct aral_statistics *as[RRDENG_MEM_MAX];
|
||||
|
||||
size_t wal;
|
||||
size_t descriptors;
|
||||
size_t xt_io;
|
||||
size_t xt_buf;
|
||||
size_t handles;
|
||||
size_t opcodes;
|
||||
size_t epdl;
|
||||
size_t deol;
|
||||
size_t pd;
|
||||
size_t pgc;
|
||||
size_t pgd;
|
||||
size_t mrg;
|
||||
#ifdef PDC_USE_JULYL
|
||||
size_t julyl;
|
||||
#endif
|
||||
};
|
||||
|
||||
struct rrdeng_buffer_sizes rrdeng_get_buffer_sizes(void);
|
||||
struct rrdeng_buffer_sizes rrdeng_pulse_memory_sizes(void);
|
||||
struct rrdeng_cache_efficiency_stats rrdeng_get_cache_efficiency_stats(void);
|
||||
|
||||
RRDENG_SIZE_STATS rrdeng_size_statistics(struct rrdengine_instance *ctx);
|
||||
|
|
|
@ -278,7 +278,7 @@ struct rrddim_tier {
|
|||
STORAGE_COLLECT_HANDLE *sch; // the data collection handle
|
||||
};
|
||||
|
||||
void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s);
|
||||
void backfill_tier_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s);
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// RRD DIMENSION - this is a metric
|
||||
|
@ -921,8 +921,23 @@ typedef enum __attribute__ ((__packed__)) rrdhost_flags {
|
|||
// Careful not to overlap with rrdhost_options to avoid bugs if
|
||||
// rrdhost_flags_xxx is used instead of rrdhost_option_xxx or vice-versa
|
||||
// Orphan, Archived and Obsolete flags
|
||||
|
||||
/*
|
||||
* 3 BASE FLAGS FOR HOSTS:
|
||||
*
|
||||
* - COLLECTOR_ONLINE = the collector is currently collecting data for this node
|
||||
* this is true FOR ALL KINDS OF NODES (including localhost, virtual hosts, children)
|
||||
*
|
||||
* - ORPHAN = the node had a collector online recently, but does not have it now
|
||||
*
|
||||
* - ARCHIVED = the node does not have data collection structures attached to it
|
||||
*
|
||||
*/
|
||||
|
||||
RRDHOST_FLAG_COLLECTOR_ONLINE = (1 << 7), // the collector of this host is online
|
||||
RRDHOST_FLAG_ORPHAN = (1 << 8), // this host is orphan (not receiving data)
|
||||
RRDHOST_FLAG_ARCHIVED = (1 << 9), // The host is archived, no collected charts yet
|
||||
|
||||
RRDHOST_FLAG_PENDING_OBSOLETE_CHARTS = (1 << 10), // the host has pending chart obsoletions
|
||||
RRDHOST_FLAG_PENDING_OBSOLETE_DIMENSIONS = (1 << 11), // the host has pending dimension obsoletions
|
||||
|
||||
|
@ -951,7 +966,6 @@ typedef enum __attribute__ ((__packed__)) rrdhost_flags {
|
|||
RRDHOST_FLAG_PENDING_CONTEXT_LOAD = (1 << 26), // Context needs to be loaded
|
||||
|
||||
RRDHOST_FLAG_METADATA_CLAIMID = (1 << 27), // metadata needs to be stored in the database
|
||||
RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED = (1 << 28), // set when the receiver part is disconnected
|
||||
|
||||
RRDHOST_FLAG_GLOBAL_FUNCTIONS_UPDATED = (1 << 29), // set when the host has updated global functions
|
||||
} RRDHOST_FLAGS;
|
||||
|
@ -990,7 +1004,7 @@ typedef enum __attribute__ ((__packed__)) {
|
|||
#define rrdhost_can_stream_metadata_to_parent(host) \
|
||||
(rrdhost_has_stream_sender_enabled(host) && \
|
||||
rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_SENDER_READY_4_METRICS) && \
|
||||
!rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED) \
|
||||
rrdhost_flag_check(host, RRDHOST_FLAG_COLLECTOR_ONLINE) \
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
@ -1358,7 +1372,12 @@ extern RRDHOST *localhost;
|
|||
#define rrdhost_sender_replicating_charts_minus_one(host) (__atomic_sub_fetch(&((host)->stream.snd.status.replication.charts), 1, __ATOMIC_RELAXED))
|
||||
#define rrdhost_sender_replicating_charts_zero(host) (__atomic_store_n(&((host)->stream.snd.status.replication.charts), 0, __ATOMIC_RELAXED))
|
||||
|
||||
#define rrdhost_is_online(host) ((host) == localhost || rrdhost_option_check(host, RRDHOST_OPTION_VIRTUAL_HOST) || !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN | RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED))
|
||||
#define rrdhost_is_online(host) ( \
|
||||
(host) == localhost || \
|
||||
rrdhost_option_check(host, RRDHOST_OPTION_VIRTUAL_HOST) || \
|
||||
(rrdhost_flag_check(host, RRDHOST_FLAG_COLLECTOR_ONLINE) && !rrdhost_flag_check(host, RRDHOST_FLAG_ORPHAN)) \
|
||||
)
|
||||
|
||||
bool rrdhost_matches_window(RRDHOST *host, time_t after, time_t before, time_t now);
|
||||
|
||||
extern DICTIONARY *rrdhost_root_index;
|
||||
|
|
|
@ -841,6 +841,9 @@ int rrd_init(const char *hostname, struct rrdhost_system_info *system_info, bool
|
|||
if (unlikely(!localhost))
|
||||
return 1;
|
||||
|
||||
rrdhost_flag_set(localhost, RRDHOST_FLAG_COLLECTOR_ONLINE);
|
||||
|
||||
ml_host_start(localhost);
|
||||
dyncfg_host_init(localhost);
|
||||
|
||||
if(!unittest)
|
||||
|
|
|
@ -65,9 +65,9 @@ typedef struct rrdlabels {
|
|||
} \
|
||||
while (0)
|
||||
|
||||
static inline void STATS_PLUS_MEMORY(struct dictionary_stats *stats, size_t key_size, size_t item_size, size_t value_size) {
|
||||
if(key_size)
|
||||
__atomic_fetch_add(&stats->memory.index, (long)JUDYHS_INDEX_SIZE_ESTIMATE(key_size), __ATOMIC_RELAXED);
|
||||
static inline void STATS_PLUS_MEMORY(struct dictionary_stats *stats, int64_t judy_mem, size_t item_size, size_t value_size) {
|
||||
if(judy_mem)
|
||||
__atomic_fetch_add(&stats->memory.index, judy_mem, __ATOMIC_RELAXED);
|
||||
|
||||
if(item_size)
|
||||
__atomic_fetch_add(&stats->memory.dict, (long)item_size, __ATOMIC_RELAXED);
|
||||
|
@ -76,9 +76,9 @@ static inline void STATS_PLUS_MEMORY(struct dictionary_stats *stats, size_t key_
|
|||
__atomic_fetch_add(&stats->memory.values, (long)value_size, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
static inline void STATS_MINUS_MEMORY(struct dictionary_stats *stats, size_t key_size, size_t item_size, size_t value_size) {
|
||||
if(key_size)
|
||||
__atomic_fetch_sub(&stats->memory.index, (long)JUDYHS_INDEX_SIZE_ESTIMATE(key_size), __ATOMIC_RELAXED);
|
||||
static inline void STATS_MINUS_MEMORY(struct dictionary_stats *stats, int64_t judy_mem, size_t item_size, size_t value_size) {
|
||||
if(judy_mem)
|
||||
__atomic_fetch_add(&stats->memory.index, judy_mem, __ATOMIC_RELAXED);
|
||||
|
||||
if(item_size)
|
||||
__atomic_fetch_sub(&stats->memory.dict, (long)item_size, __ATOMIC_RELAXED);
|
||||
|
@ -131,7 +131,12 @@ static RRDLABEL *add_label_name_value(const char *name, const char *value)
|
|||
|
||||
spinlock_lock(&global_labels.spinlock);
|
||||
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
Pvoid_t *PValue = JudyHSIns(&global_labels.JudyHS, (void *)&label_index, sizeof(label_index), PJE0);
|
||||
|
||||
int64_t judy_mem = JudyAllocThreadPulseGetAndReset();
|
||||
|
||||
if(unlikely(!PValue || PValue == PJERR))
|
||||
fatal("RRDLABELS: corrupted judyHS array");
|
||||
|
||||
|
@ -139,11 +144,12 @@ static RRDLABEL *add_label_name_value(const char *name, const char *value)
|
|||
rrdlabel = *PValue;
|
||||
string_freez(label_index.key);
|
||||
string_freez(label_index.value);
|
||||
STATS_PLUS_MEMORY(&dictionary_stats_category_rrdlabels, judy_mem, 0, 0);
|
||||
} else {
|
||||
rrdlabel = callocz(1, sizeof(*rrdlabel));
|
||||
rrdlabel->label.index = label_index;
|
||||
*PValue = rrdlabel;
|
||||
STATS_PLUS_MEMORY(&dictionary_stats_category_rrdlabels, sizeof(LABEL_REGISTRY_IDX), sizeof(RRDLABEL_IDX), 0);
|
||||
STATS_PLUS_MEMORY(&dictionary_stats_category_rrdlabels, judy_mem, sizeof(RRDLABEL_IDX), 0);
|
||||
}
|
||||
__atomic_add_fetch(&rrdlabel->refcount, 1, __ATOMIC_RELAXED);
|
||||
|
||||
|
@ -160,11 +166,16 @@ static void delete_label(RRDLABEL *label)
|
|||
RRDLABEL_IDX *rrdlabel = *PValue;
|
||||
size_t refcount = __atomic_sub_fetch(&rrdlabel->refcount, 1, __ATOMIC_RELAXED);
|
||||
if (refcount == 0) {
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
int ret = JudyHSDel(&global_labels.JudyHS, (void *)label, sizeof(*label), PJE0);
|
||||
|
||||
int64_t judy_mem = JudyAllocThreadPulseGetAndReset();
|
||||
|
||||
if (unlikely(ret == JERR))
|
||||
STATS_MINUS_MEMORY(&dictionary_stats_category_rrdlabels, 0, sizeof(*rrdlabel), 0);
|
||||
STATS_MINUS_MEMORY(&dictionary_stats_category_rrdlabels, judy_mem, sizeof(*rrdlabel), 0);
|
||||
else
|
||||
STATS_MINUS_MEMORY(&dictionary_stats_category_rrdlabels, sizeof(LABEL_REGISTRY_IDX), sizeof(*rrdlabel), 0);
|
||||
STATS_MINUS_MEMORY(&dictionary_stats_category_rrdlabels, judy_mem, sizeof(*rrdlabel), 0);
|
||||
string_freez(label->index.key);
|
||||
string_freez(label->index.value);
|
||||
freez(rrdlabel);
|
||||
|
|
|
@ -1281,7 +1281,7 @@ void rrddim_store_metric(RRDDIM *rd, usec_t point_end_time_ut, NETDATA_DOUBLE n,
|
|||
if(!rrddim_option_check(rd, RRDDIM_OPTION_BACKFILLED_HIGH_TIERS)) {
|
||||
// we have not collected this tier before
|
||||
// let's fill any gap that may exist
|
||||
rrdr_fill_tier_gap_from_smaller_tiers(rd, tier, now_s);
|
||||
backfill_tier_from_smaller_tiers(rd, tier, now_s);
|
||||
}
|
||||
|
||||
store_metric_at_tier(rd, tier, t, sp, point_end_time_ut);
|
||||
|
|
|
@ -213,6 +213,12 @@ static void health_event_loop(void) {
|
|||
unsigned int loop = 0;
|
||||
|
||||
while(service_running(SERVICE_HEALTH)) {
|
||||
if(!stream_control_health_should_be_running()) {
|
||||
worker_is_idle();
|
||||
stream_control_throttle();
|
||||
continue;
|
||||
}
|
||||
|
||||
loop++;
|
||||
netdata_log_debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
|
||||
|
||||
|
|
|
@ -11,6 +11,12 @@
|
|||
#define TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS
|
||||
#endif
|
||||
|
||||
#if ENV32BIT
|
||||
#define SYSTEM_REQUIRED_ALIGNMENT (sizeof(uintptr_t) * 2)
|
||||
#else
|
||||
#define SYSTEM_REQUIRED_ALIGNMENT (alignof(uintptr_t))
|
||||
#endif
|
||||
|
||||
// max mapped file size
|
||||
#define ARAL_MAX_PAGE_SIZE_MMAP (1ULL * 1024 * 1024 * 1024)
|
||||
|
||||
|
@ -61,13 +67,17 @@ typedef enum {
|
|||
|
||||
struct aral_ops {
|
||||
struct {
|
||||
alignas(64) size_t allocators; // the number of threads currently trying to allocate memory
|
||||
alignas(64) size_t deallocators; // the number of threads currently trying to deallocate memory
|
||||
alignas(64) bool last_allocated_or_deallocated; // stability detector, true when was last allocated
|
||||
CACHE_LINE_PADDING();
|
||||
size_t allocators; // the number of threads currently trying to allocate memory
|
||||
CACHE_LINE_PADDING();
|
||||
size_t deallocators; // the number of threads currently trying to deallocate memory
|
||||
CACHE_LINE_PADDING();
|
||||
bool last_allocated_or_deallocated; // stability detector, true when was last allocated
|
||||
} atomic;
|
||||
|
||||
struct {
|
||||
alignas(64) SPINLOCK spinlock;
|
||||
CACHE_LINE_PADDING();
|
||||
SPINLOCK spinlock;
|
||||
size_t allocating_elements; // currently allocating elements
|
||||
size_t allocation_size; // current / next allocation size
|
||||
} adders;
|
||||
|
@ -97,7 +107,7 @@ struct aral {
|
|||
} config;
|
||||
|
||||
struct {
|
||||
alignas(64) SPINLOCK spinlock;
|
||||
SPINLOCK spinlock;
|
||||
size_t file_number; // for mmap
|
||||
|
||||
ARAL_PAGE *pages_free; // pages with free items
|
||||
|
@ -125,12 +135,12 @@ const char *aral_name(ARAL *ar) {
|
|||
return ar->config.name;
|
||||
}
|
||||
|
||||
size_t aral_structures_from_stats(struct aral_statistics *stats) {
|
||||
size_t aral_structures_bytes_from_stats(struct aral_statistics *stats) {
|
||||
if(!stats) return 0;
|
||||
return __atomic_load_n(&stats->structures.allocated_bytes, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
size_t aral_overhead_from_stats(struct aral_statistics *stats) {
|
||||
size_t aral_free_bytes_from_stats(struct aral_statistics *stats) {
|
||||
if(!stats) return 0;
|
||||
|
||||
size_t allocated = __atomic_load_n(&stats->malloc.allocated_bytes, __ATOMIC_RELAXED) +
|
||||
|
@ -139,23 +149,39 @@ size_t aral_overhead_from_stats(struct aral_statistics *stats) {
|
|||
size_t used = __atomic_load_n(&stats->malloc.used_bytes, __ATOMIC_RELAXED) +
|
||||
__atomic_load_n(&stats->mmap.used_bytes, __ATOMIC_RELAXED);
|
||||
|
||||
if(allocated > used) return allocated - used;
|
||||
return allocated;
|
||||
return (allocated > used) ? allocated - used : 0;
|
||||
}
|
||||
|
||||
size_t aral_used_bytes_from_stats(struct aral_statistics *stats) {
|
||||
size_t used = __atomic_load_n(&stats->malloc.used_bytes, __ATOMIC_RELAXED) +
|
||||
__atomic_load_n(&stats->mmap.used_bytes, __ATOMIC_RELAXED);
|
||||
|
||||
return used;
|
||||
}
|
||||
|
||||
size_t aral_overhead(ARAL *ar) {
|
||||
return aral_overhead_from_stats(ar->stats);
|
||||
size_t aral_padding_bytes_from_stats(struct aral_statistics *stats) {
|
||||
size_t padding = __atomic_load_n(&stats->malloc.padding_bytes, __ATOMIC_RELAXED) +
|
||||
__atomic_load_n(&stats->mmap.padding_bytes, __ATOMIC_RELAXED);
|
||||
return padding;
|
||||
}
|
||||
|
||||
size_t aral_structures(ARAL *ar) {
|
||||
return aral_structures_from_stats(ar->stats);
|
||||
size_t aral_used_bytes(ARAL *ar) {
|
||||
return aral_used_bytes_from_stats(ar->stats);
|
||||
}
|
||||
|
||||
size_t aral_free_bytes(ARAL *ar) {
|
||||
return aral_free_bytes_from_stats(ar->stats);
|
||||
}
|
||||
|
||||
size_t aral_structures_bytes(ARAL *ar) {
|
||||
return aral_structures_bytes_from_stats(ar->stats);
|
||||
}
|
||||
|
||||
size_t aral_padding_bytes(ARAL *ar) {
|
||||
return aral_padding_bytes_from_stats(ar->stats);
|
||||
}
|
||||
|
||||
size_t aral_free_structures_padding_from_stats(struct aral_statistics *stats) {
|
||||
return aral_free_bytes_from_stats(stats) + aral_structures_bytes_from_stats(stats) + aral_padding_bytes_from_stats(stats);
|
||||
}
|
||||
|
||||
struct aral_statistics *aral_get_statistics(ARAL *ar) {
|
||||
|
@ -343,6 +369,8 @@ static ARAL_PAGE *aral_get_page_pointer_after_element___do_NOT_have_aral_lock(AR
|
|||
}
|
||||
#endif
|
||||
|
||||
internal_fatal((uintptr_t)page % SYSTEM_REQUIRED_ALIGNMENT != 0, "Pointer is not aligned properly");
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
|
@ -387,11 +415,6 @@ static size_t aral_get_system_page_size(void) {
|
|||
return page_size;
|
||||
}
|
||||
|
||||
// we don't need alignof(max_align_t) for normal C structures
|
||||
// alignof(uintptr_r) is sufficient for our use cases
|
||||
// #define SYSTEM_REQUIRED_ALIGNMENT (alignof(max_align_t))
|
||||
#define SYSTEM_REQUIRED_ALIGNMENT (alignof(uintptr_t))
|
||||
|
||||
static size_t aral_element_slot_size(size_t requested_element_size, bool usable) {
|
||||
// we need to add a page pointer after the element
|
||||
// so, first align the element size to the pointer size
|
||||
|
@ -453,8 +476,11 @@ static size_t aral_next_allocation_size___adders_lock_needed(ARAL *ar, bool mark
|
|||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_ALLOCATIONS_FUNCTION_DEFINITION_PARAMS) {
|
||||
size_t data_size, structures_size;
|
||||
struct aral_page_type_stats *stats;
|
||||
ARAL_PAGE *page;
|
||||
|
||||
size_t total_size = size;
|
||||
|
||||
if(ar->config.mmap.enabled) {
|
||||
page = callocz(1, sizeof(ARAL_PAGE));
|
||||
ar->aral_lock.file_number++;
|
||||
|
@ -469,10 +495,8 @@ static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_
|
|||
fatal("ARAL: '%s' cannot allocate aral buffer of size %zu on filename '%s'",
|
||||
ar->config.name, size, page->filename);
|
||||
|
||||
__atomic_add_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&ar->stats->mmap.allocated_bytes, size, __ATOMIC_RELAXED);
|
||||
data_size = size;
|
||||
structures_size = sizeof(ARAL_PAGE);
|
||||
total_size = size + sizeof(ARAL_PAGE);
|
||||
stats = &ar->stats->mmap;
|
||||
}
|
||||
#ifdef NETDATA_TRACE_ALLOCATIONS
|
||||
else {
|
||||
|
@ -485,23 +509,18 @@ static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_
|
|||
#else
|
||||
else {
|
||||
size_t ARAL_PAGE_size = memory_alignment(sizeof(ARAL_PAGE), SYSTEM_REQUIRED_ALIGNMENT);
|
||||
size_t max_elements = aral_elements_in_page_size(ar, size);
|
||||
data_size = max_elements * ar->config.element_size;
|
||||
structures_size = size - data_size;
|
||||
|
||||
if (size >= ARAL_MMAP_PAGES_ABOVE) {
|
||||
bool mapped;
|
||||
uint8_t *ptr = netdata_mmap(NULL, size, MAP_PRIVATE, 1, false, NULL);
|
||||
if (ptr) {
|
||||
mapped = true;
|
||||
__atomic_add_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&ar->stats->mmap.allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
stats = &ar->stats->mmap;
|
||||
}
|
||||
else {
|
||||
ptr = mallocz(size);
|
||||
mapped = false;
|
||||
__atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&ar->stats->malloc.allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
stats = &ar->stats->malloc;
|
||||
}
|
||||
page = (ARAL_PAGE *)ptr;
|
||||
memset(page, 0, ARAL_PAGE_size);
|
||||
|
@ -515,8 +534,7 @@ static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_
|
|||
page->data = &ptr[ARAL_PAGE_size];
|
||||
page->mapped = false;
|
||||
|
||||
__atomic_add_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&ar->stats->malloc.allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
stats = &ar->stats->malloc;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -526,13 +544,21 @@ static ARAL_PAGE *aral_create_page___no_lock_needed(ARAL *ar, size_t size TRACE_
|
|||
page->max_elements = aral_elements_in_page_size(ar, page->size);
|
||||
page->aral_lock.free_elements = page->max_elements;
|
||||
|
||||
size_t structures_size = sizeof(ARAL_PAGE) + page->max_elements * sizeof(void *);
|
||||
size_t data_size = page->max_elements * ar->config.requested_element_size;
|
||||
size_t padding_size = total_size - data_size - structures_size;
|
||||
|
||||
__atomic_add_fetch(&stats->allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&stats->allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&stats->padding_bytes, padding_size, __ATOMIC_RELAXED);
|
||||
|
||||
__atomic_add_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&ar->stats->structures.allocated_bytes, structures_size, __ATOMIC_RELAXED);
|
||||
|
||||
// link the free space to its page
|
||||
ARAL_FREE *fr = (ARAL_FREE *)page->data;
|
||||
|
||||
fr->size = data_size;
|
||||
fr->size = page->max_elements * ar->config.element_size;
|
||||
fr->next = NULL;
|
||||
page->free.list = fr;
|
||||
|
||||
|
@ -545,15 +571,15 @@ void aral_del_page___no_lock_needed(ARAL *ar, ARAL_PAGE *page TRACE_ALLOCATIONS_
|
|||
size_t idx = mark_to_idx(page->started_marked);
|
||||
__atomic_store_n(&ar->ops[idx].atomic.last_allocated_or_deallocated, true, __ATOMIC_RELAXED);
|
||||
|
||||
size_t data_size, structures_size;
|
||||
struct aral_page_type_stats *stats;
|
||||
size_t max_elements = page->max_elements;
|
||||
size_t size = page->size;
|
||||
size_t total_size = size;
|
||||
|
||||
// free it
|
||||
if (ar->config.mmap.enabled) {
|
||||
data_size = page->size;
|
||||
structures_size = sizeof(ARAL_PAGE);
|
||||
|
||||
__atomic_sub_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_sub_fetch(&ar->stats->mmap.allocated_bytes, page->size, __ATOMIC_RELAXED);
|
||||
stats = &ar->stats->mmap;
|
||||
total_size = size + sizeof(ARAL_PAGE);
|
||||
|
||||
netdata_munmap(page->data, page->size);
|
||||
|
||||
|
@ -571,24 +597,25 @@ void aral_del_page___no_lock_needed(ARAL *ar, ARAL_PAGE *page TRACE_ALLOCATIONS_
|
|||
freez_int(page->data TRACE_ALLOCATIONS_FUNCTION_CALL_PARAMS);
|
||||
freez(page);
|
||||
#else
|
||||
data_size = page->max_elements * ar->config.element_size;
|
||||
structures_size = page->size - data_size;
|
||||
|
||||
if(page->mapped) {
|
||||
__atomic_sub_fetch(&ar->stats->mmap.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_sub_fetch(&ar->stats->mmap.allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
|
||||
stats = &ar->stats->mmap;
|
||||
netdata_munmap(page, page->size);
|
||||
}
|
||||
else {
|
||||
__atomic_sub_fetch(&ar->stats->malloc.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_sub_fetch(&ar->stats->malloc.allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
|
||||
stats = &ar->stats->malloc;
|
||||
freez(page);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
size_t structures_size = sizeof(ARAL_PAGE) + max_elements * sizeof(void *);
|
||||
size_t data_size = max_elements * ar->config.requested_element_size;
|
||||
size_t padding_size = total_size - data_size - structures_size;
|
||||
|
||||
__atomic_sub_fetch(&stats->allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_sub_fetch(&stats->allocated_bytes, data_size, __ATOMIC_RELAXED);
|
||||
__atomic_sub_fetch(&stats->padding_bytes, padding_size, __ATOMIC_RELAXED);
|
||||
|
||||
__atomic_sub_fetch(&ar->stats->structures.allocations, 1, __ATOMIC_RELAXED);
|
||||
__atomic_sub_fetch(&ar->stats->structures.allocated_bytes, structures_size, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
@ -766,10 +793,12 @@ void *aral_mallocz_internal(ARAL *ar, bool marked TRACE_ALLOCATIONS_FUNCTION_DEF
|
|||
// put the page pointer after the element
|
||||
aral_set_page_pointer_after_element___do_NOT_have_aral_lock(ar, page, found_fr, marked);
|
||||
|
||||
if(unlikely(ar->config.mmap.enabled))
|
||||
__atomic_add_fetch(&ar->stats->mmap.used_bytes, ar->config.element_size, __ATOMIC_RELAXED);
|
||||
if(unlikely(ar->config.mmap.enabled || page->mapped))
|
||||
__atomic_add_fetch(&ar->stats->mmap.used_bytes, ar->config.requested_element_size, __ATOMIC_RELAXED);
|
||||
else
|
||||
__atomic_add_fetch(&ar->stats->malloc.used_bytes, ar->config.element_size, __ATOMIC_RELAXED);
|
||||
__atomic_add_fetch(&ar->stats->malloc.used_bytes, ar->config.requested_element_size, __ATOMIC_RELAXED);
|
||||
|
||||
internal_fatal((uintptr_t)found_fr % SYSTEM_REQUIRED_ALIGNMENT != 0, "Pointer is not aligned properly");
|
||||
|
||||
return (void *)found_fr;
|
||||
}
|
||||
|
@ -827,11 +856,6 @@ void aral_freez_internal(ARAL *ar, void *ptr TRACE_ALLOCATIONS_FUNCTION_DEFINITI
|
|||
|
||||
if(unlikely(!ptr)) return;
|
||||
|
||||
if(unlikely(ar->config.mmap.enabled))
|
||||
__atomic_sub_fetch(&ar->stats->mmap.used_bytes, ar->config.element_size, __ATOMIC_RELAXED);
|
||||
else
|
||||
__atomic_sub_fetch(&ar->stats->malloc.used_bytes, ar->config.element_size, __ATOMIC_RELAXED);
|
||||
|
||||
// get the page pointer
|
||||
bool marked;
|
||||
ARAL_PAGE *page = aral_get_page_pointer_after_element___do_NOT_have_aral_lock(ar, ptr, &marked);
|
||||
|
@ -839,6 +863,11 @@ void aral_freez_internal(ARAL *ar, void *ptr TRACE_ALLOCATIONS_FUNCTION_DEFINITI
|
|||
size_t idx = mark_to_idx(marked);
|
||||
__atomic_add_fetch(&ar->ops[idx].atomic.deallocators, 1, __ATOMIC_RELAXED);
|
||||
|
||||
if(unlikely(ar->config.mmap.enabled || page->mapped))
|
||||
__atomic_sub_fetch(&ar->stats->mmap.used_bytes, ar->config.requested_element_size, __ATOMIC_RELAXED);
|
||||
else
|
||||
__atomic_sub_fetch(&ar->stats->malloc.used_bytes, ar->config.requested_element_size, __ATOMIC_RELAXED);
|
||||
|
||||
// make this element available
|
||||
ARAL_FREE *fr = (ARAL_FREE *)ptr;
|
||||
fr->size = ar->config.element_size;
|
||||
|
@ -1093,18 +1122,22 @@ struct aral_statistics *aral_by_size_statistics(void) {
|
|||
return &aral_by_size_globals.shared_statistics;
|
||||
}
|
||||
|
||||
size_t aral_by_size_structures(void) {
|
||||
return aral_structures_from_stats(&aral_by_size_globals.shared_statistics);
|
||||
size_t aral_by_size_structures_bytes(void) {
|
||||
return aral_structures_bytes_from_stats(&aral_by_size_globals.shared_statistics);
|
||||
}
|
||||
|
||||
size_t aral_by_size_overhead(void) {
|
||||
return aral_overhead_from_stats(&aral_by_size_globals.shared_statistics);
|
||||
size_t aral_by_size_free_bytes(void) {
|
||||
return aral_free_bytes_from_stats(&aral_by_size_globals.shared_statistics);
|
||||
}
|
||||
|
||||
size_t aral_by_size_used_bytes(void) {
|
||||
return aral_used_bytes_from_stats(&aral_by_size_globals.shared_statistics);
|
||||
}
|
||||
|
||||
size_t aral_by_size_padding_bytes(void) {
|
||||
return aral_padding_bytes_from_stats(&aral_by_size_globals.shared_statistics);
|
||||
}
|
||||
|
||||
ARAL *aral_by_size_acquire(size_t size) {
|
||||
spinlock_lock(&aral_by_size_globals.spinlock);
|
||||
|
||||
|
|
|
@ -8,53 +8,96 @@
|
|||
|
||||
typedef struct aral ARAL;
|
||||
|
||||
struct aral_page_type_stats {
|
||||
CACHE_LINE_PADDING();
|
||||
size_t allocations;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t allocated_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t used_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t padding_bytes;
|
||||
};
|
||||
|
||||
struct aral_statistics {
|
||||
struct {
|
||||
alignas(64) size_t allocations;
|
||||
alignas(64) size_t allocated_bytes;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t allocations;
|
||||
CACHE_LINE_PADDING();
|
||||
size_t allocated_bytes;
|
||||
} structures;
|
||||
|
||||
struct {
|
||||
alignas(64) size_t allocations;
|
||||
alignas(64) size_t allocated_bytes;
|
||||
alignas(64) size_t used_bytes;
|
||||
} malloc;
|
||||
|
||||
struct {
|
||||
alignas(64) size_t allocations;
|
||||
alignas(64) size_t allocated_bytes;
|
||||
alignas(64) size_t used_bytes;
|
||||
} mmap;
|
||||
struct aral_page_type_stats malloc;
|
||||
struct aral_page_type_stats mmap;
|
||||
};
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
const char *aral_name(ARAL *ar);
|
||||
|
||||
ARAL *aral_create(const char *name, size_t element_size, size_t initial_page_elements, size_t max_page_size,
|
||||
struct aral_statistics *stats, const char *filename, const char **cache_dir, bool mmap, bool lockless);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
// return the size of the element, as requested
|
||||
size_t aral_requested_element_size(ARAL *ar);
|
||||
|
||||
// return the exact memory footprint of the elements
|
||||
size_t aral_actual_element_size(ARAL *ar);
|
||||
|
||||
const char *aral_name(ARAL *ar);
|
||||
size_t aral_overhead(ARAL *ar);
|
||||
size_t aral_structures(ARAL *ar);
|
||||
struct aral_statistics *aral_get_statistics(ARAL *ar);
|
||||
size_t aral_structures_from_stats(struct aral_statistics *stats);
|
||||
size_t aral_overhead_from_stats(struct aral_statistics *stats);
|
||||
|
||||
ARAL *aral_by_size_acquire(size_t size);
|
||||
void aral_by_size_release(ARAL *ar);
|
||||
size_t aral_by_size_structures(void);
|
||||
size_t aral_by_size_overhead(void);
|
||||
struct aral_statistics *aral_by_size_statistics(void);
|
||||
|
||||
size_t aral_by_size_used_bytes(void);
|
||||
size_t aral_used_bytes_from_stats(struct aral_statistics *stats);
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
size_t aral_optimal_malloc_page_size(void);
|
||||
|
||||
int aral_unittest(size_t elements);
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
/*
|
||||
*
|
||||
* The total memory used by ARAL is:
|
||||
*
|
||||
* total = structures + used + free + padding
|
||||
*
|
||||
* or
|
||||
*
|
||||
* total = structures + allocated + padding
|
||||
*
|
||||
* always:
|
||||
*
|
||||
* allocated = used + free
|
||||
*
|
||||
* Hints:
|
||||
* - allocated, used and free are about the requested element size.
|
||||
* - structures includes the extension of the elements for the metadata aral needs.
|
||||
* - padding is lost due to alignment requirements
|
||||
*
|
||||
*/
|
||||
|
||||
size_t aral_structures_bytes(ARAL *ar);
|
||||
size_t aral_free_bytes(ARAL *ar);
|
||||
size_t aral_used_bytes(ARAL *ar);
|
||||
size_t aral_padding_bytes(ARAL *ar);
|
||||
|
||||
struct aral_statistics *aral_get_statistics(ARAL *ar);
|
||||
|
||||
size_t aral_structures_bytes_from_stats(struct aral_statistics *stats);
|
||||
size_t aral_free_bytes_from_stats(struct aral_statistics *stats);
|
||||
size_t aral_used_bytes_from_stats(struct aral_statistics *stats);
|
||||
size_t aral_padding_bytes_from_stats(struct aral_statistics *stats);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
ARAL *aral_by_size_acquire(size_t size);
|
||||
void aral_by_size_release(ARAL *ar);
|
||||
|
||||
size_t aral_by_size_structures_bytes(void);
|
||||
size_t aral_by_size_free_bytes(void);
|
||||
size_t aral_by_size_used_bytes(void);
|
||||
size_t aral_by_size_padding_bytes(void);
|
||||
|
||||
struct aral_statistics *aral_by_size_statistics(void);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
#ifdef NETDATA_TRACE_ALLOCATIONS
|
||||
|
||||
|
@ -87,6 +130,10 @@ void aral_destroy_internal(ARAL *ar);
|
|||
|
||||
void aral_unmark_allocation(ARAL *ar, void *ptr);
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
int aral_unittest(size_t elements);
|
||||
|
||||
#endif // NETDATA_TRACE_ALLOCATIONS
|
||||
|
||||
#endif // ARAL_H
|
||||
|
|
|
@ -394,6 +394,12 @@ typedef uint32_t uid_t;
|
|||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
#define CONCAT_INDIRECT(a, b) a##b
|
||||
#define CONCAT(a, b) CONCAT_INDIRECT(a, b)
|
||||
#define CACHE_LINE_PADDING() uint8_t CONCAT(padding, __COUNTER__)[64 - sizeof(size_t)];
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
#if defined(OS_WINDOWS)
|
||||
#include <windows.h>
|
||||
#include <wctype.h>
|
||||
|
|
|
@ -112,8 +112,13 @@ static inline size_t hashtable_destroy_judy(DICTIONARY *dict) {
|
|||
|
||||
pointer_destroy_index(dict);
|
||||
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
JError_t J_Error;
|
||||
Word_t ret = JudyHSFreeArray(&dict->index.JudyHSArray, &J_Error);
|
||||
|
||||
__atomic_add_fetch(&dict->stats->memory.index, JudyAllocThreadPulseGetAndReset(), __ATOMIC_RELAXED);
|
||||
|
||||
if(unlikely(ret == (Word_t) JERR)) {
|
||||
netdata_log_error("DICTIONARY: Cannot destroy JudyHS, JU_ERRNO_* == %u, ID == %d",
|
||||
JU_ERRNO(&J_Error), JU_ERRID(&J_Error));
|
||||
|
@ -126,8 +131,13 @@ static inline size_t hashtable_destroy_judy(DICTIONARY *dict) {
|
|||
}
|
||||
|
||||
static inline void *hashtable_insert_judy(DICTIONARY *dict, const char *name, size_t name_len) {
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
JError_t J_Error;
|
||||
Pvoid_t *Rc = JudyHSIns(&dict->index.JudyHSArray, (void *)name, name_len, &J_Error);
|
||||
|
||||
__atomic_add_fetch(&dict->stats->memory.index, JudyAllocThreadPulseGetAndReset(), __ATOMIC_RELAXED);
|
||||
|
||||
if (unlikely(Rc == PJERR)) {
|
||||
netdata_log_error("DICTIONARY: Cannot insert entry with name '%s' to JudyHS, JU_ERRNO_* == %u, ID == %d",
|
||||
name, JU_ERRNO(&J_Error), JU_ERRID(&J_Error));
|
||||
|
@ -159,8 +169,13 @@ static inline int hashtable_delete_judy(DICTIONARY *dict, const char *name, size
|
|||
(void)item;
|
||||
if(unlikely(!dict->index.JudyHSArray)) return 0;
|
||||
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
JError_t J_Error;
|
||||
int ret = JudyHSDel(&dict->index.JudyHSArray, (void *)name, name_len, &J_Error);
|
||||
|
||||
__atomic_add_fetch(&dict->stats->memory.index, JudyAllocThreadPulseGetAndReset(), __ATOMIC_RELAXED);
|
||||
|
||||
if(unlikely(ret == JERR)) {
|
||||
netdata_log_error("DICTIONARY: Cannot delete entry with name '%s' from JudyHS, JU_ERRNO_* == %u, ID == %d",
|
||||
name,
|
||||
|
|
|
@ -9,10 +9,7 @@
|
|||
// memory statistics
|
||||
|
||||
#ifdef DICT_WITH_STATS
|
||||
static inline void DICTIONARY_STATS_PLUS_MEMORY(DICTIONARY *dict, size_t key_size, size_t item_size, size_t value_size) {
|
||||
if(key_size)
|
||||
__atomic_fetch_add(&dict->stats->memory.index, (long)JUDYHS_INDEX_SIZE_ESTIMATE(key_size), __ATOMIC_RELAXED);
|
||||
|
||||
static inline void DICTIONARY_STATS_PLUS_MEMORY(DICTIONARY *dict, size_t key_size __maybe_unused, size_t item_size, size_t value_size) {
|
||||
if(item_size)
|
||||
__atomic_fetch_add(&dict->stats->memory.dict, (long)item_size, __ATOMIC_RELAXED);
|
||||
|
||||
|
@ -20,10 +17,7 @@ static inline void DICTIONARY_STATS_PLUS_MEMORY(DICTIONARY *dict, size_t key_siz
|
|||
__atomic_fetch_add(&dict->stats->memory.values, (long)value_size, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
static inline void DICTIONARY_STATS_MINUS_MEMORY(DICTIONARY *dict, size_t key_size, size_t item_size, size_t value_size) {
|
||||
if(key_size)
|
||||
__atomic_fetch_sub(&dict->stats->memory.index, (long)JUDYHS_INDEX_SIZE_ESTIMATE(key_size), __ATOMIC_RELAXED);
|
||||
|
||||
static inline void DICTIONARY_STATS_MINUS_MEMORY(DICTIONARY *dict, size_t key_size __maybe_unused, size_t item_size, size_t value_size) {
|
||||
if(item_size)
|
||||
__atomic_fetch_sub(&dict->stats->memory.dict, (long)item_size, __ATOMIC_RELAXED);
|
||||
|
||||
|
|
|
@ -66,48 +66,74 @@ struct dictionary_stats {
|
|||
const char *name; // the name of the category
|
||||
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
size_t active; // the number of active dictionaries
|
||||
CACHE_LINE_PADDING();
|
||||
size_t deleted; // the number of dictionaries queued for destruction
|
||||
} dictionaries;
|
||||
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
long entries; // active items in the dictionary
|
||||
CACHE_LINE_PADDING();
|
||||
long pending_deletion; // pending deletion items in the dictionary
|
||||
CACHE_LINE_PADDING();
|
||||
long referenced; // referenced items in the dictionary
|
||||
} items;
|
||||
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
size_t creations; // dictionary creations
|
||||
CACHE_LINE_PADDING();
|
||||
size_t destructions; // dictionary destructions
|
||||
CACHE_LINE_PADDING();
|
||||
size_t flushes; // dictionary flushes
|
||||
CACHE_LINE_PADDING();
|
||||
size_t traversals; // dictionary foreach
|
||||
CACHE_LINE_PADDING();
|
||||
size_t walkthroughs; // dictionary walkthrough
|
||||
CACHE_LINE_PADDING();
|
||||
size_t garbage_collections; // dictionary garbage collections
|
||||
CACHE_LINE_PADDING();
|
||||
size_t searches; // item searches
|
||||
CACHE_LINE_PADDING();
|
||||
size_t inserts; // item inserts
|
||||
CACHE_LINE_PADDING();
|
||||
size_t resets; // item resets
|
||||
CACHE_LINE_PADDING();
|
||||
size_t deletes; // item deletes
|
||||
} ops;
|
||||
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
size_t inserts; // number of times the insert callback is called
|
||||
CACHE_LINE_PADDING();
|
||||
size_t conflicts; // number of times the conflict callback is called
|
||||
CACHE_LINE_PADDING();
|
||||
size_t reacts; // number of times the react callback is called
|
||||
CACHE_LINE_PADDING();
|
||||
size_t deletes; // number of times the delete callback is called
|
||||
} callbacks;
|
||||
|
||||
// memory
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
ssize_t index; // bytes of keys indexed (indication of the index size)
|
||||
CACHE_LINE_PADDING();
|
||||
ssize_t values; // bytes of caller structures
|
||||
CACHE_LINE_PADDING();
|
||||
ssize_t dict; // bytes of the structures dictionary needs
|
||||
} memory;
|
||||
|
||||
// spin locks
|
||||
struct {
|
||||
CACHE_LINE_PADDING();
|
||||
size_t use_spins; // number of times a reference to item had to spin to acquire it or ignore it
|
||||
CACHE_LINE_PADDING();
|
||||
size_t search_spins; // number of times a successful search result had to be thrown away
|
||||
CACHE_LINE_PADDING();
|
||||
size_t insert_spins; // number of times an insertion to the hash table had to be repeated
|
||||
CACHE_LINE_PADDING();
|
||||
size_t delete_spins; // number of times a deletion had to spin to get a decision
|
||||
} spin_locks;
|
||||
};
|
||||
|
|
|
@ -34,12 +34,12 @@ __attribute__((constructor)) void aral_judy_init(void) {
|
|||
}
|
||||
}
|
||||
|
||||
size_t judy_aral_overhead(void) {
|
||||
return aral_overhead_from_stats(&judy_sizes_aral_statistics);
|
||||
size_t judy_aral_free_bytes(void) {
|
||||
return aral_free_bytes_from_stats(&judy_sizes_aral_statistics);
|
||||
}
|
||||
|
||||
size_t judy_aral_structures(void) {
|
||||
return aral_structures_from_stats(&judy_sizes_aral_statistics);
|
||||
return aral_structures_bytes_from_stats(&judy_sizes_aral_statistics);
|
||||
}
|
||||
|
||||
static ARAL *judy_size_aral(Word_t Words) {
|
||||
|
@ -51,11 +51,11 @@ static ARAL *judy_size_aral(Word_t Words) {
|
|||
|
||||
static __thread int64_t judy_allocated = 0;
|
||||
|
||||
void JudyAllocThreadTelemetryReset(void) {
|
||||
void JudyAllocThreadPulseReset(void) {
|
||||
judy_allocated = 0;
|
||||
}
|
||||
|
||||
int64_t JudyAllocThreadTelemetryGetAndReset(void) {
|
||||
int64_t JudyAllocThreadPulseGetAndReset(void) {
|
||||
int64_t rc = judy_allocated;
|
||||
judy_allocated = 0;
|
||||
return rc;
|
||||
|
|
|
@ -5,10 +5,10 @@
|
|||
|
||||
#include "../libnetdata.h"
|
||||
|
||||
size_t judy_aral_overhead(void);
|
||||
size_t judy_aral_free_bytes(void);
|
||||
size_t judy_aral_structures(void);
|
||||
|
||||
void JudyAllocThreadTelemetryReset(void);
|
||||
int64_t JudyAllocThreadTelemetryGetAndReset(void);
|
||||
void JudyAllocThreadPulseReset(void);
|
||||
int64_t JudyAllocThreadPulseGetAndReset(void);
|
||||
|
||||
#endif //NETDATA_JUDY_MALLOC_H
|
||||
|
|
|
@ -213,7 +213,7 @@ Leaf |< 3 > | 3 | 2 | 3 | 1 | 2 | 3 | 3
|
|||
typedef int bool_t;
|
||||
#endif
|
||||
|
||||
#define FUNCTION // null; easy to find functions.
|
||||
#define FUNCTION __attribute__((no_sanitize("shift"))) // null; easy to find functions.
|
||||
|
||||
#ifndef TRUE
|
||||
#define TRUE 1
|
||||
|
|
|
@ -311,7 +311,6 @@ static int j__udyStageJBBtoJBB(
|
|||
//
|
||||
// NOTE: Caller must release the Leaf2 that was passed in.
|
||||
|
||||
__attribute__((no_sanitize("shift")))
|
||||
FUNCTION static Pjlb_t j__udyJLL2toJLB1(
|
||||
uint16_t * Pjll, // array of 16-bit indexes.
|
||||
#ifdef JUDYL
|
||||
|
|
|
@ -345,7 +345,6 @@ FUNCTION int j__udyBranchUToBranchB(
|
|||
// allocation and free, in order to allow the caller to continue with a LeafB1
|
||||
// if allocation fails.
|
||||
|
||||
__attribute__((no_sanitize("shift")))
|
||||
FUNCTION int j__udyLeafB1ToLeaf1(
|
||||
Pjp_t Pjp, // points to LeafB1 to shrink.
|
||||
Pvoid_t Pjpm) // for global accounting.
|
||||
|
@ -432,7 +431,6 @@ FUNCTION int j__udyLeafB1ToLeaf1(
|
|||
// TBD: In this and all following functions, the caller should already be able
|
||||
// to compute the Pop1 return value, so why return it?
|
||||
|
||||
__attribute__((no_sanitize("shift")))
|
||||
FUNCTION Word_t j__udyLeaf1ToLeaf2(
|
||||
uint16_t * PLeaf2, // destination uint16_t * Index portion of leaf.
|
||||
#ifdef JUDYL
|
||||
|
|
|
@ -147,7 +147,6 @@ extern Word_t j__udyLLeaf7ToLeafW(Pjlw_t, Pjv_t, Pjp_t, Word_t, Pvoid_t);
|
|||
|
||||
DBGCODE(uint8_t parentJPtype;) // parent branch JP type.
|
||||
|
||||
__attribute__((no_sanitize("shift")))
|
||||
FUNCTION static int j__udyDelWalk(
|
||||
Pjp_t Pjp, // current JP under which to delete.
|
||||
Word_t Index, // to delete.
|
||||
|
|
|
@ -44,8 +44,6 @@
|
|||
// See the manual entry for details. Note support for "shortcut" entries to
|
||||
// trees known to start with a JPM.
|
||||
|
||||
__attribute__((no_sanitize("shift")))
|
||||
|
||||
#ifdef JUDY1
|
||||
|
||||
#ifdef JUDYGETINLINE
|
||||
|
|
|
@ -152,7 +152,6 @@ extern int j__udyLInsertBranch(Pjp_t Pjp, Word_t Index, Word_t Btype, Pjpm_t);
|
|||
// Return -1 for error (details in JPM), 0 for Index already inserted, 1 for
|
||||
// new Index inserted.
|
||||
|
||||
__attribute__((no_sanitize("shift")))
|
||||
FUNCTION static int j__udyInsWalk(
|
||||
Pjp_t Pjp, // current JP to descend.
|
||||
Word_t Index, // to insert.
|
||||
|
|
|
@ -454,7 +454,8 @@ void mallocz_release_as_much_memory_to_the_system(void) {
|
|||
spinlock_lock(&spinlock);
|
||||
|
||||
#ifdef HAVE_C_MALLOPT
|
||||
size_t trim_threshold = aral_optimal_malloc_page_size();
|
||||
// the default is 128KiB
|
||||
size_t trim_threshold = 65ULL * 1024;
|
||||
mallopt(M_TRIM_THRESHOLD, (int)trim_threshold);
|
||||
#endif
|
||||
|
||||
|
|
|
@ -69,9 +69,13 @@ static inline bool SIMPLE_HASHTABLE_COMPARE_KEYS_FUNCTION(SIMPLE_HASHTABLE_KEY_T
|
|||
#endif
|
||||
|
||||
// First layer of macro for token concatenation
|
||||
#define CONCAT_INTERNAL(a, b) a ## b
|
||||
#ifndef CONCAT_INDIRECT
|
||||
#define CONCAT_INDIRECT(a, b) a ## b
|
||||
#endif
|
||||
// Second layer of macro, which ensures proper expansion
|
||||
#define CONCAT(a, b) CONCAT_INTERNAL(a, b)
|
||||
#ifndef CONCAT
|
||||
#define CONCAT(a, b) CONCAT_INDIRECT(a, b)
|
||||
#endif
|
||||
|
||||
// define names for all structures and structures
|
||||
#define simple_hashtable_init_named CONCAT(simple_hashtable_init, SIMPLE_HASHTABLE_NAME)
|
||||
|
|
|
@ -32,7 +32,7 @@ static struct string_partition {
|
|||
size_t deletes; // the number of successful deleted from the index
|
||||
|
||||
long int entries; // the number of entries in the index
|
||||
long int memory; // the memory used, without the JudyHS index
|
||||
long int memory; // the memory used, with JudyHS (accurate)
|
||||
|
||||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
// internal statistics
|
||||
|
@ -196,10 +196,18 @@ static inline STRING *string_index_insert(const char *str, size_t length) {
|
|||
|
||||
rw_spinlock_write_lock(&string_base[partition].spinlock);
|
||||
|
||||
int64_t mem = 0;
|
||||
|
||||
STRING **ptr;
|
||||
{
|
||||
JError_t J_Error;
|
||||
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
Pvoid_t *Rc = JudyHSIns(&string_base[partition].JudyHSArray, (void *)str, length - 1, &J_Error);
|
||||
|
||||
mem = JudyAllocThreadPulseGetAndReset();
|
||||
|
||||
if (unlikely(Rc == PJERR)) {
|
||||
fatal(
|
||||
"STRING: Cannot insert entry with name '%s' to JudyHS, JU_ERRNO_* == %u, ID == %d",
|
||||
|
@ -220,7 +228,7 @@ static inline STRING *string_index_insert(const char *str, size_t length) {
|
|||
*ptr = string;
|
||||
string_base[partition].inserts++;
|
||||
string_base[partition].entries++;
|
||||
string_base[partition].memory += (long)(mem_size + JUDYHS_INDEX_SIZE_ESTIMATE(length));
|
||||
string_base[partition].memory += (long)(mem_size + mem);
|
||||
}
|
||||
else {
|
||||
// the item is already in the index
|
||||
|
@ -256,10 +264,17 @@ static inline void string_index_delete(STRING *string) {
|
|||
#endif
|
||||
|
||||
bool deleted = false;
|
||||
int64_t mem = 0;
|
||||
|
||||
if (likely(string_base[partition].JudyHSArray)) {
|
||||
JError_t J_Error;
|
||||
|
||||
JudyAllocThreadPulseReset();
|
||||
|
||||
int ret = JudyHSDel(&string_base[partition].JudyHSArray, (void *)string->str, string->length - 1, &J_Error);
|
||||
|
||||
mem = JudyAllocThreadPulseGetAndReset();
|
||||
|
||||
if (unlikely(ret == JERR)) {
|
||||
netdata_log_error(
|
||||
"STRING: Cannot delete entry with name '%s' from JudyHS, JU_ERRNO_* == %u, ID == %d",
|
||||
|
@ -276,7 +291,7 @@ static inline void string_index_delete(STRING *string) {
|
|||
size_t mem_size = sizeof(STRING) + string->length;
|
||||
string_base[partition].deletes++;
|
||||
string_base[partition].entries--;
|
||||
string_base[partition].memory -= (long)(mem_size + JUDYHS_INDEX_SIZE_ESTIMATE(string->length));
|
||||
string_base[partition].memory -= (long)(mem_size + mem);
|
||||
freez(string);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,7 +19,7 @@ char to_hex(char code) {
|
|||
|
||||
/* Returns an url-encoded version of str */
|
||||
/* IMPORTANT: be sure to free() the returned string after use */
|
||||
char *url_encode(char *str) {
|
||||
char *url_encode(const char *str) {
|
||||
char *buf, *pbuf;
|
||||
|
||||
pbuf = buf = mallocz(strlen(str) * 3 + 1);
|
||||
|
|
|
@ -17,7 +17,7 @@ char to_hex(char code);
|
|||
|
||||
/* Returns a url-encoded version of str */
|
||||
/* IMPORTANT: be sure to free() the returned string after use */
|
||||
char *url_encode(char *str);
|
||||
char *url_encode(const char *str);
|
||||
|
||||
/* Returns a url-decoded version of str */
|
||||
/* IMPORTANT: be sure to free() the returned string after use */
|
||||
|
|
145
src/ml/ml.cc
145
src/ml/ml.cc
|
@ -6,6 +6,7 @@
|
|||
|
||||
#include "ad_charts.h"
|
||||
#include "database/sqlite/vendored/sqlite3.h"
|
||||
#include "streaming/stream-control.h"
|
||||
|
||||
#define WORKER_TRAIN_QUEUE_POP 0
|
||||
#define WORKER_TRAIN_ACQUIRE_DIMENSION 1
|
||||
|
@ -20,13 +21,6 @@ sqlite3 *ml_db = NULL;
|
|||
static netdata_mutex_t db_mutex = NETDATA_MUTEX_INITIALIZER;
|
||||
|
||||
typedef struct {
|
||||
// Time when the request for this response was made
|
||||
time_t request_time;
|
||||
|
||||
// First/last entry of the dimension in DB when generating the request
|
||||
time_t first_entry_on_request;
|
||||
time_t last_entry_on_request;
|
||||
|
||||
// First/last entry of the dimension in DB when generating the response
|
||||
time_t first_entry_on_response;
|
||||
time_t last_entry_on_response;
|
||||
|
@ -47,14 +41,10 @@ typedef struct {
|
|||
} ml_training_response_t;
|
||||
|
||||
static std::pair<enum ml_worker_result, ml_training_response_t>
|
||||
ml_dimension_calculated_numbers(ml_worker_t *worker, ml_dimension_t *dim, const ml_request_create_new_model_t &req)
|
||||
ml_dimension_calculated_numbers(ml_worker_t *worker, ml_dimension_t *dim)
|
||||
{
|
||||
ml_training_response_t training_response = {};
|
||||
|
||||
training_response.request_time = req.request_time;
|
||||
training_response.first_entry_on_request = req.first_entry_on_request;
|
||||
training_response.last_entry_on_request = req.last_entry_on_request;
|
||||
|
||||
training_response.first_entry_on_response = rrddim_first_entry_s_of_tier(dim->rd, 0);
|
||||
training_response.last_entry_on_response = rrddim_last_entry_s_of_tier(dim->rd, 0);
|
||||
|
||||
|
@ -83,7 +73,7 @@ ml_dimension_calculated_numbers(ml_worker_t *worker, ml_dimension_t *dim, const
|
|||
|
||||
storage_engine_query_init(dim->rd->tiers[0].seb, dim->rd->tiers[0].smh, &handle,
|
||||
training_response.query_after_t, training_response.query_before_t,
|
||||
STORAGE_PRIORITY_BEST_EFFORT);
|
||||
STORAGE_PRIORITY_SYNCHRONOUS);
|
||||
|
||||
size_t idx = 0;
|
||||
memset(worker->training_cns, 0, sizeof(calculated_number_t) * max_n * (Cfg.lag_n + 1));
|
||||
|
@ -637,10 +627,18 @@ static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim)
|
|||
}
|
||||
|
||||
static enum ml_worker_result
|
||||
ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim, const ml_request_create_new_model_t &req)
|
||||
ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim)
|
||||
{
|
||||
worker_is_busy(WORKER_TRAIN_QUERY);
|
||||
auto P = ml_dimension_calculated_numbers(worker, dim, req);
|
||||
|
||||
spinlock_lock(&dim->slock);
|
||||
if (dim->mt == METRIC_TYPE_CONSTANT) {
|
||||
spinlock_unlock(&dim->slock);
|
||||
return ML_WORKER_RESULT_OK;
|
||||
}
|
||||
spinlock_unlock(&dim->slock);
|
||||
|
||||
auto P = ml_dimension_calculated_numbers(worker, dim);
|
||||
ml_worker_result worker_result = P.first;
|
||||
ml_training_response_t training_response = P.second;
|
||||
|
||||
|
@ -648,21 +646,8 @@ ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim, const ml_requ
|
|||
spinlock_lock(&dim->slock);
|
||||
|
||||
dim->mt = METRIC_TYPE_CONSTANT;
|
||||
|
||||
switch (dim->ts) {
|
||||
case TRAINING_STATUS_PENDING_WITH_MODEL:
|
||||
dim->ts = TRAINING_STATUS_TRAINED;
|
||||
break;
|
||||
case TRAINING_STATUS_PENDING_WITHOUT_MODEL:
|
||||
dim->ts = TRAINING_STATUS_UNTRAINED;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
dim->suppression_anomaly_counter = 0;
|
||||
dim->suppression_window_counter = 0;
|
||||
|
||||
dim->last_training_time = training_response.last_entry_on_response;
|
||||
|
||||
spinlock_unlock(&dim->slock);
|
||||
|
@ -694,59 +679,8 @@ ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim, const ml_requ
|
|||
return worker_result;
|
||||
}
|
||||
|
||||
static void
|
||||
ml_dimension_schedule_for_training(ml_dimension_t *dim, time_t curr_time)
|
||||
{
|
||||
switch (dim->mt) {
|
||||
case METRIC_TYPE_CONSTANT:
|
||||
return;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
bool schedule_for_training = false;
|
||||
|
||||
switch (dim->ts) {
|
||||
case TRAINING_STATUS_PENDING_WITH_MODEL:
|
||||
case TRAINING_STATUS_PENDING_WITHOUT_MODEL:
|
||||
schedule_for_training = false;
|
||||
break;
|
||||
case TRAINING_STATUS_UNTRAINED:
|
||||
schedule_for_training = true;
|
||||
dim->ts = TRAINING_STATUS_PENDING_WITHOUT_MODEL;
|
||||
break;
|
||||
case TRAINING_STATUS_SILENCED:
|
||||
case TRAINING_STATUS_TRAINED:
|
||||
if ((dim->last_training_time + (Cfg.train_every * dim->rd->rrdset->update_every)) < curr_time) {
|
||||
schedule_for_training = true;
|
||||
dim->ts = TRAINING_STATUS_PENDING_WITH_MODEL;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (schedule_for_training) {
|
||||
ml_request_create_new_model_t req;
|
||||
|
||||
req.DLI = DimensionLookupInfo(
|
||||
&dim->rd->rrdset->rrdhost->machine_guid[0],
|
||||
dim->rd->rrdset->id,
|
||||
dim->rd->id
|
||||
);
|
||||
req.request_time = curr_time;
|
||||
req.first_entry_on_request = rrddim_first_entry_s(dim->rd);
|
||||
req.last_entry_on_request = rrddim_last_entry_s(dim->rd);
|
||||
|
||||
ml_host_t *host = (ml_host_t *) dim->rd->rrdset->rrdhost->ml_host;
|
||||
|
||||
ml_queue_item_t item;
|
||||
item.type = ML_QUEUE_ITEM_TYPE_CREATE_NEW_MODEL;
|
||||
item.create_new_model = req;
|
||||
ml_queue_push(host->queue, item);
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t value, bool exists)
|
||||
ml_dimension_predict(ml_dimension_t *dim, calculated_number_t value, bool exists)
|
||||
{
|
||||
// Nothing to do if ML is disabled for this dimension
|
||||
if (dim->mls != MACHINE_LEARNING_STATUS_ENABLED)
|
||||
|
@ -791,7 +725,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
|
|||
ml_features_preprocess(&features);
|
||||
|
||||
/*
|
||||
* Lock to predict and possibly schedule the dimension for training
|
||||
* Lock to predict
|
||||
*/
|
||||
if (spinlock_trylock(&dim->slock) == 0)
|
||||
return false;
|
||||
|
@ -800,19 +734,10 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
|
|||
if (!same_value)
|
||||
dim->mt = METRIC_TYPE_VARIABLE;
|
||||
|
||||
// Decide if the dimension needs to be scheduled for training
|
||||
ml_dimension_schedule_for_training(dim, curr_time);
|
||||
|
||||
// Nothing to do if we don't have a model
|
||||
switch (dim->ts) {
|
||||
case TRAINING_STATUS_UNTRAINED:
|
||||
case TRAINING_STATUS_PENDING_WITHOUT_MODEL: {
|
||||
case TRAINING_STATUS_SILENCED:
|
||||
spinlock_unlock(&dim->slock);
|
||||
return false;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
// Ignore silenced dimensions
|
||||
if (dim->ts == TRAINING_STATUS_SILENCED) {
|
||||
spinlock_unlock(&dim->slock);
|
||||
return false;
|
||||
}
|
||||
|
||||
dim->suppression_window_counter++;
|
||||
|
@ -888,18 +813,9 @@ ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomal
|
|||
case TRAINING_STATUS_UNTRAINED:
|
||||
chart->mls.num_training_status_untrained++;
|
||||
return;
|
||||
case TRAINING_STATUS_PENDING_WITHOUT_MODEL:
|
||||
chart->mls.num_training_status_pending_without_model++;
|
||||
return;
|
||||
case TRAINING_STATUS_TRAINED:
|
||||
chart->mls.num_training_status_trained++;
|
||||
|
||||
chart->mls.num_anomalous_dimensions += is_anomalous;
|
||||
chart->mls.num_normal_dimensions += !is_anomalous;
|
||||
return;
|
||||
case TRAINING_STATUS_PENDING_WITH_MODEL:
|
||||
chart->mls.num_training_status_pending_with_model++;
|
||||
|
||||
chart->mls.num_anomalous_dimensions += is_anomalous;
|
||||
chart->mls.num_normal_dimensions += !is_anomalous;
|
||||
return;
|
||||
|
@ -997,6 +913,12 @@ ml_host_detect_once(ml_host_t *host)
|
|||
mls_copy = host->mls;
|
||||
|
||||
netdata_mutex_unlock(&host->mutex);
|
||||
|
||||
worker_is_busy(WORKER_JOB_DETECTION_DIM_CHART);
|
||||
ml_update_dimensions_chart(host, mls_copy);
|
||||
|
||||
worker_is_busy(WORKER_JOB_DETECTION_HOST_CHART);
|
||||
ml_update_host_and_detection_rate_charts(host, host->host_anomaly_rate * 10000.0);
|
||||
} else {
|
||||
host->host_anomaly_rate = 0.0;
|
||||
|
||||
|
@ -1009,12 +931,6 @@ ml_host_detect_once(ml_host_t *host)
|
|||
};
|
||||
}
|
||||
}
|
||||
|
||||
worker_is_busy(WORKER_JOB_DETECTION_DIM_CHART);
|
||||
ml_update_dimensions_chart(host, mls_copy);
|
||||
|
||||
worker_is_busy(WORKER_JOB_DETECTION_HOST_CHART);
|
||||
ml_update_host_and_detection_rate_charts(host, host->host_anomaly_rate * 10000.0);
|
||||
}
|
||||
|
||||
void *
|
||||
|
@ -1129,7 +1045,7 @@ static enum ml_worker_result ml_worker_create_new_model(ml_worker_t *worker, ml_
|
|||
}
|
||||
|
||||
ml_dimension_t *Dim = reinterpret_cast<ml_dimension_t *>(AcqDim.dimension());
|
||||
return ml_dimension_train_model(worker, Dim, req);
|
||||
return ml_dimension_train_model(worker, Dim);
|
||||
}
|
||||
|
||||
static enum ml_worker_result ml_worker_add_existing_model(ml_worker_t *worker, ml_request_add_existing_model_t req) {
|
||||
|
@ -1173,6 +1089,12 @@ void *ml_train_main(void *arg) {
|
|||
worker_register_job_name(WORKER_TRAIN_FLUSH_MODELS, "flush models");
|
||||
|
||||
while (!Cfg.training_stop) {
|
||||
if(!stream_control_ml_should_be_running()) {
|
||||
worker_is_idle();
|
||||
stream_control_throttle();
|
||||
continue;
|
||||
}
|
||||
|
||||
worker_is_busy(WORKER_TRAIN_QUEUE_POP);
|
||||
|
||||
ml_queue_stats_t loop_stats{};
|
||||
|
@ -1195,6 +1117,9 @@ void *ml_train_main(void *arg) {
|
|||
switch (item.type) {
|
||||
case ML_QUEUE_ITEM_TYPE_CREATE_NEW_MODEL: {
|
||||
worker_res = ml_worker_create_new_model(worker, item.create_new_model);
|
||||
if (worker_res != ML_WORKER_RESULT_NULL_ACQUIRED_DIMENSION) {
|
||||
ml_queue_push(worker->queue, item);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL: {
|
||||
|
|
|
@ -46,7 +46,7 @@ void ml_config_load(ml_config_t *cfg) {
|
|||
time_t anomaly_detection_query_duration = config_get_duration_seconds(config_section_ml, "anomaly detection grouping duration", 5 * 60);
|
||||
|
||||
size_t num_worker_threads = config_get_number(config_section_ml, "num training threads", os_get_system_cpus() / 4);
|
||||
size_t flush_models_batch_size = config_get_number(config_section_ml, "flush models batch size", 128);
|
||||
size_t flush_models_batch_size = config_get_number(config_section_ml, "flush models batch size", 256);
|
||||
|
||||
size_t suppression_window =
|
||||
config_get_duration_seconds(config_section_ml, "dimension anomaly rate suppression window", 900);
|
||||
|
|
|
@ -29,7 +29,7 @@ struct ml_dimension_t {
|
|||
};
|
||||
|
||||
bool
|
||||
ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t value, bool exists);
|
||||
ml_dimension_predict(ml_dimension_t *dim, calculated_number_t value, bool exists);
|
||||
|
||||
bool ml_dimension_deserialize_kmeans(const char *json_str);
|
||||
|
||||
|
|
|
@ -32,10 +32,6 @@ const char *
|
|||
ml_training_status_to_string(enum ml_training_status ts)
|
||||
{
|
||||
switch (ts) {
|
||||
case TRAINING_STATUS_PENDING_WITH_MODEL:
|
||||
return "pending-with-model";
|
||||
case TRAINING_STATUS_PENDING_WITHOUT_MODEL:
|
||||
return "pending-without-model";
|
||||
case TRAINING_STATUS_TRAINED:
|
||||
return "trained";
|
||||
case TRAINING_STATUS_UNTRAINED:
|
||||
|
|
|
@ -27,12 +27,6 @@ enum ml_training_status {
|
|||
// We don't have a model for this dimension
|
||||
TRAINING_STATUS_UNTRAINED,
|
||||
|
||||
// Request for training sent, but we don't have any models yet
|
||||
TRAINING_STATUS_PENDING_WITHOUT_MODEL,
|
||||
|
||||
// Request to update existing models sent
|
||||
TRAINING_STATUS_PENDING_WITH_MODEL,
|
||||
|
||||
// Have a valid, up-to-date model
|
||||
TRAINING_STATUS_TRAINED,
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ void ml_host_new(RRDHOST *rh)
|
|||
netdata_mutex_init(&host->mutex);
|
||||
spinlock_init(&host->type_anomaly_rate_spinlock);
|
||||
|
||||
host->ml_running = true;
|
||||
host->ml_running = false;
|
||||
rh->ml_host = (rrd_ml_host_t *) host;
|
||||
}
|
||||
|
||||
|
@ -104,13 +104,12 @@ void ml_host_stop(RRDHOST *rh) {
|
|||
|
||||
spinlock_lock(&dim->slock);
|
||||
|
||||
// reset dim
|
||||
// TODO: should we drop in-mem models, or mark them as stale? Is it
|
||||
// okay to resume training straight away?
|
||||
|
||||
dim->mt = METRIC_TYPE_CONSTANT;
|
||||
dim->ts = TRAINING_STATUS_UNTRAINED;
|
||||
|
||||
// TODO: Check if we can remove this field.
|
||||
dim->last_training_time = 0;
|
||||
|
||||
dim->suppression_anomaly_counter = 0;
|
||||
dim->suppression_window_counter = 0;
|
||||
dim->cns.clear();
|
||||
|
@ -290,6 +289,25 @@ void ml_dimension_new(RRDDIM *rd)
|
|||
rd->ml_dimension = (rrd_ml_dimension_t *) dim;
|
||||
|
||||
metaqueue_ml_load_models(rd);
|
||||
|
||||
// add to worker queue
|
||||
{
|
||||
RRDHOST *rh = rd->rrdset->rrdhost;
|
||||
ml_host_t *host = (ml_host_t *) rh->ml_host;
|
||||
|
||||
ml_queue_item_t item;
|
||||
item.type = ML_QUEUE_ITEM_TYPE_CREATE_NEW_MODEL;
|
||||
|
||||
ml_request_create_new_model_t req;
|
||||
req.DLI = DimensionLookupInfo(
|
||||
&rh->machine_guid[0],
|
||||
rd->rrdset->id,
|
||||
rd->id
|
||||
);
|
||||
item.create_new_model = req;
|
||||
|
||||
ml_queue_push(host->queue, item);
|
||||
}
|
||||
}
|
||||
|
||||
void ml_dimension_delete(RRDDIM *rd)
|
||||
|
@ -318,6 +336,8 @@ void ml_dimension_received_anomaly(RRDDIM *rd, bool is_anomalous) {
|
|||
|
||||
bool ml_dimension_is_anomalous(RRDDIM *rd, time_t curr_time, double value, bool exists)
|
||||
{
|
||||
UNUSED(curr_time);
|
||||
|
||||
ml_dimension_t *dim = (ml_dimension_t *) rd->ml_dimension;
|
||||
if (!dim)
|
||||
return false;
|
||||
|
@ -328,7 +348,7 @@ bool ml_dimension_is_anomalous(RRDDIM *rd, time_t curr_time, double value, bool
|
|||
|
||||
ml_chart_t *chart = (ml_chart_t *) rd->rrdset->ml_chart;
|
||||
|
||||
bool is_anomalous = ml_dimension_predict(dim, curr_time, value, exists);
|
||||
bool is_anomalous = ml_dimension_predict(dim, value, exists);
|
||||
ml_chart_update_dimension(chart, dim, is_anomalous);
|
||||
|
||||
return is_anomalous;
|
||||
|
|
|
@ -10,14 +10,6 @@
|
|||
|
||||
typedef struct ml_request_create_new_model {
|
||||
DimensionLookupInfo DLI;
|
||||
|
||||
// Creation time of request
|
||||
time_t request_time;
|
||||
|
||||
// First/last entry of this dimension in DB
|
||||
// at the point the request was made
|
||||
time_t first_entry_on_request;
|
||||
time_t last_entry_on_request;
|
||||
} ml_request_create_new_model_t;
|
||||
|
||||
typedef struct ml_request_add_existing_model {
|
||||
|
|
|
@ -202,6 +202,8 @@ static inline PARSER_RC pluginsd_host_define_end(char **words __maybe_unused, si
|
|||
false);
|
||||
|
||||
rrdhost_option_set(host, RRDHOST_OPTION_VIRTUAL_HOST);
|
||||
rrdhost_flag_set(host, RRDHOST_FLAG_COLLECTOR_ONLINE);
|
||||
ml_host_start(host);
|
||||
dyncfg_host_init(host);
|
||||
|
||||
if(host->rrdlabels) {
|
||||
|
|
|
@ -30,7 +30,7 @@ stream_send_rrdset_metrics_v1_internal(BUFFER *wb, RRDSET *st, struct sender_sta
|
|||
buffer_fast_strcat(wb, "\n", 1);
|
||||
}
|
||||
else {
|
||||
internal_error(true, "STREAM: 'host:%s/chart:%s/dim:%s' flag 'exposed' is updated but not exposed",
|
||||
internal_error(true, "STREAM SEND '%s': 'chart:%s/dim:%s' flag 'exposed' is updated but not exposed",
|
||||
rrdhost_hostname(st->rrdhost), rrdset_id(st), rrddim_id(rd));
|
||||
// we will include it in the next iteration
|
||||
rrddim_metadata_updated(rd);
|
||||
|
|
|
@ -51,7 +51,7 @@ void stream_sender_get_node_and_claim_id_from_parent(struct sender_state *s) {
|
|||
ND_UUID claim_id;
|
||||
if (uuid_parse(claim_id_str ? claim_id_str : "", claim_id.uuid) != 0) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s] received invalid claim id '%s'",
|
||||
"STREAM SEND '%s' [to %s] received invalid claim id '%s'",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
claim_id_str ? claim_id_str : "(unset)");
|
||||
return;
|
||||
|
@ -60,7 +60,7 @@ void stream_sender_get_node_and_claim_id_from_parent(struct sender_state *s) {
|
|||
ND_UUID node_id;
|
||||
if(uuid_parse(node_id_str ? node_id_str : "", node_id.uuid) != 0) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s] received an invalid node id '%s'",
|
||||
"STREAM SEND '%s' [to %s] received an invalid node id '%s'",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
node_id_str ? node_id_str : "(unset)");
|
||||
return;
|
||||
|
@ -68,14 +68,14 @@ void stream_sender_get_node_and_claim_id_from_parent(struct sender_state *s) {
|
|||
|
||||
if (!UUIDiszero(s->host->aclk.claim_id_of_parent) && !UUIDeq(s->host->aclk.claim_id_of_parent, claim_id))
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO,
|
||||
"STREAM %s [send to %s] changed parent's claim id to %s",
|
||||
"STREAM SEND '%s' [to %s] changed parent's claim id to %s",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
claim_id_str ? claim_id_str : "(unset)");
|
||||
|
||||
if(!UUIDiszero(s->host->node_id) && !UUIDeq(s->host->node_id, node_id)) {
|
||||
if(claimed) {
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM %s [send to %s] parent reports different node id '%s', but we are claimed. Ignoring it.",
|
||||
"STREAM SEND '%s' [to %s] parent reports different node id '%s', but we are claimed. Ignoring it.",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
node_id_str ? node_id_str : "(unset)");
|
||||
return;
|
||||
|
@ -83,7 +83,7 @@ void stream_sender_get_node_and_claim_id_from_parent(struct sender_state *s) {
|
|||
else {
|
||||
update_node_id = true;
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM %s [send to %s] changed node id to %s",
|
||||
"STREAM SEND '%s' [to %s] changed node id to %s",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
node_id_str ? node_id_str : "(unset)");
|
||||
}
|
||||
|
@ -91,7 +91,7 @@ void stream_sender_get_node_and_claim_id_from_parent(struct sender_state *s) {
|
|||
|
||||
if(!url || !*url) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s] received an invalid cloud URL '%s'",
|
||||
"STREAM SEND '%s' [to %s] received an invalid cloud URL '%s'",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
url ? url : "(unset)");
|
||||
return;
|
||||
|
|
|
@ -19,13 +19,19 @@ RRDSET_STREAM_BUFFER stream_send_metrics_init(RRDSET *st, time_t wall_clock_time
|
|||
// check if we are not connected
|
||||
if(unlikely(!(host_flags & RRDHOST_FLAG_STREAM_SENDER_READY_4_METRICS))) {
|
||||
|
||||
if(unlikely(!(host_flags & (RRDHOST_FLAG_STREAM_SENDER_ADDED | RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED))))
|
||||
if(unlikely((host_flags & RRDHOST_FLAG_COLLECTOR_ONLINE) &&
|
||||
!(host_flags & RRDHOST_FLAG_STREAM_SENDER_ADDED)))
|
||||
stream_sender_start_host(host);
|
||||
|
||||
if(unlikely(!(host_flags & RRDHOST_FLAG_STREAM_SENDER_LOGGED_STATUS))) {
|
||||
rrdhost_flag_set(host, RRDHOST_FLAG_STREAM_SENDER_LOGGED_STATUS);
|
||||
|
||||
// this message is logged in 2 cases:
|
||||
// - the parent is connected, but not yet available for streaming data
|
||||
// - the parent just disconnected, so local data are not streamed to parent
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO,
|
||||
"STREAM SEND %s: connected but streaming is not ready yet...",
|
||||
"STREAM SEND '%s': streaming is not ready, not sending data to a parent...",
|
||||
rrdhost_hostname(host));
|
||||
}
|
||||
|
||||
|
@ -33,7 +39,7 @@ RRDSET_STREAM_BUFFER stream_send_metrics_init(RRDSET *st, time_t wall_clock_time
|
|||
}
|
||||
else if(unlikely(host_flags & RRDHOST_FLAG_STREAM_SENDER_LOGGED_STATUS)) {
|
||||
nd_log(NDLS_DAEMON, NDLP_INFO,
|
||||
"STREAM SEND %s: streaming is ready, sending metrics to parent...",
|
||||
"STREAM SEND '%s': streaming is ready, sending metrics to parent...",
|
||||
rrdhost_hostname(host));
|
||||
rrdhost_flag_clear(host, RRDHOST_FLAG_STREAM_SENDER_LOGGED_STATUS);
|
||||
}
|
||||
|
|
|
@ -50,7 +50,7 @@ struct replication_query_statistics replication_get_query_statistics(void) {
|
|||
return ret;
|
||||
}
|
||||
|
||||
size_t replication_buffers_allocated = 0;
|
||||
static size_t replication_buffers_allocated = 0;
|
||||
|
||||
size_t replication_allocated_buffers(void) {
|
||||
return __atomic_load_n(&replication_buffers_allocated, __ATOMIC_RELAXED);
|
||||
|
@ -155,7 +155,7 @@ static struct replication_query *replication_query_prepare(
|
|||
if (st->last_updated.tv_sec > q->query.before) {
|
||||
#ifdef NETDATA_LOG_REPLICATION_REQUESTS
|
||||
internal_error(true,
|
||||
"STREAM_SENDER REPLAY: 'host:%s/chart:%s' "
|
||||
"STREAM SEND REPLAY: 'host:%s/chart:%s' "
|
||||
"has start_streaming = true, "
|
||||
"adjusting replication before timestamp from %llu to %llu",
|
||||
rrdhost_hostname(st->rrdhost), rrdset_id(st),
|
||||
|
@ -178,7 +178,7 @@ static struct replication_query *replication_query_prepare(
|
|||
|
||||
if (unlikely(rd_dfe.counter >= q->dimensions)) {
|
||||
internal_error(true,
|
||||
"STREAM_SENDER REPLAY ERROR: 'host:%s/chart:%s' has more dimensions than the replicated ones",
|
||||
"STREAM SEND REPLAY ERROR: 'host:%s/chart:%s' has more dimensions than the replicated ones",
|
||||
rrdhost_hostname(st->rrdhost), rrdset_id(st));
|
||||
break;
|
||||
}
|
||||
|
@ -192,6 +192,7 @@ static struct replication_query *replication_query_prepare(
|
|||
STORAGE_PRIORITY priority = q->query.locked_data_collection ? STORAGE_PRIORITY_HIGH : STORAGE_PRIORITY_LOW;
|
||||
if(synchronous) priority = STORAGE_PRIORITY_SYNCHRONOUS;
|
||||
|
||||
stream_control_replication_query_started();
|
||||
storage_engine_query_init(q->backend, rd->tiers[0].smh, &d->handle,
|
||||
q->query.after, q->query.before, priority);
|
||||
d->enabled = true;
|
||||
|
@ -276,6 +277,7 @@ static void replication_query_finalize(BUFFER *wb, struct replication_query *q,
|
|||
if (unlikely(!d->enabled)) continue;
|
||||
|
||||
storage_engine_query_finalize(&d->handle);
|
||||
stream_control_replication_query_finished();
|
||||
|
||||
dictionary_acquired_item_release(d->dict, d->rda);
|
||||
|
||||
|
@ -362,7 +364,7 @@ static bool replication_query_execute(BUFFER *wb, struct replication_query *q, s
|
|||
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM_SENDER REPLAY ERROR: 'host:%s/chart:%s/dim:%s': db does not advance the query "
|
||||
"STREAM SEND REPLAY: 'host:%s/chart:%s/dim:%s': db does not advance the query "
|
||||
"beyond time %llu (tried 1000 times to get the next point and always got back a point in the past)",
|
||||
rrdhost_hostname(q->st->rrdhost), rrdset_id(q->st), rrddim_id(d->rd),
|
||||
(unsigned long long) now);
|
||||
|
@ -412,8 +414,7 @@ static bool replication_query_execute(BUFFER *wb, struct replication_query *q, s
|
|||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_WARNING,
|
||||
"REPLAY WARNING: 'host:%s/chart:%s' "
|
||||
"misaligned dimensions, "
|
||||
"STREAM SEND REPLAY WARNING: 'host:%s/chart:%s' misaligned dimensions, "
|
||||
"update every (min: %ld, max: %ld), "
|
||||
"start time (min: %ld, max: %ld), "
|
||||
"end time (min %ld, max %ld), "
|
||||
|
@ -448,9 +449,10 @@ static bool replication_query_execute(BUFFER *wb, struct replication_query *q, s
|
|||
q->query.before = last_end_time_in_buffer;
|
||||
q->query.enable_streaming = false;
|
||||
|
||||
internal_error(true, "REPLICATION: current buffer size %zu is more than the "
|
||||
"max message size %zu for chart '%s' of host '%s'. "
|
||||
"Interrupting replication request (%ld to %ld, %s) at %ld to %ld, %s.",
|
||||
internal_error(true,
|
||||
"STREAM SEND REPLAY: current buffer size %zu is more than the "
|
||||
"max message size %zu for chart '%s' of host '%s'. "
|
||||
"Interrupting replication request (%ld to %ld, %s) at %ld to %ld, %s.",
|
||||
buffer_strlen(wb), max_msg_size, rrdset_id(q->st), rrdhost_hostname(q->st->rrdhost),
|
||||
q->request.after, q->request.before, q->request.enable_streaming?"true":"false",
|
||||
q->query.after, q->query.before, q->query.enable_streaming?"true":"false");
|
||||
|
@ -528,14 +530,14 @@ static bool replication_query_execute(BUFFER *wb, struct replication_query *q, s
|
|||
log_date(actual_after_buf, LOG_DATE_LENGTH, actual_after);
|
||||
log_date(actual_before_buf, LOG_DATE_LENGTH, actual_before);
|
||||
internal_error(true,
|
||||
"STREAM_SENDER REPLAY: 'host:%s/chart:%s': sending data %llu [%s] to %llu [%s] (requested %llu [delta %lld] to %llu [delta %lld])",
|
||||
"STREAM SEND REPLAY: 'host:%s/chart:%s': sending data %llu [%s] to %llu [%s] (requested %llu [delta %lld] to %llu [delta %lld])",
|
||||
rrdhost_hostname(q->st->rrdhost), rrdset_id(q->st),
|
||||
(unsigned long long)actual_after, actual_after_buf, (unsigned long long)actual_before, actual_before_buf,
|
||||
(unsigned long long)after, (long long)(actual_after - after), (unsigned long long)before, (long long)(actual_before - before));
|
||||
}
|
||||
else
|
||||
internal_error(true,
|
||||
"STREAM_SENDER REPLAY: 'host:%s/chart:%s': nothing to send (requested %llu to %llu)",
|
||||
"STREAM SEND REPLAY: 'host:%s/chart:%s': nothing to send (requested %llu to %llu)",
|
||||
rrdhost_hostname(q->st->rrdhost), rrdset_id(q->st),
|
||||
(unsigned long long)after, (unsigned long long)before);
|
||||
#endif // NETDATA_LOG_REPLICATION_REQUESTS
|
||||
|
@ -706,12 +708,14 @@ bool replication_response_execute_and_finalize(struct replication_query *q, size
|
|||
st->stream.snd.resync_time_s = 0;
|
||||
|
||||
#ifdef NETDATA_LOG_REPLICATION_REQUESTS
|
||||
internal_error(true, "STREAM_SENDER REPLAY: 'host:%s/chart:%s' streaming starts",
|
||||
internal_error(true, "STREAM SEND REPLAY: 'host:%s/chart:%s' streaming starts",
|
||||
rrdhost_hostname(st->rrdhost), rrdset_id(st));
|
||||
#endif
|
||||
}
|
||||
else
|
||||
internal_error(true, "REPLAY ERROR: 'host:%s/chart:%s' received start streaming command, but the chart is not in progress replicating",
|
||||
internal_error(true,
|
||||
"STREAM SEND REPLAY ERROR: 'host:%s/chart:%s' "
|
||||
"received start streaming command, but the chart is not in progress replicating",
|
||||
rrdhost_hostname(st->rrdhost), rrdset_id(st));
|
||||
}
|
||||
}
|
||||
|
@ -771,7 +775,7 @@ static void replicate_log_request(struct replication_request_details *r, const c
|
|||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_NOTICE,
|
||||
#endif
|
||||
"REPLAY ERROR: 'host:%s/chart:%s' child sent: "
|
||||
"STREAM SEND REPLAY ERROR: 'host:%s/chart:%s' child sent: "
|
||||
"db from %ld to %ld%s, wall clock time %ld, "
|
||||
"last request from %ld to %ld, "
|
||||
"issue: %s - "
|
||||
|
@ -809,7 +813,7 @@ static bool send_replay_chart_cmd(struct replication_request_details *r, const c
|
|||
log_date(wanted_before_buf, LOG_DATE_LENGTH, r->wanted.before);
|
||||
|
||||
internal_error(true,
|
||||
"REPLAY: 'host:%s/chart:%s' sending replication request %ld [%s] to %ld [%s], start streaming '%s': %s: "
|
||||
"STREAM SEND REPLAY: 'host:%s/chart:%s' sending replication request %ld [%s] to %ld [%s], start streaming '%s': %s: "
|
||||
"last[%ld - %ld] child[%ld - %ld, now %ld %s] local[%ld - %ld, now %ld] gap[%ld - %ld %s] %s"
|
||||
, rrdhost_hostname(r->host), rrdset_id(r->st)
|
||||
, r->wanted.after, wanted_after_buf
|
||||
|
@ -838,7 +842,7 @@ static bool send_replay_chart_cmd(struct replication_request_details *r, const c
|
|||
|
||||
ssize_t ret = r->caller.callback(buffer, r->caller.parser, STREAM_TRAFFIC_TYPE_REPLICATION);
|
||||
if (ret < 0) {
|
||||
netdata_log_error("REPLAY ERROR: 'host:%s/chart:%s' failed to send replication request to child (error %zd)",
|
||||
netdata_log_error("STREAM SEND REPLAY ERROR: 'host:%s/chart:%s' failed to send replication request to child (error %zd)",
|
||||
rrdhost_hostname(r->host), rrdset_id(r->st), ret);
|
||||
return false;
|
||||
}
|
||||
|
@ -1277,7 +1281,7 @@ static void replication_sort_entry_del(struct replication_request *rq, bool buff
|
|||
}
|
||||
|
||||
if (!rse_to_delete)
|
||||
fatal("REPLAY: 'host:%s/chart:%s' Cannot find sort entry to delete for time %ld.",
|
||||
fatal("STREAM SEND REPLAY: 'host:%s/chart:%s' Cannot find sort entry to delete for time %ld.",
|
||||
rrdhost_hostname(rq->sender->host), string2str(rq->chart_id), rq->after);
|
||||
|
||||
}
|
||||
|
@ -1380,7 +1384,7 @@ static bool replication_request_conflict_callback(const DICTIONARY_ITEM *item __
|
|||
// we can replace this command
|
||||
internal_error(
|
||||
true,
|
||||
"STREAM %s [send to %s]: REPLAY: 'host:%s/chart:%s' replacing duplicate replication command received (existing from %llu to %llu [%s], new from %llu to %llu [%s])",
|
||||
"STREAM SEND '%s' [to %s]: REPLAY: 'host:%s/chart:%s' replacing duplicate replication command received (existing from %llu to %llu [%s], new from %llu to %llu [%s])",
|
||||
rrdhost_hostname(s->host), s->connected_to, rrdhost_hostname(s->host), dictionary_acquired_item_name(item),
|
||||
(unsigned long long)rq->after, (unsigned long long)rq->before, rq->start_streaming ? "true" : "false",
|
||||
(unsigned long long)rq_new->after, (unsigned long long)rq_new->before, rq_new->start_streaming ? "true" : "false");
|
||||
|
@ -1393,7 +1397,7 @@ static bool replication_request_conflict_callback(const DICTIONARY_ITEM *item __
|
|||
replication_sort_entry_add(rq);
|
||||
internal_error(
|
||||
true,
|
||||
"STREAM %s [send to %s]: REPLAY: 'host:%s/chart:%s' adding duplicate replication command received (existing from %llu to %llu [%s], new from %llu to %llu [%s])",
|
||||
"STREAM SEND '%s' [to %s]: REPLAY: 'host:%s/chart:%s' adding duplicate replication command received (existing from %llu to %llu [%s], new from %llu to %llu [%s])",
|
||||
rrdhost_hostname(s->host), s->connected_to, rrdhost_hostname(s->host), dictionary_acquired_item_name(item),
|
||||
(unsigned long long)rq->after, (unsigned long long)rq->before, rq->start_streaming ? "true" : "false",
|
||||
(unsigned long long)rq_new->after, (unsigned long long)rq_new->before, rq_new->start_streaming ? "true" : "false");
|
||||
|
@ -1401,7 +1405,7 @@ static bool replication_request_conflict_callback(const DICTIONARY_ITEM *item __
|
|||
else {
|
||||
internal_error(
|
||||
true,
|
||||
"STREAM %s [send to %s]: REPLAY: 'host:%s/chart:%s' ignoring duplicate replication command received (existing from %llu to %llu [%s], new from %llu to %llu [%s])",
|
||||
"STREAM SEND '%s' [to %s]: REPLAY: 'host:%s/chart:%s' ignoring duplicate replication command received (existing from %llu to %llu [%s], new from %llu to %llu [%s])",
|
||||
rrdhost_hostname(s->host), s->connected_to, rrdhost_hostname(s->host),
|
||||
dictionary_acquired_item_name(item),
|
||||
(unsigned long long) rq->after, (unsigned long long) rq->before, rq->start_streaming ? "true" : "false",
|
||||
|
@ -1445,7 +1449,7 @@ static bool replication_execute_request(struct replication_request *rq, bool wor
|
|||
}
|
||||
|
||||
if(!rq->st) {
|
||||
internal_error(true, "REPLAY ERROR: 'host:%s/chart:%s' not found",
|
||||
internal_error(true, "STREAM SEND REPLAY ERROR: 'host:%s/chart:%s' not found",
|
||||
rrdhost_hostname(rq->sender->host), string2str(rq->chart_id));
|
||||
|
||||
goto cleanup;
|
||||
|
@ -1573,7 +1577,8 @@ static size_t verify_host_charts_are_streaming_now(RRDHOST *host) {
|
|||
host->sender &&
|
||||
!stream_sender_pending_replication_requests(host->sender) &&
|
||||
dictionary_entries(host->sender->replication.requests) != 0,
|
||||
"REPLICATION SUMMARY: 'host:%s' reports %zu pending replication requests, but its chart replication index says there are %zu charts pending replication",
|
||||
"STREAM SEND REPLAY SUMMARY: 'host:%s' reports %zu pending replication requests, "
|
||||
"but its chart replication index says there are %zu charts pending replication",
|
||||
rrdhost_hostname(host),
|
||||
stream_sender_pending_replication_requests(host->sender),
|
||||
dictionary_entries(host->sender->replication.requests)
|
||||
|
@ -1591,7 +1596,7 @@ static size_t verify_host_charts_are_streaming_now(RRDHOST *host) {
|
|||
if(!flags) {
|
||||
internal_error(
|
||||
true,
|
||||
"REPLICATION SUMMARY: 'host:%s/chart:%s' is neither IN PROGRESS nor FINISHED",
|
||||
"STREAM SEND REPLAY SUMMARY: 'host:%s/chart:%s' is neither IN PROGRESS nor FINISHED",
|
||||
rrdhost_hostname(host), rrdset_id(st)
|
||||
);
|
||||
is_error = true;
|
||||
|
@ -1600,7 +1605,7 @@ static size_t verify_host_charts_are_streaming_now(RRDHOST *host) {
|
|||
if(!(flags & RRDSET_FLAG_SENDER_REPLICATION_FINISHED) || (flags & RRDSET_FLAG_SENDER_REPLICATION_IN_PROGRESS)) {
|
||||
internal_error(
|
||||
true,
|
||||
"REPLICATION SUMMARY: 'host:%s/chart:%s' is IN PROGRESS although replication is finished",
|
||||
"STREAM SEND REPLAY SUMMARY: 'host:%s/chart:%s' is IN PROGRESS although replication is finished",
|
||||
rrdhost_hostname(host), rrdset_id(st)
|
||||
);
|
||||
is_error = true;
|
||||
|
@ -1614,7 +1619,7 @@ static size_t verify_host_charts_are_streaming_now(RRDHOST *host) {
|
|||
rrdset_foreach_done(st);
|
||||
|
||||
internal_error(errors,
|
||||
"REPLICATION SUMMARY: 'host:%s' finished replicating %zu charts, but %zu charts are still in progress although replication finished",
|
||||
"STREAM SEND REPLAY SUMMARY: 'host:%s' finished replicating %zu charts, but %zu charts are still in progress although replication finished",
|
||||
rrdhost_hostname(host), ok, errors);
|
||||
|
||||
return errors;
|
||||
|
@ -1830,6 +1835,12 @@ static void *replication_worker_thread(void *ptr __maybe_unused) {
|
|||
replication_initialize_workers(false);
|
||||
|
||||
while (service_running(SERVICE_REPLICATION)) {
|
||||
if(!stream_control_replication_should_be_running()) {
|
||||
worker_is_idle();
|
||||
stream_control_throttle();
|
||||
continue;
|
||||
}
|
||||
|
||||
if (unlikely(replication_pipeline_execute_next() == REQUEST_QUEUE_EMPTY)) {
|
||||
sender_commit_thread_buffer_free();
|
||||
worker_is_busy(WORKER_JOB_WAIT);
|
||||
|
@ -1880,7 +1891,7 @@ void *replication_thread_main(void *ptr) {
|
|||
|
||||
int nodes = (int)dictionary_entries(rrdhost_root_index);
|
||||
int cpus = (int)get_netdata_cpus();
|
||||
int threads = MIN(cpus * 1 / 3, nodes / 10);
|
||||
int threads = cpus / 2;
|
||||
if (threads < 1) threads = 1;
|
||||
else if (threads > MAX_REPLICATION_THREADS) threads = MAX_REPLICATION_THREADS;
|
||||
|
||||
|
@ -1926,6 +1937,12 @@ void *replication_thread_main(void *ptr) {
|
|||
|
||||
while(service_running(SERVICE_REPLICATION)) {
|
||||
|
||||
if(!stream_control_replication_should_be_running()) {
|
||||
worker_is_idle();
|
||||
stream_control_throttle();
|
||||
continue;
|
||||
}
|
||||
|
||||
// statistics
|
||||
usec_t now_mono_ut = now_monotonic_usec();
|
||||
if(unlikely(now_mono_ut - last_now_mono_ut > default_rrd_update_every * USEC_PER_SEC)) {
|
||||
|
|
|
@ -6,6 +6,10 @@
|
|||
#include "daemon/common.h"
|
||||
#include "stream-circular-buffer.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct parser;
|
||||
|
||||
struct replication_query_statistics {
|
||||
|
@ -36,4 +40,8 @@ void replication_recalculate_buffer_used_ratio_unsafe(struct sender_state *s);
|
|||
size_t replication_allocated_memory(void);
|
||||
size_t replication_allocated_buffers(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* REPLICATION_H */
|
||||
|
|
|
@ -132,7 +132,7 @@ void rrdhost_status(RRDHOST *host, time_t now, RRDHOST_STATUS *s) {
|
|||
rrdhost_receiver_lock(host);
|
||||
s->ingest.hops = (int16_t)(host->system_info ? host->system_info->hops : (host == localhost) ? 0 : 1);
|
||||
bool has_receiver = false;
|
||||
if (host->receiver && !rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED)) {
|
||||
if (host->receiver && rrdhost_flag_check(host, RRDHOST_FLAG_COLLECTOR_ONLINE)) {
|
||||
has_receiver = true;
|
||||
s->ingest.replication.instances = rrdhost_receiver_replicating_charts(host);
|
||||
s->ingest.replication.completion = host->stream.rcv.status.replication.percent;
|
||||
|
|
|
@ -80,7 +80,7 @@ void log_receiver_capabilities(struct receiver_state *rpt) {
|
|||
BUFFER *wb = buffer_create(100, NULL);
|
||||
stream_capabilities_to_string(wb, rpt->capabilities);
|
||||
|
||||
nd_log_daemon(NDLP_INFO, "STREAM %s [receive from [%s]:%s]: established link with negotiated capabilities: %s",
|
||||
nd_log_daemon(NDLP_INFO, "STREAM RECEIVE '%s' [from [%s]:%s]: established link with negotiated capabilities: %s",
|
||||
rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, buffer_tostring(wb));
|
||||
|
||||
buffer_free(wb);
|
||||
|
@ -90,7 +90,7 @@ void log_sender_capabilities(struct sender_state *s) {
|
|||
BUFFER *wb = buffer_create(100, NULL);
|
||||
stream_capabilities_to_string(wb, s->capabilities);
|
||||
|
||||
nd_log_daemon(NDLP_INFO, "STREAM %s [send to %s]: established link with negotiated capabilities: %s",
|
||||
nd_log_daemon(NDLP_INFO, "STREAM SEND '%s' [to %s]: established link with negotiated capabilities: %s",
|
||||
rrdhost_hostname(s->host), s->connected_to, buffer_tostring(wb));
|
||||
|
||||
buffer_free(wb);
|
||||
|
|
|
@ -3,8 +3,6 @@
|
|||
#include "stream.h"
|
||||
#include "stream-sender-internals.h"
|
||||
|
||||
#define STREAM_CIRCULAR_BUFFER_ADAPT_TO_TIMES_MAX_SIZE 3
|
||||
|
||||
struct stream_circular_buffer {
|
||||
struct circular_buffer *cb;
|
||||
STREAM_CIRCULAR_BUFFER_STATS stats;
|
||||
|
@ -41,10 +39,9 @@ STREAM_CIRCULAR_BUFFER *stream_circular_buffer_create(void) {
|
|||
}
|
||||
|
||||
// returns true if it increased the buffer size
|
||||
bool stream_circular_buffer_set_max_size_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t uncompressed_msg_size, bool force) {
|
||||
size_t wanted = uncompressed_msg_size * STREAM_CIRCULAR_BUFFER_ADAPT_TO_TIMES_MAX_SIZE;
|
||||
if(force || scb->cb->max_size < wanted) {
|
||||
scb->cb->max_size = wanted;
|
||||
bool stream_circular_buffer_set_max_size_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t max_size, bool force) {
|
||||
if(force || scb->cb->max_size < max_size) {
|
||||
scb->cb->max_size = max_size;
|
||||
scb->stats.bytes_max_size = scb->cb->max_size;
|
||||
__atomic_store_n(&scb->atomic.max_size, scb->cb->max_size, __ATOMIC_RELAXED);
|
||||
stream_circular_buffer_stats_update_unsafe(scb);
|
||||
|
@ -81,8 +78,9 @@ void stream_circular_buffer_recreate_timed_unsafe(STREAM_CIRCULAR_BUFFER *scb, u
|
|||
scb->stats.recreates++; // we increase even if we don't do it, to have sender_start() recreate its buffers
|
||||
|
||||
if(scb->cb && scb->cb->size > CBUFFER_INITIAL_SIZE) {
|
||||
size_t max_size = scb->cb->max_size;
|
||||
cbuffer_free(scb->cb);
|
||||
scb->cb = cbuffer_new(CBUFFER_INITIAL_SIZE, stream_send.buffer_max_size, &netdata_buffers_statistics.cbuffers_streaming);
|
||||
scb->cb = cbuffer_new(CBUFFER_INITIAL_SIZE, max_size, &netdata_buffers_statistics.cbuffers_streaming);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -96,15 +94,22 @@ void stream_circular_buffer_destroy(STREAM_CIRCULAR_BUFFER *scb) {
|
|||
}
|
||||
|
||||
// adds data to the circular buffer, returns false when it can't (buffer is full)
|
||||
bool stream_circular_buffer_add_unsafe(STREAM_CIRCULAR_BUFFER *scb, const char *data, size_t bytes_actual, size_t bytes_uncompressed, STREAM_TRAFFIC_TYPE type) {
|
||||
bool stream_circular_buffer_add_unsafe(
|
||||
STREAM_CIRCULAR_BUFFER *scb, const char *data,
|
||||
size_t bytes_actual, size_t bytes_uncompressed, STREAM_TRAFFIC_TYPE type, bool autoscale) {
|
||||
scb->stats.adds++;
|
||||
scb->stats.bytes_added += bytes_actual;
|
||||
scb->stats.bytes_uncompressed += bytes_uncompressed;
|
||||
scb->stats.bytes_sent_by_type[type] += bytes_actual;
|
||||
bool rc = cbuffer_add_unsafe(scb->cb, data, bytes_actual) == 0;
|
||||
if(rc)
|
||||
stream_circular_buffer_stats_update_unsafe(scb);
|
||||
return rc;
|
||||
|
||||
if(unlikely(autoscale && cbuffer_available_size_unsafe(scb->cb) < bytes_actual))
|
||||
stream_circular_buffer_set_max_size_unsafe(scb, scb->cb->max_size * 2, true);
|
||||
|
||||
if(unlikely(cbuffer_add_unsafe(scb->cb, data, bytes_actual) != 0))
|
||||
return false;
|
||||
|
||||
stream_circular_buffer_stats_update_unsafe(scb);
|
||||
return true;
|
||||
}
|
||||
|
||||
// return the first available chunk at the beginning of the buffer
|
||||
|
|
|
@ -6,10 +6,16 @@
|
|||
#include "libnetdata/libnetdata.h"
|
||||
#include "stream-traffic-types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define CBUFFER_INITIAL_SIZE (16 * 1024)
|
||||
#define CBUFFER_INITIAL_MAX_SIZE (10 * 1024 * 1024)
|
||||
#define THREAD_BUFFER_INITIAL_SIZE (8192)
|
||||
|
||||
#define STREAM_CIRCULAR_BUFFER_ADAPT_TO_TIMES_MAX_SIZE 3
|
||||
|
||||
typedef struct stream_circular_buffer_stats {
|
||||
size_t adds;
|
||||
size_t sends;
|
||||
|
@ -48,7 +54,7 @@ void stream_circular_buffer_recreate_timed_unsafe(STREAM_CIRCULAR_BUFFER *scb, u
|
|||
|
||||
// returns true if it increased the buffer size
|
||||
// if it changes the size, it updates the statistics
|
||||
bool stream_circular_buffer_set_max_size_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t uncompressed_msg_size, bool force);
|
||||
bool stream_circular_buffer_set_max_size_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t max_size, bool force);
|
||||
|
||||
// returns a pointer to the current circular buffer statistics
|
||||
// copy it if you plan to use it without a lock
|
||||
|
@ -71,7 +77,9 @@ usec_t stream_circular_buffer_get_since_ut(STREAM_CIRCULAR_BUFFER *scb);
|
|||
|
||||
// adds data to the end of the circular buffer, returns false when it can't (buffer is full)
|
||||
// it updates the statistics
|
||||
bool stream_circular_buffer_add_unsafe(STREAM_CIRCULAR_BUFFER *scb, const char *data, size_t bytes_actual, size_t bytes_uncompressed, STREAM_TRAFFIC_TYPE type);
|
||||
bool stream_circular_buffer_add_unsafe(
|
||||
STREAM_CIRCULAR_BUFFER *scb, const char *data, size_t bytes_actual, size_t bytes_uncompressed,
|
||||
STREAM_TRAFFIC_TYPE type, bool autoscale);
|
||||
|
||||
// returns a pointer to the beginning of the buffer, and its size in bytes
|
||||
size_t stream_circular_buffer_get_unsafe(STREAM_CIRCULAR_BUFFER *scb, char **chunk);
|
||||
|
@ -80,4 +88,8 @@ size_t stream_circular_buffer_get_unsafe(STREAM_CIRCULAR_BUFFER *scb, char **chu
|
|||
// it updates the statistics
|
||||
void stream_circular_buffer_del_unsafe(STREAM_CIRCULAR_BUFFER *scb, size_t bytes);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //NETDATA_STREAM_CIRCULAR_BUFFER_H
|
||||
|
|
|
@ -124,7 +124,10 @@ static inline size_t stream_decompress_decode_signature(const char *data, size_t
|
|||
if (unlikely(data_size != STREAM_COMPRESSION_SIGNATURE_SIZE))
|
||||
return 0;
|
||||
|
||||
stream_compression_signature_t sign = *(stream_compression_signature_t *)data;
|
||||
stream_compression_signature_t sign;
|
||||
memcpy(&sign, data, sizeof(stream_compression_signature_t)); // Safe copy to aligned variable
|
||||
// stream_compression_signature_t sign = *(stream_compression_signature_t *)data;
|
||||
|
||||
if (unlikely((sign & STREAM_COMPRESSION_SIGNATURE_MASK) != STREAM_COMPRESSION_SIGNATURE))
|
||||
return 0;
|
||||
|
||||
|
|
|
@ -194,7 +194,7 @@ void stream_conf_receiver_config(struct receiver_state *rpt, struct stream_recei
|
|||
rrd_memory_mode_name(default_rrd_memory_mode))));
|
||||
|
||||
if (unlikely(config->mode == RRD_MEMORY_MODE_DBENGINE && !dbengine_enabled)) {
|
||||
netdata_log_error("STREAM '%s' [receive from %s:%s]: "
|
||||
netdata_log_error("STREAM RECEIVE '%s' [from [%s]:%s]: "
|
||||
"dbengine is not enabled, falling back to default."
|
||||
, rpt->hostname
|
||||
, rpt->client_ip, rpt->client_port
|
||||
|
@ -270,11 +270,8 @@ void stream_conf_receiver_config(struct receiver_state *rpt, struct stream_recei
|
|||
stream_parse_compression_order(
|
||||
config,
|
||||
appconfig_get(
|
||||
&stream_config,
|
||||
machine_guid,
|
||||
"compression algorithms order",
|
||||
appconfig_get(
|
||||
&stream_config, api_key, "compression algorithms order", STREAM_COMPRESSION_ALGORITHMS_ORDER)));
|
||||
&stream_config, machine_guid, "compression algorithms order",
|
||||
appconfig_get(&stream_config, api_key, "compression algorithms order", STREAM_COMPRESSION_ALGORITHMS_ORDER)));
|
||||
}
|
||||
|
||||
config->ephemeral =
|
||||
|
|
|
@ -2,49 +2,6 @@
|
|||
|
||||
#include "stream-sender-internals.h"
|
||||
|
||||
typedef struct {
|
||||
char *os_name;
|
||||
char *os_id;
|
||||
char *os_version;
|
||||
char *kernel_name;
|
||||
char *kernel_version;
|
||||
} stream_encoded_t;
|
||||
|
||||
static void rrdpush_encode_variable(stream_encoded_t *se, RRDHOST *host) {
|
||||
se->os_name = (host->system_info->host_os_name)?url_encode(host->system_info->host_os_name):strdupz("");
|
||||
se->os_id = (host->system_info->host_os_id)?url_encode(host->system_info->host_os_id):strdupz("");
|
||||
se->os_version = (host->system_info->host_os_version)?url_encode(host->system_info->host_os_version):strdupz("");
|
||||
se->kernel_name = (host->system_info->kernel_name)?url_encode(host->system_info->kernel_name):strdupz("");
|
||||
se->kernel_version = (host->system_info->kernel_version)?url_encode(host->system_info->kernel_version):strdupz("");
|
||||
}
|
||||
|
||||
static void rrdpush_clean_encoded(stream_encoded_t *se) {
|
||||
if (se->os_name) {
|
||||
freez(se->os_name);
|
||||
se->os_name = NULL;
|
||||
}
|
||||
|
||||
if (se->os_id) {
|
||||
freez(se->os_id);
|
||||
se->os_id = NULL;
|
||||
}
|
||||
|
||||
if (se->os_version) {
|
||||
freez(se->os_version);
|
||||
se->os_version = NULL;
|
||||
}
|
||||
|
||||
if (se->kernel_name) {
|
||||
freez(se->kernel_name);
|
||||
se->kernel_name = NULL;
|
||||
}
|
||||
|
||||
if (se->kernel_version) {
|
||||
freez(se->kernel_version);
|
||||
se->kernel_version = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
static struct {
|
||||
const char *response;
|
||||
const char *status;
|
||||
|
@ -152,7 +109,7 @@ static struct {
|
|||
.dynamic = false,
|
||||
.error = "remote server is initializing, we should try later",
|
||||
.worker_job_id = WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_BAD_HANDSHAKE,
|
||||
.postpone_reconnect_seconds = 2 * 60, // 2 minute
|
||||
.postpone_reconnect_seconds = 30, // 30 seconds
|
||||
.priority = NDLP_NOTICE,
|
||||
},
|
||||
|
||||
|
@ -303,12 +260,23 @@ stream_connect_validate_first_response(RRDHOST *host, struct sender_state *s, ch
|
|||
rfc3339_datetime_ut(buf, sizeof(buf), stream_parent_get_reconnection_ut(host->stream.snd.parents.current), 0, false);
|
||||
|
||||
nd_log(NDLS_DAEMON, priority,
|
||||
"STREAM %s [send to %s]: %s - will retry in %d secs, at %s",
|
||||
"STREAM CONNECT '%s' [to %s]: %s - will retry in %d secs, at %s",
|
||||
rrdhost_hostname(host), s->connected_to, error, delay, buf);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void buffer_key_value_urlencode(BUFFER *wb, const char *key, const char *value) {
|
||||
char *encoded = NULL;
|
||||
|
||||
if(value && *value)
|
||||
encoded = url_encode(value);
|
||||
|
||||
buffer_sprintf(wb, "%s=%s", key, encoded ? encoded : "");
|
||||
|
||||
freez(encoded);
|
||||
}
|
||||
|
||||
bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeout) {
|
||||
worker_is_busy(WORKER_SENDER_CONNECTOR_JOB_CONNECTING);
|
||||
|
||||
|
@ -342,104 +310,53 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
/* TODO: During the implementation of #7265 switch the set of variables to HOST_* and CONTAINER_* if the
|
||||
version negotiation resulted in a high enough version.
|
||||
*/
|
||||
stream_encoded_t se;
|
||||
rrdpush_encode_variable(&se, host);
|
||||
|
||||
char http[HTTP_HEADER_SIZE + 1];
|
||||
int eol = snprintfz(http, HTTP_HEADER_SIZE,
|
||||
"STREAM "
|
||||
"key=%s"
|
||||
"&hostname=%s"
|
||||
"®istry_hostname=%s"
|
||||
"&machine_guid=%s"
|
||||
"&update_every=%d"
|
||||
"&os=%s"
|
||||
"&timezone=%s"
|
||||
"&abbrev_timezone=%s"
|
||||
"&utc_offset=%d"
|
||||
"&hops=%d"
|
||||
"&ml_capable=%d"
|
||||
"&ml_enabled=%d"
|
||||
"&mc_version=%d"
|
||||
"&ver=%u"
|
||||
"&NETDATA_INSTANCE_CLOUD_TYPE=%s"
|
||||
"&NETDATA_INSTANCE_CLOUD_INSTANCE_TYPE=%s"
|
||||
"&NETDATA_INSTANCE_CLOUD_INSTANCE_REGION=%s"
|
||||
"&NETDATA_SYSTEM_OS_NAME=%s"
|
||||
"&NETDATA_SYSTEM_OS_ID=%s"
|
||||
"&NETDATA_SYSTEM_OS_ID_LIKE=%s"
|
||||
"&NETDATA_SYSTEM_OS_VERSION=%s"
|
||||
"&NETDATA_SYSTEM_OS_VERSION_ID=%s"
|
||||
"&NETDATA_SYSTEM_OS_DETECTION=%s"
|
||||
"&NETDATA_HOST_IS_K8S_NODE=%s"
|
||||
"&NETDATA_SYSTEM_KERNEL_NAME=%s"
|
||||
"&NETDATA_SYSTEM_KERNEL_VERSION=%s"
|
||||
"&NETDATA_SYSTEM_ARCHITECTURE=%s"
|
||||
"&NETDATA_SYSTEM_VIRTUALIZATION=%s"
|
||||
"&NETDATA_SYSTEM_VIRT_DETECTION=%s"
|
||||
"&NETDATA_SYSTEM_CONTAINER=%s"
|
||||
"&NETDATA_SYSTEM_CONTAINER_DETECTION=%s"
|
||||
"&NETDATA_CONTAINER_OS_NAME=%s"
|
||||
"&NETDATA_CONTAINER_OS_ID=%s"
|
||||
"&NETDATA_CONTAINER_OS_ID_LIKE=%s"
|
||||
"&NETDATA_CONTAINER_OS_VERSION=%s"
|
||||
"&NETDATA_CONTAINER_OS_VERSION_ID=%s"
|
||||
"&NETDATA_CONTAINER_OS_DETECTION=%s"
|
||||
"&NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT=%s"
|
||||
"&NETDATA_SYSTEM_CPU_FREQ=%s"
|
||||
"&NETDATA_SYSTEM_TOTAL_RAM=%s"
|
||||
"&NETDATA_SYSTEM_TOTAL_DISK_SIZE=%s"
|
||||
"&NETDATA_PROTOCOL_VERSION=%s"
|
||||
HTTP_1_1 HTTP_ENDL
|
||||
"User-Agent: %s/%s" HTTP_ENDL
|
||||
"Accept: */*" HTTP_HDR_END
|
||||
, string2str(host->stream.snd.api_key)
|
||||
, rrdhost_hostname(host)
|
||||
, rrdhost_registry_hostname(host)
|
||||
, host->machine_guid
|
||||
, default_rrd_update_every
|
||||
, rrdhost_os(host)
|
||||
, rrdhost_timezone(host)
|
||||
, rrdhost_abbrev_timezone(host)
|
||||
, host->utc_offset
|
||||
, s->hops
|
||||
, host->system_info->ml_capable
|
||||
, host->system_info->ml_enabled
|
||||
, host->system_info->mc_version
|
||||
, s->capabilities
|
||||
, (host->system_info->cloud_provider_type) ? host->system_info->cloud_provider_type : ""
|
||||
, (host->system_info->cloud_instance_type) ? host->system_info->cloud_instance_type : ""
|
||||
, (host->system_info->cloud_instance_region) ? host->system_info->cloud_instance_region : ""
|
||||
, se.os_name
|
||||
, se.os_id
|
||||
, (host->system_info->host_os_id_like) ? host->system_info->host_os_id_like : ""
|
||||
, se.os_version
|
||||
, (host->system_info->host_os_version_id) ? host->system_info->host_os_version_id : ""
|
||||
, (host->system_info->host_os_detection) ? host->system_info->host_os_detection : ""
|
||||
, (host->system_info->is_k8s_node) ? host->system_info->is_k8s_node : ""
|
||||
, se.kernel_name
|
||||
, se.kernel_version
|
||||
, (host->system_info->architecture) ? host->system_info->architecture : ""
|
||||
, (host->system_info->virtualization) ? host->system_info->virtualization : ""
|
||||
, (host->system_info->virt_detection) ? host->system_info->virt_detection : ""
|
||||
, (host->system_info->container) ? host->system_info->container : ""
|
||||
, (host->system_info->container_detection) ? host->system_info->container_detection : ""
|
||||
, (host->system_info->container_os_name) ? host->system_info->container_os_name : ""
|
||||
, (host->system_info->container_os_id) ? host->system_info->container_os_id : ""
|
||||
, (host->system_info->container_os_id_like) ? host->system_info->container_os_id_like : ""
|
||||
, (host->system_info->container_os_version) ? host->system_info->container_os_version : ""
|
||||
, (host->system_info->container_os_version_id) ? host->system_info->container_os_version_id : ""
|
||||
, (host->system_info->container_os_detection) ? host->system_info->container_os_detection : ""
|
||||
, (host->system_info->host_cores) ? host->system_info->host_cores : ""
|
||||
, (host->system_info->host_cpu_freq) ? host->system_info->host_cpu_freq : ""
|
||||
, (host->system_info->host_ram_total) ? host->system_info->host_ram_total : ""
|
||||
, (host->system_info->host_disk_space) ? host->system_info->host_disk_space : ""
|
||||
, STREAMING_PROTOCOL_VERSION
|
||||
, rrdhost_program_name(host)
|
||||
, rrdhost_program_version(host)
|
||||
);
|
||||
http[eol] = 0x00;
|
||||
rrdpush_clean_encoded(&se);
|
||||
CLEAN_BUFFER *wb = buffer_create(0, NULL);
|
||||
buffer_strcat(wb, "STREAM ");
|
||||
buffer_key_value_urlencode(wb, "key", string2str(host->stream.snd.api_key));
|
||||
buffer_key_value_urlencode(wb, "&hostname", rrdhost_hostname(host));
|
||||
buffer_key_value_urlencode(wb, "®istry_hostname", rrdhost_registry_hostname(host));
|
||||
buffer_key_value_urlencode(wb, "&machine_guid", host->machine_guid);
|
||||
buffer_sprintf(wb, "&update_every=%d", default_rrd_update_every);
|
||||
buffer_key_value_urlencode(wb, "&os", rrdhost_os(host));
|
||||
buffer_key_value_urlencode(wb, "&timezone", rrdhost_timezone(host));
|
||||
buffer_key_value_urlencode(wb, "&abbrev_timezone", rrdhost_abbrev_timezone(host));
|
||||
buffer_sprintf(wb, "&utc_offset=%d", host->utc_offset);
|
||||
buffer_sprintf(wb, "&hops=%d", s->hops);
|
||||
buffer_sprintf(wb, "&ml_capable=%d", host->system_info->ml_capable);
|
||||
buffer_sprintf(wb, "&ml_enabled=%d", host->system_info->ml_enabled);
|
||||
buffer_sprintf(wb, "&mc_version=%d", host->system_info->mc_version);
|
||||
buffer_sprintf(wb, "&ver=%u", s->capabilities);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_INSTANCE_CLOUD_TYPE", host->system_info->cloud_provider_type);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_INSTANCE_CLOUD_INSTANCE_TYPE", host->system_info->cloud_instance_type);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_INSTANCE_CLOUD_INSTANCE_REGION", host->system_info->cloud_instance_region);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_OS_NAME", host->system_info->host_os_name);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_OS_ID", host->system_info->host_os_id);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_OS_ID_LIKE", host->system_info->host_os_id_like);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_OS_VERSION", host->system_info->host_os_version);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_OS_VERSION_ID", host->system_info->host_os_version_id);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_OS_DETECTION", host->system_info->host_os_detection);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_HOST_IS_K8S_NODE", host->system_info->is_k8s_node);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_KERNEL_NAME", host->system_info->kernel_name);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_KERNEL_VERSION", host->system_info->kernel_version);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_ARCHITECTURE", host->system_info->architecture);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_VIRTUALIZATION", host->system_info->virtualization);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_VIRT_DETECTION", host->system_info->virt_detection);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_CONTAINER", host->system_info->container);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_CONTAINER_DETECTION", host->system_info->container_detection);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_CONTAINER_OS_NAME", host->system_info->container_os_name);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_CONTAINER_OS_ID", host->system_info->container_os_id);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_CONTAINER_OS_ID_LIKE", host->system_info->container_os_id_like);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_CONTAINER_OS_VERSION", host->system_info->container_os_version);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_CONTAINER_OS_VERSION_ID", host->system_info->container_os_version_id);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_CONTAINER_OS_DETECTION", host->system_info->container_os_detection);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_CPU_LOGICAL_CPU_COUNT", host->system_info->host_cores);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_CPU_FREQ", host->system_info->host_cpu_freq);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_TOTAL_RAM", host->system_info->host_ram_total);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_SYSTEM_TOTAL_DISK_SIZE", host->system_info->host_disk_space);
|
||||
buffer_key_value_urlencode(wb, "&NETDATA_PROTOCOL_VERSION", STREAMING_PROTOCOL_VERSION);
|
||||
buffer_strcat(wb, HTTP_1_1 HTTP_ENDL);
|
||||
buffer_sprintf(wb, "User-Agent: %s/%s" HTTP_ENDL, rrdhost_program_name(host), rrdhost_program_version(host));
|
||||
buffer_strcat(wb, "Accept: */*" HTTP_HDR_END);
|
||||
|
||||
if (s->parent_using_h2o && stream_connect_upgrade_prelude(host, s)) {
|
||||
ND_LOG_STACK lgs[] = {
|
||||
|
@ -455,8 +372,8 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
return false;
|
||||
}
|
||||
|
||||
ssize_t len = (ssize_t)strlen(http);
|
||||
ssize_t bytes = nd_sock_send_timeout(&s->sock, http, len, 0, timeout);
|
||||
ssize_t len = (ssize_t)buffer_strlen(wb);
|
||||
ssize_t bytes = nd_sock_send_timeout(&s->sock, (void *)buffer_tostring(wb), len, 0, timeout);
|
||||
if(bytes <= 0) { // timeout is 0
|
||||
ND_LOG_STACK lgs[] = {
|
||||
ND_LOG_FIELD_TXT(NDF_RESPONSE_CODE, STREAM_STATUS_TIMEOUT),
|
||||
|
@ -468,7 +385,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
nd_sock_close(&s->sock);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s]: failed to send HTTP header to remote netdata.",
|
||||
"STREAM CONNECT '%s' [to %s]: failed to send HTTP header to remote netdata.",
|
||||
rrdhost_hostname(host), s->connected_to);
|
||||
|
||||
stream_parent_set_reconnect_delay(
|
||||
|
@ -476,7 +393,8 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
return false;
|
||||
}
|
||||
|
||||
bytes = nd_sock_recv_timeout(&s->sock, http, HTTP_HEADER_SIZE, 0, timeout);
|
||||
char response[4096];
|
||||
bytes = nd_sock_recv_timeout(&s->sock, response, sizeof(response) - 1, 0, timeout);
|
||||
if(bytes <= 0) { // timeout is 0
|
||||
nd_sock_close(&s->sock);
|
||||
|
||||
|
@ -489,7 +407,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
worker_is_busy(WORKER_SENDER_CONNECTOR_JOB_DISCONNECT_TIMEOUT);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s]: remote netdata does not respond.",
|
||||
"STREAM CONNECT '%s' [to %s]: remote netdata does not respond.",
|
||||
rrdhost_hostname(host), s->connected_to);
|
||||
|
||||
stream_parent_set_reconnect_delay(
|
||||
|
@ -497,21 +415,21 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
|
||||
return false;
|
||||
}
|
||||
http[bytes] = '\0';
|
||||
response[bytes] = '\0';
|
||||
|
||||
if(sock_setnonblock(s->sock.fd) < 0)
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM %s [send to %s]: cannot set non-blocking mode for socket.",
|
||||
"STREAM CONNECT '%s' [to %s]: cannot set non-blocking mode for socket.",
|
||||
rrdhost_hostname(host), s->connected_to);
|
||||
|
||||
sock_setcloexec(s->sock.fd);
|
||||
|
||||
if(sock_enlarge_out(s->sock.fd) < 0)
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM %s [send to %s]: cannot enlarge the socket buffer.",
|
||||
"STREAM CONNECT '%s' [to %s]: cannot enlarge the socket buffer.",
|
||||
rrdhost_hostname(host), s->connected_to);
|
||||
|
||||
if(!stream_connect_validate_first_response(host, s, http, bytes)) {
|
||||
if(!stream_connect_validate_first_response(host, s, response, bytes)) {
|
||||
nd_sock_close(&s->sock);
|
||||
return false;
|
||||
}
|
||||
|
@ -527,7 +445,7 @@ bool stream_connect(struct sender_state *s, uint16_t default_port, time_t timeou
|
|||
ND_LOG_STACK_PUSH(lgs);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM [connector] %s: connected to %s...",
|
||||
"STREAM CONNECT '%s' [to %s]: connected to parent...",
|
||||
rrdhost_hostname(host), s->connected_to);
|
||||
|
||||
return true;
|
||||
|
@ -592,7 +510,7 @@ void stream_connector_requeue(struct sender_state *s) {
|
|||
struct connector *sc = stream_connector_get(s);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM [connector] [%s]: adding host in connector queue...",
|
||||
"STREAM CONNECT '%s' [to parent]: adding host in connector queue...",
|
||||
rrdhost_hostname(s->host));
|
||||
|
||||
spinlock_lock(&sc->queue.spinlock);
|
||||
|
@ -608,13 +526,13 @@ void stream_connector_add(struct sender_state *s) {
|
|||
// multiple threads may come here - only one should be able to pass through
|
||||
stream_sender_lock(s);
|
||||
if(!rrdhost_has_stream_sender_enabled(s->host) || !s->host->stream.snd.destination || !s->host->stream.snd.api_key) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM %s [send]: host has streaming disabled - not sending data to a parent.",
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM CONNECT '%s' [disabled]: host has streaming disabled - not sending data to a parent.",
|
||||
rrdhost_hostname(s->host));
|
||||
stream_sender_unlock(s);
|
||||
return;
|
||||
}
|
||||
if(rrdhost_flag_check(s->host, RRDHOST_FLAG_STREAM_SENDER_ADDED)) {
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG, "STREAM %s [send]: host has already added to sender - ignoring request",
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG, "STREAM CONNECT '%s' [duplicate]: host has already added to sender - ignoring request.",
|
||||
rrdhost_hostname(s->host));
|
||||
stream_sender_unlock(s);
|
||||
return;
|
||||
|
@ -632,7 +550,7 @@ void stream_connector_add(struct sender_state *s) {
|
|||
|
||||
static void stream_connector_remove(struct sender_state *s) {
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM [connector] [%s]: stopped streaming connector for host: %s",
|
||||
"STREAM CONNECT '%s' [stopped]: stopped streaming connector for host, reason: %s",
|
||||
rrdhost_hostname(s->host), stream_handshake_error_to_string(s->exit.reason));
|
||||
|
||||
struct connector *sc = stream_connector_get(s);
|
||||
|
@ -658,8 +576,8 @@ static void *stream_connector_thread(void *ptr) {
|
|||
worker_register_job_custom_metric(WORKER_SENDER_CONNECTOR_JOB_CANCELLED_NODES, "cancelled nodes", "nodes", WORKER_METRIC_ABSOLUTE);
|
||||
|
||||
unsigned job_id = 0;
|
||||
|
||||
while(!nd_thread_signaled_to_cancel() && service_running(SERVICE_STREAMING)) {
|
||||
|
||||
worker_is_idle();
|
||||
job_id = completion_wait_for_a_job_with_timeout(&sc->completion, job_id, 1000);
|
||||
size_t nodes = 0, connected_nodes = 0, failed_nodes = 0, cancelled_nodes = 0;
|
||||
|
@ -730,7 +648,7 @@ bool stream_connector_init(struct sender_state *s) {
|
|||
if(!sc->thread) {
|
||||
sc->id = (int8_t)(sc - connector_globals.connectors); // find the slot number
|
||||
if(&connector_globals.connectors[sc->id] != sc)
|
||||
fatal("Connector ID and slot do not match!");
|
||||
fatal("STREAM CONNECT '%s': connector ID and slot do not match!", rrdhost_hostname(s->host));
|
||||
|
||||
spinlock_init(&sc->queue.spinlock);
|
||||
completion_init(&sc->completion);
|
||||
|
@ -741,7 +659,9 @@ bool stream_connector_init(struct sender_state *s) {
|
|||
|
||||
sc->thread = nd_thread_create(tag, NETDATA_THREAD_OPTION_DEFAULT, stream_connector_thread, sc);
|
||||
if (!sc->thread)
|
||||
nd_log_daemon(NDLP_ERR, "STREAM connector: failed to create new thread for client.");
|
||||
nd_log_daemon(NDLP_ERR,
|
||||
"STREAM CONNECT '%s': failed to create new thread for client.",
|
||||
rrdhost_hostname(s->host));
|
||||
}
|
||||
|
||||
spinlock_unlock(&spinlock);
|
||||
|
|
116
src/streaming/stream-control.c
Normal file
116
src/streaming/stream-control.c
Normal file
|
@ -0,0 +1,116 @@
|
|||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#include "stream-control.h"
|
||||
#include "stream.h"
|
||||
#include "replication.h"
|
||||
|
||||
static struct {
|
||||
CACHE_LINE_PADDING();
|
||||
|
||||
uint32_t backfill_runners;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
|
||||
uint32_t replication_runners;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
|
||||
uint32_t user_data_queries_runners;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
|
||||
uint32_t user_weights_queries_runners;
|
||||
|
||||
CACHE_LINE_PADDING();
|
||||
} sc;
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
// backfilling
|
||||
|
||||
static uint32_t backfill_runners(void) {
|
||||
return __atomic_load_n(&sc.backfill_runners, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_backfill_query_started(void) {
|
||||
__atomic_add_fetch(&sc.backfill_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_backfill_query_finished(void) {
|
||||
__atomic_sub_fetch(&sc.backfill_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
// replication
|
||||
|
||||
static uint32_t replication_runners(void) {
|
||||
return __atomic_load_n(&sc.replication_runners, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_replication_query_started(void) {
|
||||
__atomic_add_fetch(&sc.replication_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_replication_query_finished(void) {
|
||||
__atomic_sub_fetch(&sc.replication_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
// user data queries
|
||||
|
||||
static uint32_t user_data_query_runners(void) {
|
||||
return __atomic_load_n(&sc.user_data_queries_runners, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_user_data_query_started(void) {
|
||||
__atomic_add_fetch(&sc.user_data_queries_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_user_data_query_finished(void) {
|
||||
__atomic_sub_fetch(&sc.user_data_queries_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
// user weights queries
|
||||
|
||||
static uint32_t user_weights_query_runners(void) {
|
||||
return __atomic_load_n(&sc.user_weights_queries_runners, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_user_weights_query_started(void) {
|
||||
__atomic_add_fetch(&sc.user_weights_queries_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
void stream_control_user_weights_query_finished(void) {
|
||||
__atomic_sub_fetch(&sc.user_weights_queries_runners, 1, __ATOMIC_RELAXED);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
// consumer API
|
||||
|
||||
bool stream_control_ml_should_be_running(void) {
|
||||
return backfill_runners() == 0 &&
|
||||
replication_runners() == 0 &&
|
||||
user_data_query_runners() == 0 &&
|
||||
user_weights_query_runners() == 0;
|
||||
}
|
||||
|
||||
bool stream_control_children_should_be_accepted(void) {
|
||||
// we should not check for replication here.
|
||||
// replication benefits from multiple nodes (merges the extents)
|
||||
// and also the nodes should be close in time in the db
|
||||
// - checking for replication leaves the last few nodes locked-out (since all the others are replicating)
|
||||
|
||||
return backfill_runners() == 0;
|
||||
}
|
||||
|
||||
bool stream_control_replication_should_be_running(void) {
|
||||
return backfill_runners() == 0 &&
|
||||
user_data_query_runners() == 0 &&
|
||||
user_weights_query_runners() == 0;
|
||||
}
|
||||
|
||||
bool stream_control_health_should_be_running(void) {
|
||||
return backfill_runners() == 0 &&
|
||||
replication_runners() == 0 &&
|
||||
(user_data_query_runners() + user_weights_query_runners()) <= 1;
|
||||
}
|
29
src/streaming/stream-control.h
Normal file
29
src/streaming/stream-control.h
Normal file
|
@ -0,0 +1,29 @@
|
|||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#ifndef NETDATA_STREAM_CONTROL_H
|
||||
#define NETDATA_STREAM_CONTROL_H
|
||||
|
||||
#include "libnetdata/libnetdata.h"
|
||||
|
||||
#define STREAM_CONTROL_SLEEP_UT (10 * USEC_PER_MS + os_random(10 * USEC_PER_MS))
|
||||
|
||||
#define stream_control_throttle() microsleep(STREAM_CONTROL_SLEEP_UT)
|
||||
|
||||
void stream_control_backfill_query_started(void);
|
||||
void stream_control_backfill_query_finished(void);
|
||||
|
||||
void stream_control_replication_query_started(void);
|
||||
void stream_control_replication_query_finished(void);
|
||||
|
||||
void stream_control_user_weights_query_started(void);
|
||||
void stream_control_user_weights_query_finished(void);
|
||||
|
||||
void stream_control_user_data_query_started(void);
|
||||
void stream_control_user_data_query_finished(void);
|
||||
|
||||
bool stream_control_ml_should_be_running(void);
|
||||
bool stream_control_children_should_be_accepted(void);
|
||||
bool stream_control_replication_should_be_running(void);
|
||||
bool stream_control_health_should_be_running(void);
|
||||
|
||||
#endif //NETDATA_STREAM_CONTROL_H
|
|
@ -1,6 +1,7 @@
|
|||
// SPDX-License-Identifier: GPL-3.0-or-later
|
||||
|
||||
#include "stream-sender-internals.h"
|
||||
#include "replication.h"
|
||||
|
||||
#define TIME_TO_CONSIDER_PARENTS_SIMILAR 120
|
||||
|
||||
|
@ -150,7 +151,7 @@ void rrdhost_stream_parents_to_json(BUFFER *wb, RRDHOST_STATUS *s) {
|
|||
STREAM_PARENT *d;
|
||||
for (d = s->host->stream.snd.parents.all; d; d = d->next) {
|
||||
buffer_json_add_array_item_object(wb);
|
||||
buffer_json_member_add_uint64(wb, "attempts", d->attempts);
|
||||
buffer_json_member_add_uint64(wb, "attempts", d->attempts + 1);
|
||||
{
|
||||
if (d->ssl) {
|
||||
snprintfz(buf, sizeof(buf) - 1, "%s:SSL", string2str(d->destination));
|
||||
|
@ -305,6 +306,10 @@ int stream_info_to_json_v1(BUFFER *wb, const char *machine_guid) {
|
|||
buffer_json_member_add_uint64(wb, "nonce", os_random32());
|
||||
|
||||
if(ret == HTTP_RESP_OK) {
|
||||
if((status.ingest.status == RRDHOST_INGEST_STATUS_ARCHIVED || status.ingest.status == RRDHOST_INGEST_STATUS_OFFLINE) &&
|
||||
!stream_control_children_should_be_accepted())
|
||||
status.ingest.status = RRDHOST_INGEST_STATUS_INITIALIZING;
|
||||
|
||||
buffer_json_member_add_string(wb, "db_status", rrdhost_db_status_to_string(status.db.status));
|
||||
buffer_json_member_add_string(wb, "db_liveness", rrdhost_db_liveness_to_string(status.db.liveness));
|
||||
buffer_json_member_add_string(wb, "ingest_type", rrdhost_ingest_type_to_string(status.ingest.type));
|
||||
|
@ -375,7 +380,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
rrdhost_program_version(localhost));
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: fetching stream info from '%s'...",
|
||||
"STREAM PARENTS '%s': fetching stream info from '%s'...",
|
||||
hostname, string2str(d->destination));
|
||||
|
||||
// Establish connection
|
||||
|
@ -384,7 +389,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
d->selection.info = false;
|
||||
stream_parent_nd_sock_error_to_reason(d, &sock);
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: failed to connect for stream info to '%s': %s",
|
||||
"STREAM PARENTS '%s': failed to connect for stream info to '%s': %s",
|
||||
hostname, string2str(d->destination),
|
||||
ND_SOCK_ERROR_2str(sock.error));
|
||||
return false;
|
||||
|
@ -396,7 +401,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
d->selection.info = false;
|
||||
stream_parent_nd_sock_error_to_reason(d, &sock);
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: failed to send stream info request to '%s': %s",
|
||||
"STREAM PARENTS '%s': failed to send stream info request to '%s': %s",
|
||||
hostname, string2str(d->destination),
|
||||
ND_SOCK_ERROR_2str(sock.error));
|
||||
return false;
|
||||
|
@ -413,7 +418,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
|
||||
if (remaining <= 1) {
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: stream info receive buffer is full while receiving response from '%s'",
|
||||
"STREAM PARENTS '%s': stream info receive buffer is full while receiving response from '%s'",
|
||||
hostname, string2str(d->destination));
|
||||
d->selection.info = false;
|
||||
d->reason = STREAM_HANDSHAKE_INTERNAL_ERROR;
|
||||
|
@ -423,7 +428,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
ssize_t received = nd_sock_recv_timeout(&sock, buf + total_received, remaining - 1, 0, 5);
|
||||
if (received <= 0) {
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: socket receive error while querying stream info on '%s' "
|
||||
"STREAM PARENTS '%s': socket receive error while querying stream info on '%s' "
|
||||
"(total received %zu, payload received %zu, content length %zu): %s",
|
||||
hostname, string2str(d->destination),
|
||||
total_received, payload_received, content_length,
|
||||
|
@ -453,7 +458,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
char *content_length_ptr = strstr(buf, "Content-Length: ");
|
||||
if (!content_length_ptr) {
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: stream info response from '%s' does not have a Content-Length",
|
||||
"STREAM PARENTS '%s': stream info response from '%s' does not have a Content-Length",
|
||||
hostname, string2str(d->destination));
|
||||
|
||||
d->selection.info = false;
|
||||
|
@ -463,7 +468,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
content_length = strtoul(content_length_ptr + strlen("Content-Length: "), NULL, 10);
|
||||
if (!content_length) {
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: stream info response from '%s' has invalid Content-Length",
|
||||
"STREAM PARENTS '%s': stream info response from '%s' has invalid Content-Length",
|
||||
hostname, string2str(d->destination));
|
||||
|
||||
d->selection.info = false;
|
||||
|
@ -479,7 +484,7 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
d->selection.info = false;
|
||||
d->reason = STREAM_HANDSHAKE_NO_STREAM_INFO;
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: failed to parse stream info response from '%s', JSON data: %s",
|
||||
"STREAM PARENTS '%s': failed to parse stream info response from '%s', JSON data: %s",
|
||||
hostname, string2str(d->destination), payload_start);
|
||||
return false;
|
||||
}
|
||||
|
@ -490,14 +495,14 @@ static bool stream_info_fetch(STREAM_PARENT *d, const char *uuid, int default_po
|
|||
d->selection.info = false;
|
||||
d->reason = STREAM_HANDSHAKE_NO_STREAM_INFO;
|
||||
nd_log(NDLS_DAEMON, NDLP_WARNING,
|
||||
"STREAM PARENTS of %s: failed to extract fields from JSON stream info response from '%s': %s",
|
||||
"STREAM PARENTS '%s': failed to extract fields from JSON stream info response from '%s': %s",
|
||||
hostname, string2str(d->destination),
|
||||
buffer_tostring(error));
|
||||
return false;
|
||||
}
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: received stream_info data from '%s': "
|
||||
"STREAM PARENTS '%s': received stream_info data from '%s': "
|
||||
"status: %d, nodes: %zu, receivers: %zu, first_time_s: %ld, last_time_s: %ld, "
|
||||
"db status: %s, db liveness: %s, ingest type: %s, ingest status: %s",
|
||||
hostname, string2str(d->destination),
|
||||
|
@ -554,7 +559,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
|
||||
// do we have any parents?
|
||||
if(!size) {
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG, "STREAM PARENTS of %s: no parents configured", rrdhost_hostname(host));
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG, "STREAM PARENTS '%s': no parents configured", rrdhost_hostname(host));
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -581,7 +586,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
if (d->postpone_until_ut > now_ut) {
|
||||
skipped_but_useful++;
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: skipping useful parent '%s': POSTPONED FOR %ld SECS MORE: %s",
|
||||
"STREAM PARENTS '%s': skipping useful parent '%s': POSTPONED FOR %ld SECS MORE: %s",
|
||||
rrdhost_hostname(host),
|
||||
string2str(d->destination),
|
||||
(time_t)((d->postpone_until_ut - now_ut) / USEC_PER_SEC),
|
||||
|
@ -603,7 +608,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
d->banned_permanently = true;
|
||||
skipped_not_useful++;
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM PARENTS of %s: destination '%s' is banned permanently because it is the origin server",
|
||||
"STREAM PARENTS '%s': destination '%s' is banned permanently because it is the origin server",
|
||||
rrdhost_hostname(host), string2str(d->destination));
|
||||
continue;
|
||||
}
|
||||
|
@ -631,7 +636,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
d->banned_for_this_session = true;
|
||||
skipped_not_useful++;
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM PARENTS of %s: destination '%s' is banned for this session, because it is in our path before us.",
|
||||
"STREAM PARENTS '%s': destination '%s' is banned for this session, because it is in our path before us.",
|
||||
rrdhost_hostname(host), string2str(d->destination));
|
||||
continue;
|
||||
}
|
||||
|
@ -648,7 +653,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
if(skip) {
|
||||
skipped_but_useful++;
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: skipping useful parent '%s': %s",
|
||||
"STREAM PARENTS '%s': skipping useful parent '%s': %s",
|
||||
rrdhost_hostname(host),
|
||||
string2str(d->destination),
|
||||
stream_handshake_error_to_string(d->reason));
|
||||
|
@ -664,7 +669,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
// can we use any parent?
|
||||
if(!count) {
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: no parents available (%zu skipped but useful, %zu skipped not useful)",
|
||||
"STREAM PARENTS '%s': no parents available (%zu skipped but useful, %zu skipped not useful)",
|
||||
rrdhost_hostname(host),
|
||||
skipped_but_useful, skipped_not_useful);
|
||||
return false;
|
||||
|
@ -692,7 +697,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
// if we have only 1 similar, move on
|
||||
if (similar == 1) {
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: reordering keeps parent No %zu, '%s'",
|
||||
"STREAM PARENTS '%s': reordering keeps parent No %zu, '%s'",
|
||||
rrdhost_hostname(host), base, string2str(array[base]->destination));
|
||||
array[base]->selection.order = base + 1;
|
||||
array[base]->selection.batch = batch + 1;
|
||||
|
@ -716,7 +721,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
SWAP(array[base], array[chosen]);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: random reordering of %zu similar parents (slots %zu to %zu), No %zu is '%s'",
|
||||
"STREAM PARENTS '%s': random reordering of %zu similar parents (slots %zu to %zu), No %zu is '%s'",
|
||||
rrdhost_hostname(host),
|
||||
similar, base, base + similar,
|
||||
base, string2str(array[base]->destination));
|
||||
|
@ -743,7 +748,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
array[0]->selection.random = false;
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: only 1 parent is available: '%s'",
|
||||
"STREAM PARENTS '%s': only 1 parent is available: '%s'",
|
||||
rrdhost_hostname(host), string2str(array[0]->destination));
|
||||
}
|
||||
|
||||
|
@ -760,7 +765,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
}
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: connecting to '%s' (default port: %d, parent %zu of %zu)...",
|
||||
"STREAM PARENTS '%s': connecting to '%s' (default port: %d, parent %zu of %zu)...",
|
||||
rrdhost_hostname(host), string2str(d->destination), default_port,
|
||||
i + 1, count);
|
||||
|
||||
|
@ -788,7 +793,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(host->stream.snd.parents.all, d, prev, next);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: connected to '%s' (default port: %d, fd %d)...",
|
||||
"STREAM PARENTS '%s': connected to '%s' (default port: %d, fd %d)...",
|
||||
rrdhost_hostname(host), string2str(d->destination), default_port,
|
||||
sender_sock->fd);
|
||||
|
||||
|
@ -798,7 +803,7 @@ bool stream_parent_connect_to_one_unsafe(
|
|||
else {
|
||||
stream_parent_nd_sock_error_to_reason(d, sender_sock);
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: stream connection to '%s' failed (default port: %d): %s",
|
||||
"STREAM PARENTS '%s': stream connection to '%s' failed (default port: %d): %s",
|
||||
rrdhost_hostname(host),
|
||||
string2str(d->destination), default_port,
|
||||
ND_SOCK_ERROR_2str(sender_sock->error));
|
||||
|
@ -854,7 +859,7 @@ static bool stream_parent_add_one_unsafe(char *entry, void *data) {
|
|||
|
||||
t->count++;
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM PARENTS of %s: added streaming destination No %d: '%s'",
|
||||
"STREAM PARENTS '%s': added streaming destination No %d: '%s'",
|
||||
rrdhost_hostname(t->host), t->count, string2str(d->destination));
|
||||
|
||||
return false; // we return false, so that we will get all defined destinations
|
||||
|
|
|
@ -237,7 +237,7 @@ void stream_path_send_to_child(RRDHOST *host) {
|
|||
|
||||
rrdhost_receiver_lock(host);
|
||||
if(stream_has_capability(host->receiver, STREAM_CAP_PATHS) &&
|
||||
!rrdhost_flag_check(host, RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED)) {
|
||||
rrdhost_flag_check(host, RRDHOST_FLAG_COLLECTOR_ONLINE)) {
|
||||
|
||||
CLEAN_BUFFER *wb = buffer_create(0, NULL);
|
||||
buffer_sprintf(wb, PLUGINSD_KEYWORD_JSON " " PLUGINSD_KEYWORD_JSON_CMD_STREAM_PATH "\n%s\n" PLUGINSD_KEYWORD_JSON_END "\n", buffer_tostring(payload));
|
||||
|
@ -317,7 +317,7 @@ static bool parse_single_path(json_object *jobj, const char *path, STREAM_PATH *
|
|||
}
|
||||
|
||||
if(p->hops < 0) {
|
||||
buffer_strcat(error, "hops cannot be negative");
|
||||
buffer_strcat(error, "hops cannot be negative (probably the child disconnected from the Netdata before us");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -360,7 +360,8 @@ bool stream_path_set_from_json(RRDHOST *host, const char *json, bool from_parent
|
|||
CLEAN_JSON_OBJECT *jobj = json_tokener_parse(json);
|
||||
if(!jobj) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM PATH: Cannot parse json: %s", json);
|
||||
"STREAM PATH '%s': Cannot parse json: %s",
|
||||
rrdhost_hostname(host), json);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -381,14 +382,16 @@ bool stream_path_set_from_json(RRDHOST *host, const char *json, bool from_parent
|
|||
json_object *joption = json_object_array_get_idx(_jarray, i);
|
||||
if (!json_object_is_type(joption, json_type_object)) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM PATH: Array item No %zu is not an object: %s", i, json);
|
||||
"STREAM PATH '%s': Array item No %zu is not an object: %s",
|
||||
rrdhost_hostname(host), i, json);
|
||||
continue;
|
||||
}
|
||||
|
||||
if(!parse_single_path(joption, "", &host->stream.path.array[host->stream.path.used], error)) {
|
||||
stream_path_cleanup(&host->stream.path.array[host->stream.path.used]);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM PATH: Array item No %zu cannot be parsed: %s: %s", i, buffer_tostring(error), json);
|
||||
"STREAM PATH '%s': Array item No %zu cannot be parsed: %s: %s",
|
||||
rrdhost_hostname(host), i, buffer_tostring(error), json);
|
||||
}
|
||||
else
|
||||
host->stream.path.used++;
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#include "stream-thread.h"
|
||||
#include "stream-receiver-internals.h"
|
||||
#include "web/server/h2o/http_server.h"
|
||||
#include "replication.h"
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
|
@ -25,8 +26,9 @@ void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, con
|
|||
, (rpt->machine_guid && *rpt->machine_guid) ? rpt->machine_guid : ""
|
||||
, msg);
|
||||
|
||||
nd_log(NDLS_DAEMON, priority, "STREAM RECEIVE '%s': %s %s%s%s"
|
||||
nd_log(NDLS_DAEMON, priority, "STREAM RECEIVE '%s' [from [%s]:%s]: %s %s%s%s"
|
||||
, (rpt->hostname && *rpt->hostname) ? rpt->hostname : ""
|
||||
, rpt->client_ip, rpt->client_port
|
||||
, msg
|
||||
, rpt->exit.reason != STREAM_HANDSHAKE_NEVER?" (":""
|
||||
, stream_handshake_error_to_string(rpt->exit.reason)
|
||||
|
@ -142,30 +144,41 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) {
|
|||
if(!host) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"failed to find/create host structure, rejecting connection",
|
||||
"rejecting streaming connection; failed to find or create the required host structure",
|
||||
STREAM_STATUS_INTERNAL_SERVER_ERROR, NDLP_ERR);
|
||||
|
||||
stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INTERNAL_ERROR);
|
||||
return false;
|
||||
}
|
||||
// IMPORTANT: KEEP THIS FIRST AFTER CHECKING host RESPONSE!
|
||||
// THIS IS HOW WE KNOW THE system_info IS GONE NOW...
|
||||
// system_info has been consumed by the host structure
|
||||
rpt->system_info = NULL;
|
||||
|
||||
if (unlikely(rrdhost_flag_check(host, RRDHOST_FLAG_PENDING_CONTEXT_LOAD))) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"host is initializing, retry later",
|
||||
"rejecting streaming connection; host is initializing, retry later",
|
||||
STREAM_STATUS_INITIALIZATION_IN_PROGRESS, NDLP_NOTICE);
|
||||
|
||||
stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INITIALIZATION);
|
||||
return false;
|
||||
}
|
||||
|
||||
// system_info has been consumed by the host structure
|
||||
rpt->system_info = NULL;
|
||||
if (unlikely(!stream_control_children_should_be_accepted())) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"rejecting streaming connection; the system is backfilling higher tiers with high-resolution data, retry later",
|
||||
STREAM_STATUS_INITIALIZATION_IN_PROGRESS, NDLP_NOTICE);
|
||||
|
||||
stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_INITIALIZATION);
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!rrdhost_set_receiver(host, rpt)) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"host is already served by another receiver",
|
||||
"rejecting streaming connection; host is already served by another receiver",
|
||||
STREAM_STATUS_DUPLICATE_RECEIVER, NDLP_INFO);
|
||||
|
||||
stream_send_error_on_taken_over_connection(rpt, START_STREAMING_ERROR_ALREADY_STREAMING);
|
||||
|
@ -174,7 +187,7 @@ static bool stream_receiver_send_first_response(struct receiver_state *rpt) {
|
|||
}
|
||||
|
||||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
netdata_log_info("STREAM '%s' [receive from [%s]:%s]: "
|
||||
netdata_log_info("STREAM RECEIVE '%s' [from [%s]:%s]: "
|
||||
"client willing to stream metrics for host '%s' with machine_guid '%s': "
|
||||
"update every = %d, history = %d, memory mode = %s, health %s,%s"
|
||||
, rpt->hostname
|
||||
|
@ -395,7 +408,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!rpt->key || !*rpt->key) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"request without an API key, rejecting connection",
|
||||
"rejecting streaming connection; request without an API key",
|
||||
STREAM_STATUS_NO_API_KEY, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -405,7 +418,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!rpt->hostname || !*rpt->hostname) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"request without a hostname, rejecting connection",
|
||||
"rejecting streaming connection; request without a hostname",
|
||||
STREAM_STATUS_NO_HOSTNAME, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -418,7 +431,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!rpt->machine_guid || !*rpt->machine_guid) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"request without a machine GUID, rejecting connection",
|
||||
"rejecting streaming connection; request without a machine UUID",
|
||||
STREAM_STATUS_NO_MACHINE_GUID, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -431,7 +444,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if (regenerate_guid(rpt->key, buf) == -1) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"API key is not a valid UUID (use the command uuidgen to generate one)",
|
||||
"rejecting streaming connection; API key is not a valid UUID (use the command uuidgen to generate one)",
|
||||
STREAM_STATUS_INVALID_API_KEY, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -441,7 +454,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if (regenerate_guid(rpt->machine_guid, buf) == -1) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"machine GUID is not a valid UUID",
|
||||
"rejecting streaming connection; machine UUID is not a valid UUID",
|
||||
STREAM_STATUS_INVALID_MACHINE_GUID, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -452,7 +465,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!stream_conf_is_key_type(rpt->key, "api")) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"API key is a machine GUID",
|
||||
"rejecting streaming connection; API key provided is a machine UUID (did you mix them up?)",
|
||||
STREAM_STATUS_INVALID_API_KEY, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -464,7 +477,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!stream_conf_api_key_is_enabled(rpt->key, false)) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"API key is not enabled",
|
||||
"rejecting streaming connection; API key is not enabled in stream.conf",
|
||||
STREAM_STATUS_API_KEY_DISABLED, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -474,7 +487,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!stream_conf_api_key_allows_client(rpt->key, w->client_ip)) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"API key is not allowed from this IP",
|
||||
"rejecting streaming connection; API key is not allowed from this IP",
|
||||
STREAM_STATUS_NOT_ALLOWED_IP, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -484,7 +497,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if (!stream_conf_is_key_type(rpt->machine_guid, "machine")) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"machine GUID is an API key",
|
||||
"rejecting streaming connection; machine UUID is an API key (did you mix them up?)",
|
||||
STREAM_STATUS_INVALID_MACHINE_GUID, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -496,7 +509,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!stream_conf_api_key_is_enabled(rpt->machine_guid, true)) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"machine GUID is not enabled",
|
||||
"rejecting streaming connection; machine UUID is not enabled in stream.conf",
|
||||
STREAM_STATUS_MACHINE_GUID_DISABLED, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -506,7 +519,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
if(!stream_conf_api_key_allows_client(rpt->machine_guid, w->client_ip)) {
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"machine GUID is not allowed from this IP",
|
||||
"rejecting streaming connection; machine UUID is not allowed from this IP",
|
||||
STREAM_STATUS_NOT_ALLOWED_IP, NDLP_WARNING);
|
||||
|
||||
stream_receiver_free(rpt);
|
||||
|
@ -518,7 +531,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
|
||||
stream_receiver_log_status(
|
||||
rpt,
|
||||
"machine GUID is my own",
|
||||
"rejecting streaming connection; machine UUID is my own",
|
||||
STREAM_STATUS_LOCALHOST, NDLP_DEBUG);
|
||||
|
||||
char initial_response[HTTP_HEADER_SIZE + 1];
|
||||
|
@ -551,7 +564,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
|
||||
char msg[100 + 1];
|
||||
snprintfz(msg, sizeof(msg) - 1,
|
||||
"rate limit, will accept new connection in %ld secs",
|
||||
"rejecting streaming connection; rate limit, will accept new connection in %ld secs",
|
||||
(long)(web_client_streaming_rate_t - (now - last_stream_accepted_t)));
|
||||
|
||||
stream_receiver_log_status(rpt, msg, STREAM_STATUS_RATE_LIMIT, NDLP_NOTICE);
|
||||
|
@ -616,7 +629,7 @@ int stream_receiver_accept_connection(struct web_client *w, char *decoded_query_
|
|||
|
||||
char msg[200 + 1];
|
||||
snprintfz(msg, sizeof(msg) - 1,
|
||||
"multiple connections for same host, "
|
||||
"rejecting streaming connection; multiple connections for same host, "
|
||||
"old connection was last used %ld secs ago%s",
|
||||
age, receiver_stale ? " (signaled old receiver to stop)" : " (new connection not accepted)");
|
||||
|
||||
|
|
|
@ -35,6 +35,8 @@ struct receiver_state {
|
|||
struct buffered_reader reader;
|
||||
|
||||
struct {
|
||||
bool draining_input; // used exclusively by the stream thread
|
||||
|
||||
// The parser pointer is safe to read and use, only when having the host receiver lock.
|
||||
// Without this lock, the data pointed by the pointer may vanish randomly.
|
||||
// Also, since the receiver sets it when it starts, it should be read with
|
||||
|
@ -88,7 +90,6 @@ void stream_receiver_log_status(struct receiver_state *rpt, const char *msg, con
|
|||
void stream_receiver_free(struct receiver_state *rpt);
|
||||
bool stream_receiver_signal_to_stop_and_wait(RRDHOST *host, STREAM_HANDSHAKE reason);
|
||||
|
||||
ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE type);
|
||||
void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcode msg);
|
||||
void stream_receiver_handle_op(struct stream_thread *sth, struct receiver_state *rpt, struct stream_opcode *msg);
|
||||
|
||||
|
|
|
@ -147,14 +147,18 @@ static inline decompressor_status_t receiver_feed_decompressor(struct receiver_s
|
|||
stream_decompressor_start(&r->thread.compressed.decompressor, buf + start, signature_size);
|
||||
|
||||
if (unlikely(!compressed_message_size)) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "multiplexed uncompressed data in compressed stream!");
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[x] '%s' [from [%s]:%s]: multiplexed uncompressed data in compressed stream!",
|
||||
rrdhost_hostname(r->host), r->client_ip, r->client_port);
|
||||
return DECOMPRESS_FAILED;
|
||||
}
|
||||
|
||||
if(unlikely(compressed_message_size > COMPRESSION_MAX_MSG_SIZE)) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"received a compressed message of %zu bytes, which is bigger than the max compressed message "
|
||||
"STREAM RECEIVE[x] '%s' [from [%s]:%s]: received a compressed message of %zu bytes, "
|
||||
"which is bigger than the max compressed message "
|
||||
"size supported of %zu. Ignoring message.",
|
||||
rrdhost_hostname(r->host), r->client_ip, r->client_port,
|
||||
compressed_message_size, (size_t)COMPRESSION_MAX_MSG_SIZE);
|
||||
return DECOMPRESS_FAILED;
|
||||
}
|
||||
|
@ -169,7 +173,9 @@ static inline decompressor_status_t receiver_feed_decompressor(struct receiver_s
|
|||
stream_decompress(&r->thread.compressed.decompressor, buf + start + signature_size, compressed_message_size);
|
||||
|
||||
if (unlikely(!bytes_to_parse)) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "no bytes to parse.");
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[x] '%s' [from [%s]:%s]: no bytes to decompress.",
|
||||
rrdhost_hostname(r->host), r->client_ip, r->client_port);
|
||||
return DECOMPRESS_FAILED;
|
||||
}
|
||||
|
||||
|
@ -259,9 +265,9 @@ void stream_receiver_handle_op(struct stream_thread *sth, struct receiver_state
|
|||
STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(rpt->thread.send_to_child.scb);
|
||||
spinlock_unlock(&rpt->thread.send_to_child.spinlock);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] %s [from %s]: send buffer is full (buffer size %u, max %u, used %u, available %u). "
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: send buffer is full (buffer size %u, max %u, used %u, available %u). "
|
||||
"Restarting connection.",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip,
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port,
|
||||
stats.bytes_size, stats.bytes_max_size, stats.bytes_outstanding, stats.bytes_available);
|
||||
|
||||
stream_receiver_remove(sth, rpt, "receiver send buffer overflow");
|
||||
|
@ -272,7 +278,7 @@ void stream_receiver_handle_op(struct stream_thread *sth, struct receiver_state
|
|||
"STREAM RECEIVE[%zu]: invalid msg id %u", sth->id, (unsigned)msg->opcode);
|
||||
}
|
||||
|
||||
ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE type) {
|
||||
static ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE type) {
|
||||
struct receiver_state *rpt = data;
|
||||
if(!rpt || rpt->thread.meta.type != POLLFD_TYPE_RECEIVER || !rpt->thread.send_to_child.scb)
|
||||
return 0;
|
||||
|
@ -286,7 +292,8 @@ ssize_t send_to_child(const char *txt, void *data, STREAM_TRAFFIC_TYPE type) {
|
|||
|
||||
size_t size = strlen(txt);
|
||||
ssize_t rc = (ssize_t)size;
|
||||
if(!stream_circular_buffer_add_unsafe(scb, txt, size, size, type)) {
|
||||
if(!stream_circular_buffer_add_unsafe(scb, txt, size, size, type, true)) {
|
||||
// should never happen, because of autoscaling
|
||||
msg.opcode = STREAM_OPCODE_RECEIVER_BUFFER_OVERFLOW;
|
||||
rc = -1;
|
||||
}
|
||||
|
@ -315,14 +322,10 @@ static void streaming_parser_init(struct receiver_state *rpt) {
|
|||
// put the client IP and port into the buffers used by plugins.d
|
||||
{
|
||||
char buf[CONFIG_MAX_NAME];
|
||||
snprintfz(buf, sizeof(buf), "%s:%s", rpt->client_ip, rpt->client_port);
|
||||
snprintfz(buf, sizeof(buf), "[%s]:%s", rpt->client_ip, rpt->client_port);
|
||||
string_freez(rpt->thread.cd.id);
|
||||
rpt->thread.cd.id = string_strdupz(buf);
|
||||
}
|
||||
|
||||
{
|
||||
char buf[FILENAME_MAX + 1];
|
||||
snprintfz(buf, sizeof(buf), "%s:%s", rpt->client_ip, rpt->client_port);
|
||||
string_freez(rpt->thread.cd.filename);
|
||||
rpt->thread.cd.filename = string_strdupz(buf);
|
||||
|
||||
|
@ -384,6 +387,23 @@ static void streaming_parser_init(struct receiver_state *rpt) {
|
|||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
static void stream_receive_log_database_gap(struct receiver_state *rpt) {
|
||||
RRDHOST *host = rpt->host;
|
||||
|
||||
time_t now = now_realtime_sec();
|
||||
time_t last_db_entry = 0;
|
||||
rrdhost_retention(host, now, false, NULL, &last_db_entry);
|
||||
|
||||
if(now < last_db_entry)
|
||||
last_db_entry = now;
|
||||
|
||||
char buf[128];
|
||||
duration_snprintf(buf, sizeof(buf), now - last_db_entry, "s", true);
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM RECEIVE '%s' [from [%s]:%s]: node connected; last sample in the database %s ago",
|
||||
rrdhost_hostname(host), rpt->client_ip, rpt->client_port, buf);
|
||||
}
|
||||
|
||||
void stream_receiver_move_queue_to_running_unsafe(struct stream_thread *sth) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ );
|
||||
|
||||
|
@ -404,8 +424,8 @@ void stream_receiver_move_queue_to_running_unsafe(struct stream_thread *sth) {
|
|||
ND_LOG_STACK_PUSH(lgs);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM RECEIVE[%zu] [%s]: moving host from receiver queue to receiver running...",
|
||||
sth->id, rrdhost_hostname(rpt->host));
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: moving host from receiver queue to receiver running...",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port);
|
||||
|
||||
rpt->host->stream.rcv.status.tid = gettid_cached();
|
||||
rpt->thread.meta.type = POLLFD_TYPE_RECEIVER;
|
||||
|
@ -413,9 +433,6 @@ void stream_receiver_move_queue_to_running_unsafe(struct stream_thread *sth) {
|
|||
|
||||
spinlock_lock(&rpt->thread.send_to_child.spinlock);
|
||||
rpt->thread.send_to_child.scb = stream_circular_buffer_create();
|
||||
|
||||
// this should be big enough to fit all the replies to the replication requests we may receive in a batch
|
||||
stream_circular_buffer_set_max_size_unsafe(rpt->thread.send_to_child.scb, 100 * 1024 * 1024, true);
|
||||
rpt->thread.send_to_child.msg.thread_slot = (int32_t)sth->id;
|
||||
rpt->thread.send_to_child.msg.session = os_random32();
|
||||
rpt->thread.send_to_child.msg.meta = &rpt->thread.meta;
|
||||
|
@ -430,7 +447,12 @@ void stream_receiver_move_queue_to_running_unsafe(struct stream_thread *sth) {
|
|||
rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, rpt->sock.fd);
|
||||
|
||||
if(!nd_poll_add(sth->run.ndpl, rpt->sock.fd, ND_POLL_READ, &rpt->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to add receiver socket to nd_poll()");
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]:"
|
||||
"Failed to add receiver socket to nd_poll()",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port);
|
||||
|
||||
stream_receive_log_database_gap(rpt);
|
||||
|
||||
// keep this last, since it sends commands back to the child
|
||||
streaming_parser_init(rpt);
|
||||
|
@ -508,6 +530,8 @@ static void stream_receiver_remove(struct stream_thread *sth, struct receiver_st
|
|||
|
||||
static ssize_t
|
||||
stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt, PARSER *parser, bool *removed) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__);
|
||||
|
||||
ssize_t rc;
|
||||
if(rpt->thread.compressed.enabled) {
|
||||
rc = receiver_read_compressed(rpt);
|
||||
|
@ -589,10 +613,10 @@ stream_receive_and_process(struct stream_thread *sth, struct receiver_state *rpt
|
|||
}
|
||||
|
||||
// process poll() events for streaming receivers
|
||||
void stream_receive_process_poll_events(struct stream_thread *sth, struct receiver_state *rpt, nd_poll_event_t events, usec_t now_ut)
|
||||
// returns true when the receiver is still there, false if it removed it
|
||||
bool stream_receive_process_poll_events(struct stream_thread *sth, struct receiver_state *rpt, nd_poll_event_t events, usec_t now_ut)
|
||||
{
|
||||
internal_fatal(
|
||||
sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__);
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__);
|
||||
|
||||
PARSER *parser = __atomic_load_n(&rpt->thread.parser, __ATOMIC_RELAXED);
|
||||
ND_LOG_STACK lgs[] = {
|
||||
|
@ -612,7 +636,7 @@ void stream_receive_process_poll_events(struct stream_thread *sth, struct receiv
|
|||
if (receiver_should_stop(rpt)) {
|
||||
receiver_set_exit_reason(rpt, rpt->exit.reason, false);
|
||||
stream_receiver_remove(sth, rpt, "received stop signal");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (unlikely(events & (ND_POLL_ERROR | ND_POLL_HUP | ND_POLL_INVALID))) {
|
||||
|
@ -631,109 +655,122 @@ void stream_receive_process_poll_events(struct stream_thread *sth, struct receiv
|
|||
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SOCKET_ERROR);
|
||||
|
||||
nd_log(
|
||||
NDLS_DAEMON,
|
||||
NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] %s [from %s]: %s - closing connection",
|
||||
sth->id,
|
||||
rrdhost_hostname(rpt->host),
|
||||
rpt->client_ip,
|
||||
error);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: %s - closing connection",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, error);
|
||||
|
||||
receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, false);
|
||||
stream_receiver_remove(sth, rpt, error);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (events & ND_POLL_WRITE) {
|
||||
worker_is_busy(WORKER_STREAM_JOB_SOCKET_SEND);
|
||||
|
||||
if (spinlock_trylock(&rpt->thread.send_to_child.spinlock)) {
|
||||
const char *disconnect_reason = NULL;
|
||||
STREAM_HANDSHAKE reason;
|
||||
bool stop = false;
|
||||
while(!stop) {
|
||||
if (spinlock_trylock(&rpt->thread.send_to_child.spinlock)) {
|
||||
const char *disconnect_reason = NULL;
|
||||
STREAM_HANDSHAKE reason;
|
||||
|
||||
char *chunk;
|
||||
STREAM_CIRCULAR_BUFFER *scb = rpt->thread.send_to_child.scb;
|
||||
STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(scb);
|
||||
size_t outstanding = stream_circular_buffer_get_unsafe(scb, &chunk);
|
||||
ssize_t rc = write_stream(rpt, chunk, outstanding);
|
||||
if (likely(rc > 0)) {
|
||||
stream_circular_buffer_del_unsafe(scb, rc);
|
||||
if (!stats->bytes_outstanding) {
|
||||
if (!nd_poll_upd(sth->run.ndpl, rpt->sock.fd, ND_POLL_READ, &rpt->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM RECEIVE: cannot update nd_poll()");
|
||||
char *chunk;
|
||||
STREAM_CIRCULAR_BUFFER *scb = rpt->thread.send_to_child.scb;
|
||||
STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(scb);
|
||||
size_t outstanding = stream_circular_buffer_get_unsafe(scb, &chunk);
|
||||
ssize_t rc = write_stream(rpt, chunk, outstanding);
|
||||
if (likely(rc > 0)) {
|
||||
stream_circular_buffer_del_unsafe(scb, rc);
|
||||
if (!stats->bytes_outstanding) {
|
||||
if (!nd_poll_upd(sth->run.ndpl, rpt->sock.fd, ND_POLL_READ, &rpt->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: cannot update nd_poll()",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port);
|
||||
|
||||
// recreate the circular buffer if we have to
|
||||
stream_circular_buffer_recreate_timed_unsafe(rpt->thread.send_to_child.scb, now_ut, false);
|
||||
// recreate the circular buffer if we have to
|
||||
stream_circular_buffer_recreate_timed_unsafe(rpt->thread.send_to_child.scb, now_ut, false);
|
||||
stop = true;
|
||||
}
|
||||
else if(stream_thread_process_opcodes(sth, &rpt->thread.meta))
|
||||
stop = true;
|
||||
}
|
||||
} else if (rc == 0 || errno == ECONNRESET) {
|
||||
disconnect_reason = "socket reports EOF (closed by child)";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END;
|
||||
} else if (rc < 0) {
|
||||
if (errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)
|
||||
// will try later
|
||||
;
|
||||
else {
|
||||
disconnect_reason = "socket reports error while writing";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED;
|
||||
else if (rc == 0 || errno == ECONNRESET) {
|
||||
disconnect_reason = "socket reports EOF (closed by child)";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END;
|
||||
}
|
||||
else if (rc < 0) {
|
||||
if (errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)
|
||||
// will try later
|
||||
stop = true;
|
||||
else {
|
||||
disconnect_reason = "socket reports error while writing";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED;
|
||||
}
|
||||
}
|
||||
spinlock_unlock(&rpt->thread.send_to_child.spinlock);
|
||||
|
||||
if (disconnect_reason) {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: %s (%zd, on fd %d) - closing connection - "
|
||||
"we have sent %zu bytes in %zu operations.",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port,
|
||||
disconnect_reason, rc, rpt->sock.fd, stats->bytes_sent, stats->sends);
|
||||
|
||||
receiver_set_exit_reason(rpt, reason, false);
|
||||
stream_receiver_remove(sth, rpt, disconnect_reason);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
spinlock_unlock(&rpt->thread.send_to_child.spinlock);
|
||||
|
||||
if (disconnect_reason) {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] %s [from %s]: %s (%zd, on fd %d) - closing connection - "
|
||||
"we have sent %zu bytes in %zu operations.",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, disconnect_reason, rc, rpt->sock.fd,
|
||||
stats->bytes_sent, stats->sends);
|
||||
|
||||
receiver_set_exit_reason(rpt, reason, false);
|
||||
stream_receiver_remove(sth, rpt, disconnect_reason);
|
||||
return;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(events & ND_POLL_READ))
|
||||
return;
|
||||
return true;
|
||||
|
||||
// we can receive data from this socket
|
||||
|
||||
worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE);
|
||||
bool removed = false;
|
||||
while(!removed) {
|
||||
bool removed = false, stop = false;
|
||||
size_t iterations = 0;
|
||||
while(!removed && !stop && iterations++ < MAX_IO_ITERATIONS_PER_EVENT) {
|
||||
ssize_t rc = stream_receive_and_process(sth, rpt, parser, &removed);
|
||||
if (likely(rc > 0)) {
|
||||
rpt->last_msg_t = (time_t)(now_ut / USEC_PER_SEC);
|
||||
|
||||
if(stream_thread_process_opcodes(sth, &rpt->thread.meta))
|
||||
stop = true;
|
||||
}
|
||||
else if (rc == 0 || errno == ECONNRESET) {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_REMOTE_CLOSED);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] %s [from %s]: socket %d reports EOF (closed by child).",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->sock.fd);
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: socket %d reports EOF (closed by child).",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, rpt->sock.fd);
|
||||
receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END, false);
|
||||
stream_receiver_remove(sth, rpt, "socket reports EOF (closed by child)");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
else if (rc < 0) {
|
||||
if(removed)
|
||||
return;
|
||||
return false;
|
||||
|
||||
else if ((errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR))
|
||||
// will try later
|
||||
break;
|
||||
stop = true;
|
||||
else {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE[%zu] %s [from %s]: error during receive (%zd, on fd %d) - closing connection.",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rc, rpt->sock.fd);
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: error during receive (%zd, on fd %d) - closing connection.",
|
||||
sth->id, rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, rc, rpt->sock.fd);
|
||||
receiver_set_exit_reason(rpt, STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, false);
|
||||
stream_receiver_remove(sth, rpt, "error during receive");
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return !removed;
|
||||
}
|
||||
|
||||
void stream_receiver_cleanup(struct stream_thread *sth) {
|
||||
|
@ -782,8 +819,9 @@ bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) {
|
|||
if (rpt->config.health.delay > 0) {
|
||||
host->health.delay_up_to = now_realtime_sec() + rpt->config.health.delay;
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"[%s]: Postponing health checks for %" PRId64 " seconds, because it was just connected.",
|
||||
rrdhost_hostname(host),
|
||||
"STREAM RECEIVE '%s' [from [%s]:%s]: "
|
||||
"Postponing health checks for %" PRId64 " seconds, because it was just connected.",
|
||||
rrdhost_hostname(host), rpt->client_ip, rpt->client_port,
|
||||
(int64_t) rpt->config.health.delay);
|
||||
}
|
||||
}
|
||||
|
@ -797,7 +835,7 @@ bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) {
|
|||
signal_rrdcontext = true;
|
||||
stream_receiver_replication_reset(host);
|
||||
|
||||
rrdhost_flag_clear(rpt->host, RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED);
|
||||
rrdhost_flag_set(rpt->host, RRDHOST_FLAG_COLLECTOR_ONLINE);
|
||||
aclk_queue_node_info(rpt->host, true);
|
||||
|
||||
rrdhost_stream_parents_reset(host, STREAM_HANDSHAKE_PREPARING);
|
||||
|
@ -810,6 +848,9 @@ bool rrdhost_set_receiver(RRDHOST *host, struct receiver_state *rpt) {
|
|||
if(signal_rrdcontext)
|
||||
rrdcontext_host_child_connected(host);
|
||||
|
||||
if(set_this)
|
||||
ml_host_start(host);
|
||||
|
||||
return set_this;
|
||||
}
|
||||
|
||||
|
@ -822,11 +863,12 @@ void rrdhost_clear_receiver(struct receiver_state *rpt) {
|
|||
// Make sure that we detach this thread and don't kill a freshly arriving receiver
|
||||
|
||||
if (host->receiver == rpt) {
|
||||
rrdhost_flag_set(host, RRDHOST_FLAG_STREAM_RECEIVER_DISCONNECTED);
|
||||
rrdhost_flag_clear(host, RRDHOST_FLAG_COLLECTOR_ONLINE);
|
||||
rrdhost_receiver_unlock(host);
|
||||
{
|
||||
// run all these without having the receiver lock
|
||||
|
||||
ml_host_stop(host);
|
||||
stream_path_child_disconnected(host);
|
||||
stream_sender_signal_to_stop_and_wait(host, STREAM_HANDSHAKE_DISCONNECT_RECEIVER_LEFT, false);
|
||||
stream_receiver_replication_reset(host);
|
||||
|
|
|
@ -21,14 +21,16 @@ void sender_commit_thread_buffer_free(void) {
|
|||
// Collector thread starting a transmission
|
||||
BUFFER *sender_commit_start_with_trace(struct sender_state *s __maybe_unused, struct sender_buffer *commit, const char *func) {
|
||||
if(unlikely(commit->used))
|
||||
fatal("STREAMING: thread buffer is used multiple times concurrently (%u). "
|
||||
fatal("STREAM SEND '%s' [to %s]: thread buffer is used multiple times concurrently (%u). "
|
||||
"It is already being used by '%s()', and now is called by '%s()'",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
(unsigned)commit->used,
|
||||
commit->last_function ? commit->last_function : "(null)",
|
||||
func ? func : "(null)");
|
||||
|
||||
if(unlikely(commit->receiver_tid && commit->receiver_tid != gettid_cached()))
|
||||
fatal("STREAMING: thread buffer is reserved for tid %d, but it used by thread %d function '%s()'.",
|
||||
fatal("STREAM SEND '%s' [to %s]: thread buffer is reserved for tid %d, but it used by thread %d function '%s()'.",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
commit->receiver_tid, gettid_cached(), func ? func : "(null)");
|
||||
|
||||
if(unlikely(commit->wb &&
|
||||
|
@ -81,11 +83,12 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff
|
|||
return;
|
||||
}
|
||||
|
||||
if (unlikely(stream_circular_buffer_set_max_size_unsafe(s->scb, src_len, false))) {
|
||||
if (unlikely(stream_circular_buffer_set_max_size_unsafe(
|
||||
s->scb, src_len * STREAM_CIRCULAR_BUFFER_ADAPT_TO_TIMES_MAX_SIZE, false))) {
|
||||
// adaptive sizing of the circular buffer
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM SEND %s [to %s]: Increased max buffer size to %u (message size %zu).",
|
||||
rrdhost_hostname(s->host), s->connected_to, stats->bytes_max_size, buffer_strlen(wb) + 1);
|
||||
"STREAM SEND '%s' [to %s]: Increased max buffer size to %u (message size %zu).",
|
||||
rrdhost_hostname(s->host), s->connected_to, stats->bytes_max_size, src_len + 1);
|
||||
}
|
||||
|
||||
stream_sender_log_payload(s, wb, type, false);
|
||||
|
@ -123,7 +126,7 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff
|
|||
size_t dst_len = stream_compress(&s->compressor, src, size_to_compress, &dst);
|
||||
if (!dst_len) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s]: COMPRESSION failed. Resetting compressor and re-trying",
|
||||
"STREAM SEND '%s' [to %s]: COMPRESSION failed. Resetting compressor and re-trying",
|
||||
rrdhost_hostname(s->host), s->connected_to);
|
||||
|
||||
stream_compression_initialize(s);
|
||||
|
@ -139,13 +142,16 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff
|
|||
size_t decoded_dst_len = stream_decompress_decode_signature((const char *)&signature, sizeof(signature));
|
||||
if (decoded_dst_len != dst_len)
|
||||
fatal(
|
||||
"STREAM COMPRESSION: invalid signature, original payload %zu bytes, "
|
||||
"STREAM SEND '%s' [to %s]: invalid signature, original payload %zu bytes, "
|
||||
"compressed payload length %zu bytes, but signature says payload is %zu bytes",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
size_to_compress, dst_len, decoded_dst_len);
|
||||
#endif
|
||||
|
||||
if (!stream_circular_buffer_add_unsafe(s->scb, (const char *)&signature, sizeof(signature), sizeof(signature), type) ||
|
||||
!stream_circular_buffer_add_unsafe(s->scb, dst, dst_len, size_to_compress, type))
|
||||
if (!stream_circular_buffer_add_unsafe(s->scb, (const char *)&signature, sizeof(signature),
|
||||
sizeof(signature), type, false) ||
|
||||
!stream_circular_buffer_add_unsafe(s->scb, dst, dst_len,
|
||||
size_to_compress, type, false))
|
||||
goto overflow_with_lock;
|
||||
|
||||
src = src + size_to_compress;
|
||||
|
@ -155,7 +161,8 @@ void sender_buffer_commit(struct sender_state *s, BUFFER *wb, struct sender_buff
|
|||
else {
|
||||
// uncompressed traffic
|
||||
|
||||
if (!stream_circular_buffer_add_unsafe(s->scb, src, src_len, src_len, type))
|
||||
if (!stream_circular_buffer_add_unsafe(s->scb, src, src_len,
|
||||
src_len, type, false))
|
||||
goto overflow_with_lock;
|
||||
}
|
||||
|
||||
|
@ -179,11 +186,12 @@ overflow_with_lock: {
|
|||
stream_sender_unlock(s);
|
||||
msg.opcode = STREAM_OPCODE_SENDER_BUFFER_OVERFLOW;
|
||||
stream_sender_send_opcode(s, msg);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s]: buffer overflow (buffer size %u, max size %u, used %u, available %u). "
|
||||
"Restarting connection.",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
stats->bytes_size, stats->bytes_max_size, stats->bytes_outstanding, stats->bytes_available);
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND '%s' [to %s]: buffer overflow (buffer size %u, max size %u, used %u, available %u). "
|
||||
"Restarting connection.",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
stats->bytes_size, stats->bytes_max_size, stats->bytes_outstanding, stats->bytes_available);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -193,9 +201,11 @@ compression_failed_with_lock: {
|
|||
stream_sender_unlock(s);
|
||||
msg.opcode = STREAM_OPCODE_SENDER_RECONNECT_WITHOUT_COMPRESSION;
|
||||
stream_sender_send_opcode(s, msg);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM %s [send to %s]: COMPRESSION failed (twice). Deactivating compression and restarting connection.",
|
||||
rrdhost_hostname(s->host), s->connected_to);
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND '%s' [to %s]: COMPRESSION failed (twice). "
|
||||
"Deactivating compression and restarting connection.",
|
||||
rrdhost_hostname(s->host), s->connected_to);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -203,10 +213,12 @@ void sender_thread_commit(struct sender_state *s, BUFFER *wb, STREAM_TRAFFIC_TYP
|
|||
struct sender_buffer *commit = (wb == commit___thread.wb) ? & commit___thread : &s->host->stream.snd.commit;
|
||||
|
||||
if (unlikely(wb != commit->wb))
|
||||
fatal("STREAMING: function '%s()' is trying to commit an unknown commit buffer.", func);
|
||||
fatal("STREAM SEND '%s' [to %s]: function '%s()' is trying to commit an unknown commit buffer.",
|
||||
rrdhost_hostname(s->host), s->connected_to, func);
|
||||
|
||||
if (unlikely(!commit->used))
|
||||
fatal("STREAMING: function '%s()' is committing a sender buffer twice.", func);
|
||||
fatal("STREAM SEND '%s' [to %s]: function '%s()' is committing a sender buffer twice.",
|
||||
rrdhost_hostname(s->host), s->connected_to, func);
|
||||
|
||||
commit->used = false;
|
||||
commit->last_function = NULL;
|
||||
|
|
|
@ -26,7 +26,7 @@ static void stream_execute_function_callback(BUFFER *func_wb, int code, void *da
|
|||
|
||||
sender_commit_clean_buffer(s, wb, STREAM_TRAFFIC_TYPE_FUNCTIONS);
|
||||
|
||||
internal_error(true, "STREAM %s [send to %s] FUNCTION transaction %s sending back response (%zu bytes, %"PRIu64" usec).",
|
||||
internal_error(true, "STREAM SEND '%s' [to %s]: FUNCTION transaction %s sending back response (%zu bytes, %"PRIu64" usec).",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
string2str(tmp->transaction),
|
||||
buffer_strlen(func_wb),
|
||||
|
@ -57,7 +57,7 @@ static void execute_commands_function(struct sender_state *s, const char *comman
|
|||
nd_log(NDLS_ACCESS, NDLP_INFO, NULL);
|
||||
|
||||
if(!transaction || !*transaction || !timeout_s || !*timeout_s || !function || !*function) {
|
||||
netdata_log_error("STREAM %s [send to %s] %s execution command is incomplete (transaction = '%s', timeout = '%s', function = '%s'). Ignoring it.",
|
||||
netdata_log_error("STREAM SEND '%s' [to %s]: %s execution command is incomplete (transaction = '%s', timeout = '%s', function = '%s'). Ignoring it.",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
command,
|
||||
transaction?transaction:"(unset)",
|
||||
|
@ -110,7 +110,10 @@ static void execute_deferred_json(struct sender_state *s, void *data) {
|
|||
if(strcmp(keyword, PLUGINSD_KEYWORD_JSON_CMD_STREAM_PATH) == 0)
|
||||
stream_path_set_from_json(s->host, buffer_tostring(s->defer.payload), true);
|
||||
else
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM: unknown JSON keyword '%s' with payload: %s", keyword, buffer_tostring(s->defer.payload));
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND '%s' [to %s]: unknown JSON keyword '%s' with payload: %s",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
keyword, buffer_tostring(s->defer.payload));
|
||||
}
|
||||
|
||||
static void cleanup_deferred_json(struct sender_state *s __maybe_unused, void *data) {
|
||||
|
@ -274,7 +277,7 @@ void stream_sender_execute_commands(struct sender_state *s) {
|
|||
const char *before = get_word(s->rbuf.line.words, s->rbuf.line.num_words, 4);
|
||||
|
||||
if (!chart_id || !start_streaming || !after || !before) {
|
||||
netdata_log_error("STREAM %s [send to %s] %s command is incomplete"
|
||||
netdata_log_error("STREAM SEND '%s' [to %s] %s command is incomplete"
|
||||
" (chart=%s, start_streaming=%s, after=%s, before=%s)",
|
||||
rrdhost_hostname(s->host), s->connected_to,
|
||||
command,
|
||||
|
@ -310,7 +313,7 @@ void stream_sender_execute_commands(struct sender_state *s) {
|
|||
s->defer.action_data = strdupz(keyword);
|
||||
}
|
||||
else {
|
||||
netdata_log_error("STREAM %s [send to %s] received unknown command over connection: %s",
|
||||
netdata_log_error("STREAM SEND '%s' [to %s] received unknown command over connection: %s",
|
||||
rrdhost_hostname(s->host), s->connected_to, s->rbuf.line.words[0]?s->rbuf.line.words[0]:"(unset)");
|
||||
}
|
||||
|
||||
|
|
|
@ -40,6 +40,8 @@ struct sender_state {
|
|||
ND_SOCK sock;
|
||||
|
||||
struct {
|
||||
bool draining_input; // used exclusively by the stream thread
|
||||
|
||||
struct stream_opcode msg; // the template for sending a message to the dispatcher - protected by sender_lock()
|
||||
|
||||
// this is a property of stream_sender_send_msg_to_dispatcher()
|
||||
|
|
|
@ -89,7 +89,7 @@ void stream_sender_on_connect(struct sender_state *s) {
|
|||
|
||||
static void stream_sender_on_ready_to_dispatch(struct sender_state *s) {
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM SEND [%s]: running ready-to-dispatch hooks...",
|
||||
"STREAM SEND '%s': running ready-to-dispatch hooks...",
|
||||
rrdhost_hostname(s->host));
|
||||
|
||||
// set this flag before sending any data, or the data will not be sent
|
||||
|
@ -105,7 +105,7 @@ static void stream_sender_on_ready_to_dispatch(struct sender_state *s) {
|
|||
|
||||
static void stream_sender_on_disconnect(struct sender_state *s) {
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM SEND [%s]: running on-disconnect hooks...",
|
||||
"STREAM SEND '%s': running on-disconnect hooks...",
|
||||
rrdhost_hostname(s->host));
|
||||
|
||||
stream_sender_lock(s);
|
||||
|
@ -182,7 +182,7 @@ void stream_sender_handle_op(struct stream_thread *sth, struct sender_state *s,
|
|||
STREAM_CIRCULAR_BUFFER_STATS stats = *stream_circular_buffer_stats_unsafe(s->scb);
|
||||
stream_sender_unlock(s);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [to %s]: send buffer is full (buffer size %u, max %u, used %u, available %u). "
|
||||
"STREAM SEND[%zu] '%s' [to %s]: send buffer is full (buffer size %u, max %u, used %u, available %u). "
|
||||
"Restarting connection.",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to,
|
||||
stats.bytes_size, stats.bytes_max_size, stats.bytes_outstanding, stats.bytes_available);
|
||||
|
@ -203,7 +203,7 @@ void stream_sender_handle_op(struct stream_thread *sth, struct sender_state *s,
|
|||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_COMPRESSION_ERROR);
|
||||
errno_clear();
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [send to %s]: restarting connection without compression.",
|
||||
"STREAM SEND[%zu] '%s' [to %s]: restarting connection without compression.",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
|
||||
stream_sender_move_running_to_connector_or_remove(
|
||||
|
@ -245,8 +245,8 @@ void stream_sender_move_queue_to_running_unsafe(struct stream_thread *sth) {
|
|||
ND_LOG_STACK_PUSH(lgs);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM SEND[%zu] [%s]: moving host from dispatcher queue to dispatcher running...",
|
||||
sth->id, rrdhost_hostname(s->host));
|
||||
"STREAM SEND[%zu] '%s' [to %s]: moving host from dispatcher queue to dispatcher running...",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
|
||||
stream_sender_lock(s);
|
||||
s->thread.meta.type = POLLFD_TYPE_SENDER;
|
||||
|
@ -268,7 +268,9 @@ void stream_sender_move_queue_to_running_unsafe(struct stream_thread *sth) {
|
|||
META_SET(&sth->run.meta, (Word_t)&s->thread.meta, &s->thread.meta);
|
||||
|
||||
if(!nd_poll_add(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to add sender socket to nd_poll()");
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] '%s' [to %s]: failed to add sender socket to nd_poll()",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
|
||||
stream_sender_on_ready_to_dispatch(s);
|
||||
}
|
||||
|
@ -279,8 +281,8 @@ void stream_sender_remove(struct sender_state *s) {
|
|||
// when it gives up on a certain node
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM SEND [%s]: streaming sender removed host: %s",
|
||||
rrdhost_hostname(s->host), stream_handshake_error_to_string(s->exit.reason));
|
||||
"STREAM SEND '%s' [to %s]: streaming sender removed host: %s",
|
||||
rrdhost_hostname(s->host), s->connected_to, stream_handshake_error_to_string(s->exit.reason));
|
||||
|
||||
stream_sender_lock(s);
|
||||
|
||||
|
@ -316,7 +318,9 @@ static void stream_sender_move_running_to_connector_or_remove(struct stream_thre
|
|||
META_DEL(&sth->run.meta, (Word_t)&s->thread.meta);
|
||||
|
||||
if(!nd_poll_del(sth->run.ndpl, s->sock.fd))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "Failed to delete sender socket from nd_poll()");
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] '%s' [to %s]: failed to delete sender socket from nd_poll()",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
|
||||
// clear this flag asap, to stop other threads from pushing metrics for this node
|
||||
rrdhost_flag_clear(s->host, RRDHOST_FLAG_STREAM_SENDER_CONNECTED | RRDHOST_FLAG_STREAM_SENDER_READY_4_METRICS);
|
||||
|
@ -331,8 +335,8 @@ static void stream_sender_move_running_to_connector_or_remove(struct stream_thre
|
|||
stream_sender_unlock(s);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_NOTICE,
|
||||
"STREAM SEND [%s]: sender disconnected from parent, reason: %s",
|
||||
rrdhost_hostname(s->host), stream_handshake_error_to_string(reason));
|
||||
"STREAM SEND[%zu] '%s' [to %s]: sender disconnected from parent, reason: %s",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, stream_handshake_error_to_string(reason));
|
||||
|
||||
nd_sock_close(&s->sock);
|
||||
|
||||
|
@ -398,7 +402,7 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n
|
|||
size_snprintf(pending, sizeof(pending), stats.bytes_outstanding, "B", false);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [send to %s]: could not send data for %ld seconds - closing connection - "
|
||||
"STREAM SEND[%zu] '%s' [to %s]: could not send data for %ld seconds - closing connection - "
|
||||
"we have sent %zu bytes in %zu operations, it is idle for %s, and we have %s pending to send "
|
||||
"(buffer is used %.2f%%).",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, stream_send.parents.timeout_s,
|
||||
|
@ -414,7 +418,7 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n
|
|||
|
||||
if(!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ | (stats.bytes_outstanding ? ND_POLL_WRITE : 0), &s->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [send to %s]: failed to update nd_poll().",
|
||||
"STREAM SEND[%zu] '%s' [to %s]: failed to update nd_poll().",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
}
|
||||
|
||||
|
@ -428,7 +432,9 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n
|
|||
worker_set_metric(WORKER_SENDER_JOB_BUFFER_RATIO, overall_buffer_ratio);
|
||||
}
|
||||
|
||||
void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_state *s, nd_poll_event_t events, usec_t now_ut) {
|
||||
// process poll() events for streaming senders
|
||||
// returns true when the sender is still there, false if it removed it
|
||||
bool stream_sender_process_poll_events(struct stream_thread *sth, struct sender_state *s, nd_poll_event_t events, usec_t now_ut) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ );
|
||||
|
||||
ND_LOG_STACK lgs[] = {
|
||||
|
@ -464,80 +470,90 @@ void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_
|
|||
stream_sender_unlock(s);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [to %s]: %s restarting connection - %zu bytes transmitted in %zu operations.",
|
||||
"STREAM SEND[%zu] '%s' [to %s]: %s restarting connection - %zu bytes transmitted in %zu operations.",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, error, stats.bytes_sent, stats.sends);
|
||||
|
||||
stream_sender_move_running_to_connector_or_remove(sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_ERROR, true);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
|
||||
if(events & ND_POLL_WRITE) {
|
||||
// we can send data on this socket
|
||||
|
||||
if(stream_sender_trylock(s)) {
|
||||
worker_is_busy(WORKER_STREAM_JOB_SOCKET_SEND);
|
||||
bool stop = false;
|
||||
while(!stop) {
|
||||
if(stream_sender_trylock(s)) {
|
||||
worker_is_busy(WORKER_STREAM_JOB_SOCKET_SEND);
|
||||
|
||||
const char *disconnect_reason = NULL;
|
||||
STREAM_HANDSHAKE reason;
|
||||
const char *disconnect_reason = NULL;
|
||||
STREAM_HANDSHAKE reason;
|
||||
|
||||
STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(s->scb);
|
||||
char *chunk;
|
||||
size_t outstanding = stream_circular_buffer_get_unsafe(s->scb, &chunk);
|
||||
ssize_t rc = nd_sock_send_nowait(&s->sock, chunk, outstanding);
|
||||
if (likely(rc > 0)) {
|
||||
stream_circular_buffer_del_unsafe(s->scb, rc);
|
||||
replication_recalculate_buffer_used_ratio_unsafe(s);
|
||||
s->thread.last_traffic_ut = now_ut;
|
||||
sth->snd.bytes_sent += rc;
|
||||
STREAM_CIRCULAR_BUFFER_STATS *stats = stream_circular_buffer_stats_unsafe(s->scb);
|
||||
char *chunk;
|
||||
size_t outstanding = stream_circular_buffer_get_unsafe(s->scb, &chunk);
|
||||
ssize_t rc = nd_sock_send_nowait(&s->sock, chunk, outstanding);
|
||||
if (likely(rc > 0)) {
|
||||
stream_circular_buffer_del_unsafe(s->scb, rc);
|
||||
replication_recalculate_buffer_used_ratio_unsafe(s);
|
||||
s->thread.last_traffic_ut = now_ut;
|
||||
sth->snd.bytes_sent += rc;
|
||||
|
||||
if (!stats->bytes_outstanding) {
|
||||
// we sent them all - remove ND_POLL_WRITE
|
||||
if (!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [send to %s]: failed to update nd_poll().",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
if (!stats->bytes_outstanding) {
|
||||
// we sent them all - remove ND_POLL_WRITE
|
||||
if (!nd_poll_upd(sth->run.ndpl, s->sock.fd, ND_POLL_READ, &s->thread.meta))
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] '%s' [to %s]: failed to update nd_poll().",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to);
|
||||
|
||||
// recreate the circular buffer if we have to
|
||||
stream_circular_buffer_recreate_timed_unsafe(s->scb, now_ut, false);
|
||||
// recreate the circular buffer if we have to
|
||||
stream_circular_buffer_recreate_timed_unsafe(s->scb, now_ut, false);
|
||||
stop = true;
|
||||
}
|
||||
else if(stream_thread_process_opcodes(sth, &s->thread.meta))
|
||||
stop = true;
|
||||
}
|
||||
else if (rc == 0 || errno == ECONNRESET) {
|
||||
disconnect_reason = "socket reports EOF (closed by parent)";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END;
|
||||
}
|
||||
else if (rc < 0) {
|
||||
if(errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR) {
|
||||
// will try later
|
||||
stop = true;
|
||||
}
|
||||
else {
|
||||
disconnect_reason = "socket reports error while writing";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED;
|
||||
}
|
||||
}
|
||||
stream_sender_unlock(s);
|
||||
|
||||
if (disconnect_reason) {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] '%s' [to %s]: %s (%zd, on fd %d) - restarting connection - "
|
||||
"we have sent %zu bytes in %zu operations.",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, disconnect_reason, rc, s->sock.fd,
|
||||
stats->bytes_sent, stats->sends);
|
||||
|
||||
stream_sender_move_running_to_connector_or_remove(sth, s, reason, true);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
else if (rc == 0 || errno == ECONNRESET) {
|
||||
disconnect_reason = "socket reports EOF (closed by parent)";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END;
|
||||
}
|
||||
else if (rc < 0) {
|
||||
if(errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)
|
||||
// will try later
|
||||
;
|
||||
else {
|
||||
disconnect_reason = "socket reports error while writing";
|
||||
reason = STREAM_HANDSHAKE_DISCONNECT_SOCKET_WRITE_FAILED;
|
||||
}
|
||||
}
|
||||
stream_sender_unlock(s);
|
||||
|
||||
if (disconnect_reason) {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_SEND_ERROR);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [to %s]: %s (%zd, on fd %d) - restarting connection - "
|
||||
"we have sent %zu bytes in %zu operations.",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, disconnect_reason, rc, s->sock.fd,
|
||||
stats->bytes_sent, stats->sends);
|
||||
|
||||
stream_sender_move_running_to_connector_or_remove(sth, s, reason, true);
|
||||
|
||||
return;
|
||||
}
|
||||
else
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if(!(events & ND_POLL_READ))
|
||||
return;
|
||||
return true;
|
||||
|
||||
// we can receive data from this socket
|
||||
|
||||
worker_is_busy(WORKER_STREAM_JOB_SOCKET_RECEIVE);
|
||||
while(true) {
|
||||
bool stop = false;
|
||||
size_t iterations = 0;
|
||||
while(!stop && iterations++ < MAX_IO_ITERATIONS_PER_EVENT) {
|
||||
// we have to drain the socket!
|
||||
|
||||
ssize_t rc = nd_sock_revc_nowait(&s->sock, s->rbuf.b + s->rbuf.read_len, sizeof(s->rbuf.b) - s->rbuf.read_len - 1);
|
||||
|
@ -549,31 +565,36 @@ void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_
|
|||
|
||||
worker_is_busy(WORKER_SENDER_JOB_EXECUTE);
|
||||
stream_sender_execute_commands(s);
|
||||
|
||||
if(stream_thread_process_opcodes(sth, &s->thread.meta))
|
||||
stop = true;
|
||||
}
|
||||
else if (rc == 0 || errno == ECONNRESET) {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_REMOTE_CLOSED);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [to %s]: socket %d reports EOF (closed by parent).",
|
||||
"STREAM SEND[%zu] '%s' [to %s]: socket %d reports EOF (closed by parent).",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, s->sock.fd);
|
||||
stream_sender_move_running_to_connector_or_remove(
|
||||
sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_CLOSED_BY_REMOTE_END, true);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
else if (rc < 0) {
|
||||
if(errno == EWOULDBLOCK || errno == EAGAIN || errno == EINTR)
|
||||
// will try later
|
||||
break;
|
||||
stop = true;
|
||||
else {
|
||||
worker_is_busy(WORKER_SENDER_JOB_DISCONNECT_RECEIVE_ERROR);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[%zu] %s [to %s]: error during receive (%zd, on fd %d) - restarting connection.",
|
||||
"STREAM SEND[%zu] '%s' [to %s]: error during receive (%zd, on fd %d) - restarting connection.",
|
||||
sth->id, rrdhost_hostname(s->host), s->connected_to, rc, s->sock.fd);
|
||||
stream_sender_move_running_to_connector_or_remove(
|
||||
sth, s, STREAM_HANDSHAKE_DISCONNECT_SOCKET_READ_FAILED, true);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void stream_sender_cleanup(struct stream_thread *sth) {
|
||||
|
|
|
@ -27,8 +27,12 @@ static void stream_thread_handle_op(struct stream_thread *sth, struct stream_opc
|
|||
{
|
||||
if(m->type == POLLFD_TYPE_SENDER) {
|
||||
if(msg->opcode & STREAM_OPCODE_SENDER_POLLOUT) {
|
||||
if(!nd_poll_upd(sth->run.ndpl, m->s->sock.fd, ND_POLL_READ|ND_POLL_WRITE, m))
|
||||
internal_fatal(true, "Failed to update sender socket in nd_poll()");
|
||||
if(!nd_poll_upd(sth->run.ndpl, m->s->sock.fd, ND_POLL_READ|ND_POLL_WRITE, m)) {
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM SEND[%zu] '%s' [to %s]: cannot enable output on sender socket %d.",
|
||||
sth->id, rrdhost_hostname(m->s->host), m->s->connected_to, m->s->sock.fd);
|
||||
}
|
||||
msg->opcode &= ~(STREAM_OPCODE_SENDER_POLLOUT);
|
||||
}
|
||||
|
||||
|
@ -37,8 +41,12 @@ static void stream_thread_handle_op(struct stream_thread *sth, struct stream_opc
|
|||
}
|
||||
else if(m->type == POLLFD_TYPE_RECEIVER) {
|
||||
if (msg->opcode & STREAM_OPCODE_RECEIVER_POLLOUT) {
|
||||
if (!nd_poll_upd(sth->run.ndpl, m->rpt->sock.fd, ND_POLL_READ | ND_POLL_WRITE, m))
|
||||
internal_fatal(true, "Failed to update receiver socket in nd_poll()");
|
||||
if (!nd_poll_upd(sth->run.ndpl, m->rpt->sock.fd, ND_POLL_READ | ND_POLL_WRITE, m)) {
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM RECEIVE[%zu] '%s' [from [%s]:%s]: cannot enable output on receiver socket %d.",
|
||||
sth->id, rrdhost_hostname(m->rpt->host), m->rpt->client_ip, m->rpt->client_port, m->rpt->sock.fd);
|
||||
}
|
||||
msg->opcode &= ~(STREAM_OPCODE_RECEIVER_POLLOUT);
|
||||
}
|
||||
|
||||
|
@ -48,7 +56,8 @@ static void stream_thread_handle_op(struct stream_thread *sth, struct stream_opc
|
|||
}
|
||||
else {
|
||||
// this may happen if we receive a POLLOUT opcode, but the sender has been disconnected
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG, "STREAM THREAD[%zu]: OPCODE %u ignored.", sth->id, (unsigned)msg->opcode);
|
||||
nd_log_limit_static_global_var(erl, 1, 0);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_DEBUG, "STREAM THREAD[%zu]: OPCODE %u ignored.", sth->id, (unsigned)msg->opcode);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -70,17 +79,22 @@ void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcod
|
|||
if (!msg.session || !msg.meta || !rpt)
|
||||
return;
|
||||
|
||||
internal_fatal(msg.meta != &rpt->thread.meta, "the receiver pointer in the message does not match this receiver");
|
||||
if(msg.meta != &rpt->thread.meta) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE '%s' [from [%s]:%s]: the receiver in the opcode the message does not match this receiver. "
|
||||
"Ignoring opcode.", rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port);
|
||||
return;
|
||||
}
|
||||
struct stream_thread *sth = stream_thread_by_slot_id(msg.thread_slot);
|
||||
if(!sth) {
|
||||
internal_fatal(true,
|
||||
"STREAM RECEIVE[x] [%s] thread pointer in the opcode message does not match the expected",
|
||||
rrdhost_hostname(rpt->host));
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM RECEIVE '%s' [from [%s]:%s]: the opcode (%u) message cannot be verified. Ignoring it.",
|
||||
rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port, msg.opcode);
|
||||
return;
|
||||
}
|
||||
|
||||
// check if we can execute the message now
|
||||
if(msg.opcode == STREAM_OPCODE_RECEIVER_POLLOUT && sth->tid == gettid_cached()) {
|
||||
if(sth->tid == gettid_cached() && (!rpt->thread.draining_input || msg.opcode == STREAM_OPCODE_RECEIVER_POLLOUT)) {
|
||||
// we are running at the stream thread, and the request is about enabling POLLOUT,
|
||||
// we can do this synchronously.
|
||||
// IMPORTANT: DO NOT HANDLE FAILURES THAT REMOVE THE RECEIVER OR THE SENDER THIS WAY
|
||||
|
@ -108,6 +122,7 @@ void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcod
|
|||
return;
|
||||
}
|
||||
|
||||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
// try to find us in the list
|
||||
for (size_t i = 0; i < sth->messages.size; i++) {
|
||||
if (sth->messages.array[i].meta == &rpt->thread.meta) {
|
||||
|
@ -118,8 +133,10 @@ void stream_receiver_send_opcode(struct receiver_state *rpt, struct stream_opcod
|
|||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
fatal("The streaming opcode queue is full, but this should never happen");
|
||||
fatal("STREAM RECEIVE '%s' [from [%s]:%s]: The streaming opcode queue is full, but this should never happen...",
|
||||
rrdhost_hostname(rpt->host), rpt->client_ip, rpt->client_port);
|
||||
}
|
||||
|
||||
// let's use a new slot
|
||||
|
@ -142,17 +159,23 @@ void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg)
|
|||
if (!msg.session || !msg.meta || !s)
|
||||
return;
|
||||
|
||||
internal_fatal(msg.meta != &s->thread.meta, "the sender pointer in the message does not match this sender");
|
||||
if(msg.meta != &s->thread.meta) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND '%s' [to %s]: the opcode message does not match this sender. "
|
||||
"Ignoring opcode.", rrdhost_hostname(s->host), s->connected_to);
|
||||
return;
|
||||
}
|
||||
|
||||
struct stream_thread *sth = stream_thread_by_slot_id(msg.thread_slot);
|
||||
if(!sth) {
|
||||
internal_fatal(true,
|
||||
"STREAM SEND[x] [%s] thread pointer in the opcode message does not match the expected",
|
||||
rrdhost_hostname(s->host));
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM SEND[x] '%s' [to %s] the opcode (%u) message cannot be verified. Ignoring it.",
|
||||
rrdhost_hostname(s->host), s->connected_to, msg.opcode);
|
||||
return;
|
||||
}
|
||||
|
||||
// check if we can execute the message now
|
||||
if(msg.opcode == STREAM_OPCODE_SENDER_POLLOUT && sth->tid == gettid_cached()) {
|
||||
if(sth->tid == gettid_cached() && (!s->thread.draining_input || msg.opcode == STREAM_OPCODE_SENDER_POLLOUT)) {
|
||||
// we are running at the stream thread, and the request is about enabling POLLOUT,
|
||||
// we can do this synchronously.
|
||||
// IMPORTANT: DO NOT HANDLE FAILURES THAT REMOVE THE RECEIVER OR THE SENDER THIS WAY
|
||||
|
@ -180,6 +203,7 @@ void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg)
|
|||
return;
|
||||
}
|
||||
|
||||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
// try to find us in the list
|
||||
for (size_t i = 0; i < sth->messages.size; i++) {
|
||||
if (sth->messages.array[i].meta == &s->thread.meta) {
|
||||
|
@ -190,8 +214,10 @@ void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg)
|
|||
return;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
fatal("the streaming opcode queue is full, but this should never happen");
|
||||
fatal("STREAM SEND '%s' [to %s]: The streaming opcode queue is full, but this should never happen...",
|
||||
rrdhost_hostname(s->host), s->connected_to);
|
||||
}
|
||||
|
||||
// let's use a new slot
|
||||
|
@ -210,12 +236,9 @@ void stream_sender_send_opcode(struct sender_state *s, struct stream_opcode msg)
|
|||
stream_thread_send_pipe_signal(sth);
|
||||
}
|
||||
|
||||
static void stream_thread_read_pipe_messages(struct stream_thread *sth) {
|
||||
bool stream_thread_process_opcodes(struct stream_thread *sth, struct pollfd_meta *my_meta) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ );
|
||||
|
||||
if(read(sth->pipe.fds[PIPE_READ], sth->pipe.buffer, sth->pipe.size * sizeof(*sth->pipe.buffer)) <= 0)
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM THREAD[%zu]: signal pipe read error", sth->id);
|
||||
|
||||
size_t used = 0;
|
||||
spinlock_lock(&sth->messages.spinlock);
|
||||
if(sth->messages.used) {
|
||||
|
@ -225,10 +248,23 @@ static void stream_thread_read_pipe_messages(struct stream_thread *sth) {
|
|||
}
|
||||
spinlock_unlock(&sth->messages.spinlock);
|
||||
|
||||
bool rc = false;
|
||||
for(size_t i = 0; i < used ;i++) {
|
||||
struct stream_opcode *msg = &sth->messages.copy[i];
|
||||
if(msg->meta == my_meta) rc = true;
|
||||
stream_thread_handle_op(sth, msg);
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
static void stream_thread_read_pipe_messages(struct stream_thread *sth) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ );
|
||||
|
||||
if(read(sth->pipe.fds[PIPE_READ], sth->pipe.buffer, sth->pipe.size * sizeof(*sth->pipe.buffer)) <= 0)
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM THREAD[%zu]: signal pipe read error", sth->id);
|
||||
|
||||
stream_thread_process_opcodes(sth, NULL);
|
||||
}
|
||||
|
||||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
@ -265,8 +301,8 @@ static int set_pipe_size(int pipe_fd, int new_size) {
|
|||
static void stream_thread_messages_resize_unsafe(struct stream_thread *sth) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ );
|
||||
|
||||
if(sth->nodes_count >= sth->messages.size) {
|
||||
size_t new_size = sth->messages.size ? sth->messages.size * 2 : 2;
|
||||
if(sth->nodes_count * 2 >= sth->messages.size) {
|
||||
size_t new_size = MAX(sth->messages.size * 2, sth->nodes_count * 2);
|
||||
sth->messages.array = reallocz(sth->messages.array, new_size * sizeof(*sth->messages.array));
|
||||
sth->messages.copy = reallocz(sth->messages.copy, new_size * sizeof(*sth->messages.copy));
|
||||
sth->messages.size = new_size;
|
||||
|
@ -276,20 +312,30 @@ static void stream_thread_messages_resize_unsafe(struct stream_thread *sth) {
|
|||
// --------------------------------------------------------------------------------------------------------------------
|
||||
|
||||
static bool stream_thread_process_poll_slot(struct stream_thread *sth, nd_poll_result_t *ev, usec_t now_ut, size_t *replay_entries) {
|
||||
internal_fatal(sth->tid != gettid_cached(), "Function %s() should only be used by the dispatcher thread", __FUNCTION__ );
|
||||
|
||||
struct pollfd_meta *m = ev->data;
|
||||
internal_fatal(!m, "Failed to get meta from event");
|
||||
if(!m) {
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR,
|
||||
"STREAM THREAD[%zu]: cannot get meta from nd_poll() event. Ignoring event.", sth->id);
|
||||
return false;
|
||||
}
|
||||
|
||||
switch(m->type) {
|
||||
case POLLFD_TYPE_SENDER: {
|
||||
struct sender_state *s = m->s;
|
||||
stream_sender_process_poll_events(sth, s, ev->events, now_ut);
|
||||
s->thread.draining_input = true;
|
||||
if(stream_sender_process_poll_events(sth, s, ev->events, now_ut))
|
||||
s->thread.draining_input = false;
|
||||
*replay_entries += dictionary_entries(s->replication.requests);
|
||||
break;
|
||||
}
|
||||
|
||||
case POLLFD_TYPE_RECEIVER: {
|
||||
struct receiver_state *rpt = m->rpt;
|
||||
stream_receive_process_poll_events(sth, rpt, ev->events, now_ut);
|
||||
rpt->thread.draining_input = true;
|
||||
if(stream_receive_process_poll_events(sth, rpt, ev->events, now_ut))
|
||||
rpt->thread.draining_input = false;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -427,7 +473,7 @@ void *stream_thread(void *ptr) {
|
|||
META_SET(&sth->run.meta, (Word_t)&sth->run.pipe, &sth->run.pipe);
|
||||
|
||||
if(!nd_poll_add(sth->run.ndpl, sth->pipe.fds[PIPE_READ], ND_POLL_READ, &sth->run.pipe))
|
||||
internal_fatal(true, "Failed to add pipe to nd_poll()");
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM THREAD[%zu]: failed to add pipe to nd_poll()", sth->id);
|
||||
|
||||
bool exit_thread = false;
|
||||
size_t replay_entries = 0;
|
||||
|
@ -484,7 +530,7 @@ void *stream_thread(void *ptr) {
|
|||
internal_fatal(true, "nd_poll() failed");
|
||||
worker_is_busy(WORKER_STREAM_JOB_POLL_ERROR);
|
||||
nd_log_limit_static_thread_var(erl, 1, 1 * USEC_PER_MS);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, "STREAM THREAD[%zu] poll() returned error", sth->id);
|
||||
nd_log_limit(&erl, NDLS_DAEMON, NDLP_ERR, "STREAM THREAD[%zu] nd_poll() returned error", sth->id);
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -597,7 +643,8 @@ static struct stream_thread * stream_thread_assign_and_start(RRDHOST *host) {
|
|||
if(!sth->thread) {
|
||||
sth->id = (sth - stream_thread_globals.threads); // find the slot number
|
||||
if(&stream_thread_globals.threads[sth->id] != sth)
|
||||
fatal("STREAM THREAD[x] [%s]: thread id and slot do not match!", rrdhost_hostname(host));
|
||||
fatal("STREAM THREAD[x] [%s]: thread and slot owner do not match!",
|
||||
rrdhost_hostname(host));
|
||||
|
||||
sth->pipe.fds[PIPE_READ] = -1;
|
||||
sth->pipe.fds[PIPE_WRITE] = -1;
|
||||
|
@ -611,7 +658,7 @@ static struct stream_thread * stream_thread_assign_and_start(RRDHOST *host) {
|
|||
|
||||
sth->thread = nd_thread_create(tag, NETDATA_THREAD_OPTION_DEFAULT, stream_thread, sth);
|
||||
if (!sth->thread)
|
||||
nd_log_daemon(NDLP_ERR, "STREAM THREAD[%zu]: failed to create new thread for client.", sth->id);
|
||||
nd_log(NDLS_DAEMON, NDLP_ERR, "STREAM THREAD[%zu]: failed to create new thread for client.", sth->id);
|
||||
}
|
||||
|
||||
spinlock_unlock(&stream_thread_globals.assign.spinlock);
|
||||
|
@ -638,7 +685,7 @@ void stream_receiver_add_to_queue(struct receiver_state *rpt) {
|
|||
stream_thread_node_queued(rpt->host);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM RECEIVE[%zu] [%s]: moving host to receiver queue...",
|
||||
"STREAM RECEIVE[%zu] '%s': moving host to receiver queue...",
|
||||
sth->id, rrdhost_hostname(rpt->host));
|
||||
|
||||
spinlock_lock(&sth->queue.spinlock);
|
||||
|
@ -653,7 +700,7 @@ void stream_sender_add_to_queue(struct sender_state *s) {
|
|||
stream_thread_node_queued(s->host);
|
||||
|
||||
nd_log(NDLS_DAEMON, NDLP_DEBUG,
|
||||
"STREAM THREAD[%zu] [%s]: moving host to dispatcher queue...",
|
||||
"STREAM THREAD[%zu] '%s': moving host to sender queue...",
|
||||
sth->id, rrdhost_hostname(s->host));
|
||||
|
||||
spinlock_lock(&sth->queue.spinlock);
|
||||
|
|
|
@ -91,6 +91,8 @@ struct stream_opcode {
|
|||
#define STREAM_MAX_THREADS 2048
|
||||
#define THREAD_TAG_STREAM "STREAM"
|
||||
|
||||
#define MAX_IO_ITERATIONS_PER_EVENT 65536 // drain the input, take it all
|
||||
|
||||
typedef enum {
|
||||
POLLFD_TYPE_EMPTY,
|
||||
POLLFD_TYPE_SENDER,
|
||||
|
@ -181,8 +183,8 @@ void stream_sender_check_all_nodes_from_poll(struct stream_thread *sth, usec_t n
|
|||
void stream_receiver_add_to_queue(struct receiver_state *rpt);
|
||||
void stream_sender_add_to_connector_queue(struct rrdhost *host);
|
||||
|
||||
void stream_sender_process_poll_events(struct stream_thread *sth, struct sender_state *s, nd_poll_event_t events, usec_t now_ut);
|
||||
void stream_receive_process_poll_events(struct stream_thread *sth, struct receiver_state *rpt, nd_poll_event_t events, usec_t now_ut);
|
||||
bool stream_sender_process_poll_events(struct stream_thread *sth, struct sender_state *s, nd_poll_event_t events, usec_t now_ut);
|
||||
bool stream_receive_process_poll_events(struct stream_thread *sth, struct receiver_state *rpt, nd_poll_event_t events, usec_t now_ut);
|
||||
|
||||
void stream_sender_cleanup(struct stream_thread *sth);
|
||||
void stream_receiver_cleanup(struct stream_thread *sth);
|
||||
|
@ -193,6 +195,9 @@ struct stream_thread *stream_thread_by_slot_id(size_t thread_slot);
|
|||
void stream_thread_node_queued(struct rrdhost *host);
|
||||
void stream_thread_node_removed(struct rrdhost *host);
|
||||
|
||||
// returns true if my_meta has received a message
|
||||
bool stream_thread_process_opcodes(struct stream_thread *sth, struct pollfd_meta *my_meta);
|
||||
|
||||
#include "stream-sender-internals.h"
|
||||
#include "stream-receiver-internals.h"
|
||||
#include "plugins.d/pluginsd_parser.h"
|
||||
|
|
|
@ -3,6 +3,10 @@
|
|||
#ifndef NETDATA_STREAM_TRAFFIC_TYPES_H
|
||||
#define NETDATA_STREAM_TRAFFIC_TYPES_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef enum __attribute__((packed)) {
|
||||
STREAM_TRAFFIC_TYPE_REPLICATION = 0,
|
||||
STREAM_TRAFFIC_TYPE_FUNCTIONS,
|
||||
|
@ -13,4 +17,8 @@ typedef enum __attribute__((packed)) {
|
|||
STREAM_TRAFFIC_TYPE_MAX,
|
||||
} STREAM_TRAFFIC_TYPE;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //NETDATA_STREAM_TRAFFIC_TYPES_H
|
||||
|
|
|
@ -41,6 +41,7 @@ char *stream_receiver_program_version_strdupz(struct rrdhost *host);
|
|||
#include "rrdhost-status.h"
|
||||
#include "protocol/commands.h"
|
||||
#include "stream-path.h"
|
||||
#include "stream-control.h"
|
||||
|
||||
void stream_threads_cancel(void);
|
||||
|
||||
|
|
|
@ -124,7 +124,9 @@ int data_query_execute(ONEWAYALLOC *owa, BUFFER *wb, QUERY_TARGET *qt, time_t *l
|
|||
wrapper_end = rrdr_json_wrapper_end2;
|
||||
}
|
||||
|
||||
stream_control_user_data_query_started();
|
||||
RRDR *r = rrd2rrdr(owa, qt);
|
||||
stream_control_user_data_query_finished();
|
||||
|
||||
if(!r) {
|
||||
buffer_strcat(wb, "Cannot generate output with these parameters on this chart.");
|
||||
|
|
|
@ -1964,7 +1964,7 @@ static void rrd2rrdr_query_execute(RRDR *r, size_t dim_id_in_rrdr, QUERY_ENGINE_
|
|||
|
||||
void store_metric_at_tier(RRDDIM *rd, size_t tier, struct rrddim_tier *t, STORAGE_POINT sp, usec_t now_ut);
|
||||
|
||||
void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s) {
|
||||
void backfill_tier_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s) {
|
||||
if(unlikely(tier >= storage_tiers)) return;
|
||||
#ifdef ENABLE_DBENGINE
|
||||
if(default_backfill == RRD_BACKFILL_NONE) return;
|
||||
|
@ -1989,9 +1989,10 @@ void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s
|
|||
// there is really nothing we can do
|
||||
if(now_s <= latest_time_s || time_diff < granularity) return;
|
||||
|
||||
struct storage_engine_query_handle seqh;
|
||||
stream_control_backfill_query_started();
|
||||
|
||||
// for each lower tier
|
||||
struct storage_engine_query_handle seqh;
|
||||
for(int read_tier = (int)tier - 1; read_tier >= 0 ; read_tier--){
|
||||
time_t smaller_tier_first_time = storage_engine_oldest_time_s(rd->tiers[read_tier].seb, rd->tiers[read_tier].smh);
|
||||
time_t smaller_tier_last_time = storage_engine_latest_time_s(rd->tiers[read_tier].seb, rd->tiers[read_tier].smh);
|
||||
|
@ -2023,6 +2024,8 @@ void rrdr_fill_tier_gap_from_smaller_tiers(RRDDIM *rd, size_t tier, time_t now_s
|
|||
//internal_error(true, "DBENGINE: backfilled chart '%s', dimension '%s', tier %d, from %ld to %ld, with %zu points from tier %d",
|
||||
// rd->rrdset->name, rd->name, tier, after_wanted, before_wanted, points, tr);
|
||||
}
|
||||
|
||||
stream_control_backfill_query_finished();
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
|
|
@ -3,6 +3,8 @@
|
|||
#ifndef NETDATA_API_DATA_QUERY_H
|
||||
#define NETDATA_API_DATA_QUERY_H
|
||||
|
||||
#include "libnetdata/common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
|
|
@ -1285,7 +1285,10 @@ NETDATA_DOUBLE *rrd2rrdr_ks2(
|
|||
};
|
||||
|
||||
QUERY_TARGET *qt = query_target_create(&qtr);
|
||||
stream_control_user_weights_query_started();
|
||||
RRDR *r = rrd2rrdr(owa, qt);
|
||||
stream_control_user_weights_query_finished();
|
||||
|
||||
if(!r)
|
||||
goto cleanup;
|
||||
|
||||
|
@ -1524,7 +1527,9 @@ static void rrdset_weights_multi_dimensional_value(struct query_weights_data *qw
|
|||
|
||||
ONEWAYALLOC *owa = onewayalloc_create(16 * 1024);
|
||||
QUERY_TARGET *qt = query_target_create(&qtr);
|
||||
stream_control_user_weights_query_started();
|
||||
RRDR *r = rrd2rrdr(owa, qt);
|
||||
stream_control_user_weights_query_finished();
|
||||
|
||||
if(!r || rrdr_rows(r) != 1 || !r->d || r->d != r->internal.qt->query.used)
|
||||
goto cleanup;
|
||||
|
|
Loading…
Add table
Reference in a new issue