0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-11 00:20:15 +00:00
netdata_netdata/src/ml/ml.cc
Costa Tsaousis 5f72d4279b
Streaming improvements No 3 ()
* ML uses synchronous queries

* do not call malloc_trim() to free memory, since to locks everything

* Reschedule dimensions for training from worker threads.

* when we collect or read from the database, it is SAMPLES. When we generate points for a chart is POINTS

* keep the receiver send buffer 10x the default

* support autoscaling stream circular buffers

* nd_poll() prefers sending data vs receiving data - in an attempt to dequeue as soon as possible

* fix last commit

* allow removing receiver and senders inline, if the stream thread is not working on them

* fix logs

* Revert "nd_poll() prefers sending data vs receiving data - in an attempt to dequeue as soon as possible"

This reverts commit 51539a97da.

* do not access receiver or sender after it has been removed

* open cache hot2clean

* open cache hot2clean does not need flushing

* use aral for extent pages up to 65k

* track aral malloc and mmap allocations separately; add 8192 as a possible value to PGD

* do not evict too frequently if not needed

* fix aral metrics

* fix aral metrics again

* accurate accounting of memory for dictionaries, strings, labels and MRG

* log during shutdown the progress of dbengine flushing

* move metasync shutfown after dbengine

* max iterations per I/O events

* max iterations per I/O events - break the loop

* max iterations per I/O events - break the loop - again

* disable inline evictions for all caches

* when writing to sockets, send everything that can be sent

* cleanup code to trigger evictions

* fix calculation of eviction size

* fix calculation of eviction size once more

* fix calculation of eviction size once more - again

* ml and replication stop while backfilling is running

* process opcodes while draining the sockets; log with limit when asking to disconnect a node

* fix log

* ml stops when replication queries are running

* report pgd_padding to pulse

* aral precise memory accounting

* removed all alignas() and fix the 2 issues that resulted in unaligned memory accesses (one in mqtt and another in streaming)

* remove the bigger sizes from PGD, but keep multiples of gorilla buffers

* exclude judy from sanitizers

* use 16 bytes alignment on 32 bit machines

* internal check about memory alignment

* experiment: do not allow more children to connect while there is backfilling or replication queries running

* when the node is initializing, retry in 30 seconds

* connector cleanup and isolation of control logic about enabling/disabling various parts

* stop also health queries while backfilling is running

* tuning

* drain the input

* improve interactivity when suspending

* more interactive stream_control

* debug logs to find the connection issue

* abstracted everything about stream control

* Add ml_host_{start,stop} again.

* Do not create/update anomaly-detection charts when ML is not running for a host.

* rrdhost flag RECEIVER_DISCONNECTED has been reversed to COLLECTOR_ONLINE and has been used for localhost and virtual hosts too, to have a single point of truth about the availability of collected data or not

* ml_host_start() and ml_host_stop() are used by streaming receivers; ml_host_start() is used for localhost and virtual hosts

* fixed typo

* allow up to 3 backfills at a time

* add throttling based on user queries

* restore cache line paddings

* unify streaming logs to make it easier to grep logs

* tuning of stream_control

* more logs unification

* use mallocz_release_as_much_memory_to_the_system() under extreme conditions

* do not rely on the response code of evict_pages()

* log the gap of the database every time a node is connected

* updated ram requirements

---------

Co-authored-by: vkalintiris <vasilis@netdata.cloud>
2024-12-11 18:02:17 +02:00

1215 lines
40 KiB
C++

// SPDX-License-Identifier: GPL-3.0-or-later
#include "ml_private.h"
#include <array>
#include "ad_charts.h"
#include "database/sqlite/vendored/sqlite3.h"
#include "streaming/stream-control.h"
#define WORKER_TRAIN_QUEUE_POP 0
#define WORKER_TRAIN_ACQUIRE_DIMENSION 1
#define WORKER_TRAIN_QUERY 2
#define WORKER_TRAIN_KMEANS 3
#define WORKER_TRAIN_UPDATE_MODELS 4
#define WORKER_TRAIN_RELEASE_DIMENSION 5
#define WORKER_TRAIN_UPDATE_HOST 6
#define WORKER_TRAIN_FLUSH_MODELS 7
sqlite3 *ml_db = NULL;
static netdata_mutex_t db_mutex = NETDATA_MUTEX_INITIALIZER;
typedef struct {
// First/last entry of the dimension in DB when generating the response
time_t first_entry_on_response;
time_t last_entry_on_response;
// After/Before timestamps of our DB query
time_t query_after_t;
time_t query_before_t;
// Actual after/before returned by the DB query ops
time_t db_after_t;
time_t db_before_t;
// Number of doubles returned by the DB query
size_t collected_values;
// Number of values we return to the caller
size_t total_values;
} ml_training_response_t;
static std::pair<enum ml_worker_result, ml_training_response_t>
ml_dimension_calculated_numbers(ml_worker_t *worker, ml_dimension_t *dim)
{
ml_training_response_t training_response = {};
training_response.first_entry_on_response = rrddim_first_entry_s_of_tier(dim->rd, 0);
training_response.last_entry_on_response = rrddim_last_entry_s_of_tier(dim->rd, 0);
size_t min_n = Cfg.min_train_samples;
size_t max_n = Cfg.max_train_samples;
// Figure out what our time window should be.
training_response.query_before_t = training_response.last_entry_on_response;
training_response.query_after_t = std::max(
training_response.query_before_t - static_cast<time_t>((max_n - 1) * dim->rd->rrdset->update_every),
training_response.first_entry_on_response
);
if (training_response.query_after_t >= training_response.query_before_t) {
return { ML_WORKER_RESULT_INVALID_QUERY_TIME_RANGE, training_response };
}
if (rrdset_is_replicating(dim->rd->rrdset)) {
return { ML_WORKER_RESULT_CHART_UNDER_REPLICATION, training_response };
}
/*
* Execute the query
*/
struct storage_engine_query_handle handle;
storage_engine_query_init(dim->rd->tiers[0].seb, dim->rd->tiers[0].smh, &handle,
training_response.query_after_t, training_response.query_before_t,
STORAGE_PRIORITY_SYNCHRONOUS);
size_t idx = 0;
memset(worker->training_cns, 0, sizeof(calculated_number_t) * max_n * (Cfg.lag_n + 1));
calculated_number_t last_value = std::numeric_limits<calculated_number_t>::quiet_NaN();
while (!storage_engine_query_is_finished(&handle)) {
if (idx == max_n)
break;
STORAGE_POINT sp = storage_engine_query_next_metric(&handle);
time_t timestamp = sp.end_time_s;
calculated_number_t value = sp.sum / sp.count;
if (netdata_double_isnumber(value)) {
if (!training_response.db_after_t)
training_response.db_after_t = timestamp;
training_response.db_before_t = timestamp;
worker->training_cns[idx] = value;
last_value = worker->training_cns[idx];
training_response.collected_values++;
} else
worker->training_cns[idx] = last_value;
idx++;
}
storage_engine_query_finalize(&handle);
pulse_queries_ml_query_completed(/* points_read */ idx);
training_response.total_values = idx;
if (training_response.collected_values < min_n) {
return { ML_WORKER_RESULT_NOT_ENOUGH_COLLECTED_VALUES, training_response };
}
// Find first non-NaN value.
for (idx = 0; std::isnan(worker->training_cns[idx]); idx++, training_response.total_values--) { }
// Overwrite NaN values.
if (idx != 0)
memmove(worker->training_cns, &worker->training_cns[idx], sizeof(calculated_number_t) * training_response.total_values);
return { ML_WORKER_RESULT_OK, training_response };
}
const char *db_models_create_table =
"CREATE TABLE IF NOT EXISTS models("
" dim_id BLOB, after INT, before INT,"
" min_dist REAL, max_dist REAL,"
" c00 REAL, c01 REAL, c02 REAL, c03 REAL, c04 REAL, c05 REAL,"
" c10 REAL, c11 REAL, c12 REAL, c13 REAL, c14 REAL, c15 REAL,"
" PRIMARY KEY(dim_id, after)"
");";
const char *db_models_add_model =
"INSERT OR REPLACE INTO models("
" dim_id, after, before,"
" min_dist, max_dist,"
" c00, c01, c02, c03, c04, c05,"
" c10, c11, c12, c13, c14, c15)"
"VALUES("
" @dim_id, @after, @before,"
" @min_dist, @max_dist,"
" @c00, @c01, @c02, @c03, @c04, @c05,"
" @c10, @c11, @c12, @c13, @c14, @c15);";
const char *db_models_load =
"SELECT * FROM models "
"WHERE dim_id = @dim_id AND after >= @after ORDER BY before ASC;";
const char *db_models_delete =
"DELETE FROM models "
"WHERE dim_id = @dim_id AND before < @before;";
const char *db_models_prune =
"DELETE FROM models "
"WHERE after < @after LIMIT @n;";
static int
ml_dimension_add_model(const nd_uuid_t *metric_uuid, const ml_kmeans_inlined_t *inlined_km)
{
static __thread sqlite3_stmt *res = NULL;
int param = 0;
int rc = 0;
if (unlikely(!ml_db)) {
error_report("Database has not been initialized");
return 1;
}
if (unlikely(!res)) {
rc = prepare_statement(ml_db, db_models_add_model, &res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to prepare statement to store model, rc = %d", rc);
return 1;
}
}
rc = sqlite3_bind_blob(res, ++param, metric_uuid, sizeof(*metric_uuid), SQLITE_STATIC);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_int(res, ++param, (int) inlined_km->after);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_int(res, ++param, (int) inlined_km->before);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_double(res, ++param, inlined_km->min_dist);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_double(res, ++param, inlined_km->max_dist);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
for (const DSample &ds : inlined_km->cluster_centers) {
if (ds.size() != 6)
fatal("Expected dsample with 6 dimensions, got %ld", ds.size());
for (long idx = 0; idx != ds.size(); idx++) {
calculated_number_t cn = ds(idx);
int rc = sqlite3_bind_double(res, ++param, cn);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
}
}
rc = execute_insert(res);
if (unlikely(rc != SQLITE_DONE)) {
error_report("Failed to store model, rc = %d", rc);
return rc;
}
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to reset statement when storing model, rc = %d", rc);
return rc;
}
return 0;
bind_fail:
error_report("Failed to bind parameter %d to store model, rc = %d", param, rc);
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to reset statement to store model, rc = %d", rc);
return rc;
}
static int
ml_dimension_delete_models(const nd_uuid_t *metric_uuid, time_t before)
{
static __thread sqlite3_stmt *res = NULL;
int rc = 0;
int param = 0;
if (unlikely(!ml_db)) {
error_report("Database has not been initialized");
return 1;
}
if (unlikely(!res)) {
rc = prepare_statement(ml_db, db_models_delete, &res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to prepare statement to delete models, rc = %d", rc);
return rc;
}
}
rc = sqlite3_bind_blob(res, ++param, metric_uuid, sizeof(*metric_uuid), SQLITE_STATIC);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_int(res, ++param, (int) before);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = execute_insert(res);
if (unlikely(rc != SQLITE_DONE)) {
error_report("Failed to delete models, rc = %d", rc);
return rc;
}
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to reset statement when deleting models, rc = %d", rc);
return rc;
}
return 0;
bind_fail:
error_report("Failed to bind parameter %d to delete models, rc = %d", param, rc);
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to reset statement to delete models, rc = %d", rc);
return rc;
}
static int
ml_prune_old_models(size_t num_models_to_prune)
{
static __thread sqlite3_stmt *res = NULL;
int rc = 0;
int param = 0;
if (unlikely(!ml_db)) {
error_report("Database has not been initialized");
return 1;
}
if (unlikely(!res)) {
rc = prepare_statement(ml_db, db_models_prune, &res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to prepare statement to prune models, rc = %d", rc);
return rc;
}
}
int after = (int) (now_realtime_sec() - Cfg.delete_models_older_than);
rc = sqlite3_bind_int(res, ++param, after);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_int(res, ++param, num_models_to_prune);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = execute_insert(res);
if (unlikely(rc != SQLITE_DONE)) {
error_report("Failed to prune old models, rc = %d", rc);
return rc;
}
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to reset statement when pruning old models, rc = %d", rc);
return rc;
}
return 0;
bind_fail:
error_report("Failed to bind parameter %d to prune old models, rc = %d", param, rc);
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to reset statement to prune old models, rc = %d", rc);
return rc;
}
int ml_dimension_load_models(RRDDIM *rd, sqlite3_stmt **active_stmt) {
ml_dimension_t *dim = (ml_dimension_t *) rd->ml_dimension;
if (!dim)
return 0;
spinlock_lock(&dim->slock);
bool is_empty = dim->km_contexts.empty();
spinlock_unlock(&dim->slock);
if (!is_empty)
return 0;
std::vector<ml_kmeans_t> V;
sqlite3_stmt *res = active_stmt ? *active_stmt : NULL;
int rc = 0;
int param = 0;
if (unlikely(!ml_db)) {
error_report("Database has not been initialized");
return 1;
}
if (unlikely(!res)) {
rc = sqlite3_prepare_v2(ml_db, db_models_load, -1, &res, NULL);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to prepare statement to load models, rc = %d", rc);
return 1;
}
if (active_stmt)
*active_stmt = res;
}
rc = sqlite3_bind_blob(res, ++param, &dim->rd->metric_uuid, sizeof(dim->rd->metric_uuid), SQLITE_STATIC);
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
rc = sqlite3_bind_int64(res, ++param, now_realtime_sec() - (Cfg.num_models_to_use * Cfg.max_train_samples));
if (unlikely(rc != SQLITE_OK))
goto bind_fail;
spinlock_lock(&dim->slock);
dim->km_contexts.reserve(Cfg.num_models_to_use);
while ((rc = sqlite3_step_monitored(res)) == SQLITE_ROW) {
ml_kmeans_t km;
km.after = sqlite3_column_int(res, 2);
km.before = sqlite3_column_int(res, 3);
km.min_dist = sqlite3_column_int(res, 4);
km.max_dist = sqlite3_column_int(res, 5);
km.cluster_centers.resize(2);
km.cluster_centers[0].set_size(Cfg.lag_n + 1);
km.cluster_centers[0](0) = sqlite3_column_double(res, 6);
km.cluster_centers[0](1) = sqlite3_column_double(res, 7);
km.cluster_centers[0](2) = sqlite3_column_double(res, 8);
km.cluster_centers[0](3) = sqlite3_column_double(res, 9);
km.cluster_centers[0](4) = sqlite3_column_double(res, 10);
km.cluster_centers[0](5) = sqlite3_column_double(res, 11);
km.cluster_centers[1].set_size(Cfg.lag_n + 1);
km.cluster_centers[1](0) = sqlite3_column_double(res, 12);
km.cluster_centers[1](1) = sqlite3_column_double(res, 13);
km.cluster_centers[1](2) = sqlite3_column_double(res, 14);
km.cluster_centers[1](3) = sqlite3_column_double(res, 15);
km.cluster_centers[1](4) = sqlite3_column_double(res, 16);
km.cluster_centers[1](5) = sqlite3_column_double(res, 17);
dim->km_contexts.emplace_back(km);
}
if (!dim->km_contexts.empty()) {
dim->ts = TRAINING_STATUS_TRAINED;
}
spinlock_unlock(&dim->slock);
if (unlikely(rc != SQLITE_DONE))
error_report("Failed to load models, rc = %d", rc);
if (active_stmt)
rc = sqlite3_reset(res);
else
rc = sqlite3_finalize(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to %s statement when loading models, rc = %d", active_stmt ? "reset" : "finalize", rc);
return 0;
bind_fail:
error_report("Failed to bind parameter %d to load models, rc = %d", param, rc);
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to reset statement to load models, rc = %d", rc);
return 1;
}
static void ml_dimension_serialize_kmeans(const ml_dimension_t *dim, BUFFER *wb)
{
RRDDIM *rd = dim->rd;
buffer_json_initialize(wb, "\"", "\"", 0, true, BUFFER_JSON_OPTIONS_MINIFY);
buffer_json_member_add_string(wb, "version", "1");
buffer_json_member_add_string(wb, "machine-guid", rd->rrdset->rrdhost->machine_guid);
buffer_json_member_add_string(wb, "chart", rrdset_id(rd->rrdset));
buffer_json_member_add_string(wb, "dimension", rrddim_id(rd));
buffer_json_member_add_object(wb, "model");
ml_kmeans_serialize(&dim->km_contexts.back(), wb);
buffer_json_object_close(wb);
buffer_json_finalize(wb);
}
bool
ml_dimension_deserialize_kmeans(const char *json_str)
{
if (!json_str) {
netdata_log_error("Failed to deserialize kmeans: json string is null");
return false;
}
struct json_object *root = json_tokener_parse(json_str);
if (!root) {
netdata_log_error("Failed to deserialize kmeans: json parsing failed");
return false;
}
// Check the version
{
struct json_object *tmp_obj;
if (!json_object_object_get_ex(root, "version", &tmp_obj)) {
netdata_log_error("Failed to deserialize kmeans: missing key 'version'");
json_object_put(root);
return false;
}
if (!json_object_is_type(tmp_obj, json_type_string)) {
netdata_log_error("Failed to deserialize kmeans: failed to parse string for 'version'");
json_object_put(root);
return false;
}
const char *version = json_object_get_string(tmp_obj);
if (strcmp(version, "1")) {
netdata_log_error("Failed to deserialize kmeans: expected version 1");
json_object_put(root);
return false;
}
}
// Get the value of each key
std::array<const char *, 3> values;
{
std::array<const char *, 3> keys = {
"machine-guid",
"chart",
"dimension",
};
struct json_object *tmp_obj;
for (size_t i = 0; i != keys.size(); i++) {
if (!json_object_object_get_ex(root, keys[i], &tmp_obj)) {
netdata_log_error("Failed to deserialize kmeans: missing key '%s'", keys[i]);
json_object_put(root);
return false;
}
if (!json_object_is_type(tmp_obj, json_type_string)) {
netdata_log_error("Failed to deserialize kmeans: missing string value for key '%s'", keys[i]);
json_object_put(root);
return false;
}
values[i] = json_object_get_string(tmp_obj);
}
}
DimensionLookupInfo DLI(values[0], values[1], values[2]);
// Parse the kmeans model
ml_kmeans_inlined_t inlined_km;
{
struct json_object *kmeans_obj;
if (!json_object_object_get_ex(root, "model", &kmeans_obj)) {
netdata_log_error("Failed to deserialize kmeans: missing key 'model'");
json_object_put(root);
return false;
}
if (!json_object_is_type(kmeans_obj, json_type_object)) {
netdata_log_error("Failed to deserialize kmeans: failed to parse object for 'model'");
json_object_put(root);
return false;
}
if (!ml_kmeans_deserialize(&inlined_km, kmeans_obj)) {
json_object_put(root);
return false;
}
}
AcquiredDimension AcqDim(DLI);
if (!AcqDim.acquired()) {
netdata_log_error("Failed to deserialize kmeans: could not acquire dimension (machine-guid: %s, dimension: '%s.%s', reason: %s)",
DLI.machineGuid(), DLI.chartId(), DLI.dimensionId(), AcqDim.acquire_failure());
json_object_put(root);
return false;
}
ml_dimension_t *Dim = reinterpret_cast<ml_dimension_t *>(AcqDim.dimension());
if (!Dim) {
pulse_ml_models_ignored();
return true;
}
ml_queue_item_t item;
item.type = ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL;
item.add_existing_model = {
DLI, inlined_km
};
ml_queue_push(AcqDim.queue(), item);
json_object_put(root);
return true;
}
static void ml_dimension_stream_kmeans(const ml_dimension_t *dim)
{
struct sender_state *s = dim->rd->rrdset->rrdhost->sender;
if (!s)
return;
if(!stream_sender_has_capabilities(dim->rd->rrdset->rrdhost, STREAM_CAP_ML_MODELS) ||
!rrdset_check_upstream_exposed(dim->rd->rrdset) ||
!rrddim_check_upstream_exposed(dim->rd))
return;
CLEAN_BUFFER *payload = buffer_create(0, NULL);
ml_dimension_serialize_kmeans(dim, payload);
CLEAN_BUFFER *wb = buffer_create(0, NULL);
buffer_sprintf(
wb, PLUGINSD_KEYWORD_JSON " " PLUGINSD_KEYWORD_JSON_CMD_ML_MODEL "\n%s\n" PLUGINSD_KEYWORD_JSON_END "\n",
buffer_tostring(payload));
sender_commit_clean_buffer(s, wb, STREAM_TRAFFIC_TYPE_METADATA);
pulse_ml_models_sent();
}
static void ml_dimension_update_models(ml_worker_t *worker, ml_dimension_t *dim)
{
worker_is_busy(WORKER_TRAIN_UPDATE_MODELS);
spinlock_lock(&dim->slock);
if (dim->km_contexts.size() < Cfg.num_models_to_use) {
dim->km_contexts.emplace_back(dim->kmeans);
} else {
bool can_drop_middle_km = false;
if (Cfg.num_models_to_use > 2) {
const ml_kmeans_inlined_t *old_km = &dim->km_contexts[dim->km_contexts.size() - 1];
const ml_kmeans_inlined_t *middle_km = &dim->km_contexts[dim->km_contexts.size() - 2];
const ml_kmeans_t *new_km = &dim->kmeans;
can_drop_middle_km = (middle_km->after < old_km->before) &&
(middle_km->before > new_km->after);
}
if (can_drop_middle_km) {
dim->km_contexts.back() = dim->kmeans;
} else {
std::rotate(std::begin(dim->km_contexts), std::begin(dim->km_contexts) + 1, std::end(dim->km_contexts));
dim->km_contexts[dim->km_contexts.size() - 1] = dim->kmeans;
}
}
dim->mt = METRIC_TYPE_CONSTANT;
dim->ts = TRAINING_STATUS_TRAINED;
dim->suppression_anomaly_counter = 0;
dim->suppression_window_counter = 0;
dim->last_training_time = rrddim_last_entry_s(dim->rd);
// Add the newly generated model to the list of pending models to flush
ml_model_info_t model_info;
uuid_copy(model_info.metric_uuid, dim->rd->metric_uuid);
model_info.inlined_kmeans = dim->km_contexts.back();
worker->pending_model_info.push_back(model_info);
ml_dimension_stream_kmeans(dim);
spinlock_unlock(&dim->slock);
}
static enum ml_worker_result
ml_dimension_train_model(ml_worker_t *worker, ml_dimension_t *dim)
{
worker_is_busy(WORKER_TRAIN_QUERY);
spinlock_lock(&dim->slock);
if (dim->mt == METRIC_TYPE_CONSTANT) {
spinlock_unlock(&dim->slock);
return ML_WORKER_RESULT_OK;
}
spinlock_unlock(&dim->slock);
auto P = ml_dimension_calculated_numbers(worker, dim);
ml_worker_result worker_result = P.first;
ml_training_response_t training_response = P.second;
if (worker_result != ML_WORKER_RESULT_OK) {
spinlock_lock(&dim->slock);
dim->mt = METRIC_TYPE_CONSTANT;
dim->suppression_anomaly_counter = 0;
dim->suppression_window_counter = 0;
dim->last_training_time = training_response.last_entry_on_response;
spinlock_unlock(&dim->slock);
return worker_result;
}
// compute kmeans
worker_is_busy(WORKER_TRAIN_KMEANS);
{
memcpy(worker->scratch_training_cns, worker->training_cns,
training_response.total_values * sizeof(calculated_number_t));
ml_features_t features = {
Cfg.diff_n, Cfg.smooth_n, Cfg.lag_n,
worker->scratch_training_cns, training_response.total_values,
worker->training_cns, training_response.total_values,
worker->training_samples
};
ml_features_preprocess(&features);
ml_kmeans_init(&dim->kmeans);
ml_kmeans_train(&dim->kmeans, &features, Cfg.max_kmeans_iters, training_response.query_after_t, training_response.query_before_t);
}
// update models
ml_dimension_update_models(worker, dim);
return worker_result;
}
bool
ml_dimension_predict(ml_dimension_t *dim, calculated_number_t value, bool exists)
{
// Nothing to do if ML is disabled for this dimension
if (dim->mls != MACHINE_LEARNING_STATUS_ENABLED)
return false;
// Don't treat values that don't exist as anomalous
if (!exists) {
dim->cns.clear();
return false;
}
// Save the value and return if we don't have enough values for a sample
unsigned n = Cfg.diff_n + Cfg.smooth_n + Cfg.lag_n;
if (dim->cns.size() < n) {
dim->cns.push_back(value);
return false;
}
// Push the value and check if it's different from the last one
bool same_value = true;
std::rotate(std::begin(dim->cns), std::begin(dim->cns) + 1, std::end(dim->cns));
if (dim->cns[n - 1] != value)
same_value = false;
dim->cns[n - 1] = value;
// Create the sample
assert((n * (Cfg.lag_n + 1) <= 128) &&
"Static buffers too small to perform prediction. "
"This should not be possible with the default clamping of feature extraction options");
calculated_number_t src_cns[128];
calculated_number_t dst_cns[128];
memset(src_cns, 0, n * (Cfg.lag_n + 1) * sizeof(calculated_number_t));
memcpy(src_cns, dim->cns.data(), n * sizeof(calculated_number_t));
memcpy(dst_cns, dim->cns.data(), n * sizeof(calculated_number_t));
ml_features_t features = {
Cfg.diff_n, Cfg.smooth_n, Cfg.lag_n,
dst_cns, n, src_cns, n,
dim->feature
};
ml_features_preprocess(&features);
/*
* Lock to predict
*/
if (spinlock_trylock(&dim->slock) == 0)
return false;
// Mark the metric time as variable if we received different values
if (!same_value)
dim->mt = METRIC_TYPE_VARIABLE;
// Ignore silenced dimensions
if (dim->ts == TRAINING_STATUS_SILENCED) {
spinlock_unlock(&dim->slock);
return false;
}
dim->suppression_window_counter++;
/*
* Use the KMeans models to check if the value is anomalous
*/
size_t sum = 0;
size_t models_consulted = 0;
for (const auto &km_ctx : dim->km_contexts) {
models_consulted++;
calculated_number_t anomaly_score = ml_kmeans_anomaly_score(&km_ctx, features.preprocessed_features[0]);
if (anomaly_score == std::numeric_limits<calculated_number_t>::quiet_NaN())
continue;
if (anomaly_score < (100 * Cfg.dimension_anomaly_score_threshold)) {
pulse_ml_models_consulted(models_consulted);
spinlock_unlock(&dim->slock);
return false;
}
sum += 1;
}
dim->suppression_anomaly_counter += sum ? 1 : 0;
if ((dim->suppression_anomaly_counter >= Cfg.suppression_threshold) &&
(dim->suppression_window_counter >= Cfg.suppression_window)) {
dim->ts = TRAINING_STATUS_SILENCED;
}
spinlock_unlock(&dim->slock);
pulse_ml_models_consulted(models_consulted);
return sum;
}
/*
* Chart
*/
static bool
ml_chart_is_available_for_ml(ml_chart_t *chart)
{
return rrdset_is_available_for_exporting_and_alarms(chart->rs);
}
void
ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomalous)
{
switch (dim->mls) {
case MACHINE_LEARNING_STATUS_DISABLED_DUE_TO_EXCLUDED_CHART:
chart->mls.num_machine_learning_status_disabled_sp++;
return;
case MACHINE_LEARNING_STATUS_ENABLED: {
chart->mls.num_machine_learning_status_enabled++;
switch (dim->mt) {
case METRIC_TYPE_CONSTANT:
chart->mls.num_metric_type_constant++;
chart->mls.num_training_status_trained++;
chart->mls.num_normal_dimensions++;
return;
case METRIC_TYPE_VARIABLE:
chart->mls.num_metric_type_variable++;
break;
}
switch (dim->ts) {
case TRAINING_STATUS_UNTRAINED:
chart->mls.num_training_status_untrained++;
return;
case TRAINING_STATUS_TRAINED:
chart->mls.num_training_status_trained++;
chart->mls.num_anomalous_dimensions += is_anomalous;
chart->mls.num_normal_dimensions += !is_anomalous;
return;
case TRAINING_STATUS_SILENCED:
chart->mls.num_training_status_silenced++;
chart->mls.num_training_status_trained++;
chart->mls.num_anomalous_dimensions += is_anomalous;
chart->mls.num_normal_dimensions += !is_anomalous;
return;
}
return;
}
}
}
/*
* Host detection & training functions
*/
#define WORKER_JOB_DETECTION_COLLECT_STATS 0
#define WORKER_JOB_DETECTION_DIM_CHART 1
#define WORKER_JOB_DETECTION_HOST_CHART 2
#define WORKER_JOB_DETECTION_STATS 3
static void
ml_host_detect_once(ml_host_t *host)
{
worker_is_busy(WORKER_JOB_DETECTION_COLLECT_STATS);
host->mls = {};
ml_machine_learning_stats_t mls_copy = {};
if (host->ml_running) {
netdata_mutex_lock(&host->mutex);
/*
* prediction/detection stats
*/
void *rsp = NULL;
rrdset_foreach_read(rsp, host->rh) {
RRDSET *rs = static_cast<RRDSET *>(rsp);
ml_chart_t *chart = (ml_chart_t *) rs->ml_chart;
if (!chart)
continue;
if (!ml_chart_is_available_for_ml(chart))
continue;
ml_machine_learning_stats_t chart_mls = chart->mls;
host->mls.num_machine_learning_status_enabled += chart_mls.num_machine_learning_status_enabled;
host->mls.num_machine_learning_status_disabled_sp += chart_mls.num_machine_learning_status_disabled_sp;
host->mls.num_metric_type_constant += chart_mls.num_metric_type_constant;
host->mls.num_metric_type_variable += chart_mls.num_metric_type_variable;
host->mls.num_training_status_untrained += chart_mls.num_training_status_untrained;
host->mls.num_training_status_pending_without_model += chart_mls.num_training_status_pending_without_model;
host->mls.num_training_status_trained += chart_mls.num_training_status_trained;
host->mls.num_training_status_pending_with_model += chart_mls.num_training_status_pending_with_model;
host->mls.num_training_status_silenced += chart_mls.num_training_status_silenced;
host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions;
host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions;
if (spinlock_trylock(&host->type_anomaly_rate_spinlock))
{
STRING *key = rs->parts.type;
auto &um = host->type_anomaly_rate;
auto it = um.find(key);
if (it == um.end()) {
um[key] = ml_type_anomaly_rate_t {
.rd = NULL,
.normal_dimensions = 0,
.anomalous_dimensions = 0
};
it = um.find(key);
}
it->second.anomalous_dimensions += chart_mls.num_anomalous_dimensions;
it->second.normal_dimensions += chart_mls.num_normal_dimensions;
spinlock_unlock(&host->type_anomaly_rate_spinlock);
}
}
rrdset_foreach_done(rsp);
host->host_anomaly_rate = 0.0;
size_t NumActiveDimensions = host->mls.num_anomalous_dimensions + host->mls.num_normal_dimensions;
if (NumActiveDimensions)
host->host_anomaly_rate = static_cast<double>(host->mls.num_anomalous_dimensions) / NumActiveDimensions;
mls_copy = host->mls;
netdata_mutex_unlock(&host->mutex);
worker_is_busy(WORKER_JOB_DETECTION_DIM_CHART);
ml_update_dimensions_chart(host, mls_copy);
worker_is_busy(WORKER_JOB_DETECTION_HOST_CHART);
ml_update_host_and_detection_rate_charts(host, host->host_anomaly_rate * 10000.0);
} else {
host->host_anomaly_rate = 0.0;
auto &um = host->type_anomaly_rate;
for (auto &entry: um) {
entry.second = ml_type_anomaly_rate_t {
.rd = NULL,
.normal_dimensions = 0,
.anomalous_dimensions = 0
};
}
}
}
void *
ml_detect_main(void *arg)
{
UNUSED(arg);
worker_register("MLDETECT");
worker_register_job_name(WORKER_JOB_DETECTION_COLLECT_STATS, "collect stats");
worker_register_job_name(WORKER_JOB_DETECTION_DIM_CHART, "dim chart");
worker_register_job_name(WORKER_JOB_DETECTION_HOST_CHART, "host chart");
worker_register_job_name(WORKER_JOB_DETECTION_STATS, "training stats");
heartbeat_t hb;
heartbeat_init(&hb, USEC_PER_SEC);
while (!Cfg.detection_stop && service_running(SERVICE_COLLECTORS)) {
worker_is_idle();
heartbeat_next(&hb);
RRDHOST *rh;
rrd_rdlock();
rrdhost_foreach_read(rh) {
if (!rh->ml_host)
continue;
if (!service_running(SERVICE_COLLECTORS))
break;
ml_host_detect_once((ml_host_t *) rh->ml_host);
}
rrd_rdunlock();
if (Cfg.enable_statistics_charts) {
// collect and update training thread stats
for (size_t idx = 0; idx != Cfg.num_worker_threads; idx++) {
ml_worker_t *worker = &Cfg.workers[idx];
netdata_mutex_lock(&worker->nd_mutex);
ml_queue_stats_t queue_stats = worker->queue_stats;
netdata_mutex_unlock(&worker->nd_mutex);
ml_update_training_statistics_chart(worker, queue_stats);
}
}
}
Cfg.training_stop = true;
return NULL;
}
static void ml_flush_pending_models(ml_worker_t *worker) {
int op_no = 1;
// begin transaction
int rc = db_execute(ml_db, "BEGIN TRANSACTION;");
// add/delete models
if (!rc) {
op_no++;
for (const auto &pending_model: worker->pending_model_info) {
if (!rc)
rc = ml_dimension_add_model(&pending_model.metric_uuid, &pending_model.inlined_kmeans);
if (!rc)
rc = ml_dimension_delete_models(&pending_model.metric_uuid, pending_model.inlined_kmeans.before - (Cfg.num_models_to_use * Cfg.train_every));
}
}
// prune old models
if (!rc) {
if ((worker->num_db_transactions % 64) == 0) {
rc = ml_prune_old_models(worker->num_models_to_prune);
if (!rc)
worker->num_models_to_prune = 0;
}
}
// commit transaction
if (!rc) {
op_no++;
rc = db_execute(ml_db, "COMMIT TRANSACTION;");
}
// rollback transaction on failure
if (rc) {
netdata_log_error("Trying to rollback ML transaction because it failed with rc=%d, op_no=%d", rc, op_no);
op_no++;
rc = db_execute(ml_db, "ROLLBACK;");
if (rc)
netdata_log_error("ML transaction rollback failed with rc=%d", rc);
}
if (!rc) {
worker->num_db_transactions++;
worker->num_models_to_prune += worker->pending_model_info.size();
}
vacuum_database(ml_db, "ML", 0, 0);
worker->pending_model_info.clear();
}
static enum ml_worker_result ml_worker_create_new_model(ml_worker_t *worker, ml_request_create_new_model_t req) {
AcquiredDimension AcqDim(req.DLI);
if (!AcqDim.acquired()) {
netdata_log_error("Failed to create new model: could not acquire dimension (machine-guid: %s, dimension: '%s.%s', reason: %s)",
req.DLI.machineGuid(), req.DLI.chartId(), req.DLI.dimensionId(), AcqDim.acquire_failure());
return ML_WORKER_RESULT_NULL_ACQUIRED_DIMENSION;
}
ml_dimension_t *Dim = reinterpret_cast<ml_dimension_t *>(AcqDim.dimension());
return ml_dimension_train_model(worker, Dim);
}
static enum ml_worker_result ml_worker_add_existing_model(ml_worker_t *worker, ml_request_add_existing_model_t req) {
UNUSED(worker);
UNUSED(req);
AcquiredDimension AcqDim(req.DLI);
if (!AcqDim.acquired()) {
netdata_log_error("Failed to add existing model: could not acquire dimension (machine-guid: %s, dimension: '%s.%s', reason: %s)",
req.DLI.machineGuid(), req.DLI.chartId(), req.DLI.dimensionId(), AcqDim.acquire_failure());
return ML_WORKER_RESULT_NULL_ACQUIRED_DIMENSION;
}
ml_dimension_t *Dim = reinterpret_cast<ml_dimension_t *>(AcqDim.dimension());
if (!Dim) {
pulse_ml_models_ignored();
return ML_WORKER_RESULT_OK;
}
Dim->kmeans = req.inlined_km;
ml_dimension_update_models(worker, Dim);
pulse_ml_models_received();
return ML_WORKER_RESULT_OK;
}
void *ml_train_main(void *arg) {
ml_worker_t *worker = (ml_worker_t *) arg;
char worker_name[1024];
snprintfz(worker_name, 1024, "ml_worker_%zu", worker->id);
worker_register("MLTRAIN");
worker_register_job_name(WORKER_TRAIN_QUEUE_POP, "pop queue");
worker_register_job_name(WORKER_TRAIN_ACQUIRE_DIMENSION, "acquire");
worker_register_job_name(WORKER_TRAIN_QUERY, "query");
worker_register_job_name(WORKER_TRAIN_KMEANS, "kmeans");
worker_register_job_name(WORKER_TRAIN_UPDATE_MODELS, "update models");
worker_register_job_name(WORKER_TRAIN_RELEASE_DIMENSION, "release");
worker_register_job_name(WORKER_TRAIN_UPDATE_HOST, "update host");
worker_register_job_name(WORKER_TRAIN_FLUSH_MODELS, "flush models");
while (!Cfg.training_stop) {
if(!stream_control_ml_should_be_running()) {
worker_is_idle();
stream_control_throttle();
continue;
}
worker_is_busy(WORKER_TRAIN_QUEUE_POP);
ml_queue_stats_t loop_stats{};
ml_queue_item_t item = ml_queue_pop(worker->queue);
if (item.type == ML_QUEUE_ITEM_STOP_REQUEST) {
break;
}
ml_queue_size_t queue_size = ml_queue_size(worker->queue);
usec_t allotted_ut = (Cfg.train_every * USEC_PER_SEC) / (queue_size.create_new_model + 1);
if (allotted_ut > USEC_PER_SEC)
allotted_ut = USEC_PER_SEC;
usec_t start_ut = now_monotonic_usec();
enum ml_worker_result worker_res;
switch (item.type) {
case ML_QUEUE_ITEM_TYPE_CREATE_NEW_MODEL: {
worker_res = ml_worker_create_new_model(worker, item.create_new_model);
if (worker_res != ML_WORKER_RESULT_NULL_ACQUIRED_DIMENSION) {
ml_queue_push(worker->queue, item);
}
break;
}
case ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL: {
worker_res = ml_worker_add_existing_model(worker, item.add_existing_model);
break;
}
default: {
fatal("Unknown queue item type");
}
}
usec_t consumed_ut = now_monotonic_usec() - start_ut;
usec_t remaining_ut = 0;
if (consumed_ut < allotted_ut)
remaining_ut = allotted_ut - consumed_ut;
if (Cfg.enable_statistics_charts) {
worker_is_busy(WORKER_TRAIN_UPDATE_HOST);
ml_queue_stats_t queue_stats = ml_queue_stats(worker->queue);
loop_stats.total_add_existing_model_requests_pushed = queue_stats.total_add_existing_model_requests_pushed;
loop_stats.total_add_existing_model_requests_popped = queue_stats.total_add_existing_model_requests_popped;
loop_stats.total_create_new_model_requests_pushed = queue_stats.total_create_new_model_requests_pushed;
loop_stats.total_create_new_model_requests_popped = queue_stats.total_create_new_model_requests_popped;
loop_stats.allotted_ut = allotted_ut;
loop_stats.consumed_ut = consumed_ut;
loop_stats.remaining_ut = remaining_ut;
switch (worker_res) {
case ML_WORKER_RESULT_OK:
loop_stats.item_result_ok = 1;
break;
case ML_WORKER_RESULT_INVALID_QUERY_TIME_RANGE:
loop_stats.item_result_invalid_query_time_range = 1;
break;
case ML_WORKER_RESULT_NOT_ENOUGH_COLLECTED_VALUES:
loop_stats.item_result_not_enough_collected_values = 1;
break;
case ML_WORKER_RESULT_NULL_ACQUIRED_DIMENSION:
loop_stats.item_result_null_acquired_dimension = 1;
break;
case ML_WORKER_RESULT_CHART_UNDER_REPLICATION:
loop_stats.item_result_chart_under_replication = 1;
break;
}
netdata_mutex_lock(&worker->nd_mutex);
worker->queue_stats.total_add_existing_model_requests_pushed = loop_stats.total_add_existing_model_requests_pushed;
worker->queue_stats.total_add_existing_model_requests_popped = loop_stats.total_add_existing_model_requests_popped;
worker->queue_stats.total_create_new_model_requests_pushed = loop_stats.total_create_new_model_requests_pushed;
worker->queue_stats.total_create_new_model_requests_popped = loop_stats.total_create_new_model_requests_popped;
worker->queue_stats.allotted_ut += loop_stats.allotted_ut;
worker->queue_stats.consumed_ut += loop_stats.consumed_ut;
worker->queue_stats.remaining_ut += loop_stats.remaining_ut;
worker->queue_stats.item_result_ok += loop_stats.item_result_ok;
worker->queue_stats.item_result_invalid_query_time_range += loop_stats.item_result_invalid_query_time_range;
worker->queue_stats.item_result_not_enough_collected_values += loop_stats.item_result_not_enough_collected_values;
worker->queue_stats.item_result_null_acquired_dimension += loop_stats.item_result_null_acquired_dimension;
worker->queue_stats.item_result_chart_under_replication += loop_stats.item_result_chart_under_replication;
netdata_mutex_unlock(&worker->nd_mutex);
}
bool should_sleep = true;
if (worker->pending_model_info.size() >= Cfg.flush_models_batch_size) {
worker_is_busy(WORKER_TRAIN_FLUSH_MODELS);
netdata_mutex_lock(&db_mutex);
ml_flush_pending_models(worker);
netdata_mutex_unlock(&db_mutex);
should_sleep = false;
}
if (item.type == ML_QUEUE_ITEM_TYPE_ADD_EXISTING_MODEL) {
should_sleep = false;
}
if (!should_sleep)
continue;
worker_is_idle();
std::this_thread::sleep_for(std::chrono::microseconds{remaining_ut});
}
return NULL;
}