0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-14 01:29:11 +00:00
netdata_netdata/collectors/cgroups.plugin/cgroup-internals.h
Costa Tsaousis f466b8aef5
DYNCFG: dynamically configured alerts ()
* cleanup alerts

* fix references

* fix references

* fix references

* load alerts once and apply them to each node

* simplify health_create_alarm_entry()

* Compile without warnings with compiler flags:

   -Wall -Wextra -Wformat=2 -Wshadow -Wno-format-nonliteral -Winit-self

* code re-organization and cleanup

* generate patterns when applying prototypes; give unique dyncfg names to all alerts

* eval expressions keep the source and the parsed_as as STRING pointers

* renamed host to node in dyncfg ids

* renamed host to node in dyncfg ids

* add all cloud roles to the list of parsed X-Netdata-Role header and also default to member access level

* working functionality

* code re-organization: moved health event-loop to a new file, moved health globals to health.c

* rrdcalctemplate is removed; alert_cfg is removed; foreach dimension is removed; RRDCALCs are now instanciated only when they are linked to RRDSETs

* dyncfg alert prototypes initialization for alerts

* health dyncfg split to separate file

* cleanup not-needed code

* normalize matches between parsing and json

* also detect !* for disabled alerts

* dyncfg capability disabled

* Store alert config part1

* Add rrdlabels_common_count

* wip health variables lookup without indexes

* Improve rrdlabels_common_count by reusing rrdlabels_find_label_with_key_unsafe with an additional parameter

* working variables with runtime lookup

* working variables with runtime lookup

* delete rrddimvar and rrdfamily index

* remove rrdsetvar; now all variables are in RRDVARs inside hosts and charts

* added /api/v1/variable that resolves a variable the same way alerts do

* remove rrdcalc from eval

* remove debug code

* remove duplicate assignment

* Fix memory leak

* all alert variables are now handled by alert_variable_lookup() and EVAL is now independent of alerts

* hide all internal structures of EVAL

* Enable -Wformat flag

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>

* Adjust binding for calculation, warning, critical

* Remove unused macro

* Update config hash id

* use the right info and summary in alerts log

* use synchronous queries for alerts

* Handle cases when config_hash_id is missing from health_log

* remove deadlock from health worker

* parsing to json payload for health alert prototypes

* cleaner parsing and avoiding memory leaks in case of duplicate members in json

* fix left-over rename of function

* Keep original lookup field to send to the cloud
Cleanup / rename function to store config
Remove unused DEFINEs, functions

* Use ac->lookup

* link jobs to the host when the template is registered; do not accept running a function without a host

* full dyncfg support for health alerts, except action TEST

* working dyncfg additions, updates, removals

* fixed missing source, wrong status updates

* add alerts by type, component, classification, recipient and module at the /api/v2/alerts endpoint

* fix dyncfg unittest

* rename functions

* generalize the json-c parser macros and move them to libnetdata

* report progress when enabling and disabling dyncfg templates

* moved rrdcalc and rrdvar to health

* update alarms

* added schema for alerts; separated alert_action_options from rrdr_options; restructured the json payload for alerts

* enable parsed json alerts; allow sending back accepted but disabled

* added format_version for alerts payload; enables/disables status now is also inheritted by the status of the rules; fixed variable names in json output

* remove the RRDHOST pointer from DYNCFG

* Fix command field submitted to the cloud

* do not send updates to creation requests, for DYNCFG jobs

---------

Signed-off-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: Stelios Fragkakis <52996999+stelfrag@users.noreply.github.com>
Co-authored-by: Tasos Katsoulas <tasos@netdata.cloud>
Co-authored-by: ilyam8 <ilya@netdata.cloud>
2024-01-23 20:20:41 +02:00

509 lines
No EOL
16 KiB
C

#include "sys_fs_cgroup.h"
#ifndef NETDATA_CGROUP_INTERNALS_H
#define NETDATA_CGROUP_INTERNALS_H 1
#ifdef NETDATA_INTERNAL_CHECKS
#define CGROUP_PROCFILE_FLAG PROCFILE_FLAG_DEFAULT
#else
#define CGROUP_PROCFILE_FLAG PROCFILE_FLAG_NO_ERROR_ON_FILE_IO
#endif
struct blkio {
int updated;
int enabled; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
int delay_counter;
char *filename;
unsigned long long Read;
unsigned long long Write;
/*
unsigned long long Sync;
unsigned long long Async;
unsigned long long Total;
*/
};
struct pids {
char *pids_current_filename;
int pids_current_updated;
unsigned long long pids_current;
};
// https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt
struct memory {
ARL_BASE *arl_base;
ARL_ENTRY *arl_dirty;
ARL_ENTRY *arl_swap;
int updated_detailed;
int updated_usage_in_bytes;
int updated_msw_usage_in_bytes;
int updated_failcnt;
int enabled_detailed; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
int enabled_usage_in_bytes; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
int enabled_msw_usage_in_bytes; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
int enabled_failcnt; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
int delay_counter_detailed;
int delay_counter_failcnt;
char *filename_detailed;
char *filename_usage_in_bytes;
char *filename_msw_usage_in_bytes;
char *filename_failcnt;
int detailed_has_dirty;
int detailed_has_swap;
// detailed metrics
/*
unsigned long long cache;
unsigned long long rss;
unsigned long long rss_huge;
unsigned long long mapped_file;
unsigned long long writeback;
unsigned long long dirty;
unsigned long long swap;
unsigned long long pgpgin;
unsigned long long pgpgout;
unsigned long long pgfault;
unsigned long long pgmajfault;
unsigned long long inactive_anon;
unsigned long long active_anon;
unsigned long long inactive_file;
unsigned long long active_file;
unsigned long long unevictable;
unsigned long long hierarchical_memory_limit;
*/
//unified cgroups metrics
unsigned long long anon;
unsigned long long kernel_stack;
unsigned long long slab;
unsigned long long sock;
// unsigned long long shmem;
unsigned long long anon_thp;
//unsigned long long file_writeback;
//unsigned long long file_dirty;
//unsigned long long file;
unsigned long long total_cache;
unsigned long long total_rss;
unsigned long long total_rss_huge;
unsigned long long total_mapped_file;
unsigned long long total_writeback;
unsigned long long total_dirty;
unsigned long long total_swap;
unsigned long long total_pgpgin;
unsigned long long total_pgpgout;
unsigned long long total_pgfault;
unsigned long long total_pgmajfault;
/*
unsigned long long total_inactive_anon;
unsigned long long total_active_anon;
*/
unsigned long long total_inactive_file;
/*
unsigned long long total_active_file;
unsigned long long total_unevictable;
*/
// single file metrics
unsigned long long usage_in_bytes;
unsigned long long msw_usage_in_bytes;
unsigned long long failcnt;
};
// https://www.kernel.org/doc/Documentation/cgroup-v1/cpuacct.txt
struct cpuacct_stat {
int updated;
int enabled; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
char *filename;
unsigned long long user; // v1, v2(user_usec)
unsigned long long system; // v1, v2(system_usec)
};
// https://www.kernel.org/doc/Documentation/cgroup-v1/cpuacct.txt
struct cpuacct_usage {
int updated;
int enabled; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
char *filename;
unsigned int cpus;
unsigned long long *cpu_percpu;
};
// represents cpuacct/cpu.stat, for v2 'cpuacct_stat' is used for 'user_usec', 'system_usec'
struct cpuacct_cpu_throttling {
int updated;
int enabled; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
char *filename;
unsigned long long nr_periods;
unsigned long long nr_throttled;
unsigned long long throttled_time;
unsigned long long nr_throttled_perc;
};
// https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/6/html/resource_management_guide/sec-cpu#sect-cfs
// https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/managing_monitoring_and_updating_the_kernel/using-cgroups-v2-to-control-distribution-of-cpu-time-for-applications_managing-monitoring-and-updating-the-kernel#proc_controlling-distribution-of-cpu-time-for-applications-by-adjusting-cpu-weight_using-cgroups-v2-to-control-distribution-of-cpu-time-for-applications
struct cpuacct_cpu_shares {
int updated;
int enabled; // CONFIG_BOOLEAN_YES or CONFIG_BOOLEAN_AUTO
char *filename;
unsigned long long shares;
};
struct cgroup_network_interface {
const char *host_device;
const char *container_device;
struct cgroup_network_interface *next;
};
enum cgroups_container_orchestrator {
CGROUPS_ORCHESTRATOR_UNSET,
CGROUPS_ORCHESTRATOR_UNKNOWN,
CGROUPS_ORCHESTRATOR_K8S
};
// *** WARNING *** The fields are not thread safe. Take care of safe usage.
struct cgroup {
uint32_t options;
int first_time_seen; // first time seen by the discoverer
int processed; // the discoverer is done processing a cgroup (resolved name, set 'enabled' option)
char available; // found in the filesystem
char enabled; // enabled in the config
bool function_ready; // true after the first iteration of chart creation/update
char pending_renames;
char *id;
uint32_t hash;
char *intermediate_id; // TODO: remove it when the renaming script is fixed
char *chart_id;
uint32_t hash_chart_id;
// 'cgroup_name' label value.
// by default this is the *id (path), later changed to the resolved name (cgroup-name.sh) or systemd service name.
char *name;
RRDLABELS *chart_labels;
int container_orchestrator;
struct cpuacct_stat cpuacct_stat;
struct cpuacct_usage cpuacct_usage;
struct cpuacct_cpu_throttling cpuacct_cpu_throttling;
struct cpuacct_cpu_shares cpuacct_cpu_shares;
struct memory memory;
struct blkio io_service_bytes; // bytes
struct blkio io_serviced; // operations
struct blkio throttle_io_service_bytes; // bytes
struct blkio throttle_io_serviced; // operations
struct blkio io_merged; // operations
struct blkio io_queued; // operations
struct pids pids;
struct cgroup_network_interface *interfaces;
struct pressure cpu_pressure;
struct pressure io_pressure;
struct pressure memory_pressure;
struct pressure irq_pressure;
// Cpu
RRDSET *st_cpu;
RRDDIM *st_cpu_rd_user;
RRDDIM *st_cpu_rd_system;
RRDSET *st_cpu_limit;
RRDSET *st_cpu_per_core;
RRDSET *st_cpu_nr_throttled;
RRDSET *st_cpu_throttled_time;
RRDSET *st_cpu_shares;
// Memory
RRDSET *st_mem;
RRDDIM *st_mem_rd_ram;
RRDDIM *st_mem_rd_swap;
RRDSET *st_mem_utilization;
RRDSET *st_writeback;
RRDSET *st_mem_activity;
RRDSET *st_pgfaults;
RRDSET *st_mem_usage;
RRDSET *st_mem_usage_limit;
RRDSET *st_mem_failcnt;
// Blkio
RRDSET *st_io;
RRDDIM *st_io_rd_read;
RRDDIM *st_io_rd_written;
RRDSET *st_serviced_ops;
RRDSET *st_throttle_io;
RRDDIM *st_throttle_io_rd_read;
RRDDIM *st_throttle_io_rd_written;
RRDSET *st_throttle_serviced_ops;
RRDSET *st_queued_ops;
RRDSET *st_merged_ops;
// Pids
RRDSET *st_pids;
RRDDIM *st_pids_rd_pids_current;
// per cgroup chart variables
char *filename_cpuset_cpus;
unsigned long long cpuset_cpus;
char *filename_cpu_cfs_period;
unsigned long long cpu_cfs_period;
char *filename_cpu_cfs_quota;
unsigned long long cpu_cfs_quota;
const RRDVAR_ACQUIRED *chart_var_cpu_limit;
NETDATA_DOUBLE prev_cpu_usage;
char *filename_memory_limit;
unsigned long long memory_limit;
const RRDVAR_ACQUIRED *chart_var_memory_limit;
char *filename_memoryswap_limit;
unsigned long long memoryswap_limit;
const RRDVAR_ACQUIRED *chart_var_memoryswap_limit;
const DICTIONARY_ITEM *cgroup_netdev_link;
struct cgroup *next;
struct cgroup *discovered_next;
};
struct discovery_thread {
uv_thread_t thread;
uv_mutex_t mutex;
uv_cond_t cond_var;
int exited;
};
extern struct discovery_thread discovery_thread;
extern char *cgroups_rename_script;
extern char cgroup_chart_id_prefix[];
extern char services_chart_id_prefix[];
extern uv_mutex_t cgroup_root_mutex;
void cgroup_discovery_worker(void *ptr);
extern int is_inside_k8s;
extern long system_page_size;
extern int cgroup_enable_cpuacct_stat;
extern int cgroup_enable_cpuacct_usage;
extern int cgroup_enable_cpuacct_cpu_throttling;
extern int cgroup_enable_cpuacct_cpu_shares;
extern int cgroup_enable_memory;
extern int cgroup_enable_detailed_memory;
extern int cgroup_enable_memory_failcnt;
extern int cgroup_enable_swap;
extern int cgroup_enable_blkio_io;
extern int cgroup_enable_blkio_ops;
extern int cgroup_enable_blkio_throttle_io;
extern int cgroup_enable_blkio_throttle_ops;
extern int cgroup_enable_blkio_merged_ops;
extern int cgroup_enable_blkio_queued_ops;
extern int cgroup_enable_pressure_cpu;
extern int cgroup_enable_pressure_io_some;
extern int cgroup_enable_pressure_io_full;
extern int cgroup_enable_pressure_memory_some;
extern int cgroup_enable_pressure_memory_full;
extern int cgroup_enable_pressure_irq_some;
extern int cgroup_enable_pressure_irq_full;
extern int cgroup_enable_systemd_services;
extern int cgroup_enable_systemd_services_detailed_memory;
extern int cgroup_used_memory;
extern int cgroup_use_unified_cgroups;
extern int cgroup_unified_exist;
extern int cgroup_search_in_devices;
extern int cgroup_check_for_new_every;
extern int cgroup_update_every;
extern int cgroup_containers_chart_priority;
extern int cgroup_recheck_zero_blkio_every_iterations;
extern int cgroup_recheck_zero_mem_failcnt_every_iterations;
extern int cgroup_recheck_zero_mem_detailed_every_iterations;
extern char *cgroup_cpuacct_base;
extern char *cgroup_cpuset_base;
extern char *cgroup_blkio_base;
extern char *cgroup_memory_base;
extern char *cgroup_pids_base;
extern char *cgroup_devices_base;
extern char *cgroup_unified_base;
extern int cgroup_root_count;
extern int cgroup_root_max;
extern int cgroup_max_depth;
extern SIMPLE_PATTERN *enabled_cgroup_paths;
extern SIMPLE_PATTERN *enabled_cgroup_names;
extern SIMPLE_PATTERN *search_cgroup_paths;
extern SIMPLE_PATTERN *enabled_cgroup_renames;
extern SIMPLE_PATTERN *systemd_services_cgroups;
extern SIMPLE_PATTERN *entrypoint_parent_process_comm;
extern char *cgroups_network_interface_script;
extern int cgroups_check;
extern uint32_t Read_hash;
extern uint32_t Write_hash;
extern uint32_t user_hash;
extern uint32_t system_hash;
extern uint32_t user_usec_hash;
extern uint32_t system_usec_hash;
extern uint32_t nr_periods_hash;
extern uint32_t nr_throttled_hash;
extern uint32_t throttled_time_hash;
extern uint32_t throttled_usec_hash;
extern struct cgroup *cgroup_root;
extern netdata_ebpf_cgroup_shm_t shm_cgroup_ebpf;
extern int shm_fd_cgroup_ebpf;
extern sem_t *shm_mutex_cgroup_ebpf;
enum cgroups_type { CGROUPS_AUTODETECT_FAIL, CGROUPS_V1, CGROUPS_V2 };
enum cgroups_systemd_setting {
SYSTEMD_CGROUP_ERR,
SYSTEMD_CGROUP_LEGACY,
SYSTEMD_CGROUP_HYBRID,
SYSTEMD_CGROUP_UNIFIED
};
struct cgroups_systemd_config_setting {
char *name;
enum cgroups_systemd_setting setting;
};
extern struct cgroups_systemd_config_setting cgroups_systemd_options[];
static inline int matches_enabled_cgroup_paths(char *id) {
return simple_pattern_matches(enabled_cgroup_paths, id);
}
static inline int matches_enabled_cgroup_names(char *name) {
return simple_pattern_matches(enabled_cgroup_names, name);
}
static inline int matches_enabled_cgroup_renames(char *id) {
return simple_pattern_matches(enabled_cgroup_renames, id);
}
static inline int matches_systemd_services_cgroups(char *id) {
return simple_pattern_matches(systemd_services_cgroups, id);
}
static inline int matches_search_cgroup_paths(const char *dir) {
return simple_pattern_matches(search_cgroup_paths, dir);
}
static inline int matches_entrypoint_parent_process_comm(const char *comm) {
return simple_pattern_matches(entrypoint_parent_process_comm, comm);
}
static inline int is_cgroup_systemd_service(struct cgroup *cg) {
return (int)(cg->options & CGROUP_OPTIONS_SYSTEM_SLICE_SERVICE);
}
static inline int k8s_is_kubepod(struct cgroup *cg) {
return cg->container_orchestrator == CGROUPS_ORCHESTRATOR_K8S;
}
static inline char *cgroup_chart_type(char *buffer, struct cgroup *cg) {
buffer[0] = '\0';
if (cg->chart_id[0] == '\0' || (cg->chart_id[0] == '/' && cg->chart_id[1] == '\0'))
strncpy(buffer, "cgroup_root", RRD_ID_LENGTH_MAX);
else if (is_cgroup_systemd_service(cg))
snprintfz(buffer, RRD_ID_LENGTH_MAX, "%s%s", services_chart_id_prefix, cg->chart_id);
else
snprintfz(buffer, RRD_ID_LENGTH_MAX, "%s%s", cgroup_chart_id_prefix, cg->chart_id);
return buffer;
}
#define RRDFUNCTIONS_CGTOP_HELP "View running containers"
#define RRDFUNCTIONS_SYSTEMD_SERVICES_HELP "View systemd services"
int cgroup_function_cgroup_top(BUFFER *wb, const char *function);
int cgroup_function_systemd_top(BUFFER *wb, const char *function);
void cgroup_netdev_link_init(void);
const DICTIONARY_ITEM *cgroup_netdev_get(struct cgroup *cg);
void cgroup_netdev_delete(struct cgroup *cg);
void update_cpu_utilization_chart(struct cgroup *cg);
void update_cpu_utilization_limit_chart(struct cgroup *cg, NETDATA_DOUBLE cpu_limit);
void update_cpu_throttled_chart(struct cgroup *cg);
void update_cpu_throttled_duration_chart(struct cgroup *cg);
void update_cpu_shares_chart(struct cgroup *cg);
void update_cpu_per_core_usage_chart(struct cgroup *cg);
void update_mem_usage_limit_chart(struct cgroup *cg, unsigned long long memory_limit);
void update_mem_utilization_chart(struct cgroup *cg, unsigned long long memory_limit);
void update_mem_usage_detailed_chart(struct cgroup *cg);
void update_mem_writeback_chart(struct cgroup *cg);
void update_mem_activity_chart(struct cgroup *cg);
void update_mem_pgfaults_chart(struct cgroup *cg);
void update_mem_failcnt_chart(struct cgroup *cg);
void update_mem_usage_chart(struct cgroup *cg);
void update_io_serviced_bytes_chart(struct cgroup *cg);
void update_io_serviced_ops_chart(struct cgroup *cg);
void update_throttle_io_serviced_bytes_chart(struct cgroup *cg);
void update_throttle_io_serviced_ops_chart(struct cgroup *cg);
void update_io_queued_ops_chart(struct cgroup *cg);
void update_io_merged_ops_chart(struct cgroup *cg);
void update_pids_current_chart(struct cgroup *cg);
void update_cpu_some_pressure_chart(struct cgroup *cg);
void update_cpu_some_pressure_stall_time_chart(struct cgroup *cg);
void update_cpu_full_pressure_chart(struct cgroup *cg);
void update_cpu_full_pressure_stall_time_chart(struct cgroup *cg);
void update_mem_some_pressure_chart(struct cgroup *cg);
void update_mem_some_pressure_stall_time_chart(struct cgroup *cg);
void update_mem_full_pressure_chart(struct cgroup *cg);
void update_mem_full_pressure_stall_time_chart(struct cgroup *cg);
void update_irq_some_pressure_chart(struct cgroup *cg);
void update_irq_some_pressure_stall_time_chart(struct cgroup *cg);
void update_irq_full_pressure_chart(struct cgroup *cg);
void update_irq_full_pressure_stall_time_chart(struct cgroup *cg);
void update_io_some_pressure_chart(struct cgroup *cg);
void update_io_some_pressure_stall_time_chart(struct cgroup *cg);
void update_io_full_pressure_chart(struct cgroup *cg);
void update_io_full_pressure_stall_time_chart(struct cgroup *cg);
#endif // NETDATA_CGROUP_INTERNALS_H