mirror of
https://github.com/netdata/netdata.git
synced 2025-04-17 11:12:42 +00:00
Detect deadlock in dbengine page cache (#6911)
* Detect deadlock in dbengine page cache when there are too many metrics and print error message * Resolve dbengine deadlock by dropping metrics when page cache is too small and define relevant alarms * Changed printing deadlock errors to only happen once per dbengine instance
This commit is contained in:
parent
7977137cee
commit
2728be8b06
7 changed files with 97 additions and 32 deletions
|
@ -381,7 +381,7 @@ declare -A configs_signatures=(
|
|||
['7deb236ec68a512b9bdd18e6a51d76f7']='python.d/mysql.conf'
|
||||
['7e5fc1644aa7a54f9dbb1bd102521b09']='health.d/memcached.conf'
|
||||
['7f13631183fbdf79c21c8e5a171e9b34']='health.d/zfs.conf'
|
||||
['93674f3206872ae9c43ecbc54988413b']='health.d/dbengine.conf'
|
||||
['0fca55fc770c243ebfd8387c89059dd2']='health.d/dbengine.conf'
|
||||
['7fb8184d56a27040e73261ed9c6fc76f']='health_alarm_notify.conf'
|
||||
['80266bddd3df374923c750a6de91d120']='health.d/apache.conf'
|
||||
['803a7f9dcb942eeac0fd764b9e3e38ca']='fping.conf'
|
||||
|
|
|
@ -538,7 +538,7 @@ void global_statistics_charts(void) {
|
|||
unsigned long long stats_array[RRDENG_NR_STATS];
|
||||
|
||||
/* get localhost's DB engine's statistics */
|
||||
rrdeng_get_33_statistics(localhost->rrdeng_ctx, stats_array);
|
||||
rrdeng_get_35_statistics(localhost->rrdeng_ctx, stats_array);
|
||||
|
||||
// ----------------------------------------------------------------
|
||||
|
||||
|
@ -756,6 +756,8 @@ void global_statistics_charts(void) {
|
|||
static RRDSET *st_errors = NULL;
|
||||
static RRDDIM *rd_fs_errors = NULL;
|
||||
static RRDDIM *rd_io_errors = NULL;
|
||||
static RRDDIM *rd_pg_cache_warnings = NULL;
|
||||
static RRDDIM *rd_pg_cache_errors = NULL;
|
||||
|
||||
if (unlikely(!st_errors)) {
|
||||
st_errors = rrdset_create_localhost(
|
||||
|
@ -775,12 +777,17 @@ void global_statistics_charts(void) {
|
|||
|
||||
rd_io_errors = rrddim_add(st_errors, "I/O errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
||||
rd_fs_errors = rrddim_add(st_errors, "FS errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
||||
rd_pg_cache_warnings = rrddim_add(st_errors, "Page-Cache warnings", NULL, 1, 1,
|
||||
RRD_ALGORITHM_INCREMENTAL);
|
||||
rd_pg_cache_errors = rrddim_add(st_errors, "Page-Cache errors", NULL, 1, 1, RRD_ALGORITHM_INCREMENTAL);
|
||||
}
|
||||
else
|
||||
rrdset_next(st_errors);
|
||||
|
||||
rrddim_set_by_pointer(st_errors, rd_io_errors, (collected_number)stats_array[30]);
|
||||
rrddim_set_by_pointer(st_errors, rd_fs_errors, (collected_number)stats_array[31]);
|
||||
rrddim_set_by_pointer(st_errors, rd_pg_cache_warnings, (collected_number)stats_array[33]);
|
||||
rrddim_set_by_pointer(st_errors, rd_pg_cache_errors, (collected_number)stats_array[34]);
|
||||
rrdset_done(st_errors);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
|
||||
rrdeng_stats_t global_io_errors = 0;
|
||||
rrdeng_stats_t global_fs_errors = 0;
|
||||
rrdeng_stats_t global_pg_cache_warnings = 0;
|
||||
rrdeng_stats_t global_pg_cache_errors = 0;
|
||||
rrdeng_stats_t rrdeng_reserved_file_descriptors = 0;
|
||||
|
||||
void sanity_check(void)
|
||||
|
|
|
@ -148,12 +148,25 @@ struct rrdengine_statistics {
|
|||
rrdeng_stats_t page_cache_descriptors;
|
||||
rrdeng_stats_t io_errors;
|
||||
rrdeng_stats_t fs_errors;
|
||||
rrdeng_stats_t pg_cache_warnings;
|
||||
rrdeng_stats_t pg_cache_errors;
|
||||
};
|
||||
|
||||
/* I/O errors global counter */
|
||||
extern rrdeng_stats_t global_io_errors;
|
||||
/* File-System errors global counter */
|
||||
extern rrdeng_stats_t global_fs_errors;
|
||||
/*
|
||||
* Page cache warnings global counter.
|
||||
* Some page cache instance is near critical utilization where metrics will fail to be stored.
|
||||
*/
|
||||
extern rrdeng_stats_t global_pg_cache_warnings;
|
||||
/*
|
||||
* Page cache errors global counter.
|
||||
* Some page cache instance has hit critical utilization where metrics failed to be stored as a deadlock resolution
|
||||
* measure.
|
||||
*/
|
||||
extern rrdeng_stats_t global_pg_cache_errors;
|
||||
/* number of File-Descriptors that have been reserved by dbengine */
|
||||
extern rrdeng_stats_t rrdeng_reserved_file_descriptors;
|
||||
|
||||
|
|
|
@ -95,9 +95,8 @@ void rrdeng_store_metric_flush_current_page(RRDDIM *rd)
|
|||
if (likely(descr->page_length)) {
|
||||
int ret, page_is_empty;
|
||||
|
||||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1);
|
||||
#endif
|
||||
|
||||
if (handle->prev_descr) {
|
||||
/* unpin old second page */
|
||||
pg_cache_put(ctx, handle->prev_descr);
|
||||
|
@ -192,9 +191,26 @@ void rrdeng_store_metric_next(RRDDIM *rd, usec_t point_in_time, storage_number n
|
|||
if (unlikely(INVALID_TIME == descr->start_time)) {
|
||||
descr->start_time = point_in_time;
|
||||
|
||||
#ifdef NETDATA_INTERNAL_CHECKS
|
||||
rrd_stat_atomic_add(&ctx->stats.metric_API_producers, 1);
|
||||
#endif
|
||||
|
||||
if (unlikely(((unsigned long)ctx->stats.metric_API_producers) >= ctx->max_cache_pages)) {
|
||||
if (0 == (unsigned long)ctx->stats.pg_cache_errors) {
|
||||
/* only print the first time */
|
||||
error("Deadlock detected in dbengine instance \"%s\", metric data will not be stored in the database"
|
||||
", please increase page cache size.", ctx->dbfiles_path);
|
||||
}
|
||||
rrd_stat_atomic_add(&ctx->stats.pg_cache_errors, 1);
|
||||
rrd_stat_atomic_add(&global_pg_cache_errors, 1);
|
||||
/* Resolve deadlock */
|
||||
descr->page_length = 0; /* make sure the page descriptor is deconstructed */
|
||||
rrdeng_store_metric_flush_current_page(rd);
|
||||
rrd_stat_atomic_add(&ctx->stats.metric_API_producers, -1);
|
||||
return;
|
||||
} else if (unlikely(((unsigned long)ctx->stats.metric_API_producers) >= ctx->cache_pages_low_watermark)) {
|
||||
rrd_stat_atomic_add(&ctx->stats.pg_cache_warnings, 1);
|
||||
rrd_stat_atomic_add(&global_pg_cache_warnings, 1);
|
||||
}
|
||||
|
||||
pg_cache_insert(ctx, handle->page_index, descr);
|
||||
} else {
|
||||
pg_cache_add_new_metric_time(handle->page_index, descr);
|
||||
|
@ -672,7 +688,7 @@ void *rrdeng_get_page(struct rrdengine_instance *ctx, uuid_t *id, usec_t point_i
|
|||
* You must not change the indices of the statistics or user code will break.
|
||||
* You must not exceed RRDENG_NR_STATS or it will crash.
|
||||
*/
|
||||
void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array)
|
||||
void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long *array)
|
||||
{
|
||||
struct page_cache *pg_cache = &ctx->pg_cache;
|
||||
|
||||
|
@ -709,7 +725,9 @@ void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long
|
|||
array[30] = (uint64_t)global_io_errors;
|
||||
array[31] = (uint64_t)global_fs_errors;
|
||||
array[32] = (uint64_t)rrdeng_reserved_file_descriptors;
|
||||
assert(RRDENG_NR_STATS == 33);
|
||||
array[33] = (uint64_t)global_pg_cache_warnings;
|
||||
array[34] = (uint64_t)global_pg_cache_errors;
|
||||
assert(RRDENG_NR_STATS == 35);
|
||||
}
|
||||
|
||||
/* Releases reference to page */
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
#define RRDENG_MIN_PAGE_CACHE_SIZE_MB (32)
|
||||
#define RRDENG_MIN_DISK_SPACE_MB (256)
|
||||
|
||||
#define RRDENG_NR_STATS (33)
|
||||
#define RRDENG_NR_STATS (35)
|
||||
|
||||
#define RRDENG_FD_BUDGET_PER_INSTANCE (50)
|
||||
|
||||
|
@ -41,7 +41,7 @@ extern int rrdeng_load_metric_is_finished(struct rrddim_query_handle *rrdimm_han
|
|||
extern void rrdeng_load_metric_finalize(struct rrddim_query_handle *rrdimm_handle);
|
||||
extern time_t rrdeng_metric_latest_time(RRDDIM *rd);
|
||||
extern time_t rrdeng_metric_oldest_time(RRDDIM *rd);
|
||||
extern void rrdeng_get_33_statistics(struct rrdengine_instance *ctx, unsigned long long *array);
|
||||
extern void rrdeng_get_35_statistics(struct rrdengine_instance *ctx, unsigned long long *array);
|
||||
|
||||
/* must call once before using anything */
|
||||
extern int rrdeng_init(struct rrdengine_instance **ctxp, char *dbfiles_path, unsigned page_cache_mb,
|
||||
|
|
|
@ -1,26 +1,51 @@
|
|||
|
||||
# you can disable an alarm notification by setting the 'to' line to: silent
|
||||
|
||||
alarm: 10min_dbengine_global_fs_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of FS errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
|
||||
to: sysadmin
|
||||
alarm: 10min_dbengine_global_fs_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of FS errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 15m multiplier 1.5 max 1h
|
||||
info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_io_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of I/O errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
|
||||
to: sysadmin
|
||||
alarm: 10min_dbengine_global_io_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
lookup: sum -10m unaligned of I/O errors
|
||||
units: errors
|
||||
every: 10s
|
||||
crit: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_page_cache_errors
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
units: errors
|
||||
every: 10s
|
||||
lookup: sum -10m unaligned of Page-Cache errors
|
||||
crit: $this > 0
|
||||
repeat: warning 120s critical 10s
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of deadlocks dbengine resolved the last 10 minutes due to insufficient page cache size, metrics have been lost
|
||||
to: sysadmin
|
||||
|
||||
alarm: 10min_dbengine_global_page_cache_warnings
|
||||
on: netdata.dbengine_global_errors
|
||||
os: linux freebsd macos
|
||||
hosts: *
|
||||
units: errors
|
||||
every: 10s
|
||||
lookup: sum -10m unaligned of Page-Cache warnings
|
||||
warn: $this > 0
|
||||
delay: down 1h multiplier 1.5 max 3h
|
||||
info: number of times dbengine almost deadlocked the last 10 minutes due to insufficient page cache size
|
||||
to: sysadmin
|
||||
|
|
Loading…
Add table
Reference in a new issue