mirror of
https://github.com/netdata/netdata.git
synced 2025-04-27 06:10:43 +00:00
Fix CRC and I/O error handling in dbengine so that netdata is not halted and relevant error messages are printed and alarms are raised (#6452)
This commit is contained in:
parent
b548214e35
commit
192a868b03
4 changed files with 39 additions and 21 deletions
|
@ -381,7 +381,7 @@ declare -A configs_signatures=(
|
||||||
['7deb236ec68a512b9bdd18e6a51d76f7']='python.d/mysql.conf'
|
['7deb236ec68a512b9bdd18e6a51d76f7']='python.d/mysql.conf'
|
||||||
['7e5fc1644aa7a54f9dbb1bd102521b09']='health.d/memcached.conf'
|
['7e5fc1644aa7a54f9dbb1bd102521b09']='health.d/memcached.conf'
|
||||||
['7f13631183fbdf79c21c8e5a171e9b34']='health.d/zfs.conf'
|
['7f13631183fbdf79c21c8e5a171e9b34']='health.d/zfs.conf'
|
||||||
['ce285c90747428ee5da4efb547418dda']='health.d/dbengine.conf'
|
['93674f3206872ae9c43ecbc54988413b']='health.d/dbengine.conf'
|
||||||
['7fb8184d56a27040e73261ed9c6fc76f']='health_alarm_notify.conf'
|
['7fb8184d56a27040e73261ed9c6fc76f']='health_alarm_notify.conf'
|
||||||
['80266bddd3df374923c750a6de91d120']='health.d/apache.conf'
|
['80266bddd3df374923c750a6de91d120']='health.d/apache.conf'
|
||||||
['803a7f9dcb942eeac0fd764b9e3e38ca']='fping.conf'
|
['803a7f9dcb942eeac0fd764b9e3e38ca']='fping.conf'
|
||||||
|
|
|
@ -3,13 +3,18 @@
|
||||||
|
|
||||||
static void flush_transaction_buffer_cb(uv_fs_t* req)
|
static void flush_transaction_buffer_cb(uv_fs_t* req)
|
||||||
{
|
{
|
||||||
struct generic_io_descriptor *io_descr;
|
struct generic_io_descriptor *io_descr = req->data;
|
||||||
|
struct rrdengine_worker_config* wc = req->loop->data;
|
||||||
|
struct rrdengine_instance *ctx = wc->ctx;
|
||||||
|
|
||||||
debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__);
|
debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__);
|
||||||
if (req->result < 0) {
|
if (req->result < 0) {
|
||||||
fatal("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
|
++ctx->stats.io_errors;
|
||||||
|
rrd_stat_atomic_add(&global_io_errors, 1);
|
||||||
|
error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
|
||||||
|
} else {
|
||||||
|
debug(D_RRDENGINE, "%s: Journal block was written to disk.", __func__);
|
||||||
}
|
}
|
||||||
io_descr = req->data;
|
|
||||||
|
|
||||||
uv_fs_req_cleanup(req);
|
uv_fs_req_cleanup(req);
|
||||||
free(io_descr->buf);
|
free(io_descr->buf);
|
||||||
|
@ -348,6 +353,7 @@ static unsigned replay_transaction(struct rrdengine_instance *ctx, struct rrdeng
|
||||||
ret = crc32cmp(jf_trailer->checksum, crc);
|
ret = crc32cmp(jf_trailer->checksum, crc);
|
||||||
debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED");
|
debug(D_RRDENGINE, "Transaction %"PRIu64" was read from disk. CRC32 check: %s", *id, ret ? "FAILED" : "SUCCEEDED");
|
||||||
if (unlikely(ret)) {
|
if (unlikely(ret)) {
|
||||||
|
error("Transaction %"PRIu64" was read from disk. CRC32 check: FAILED", *id);
|
||||||
return size_bytes;
|
return size_bytes;
|
||||||
}
|
}
|
||||||
switch (jf_header->type) {
|
switch (jf_header->type) {
|
||||||
|
|
|
@ -37,24 +37,29 @@ void read_extent_cb(uv_fs_t* req)
|
||||||
unsigned i, j, count;
|
unsigned i, j, count;
|
||||||
void *page, *uncompressed_buf = NULL;
|
void *page, *uncompressed_buf = NULL;
|
||||||
uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length;
|
uint32_t payload_length, payload_offset, page_offset, uncompressed_payload_length;
|
||||||
|
uint8_t have_read_error = 0;
|
||||||
/* persistent structures */
|
/* persistent structures */
|
||||||
struct rrdeng_df_extent_header *header;
|
struct rrdeng_df_extent_header *header;
|
||||||
struct rrdeng_df_extent_trailer *trailer;
|
struct rrdeng_df_extent_trailer *trailer;
|
||||||
uLong crc;
|
uLong crc;
|
||||||
|
|
||||||
xt_io_descr = req->data;
|
xt_io_descr = req->data;
|
||||||
if (req->result < 0) {
|
|
||||||
error("%s: uv_fs_read: %s", __func__, uv_strerror((int)req->result));
|
|
||||||
goto cleanup;
|
|
||||||
}
|
|
||||||
|
|
||||||
header = xt_io_descr->buf;
|
header = xt_io_descr->buf;
|
||||||
payload_length = header->payload_length;
|
payload_length = header->payload_length;
|
||||||
count = header->number_of_pages;
|
count = header->number_of_pages;
|
||||||
|
|
||||||
payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count;
|
payload_offset = sizeof(*header) + sizeof(header->descr[0]) * count;
|
||||||
|
|
||||||
trailer = xt_io_descr->buf + xt_io_descr->bytes - sizeof(*trailer);
|
trailer = xt_io_descr->buf + xt_io_descr->bytes - sizeof(*trailer);
|
||||||
|
|
||||||
|
if (req->result < 0) {
|
||||||
|
struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
|
||||||
|
|
||||||
|
++ctx->stats.io_errors;
|
||||||
|
rrd_stat_atomic_add(&global_io_errors, 1);
|
||||||
|
have_read_error = 1;
|
||||||
|
error("%s: uv_fs_read - %s - extent at offset %"PRIu64"(%u) in datafile %u-%u.", __func__,
|
||||||
|
uv_strerror((int)req->result), xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
|
||||||
|
goto after_crc_check;
|
||||||
|
}
|
||||||
crc = crc32(0L, Z_NULL, 0);
|
crc = crc32(0L, Z_NULL, 0);
|
||||||
crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer));
|
crc = crc32(crc, xt_io_descr->buf, xt_io_descr->bytes - sizeof(*trailer));
|
||||||
ret = crc32cmp(trailer->checksum, crc);
|
ret = crc32cmp(trailer->checksum, crc);
|
||||||
|
@ -66,12 +71,17 @@ void read_extent_cb(uv_fs_t* req)
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
if (unlikely(ret)) {
|
if (unlikely(ret)) {
|
||||||
/* TODO: handle errors */
|
struct rrdengine_datafile *datafile = xt_io_descr->descr_array[0]->extent->datafile;
|
||||||
exit(UV_EIO);
|
|
||||||
goto cleanup;
|
++ctx->stats.io_errors;
|
||||||
|
rrd_stat_atomic_add(&global_io_errors, 1);
|
||||||
|
have_read_error = 1;
|
||||||
|
error("%s: Extent at offset %"PRIu64"(%u) was read from datafile %u-%u. CRC32 check: FAILED", __func__,
|
||||||
|
xt_io_descr->pos, xt_io_descr->bytes, datafile->tier, datafile->fileno);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (RRD_NO_COMPRESSION != header->compression_algorithm) {
|
after_crc_check:
|
||||||
|
if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) {
|
||||||
uncompressed_payload_length = 0;
|
uncompressed_payload_length = 0;
|
||||||
for (i = 0 ; i < count ; ++i) {
|
for (i = 0 ; i < count ; ++i) {
|
||||||
uncompressed_payload_length += header->descr[i].page_length;
|
uncompressed_payload_length += header->descr[i].page_length;
|
||||||
|
@ -99,7 +109,10 @@ void read_extent_cb(uv_fs_t* req)
|
||||||
page_offset += header->descr[j].page_length;
|
page_offset += header->descr[j].page_length;
|
||||||
}
|
}
|
||||||
/* care, we don't hold the descriptor mutex */
|
/* care, we don't hold the descriptor mutex */
|
||||||
if (RRD_NO_COMPRESSION == header->compression_algorithm) {
|
if (have_read_error) {
|
||||||
|
/* Applications should make sure NULL values match 0 as does SN_EMPTY_SLOT */
|
||||||
|
memset(page, 0, descr->page_length);
|
||||||
|
} else if (RRD_NO_COMPRESSION == header->compression_algorithm) {
|
||||||
(void) memcpy(page, xt_io_descr->buf + payload_offset + page_offset, descr->page_length);
|
(void) memcpy(page, xt_io_descr->buf + payload_offset + page_offset, descr->page_length);
|
||||||
} else {
|
} else {
|
||||||
(void) memcpy(page, uncompressed_buf + page_offset, descr->page_length);
|
(void) memcpy(page, uncompressed_buf + page_offset, descr->page_length);
|
||||||
|
@ -118,12 +131,11 @@ void read_extent_cb(uv_fs_t* req)
|
||||||
}
|
}
|
||||||
rrdeng_page_descr_mutex_unlock(ctx, descr);
|
rrdeng_page_descr_mutex_unlock(ctx, descr);
|
||||||
}
|
}
|
||||||
if (RRD_NO_COMPRESSION != header->compression_algorithm) {
|
if (!have_read_error && RRD_NO_COMPRESSION != header->compression_algorithm) {
|
||||||
freez(uncompressed_buf);
|
freez(uncompressed_buf);
|
||||||
}
|
}
|
||||||
if (xt_io_descr->completion)
|
if (xt_io_descr->completion)
|
||||||
complete(xt_io_descr->completion);
|
complete(xt_io_descr->completion);
|
||||||
cleanup:
|
|
||||||
uv_fs_req_cleanup(req);
|
uv_fs_req_cleanup(req);
|
||||||
free(xt_io_descr->buf);
|
free(xt_io_descr->buf);
|
||||||
freez(xt_io_descr);
|
freez(xt_io_descr);
|
||||||
|
@ -246,8 +258,9 @@ void flush_pages_cb(uv_fs_t* req)
|
||||||
|
|
||||||
xt_io_descr = req->data;
|
xt_io_descr = req->data;
|
||||||
if (req->result < 0) {
|
if (req->result < 0) {
|
||||||
|
++ctx->stats.io_errors;
|
||||||
|
rrd_stat_atomic_add(&global_io_errors, 1);
|
||||||
error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
|
error("%s: uv_fs_write: %s", __func__, uv_strerror((int)req->result));
|
||||||
goto cleanup;
|
|
||||||
}
|
}
|
||||||
#ifdef NETDATA_INTERNAL_CHECKS
|
#ifdef NETDATA_INTERNAL_CHECKS
|
||||||
{
|
{
|
||||||
|
@ -279,7 +292,6 @@ void flush_pages_cb(uv_fs_t* req)
|
||||||
}
|
}
|
||||||
if (xt_io_descr->completion)
|
if (xt_io_descr->completion)
|
||||||
complete(xt_io_descr->completion);
|
complete(xt_io_descr->completion);
|
||||||
cleanup:
|
|
||||||
uv_fs_req_cleanup(req);
|
uv_fs_req_cleanup(req);
|
||||||
free(xt_io_descr->buf);
|
free(xt_io_descr->buf);
|
||||||
freez(xt_io_descr);
|
freez(xt_io_descr);
|
||||||
|
|
|
@ -22,5 +22,5 @@
|
||||||
every: 10s
|
every: 10s
|
||||||
crit: $this > 0
|
crit: $this > 0
|
||||||
delay: down 1h multiplier 1.5 max 3h
|
delay: down 1h multiplier 1.5 max 3h
|
||||||
info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc)
|
info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
|
||||||
to: sysadmin
|
to: sysadmin
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue