Provide new attributes in health conf files (#10961)

* read and store new attributes (class, component, type) from health conf files. Replace family variable in info strings * provide the attributes to jsons * remove extra semicolon * populate conf files with new attributes * added newline * remove extra defines from health.h * remove empty line * remove realloc * use helper variables for find_and_replace. Adjust position for next strstr * remove comments * Add type to mysql.conf and vcsa.conf * fix formatting * add parenthesis * remove extra assignment * changes to mysql_galera_cluster_state from master * add type Errors to unbound_request_list_overwritten * fix identation for info strings spawning more than one line * check for null, replace with empty string if true * add class, component, type to systemdunits.conf
2025-04-17 03:02:41 +00:00 · 2021-04-20 16:24:41 +03:00 · 2021-04-20 16:24:41 +03:00 · f5bd20e60a
commit f5bd20e60a
parent 0a6a14e323
93 changed files with 4230 additions and 3101 deletions
--- a/database/rrd.h
+++ b/database/rrd.h
@ -666,6 +666,10 @@ struct alarm_entry {

    char *family;

+    char *class;
+    char *component;
+    char *type;
+
    char *exec;
    char *recipient;
    time_t exec_run_timestamp;
--- a/database/rrdcalc.c
+++ b/database/rrdcalc.c
@ -91,6 +91,9 @@ static void rrdsetcalc_link(RRDSET *st, RRDCALC *rc) {
                rc->name,
                rc->rrdset->id,
                rc->rrdset->family,
+                rc->class,
+                rc->component,
+                rc->type,
                rc->exec,
                rc->recipient,
                now - rc->last_status_change,
@ -165,6 +168,9 @@ inline void rrdsetcalc_unlink(RRDCALC *rc) {
                rc->name,
                rc->rrdset->id,
                rc->rrdset->family,
+                rc->class,
+                rc->component,
+                rc->type,
                rc->exec,
                rc->recipient,
                now - rc->last_status_change,
@ -428,6 +434,10 @@ inline RRDCALC *rrdcalc_create_from_template(RRDHOST *host, RRDCALCTEMPLATE *rt,
    if(rt->units) rc->units = strdupz(rt->units);
    if(rt->info) rc->info = strdupz(rt->info);

+    if (rt->class) rc->class = strdupz(rt->class);
+    if (rt->component) rc->component = strdupz(rt->component);
+    if (rt->type) rc->type = strdupz(rt->type);
+
    if(rt->calculation) {
        rc->calculation = expression_parse(rt->calculation->source, NULL, NULL);
        if(!rc->calculation)
@ -535,6 +545,10 @@ inline RRDCALC *rrdcalc_create_from_rrdcalc(RRDCALC *rc, RRDHOST *host, const ch
    if(rc->units) newrc->units = strdupz(rc->units);
    if(rc->info) newrc->info = strdupz(rc->info);

+    if (rc->class) newrc->class = strdupz(rc->class);
+    if (rc->component) newrc->component = strdupz(rc->component);
+    if (rc->type) newrc->type = strdupz(rc->type);
+
    if(rc->calculation) {
        newrc->calculation = expression_parse(rc->calculation->source, NULL, NULL);
        if(!newrc->calculation)
@ -573,6 +587,9 @@ void rrdcalc_free(RRDCALC *rc) {
    freez(rc->source);
    freez(rc->units);
    freez(rc->info);
+    freez(rc->class);
+    freez(rc->component);
+    freez(rc->type);
    simple_pattern_free(rc->spdim);
    freez(rc->labels);
    simple_pattern_free(rc->splabels);
--- a/database/rrdcalc.h
+++ b/database/rrdcalc.h
@ -42,10 +42,13 @@ struct rrdcalc {
    char *exec;                     // the command to execute when this alarm switches state
    char *recipient;                // the recipient of the alarm (the first parameter to exec)

+    char *class;                    // the class that this alarm belongs
+    char *component;                // the component that this alarm refers to
+    char *type;                     // type of the alarm
+
    char *chart;                    // the chart id this should be linked to
    uint32_t hash_chart;

-
    char *plugin_match;             //the plugin name that should be linked to
    SIMPLE_PATTERN *plugin_pattern;

--- a/database/rrdcalctemplate.h
+++ b/database/rrdcalctemplate.h
@ -15,6 +15,10 @@ struct rrdcalctemplate {
    char *exec;
    char *recipient;

+    char *class;
+    char *component;
+    char *type;
+
    char *context;
    uint32_t hash_context;

--- a/health/health.c
+++ b/health/health.c
@ -930,7 +930,7 @@ void *health_main(void *ptr) {
                        if(likely(!rrdcalc_isrepeating(rc))) {
                            ALARM_ENTRY *ae = health_create_alarm_entry(
                                    host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
-                                    rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+                                    rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
                                    rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
                                    rc->delay_last,
                                    (
@ -980,7 +980,7 @@ void *health_main(void *ptr) {
                        rc->last_repeat = now;
                        ALARM_ENTRY *ae = health_create_alarm_entry(
                                host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
-                                rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+                                rc->rrdset->family, rc->class, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
                                rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
                                rc->delay_last,
                                (
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@ -1,24 +1,30 @@

 # logical device status check

-template: adaptec_raid_ld_status
-      on: adaptec_raid.ld_status
-  lookup: max -10s foreach *
-   units: bool
-   every: 10s
-    crit: $this > 0
-   delay: down 5m multiplier 1.5 max 1h
-    info: logical device status is failed or degraded
-      to: sysadmin
+ template: adaptec_raid_ld_status
+       on: adaptec_raid.ld_status
+    class: System
+component: RAID
+     type: Errors
+   lookup: max -10s foreach *
+    units: bool
+    every: 10s
+     crit: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: logical device status is failed or degraded
+       to: sysadmin

 # physical device state check

-template: adaptec_raid_pd_state
-      on: adaptec_raid.pd_state
-  lookup: max -10s foreach *
-   units: bool
-   every: 10s
-    crit: $this > 0
-   delay: down 5m multiplier 1.5 max 1h
-    info: physical device state is not online
-      to: sysadmin
+ template: adaptec_raid_pd_state
+       on: adaptec_raid.pd_state
+    class: System
+component: RAID
+     type: Errors
+   lookup: max -10s foreach *
+    units: bool
+    every: 10s
+     crit: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: physical device state is not online
+       to: sysadmin
--- a/health/health.d/am2320.conf
+++ b/health/health.d/am2320.conf
@ -1,12 +1,15 @@
 # make sure am2320 is sending stats

-template: am2320_last_collected_secs
-      on: am2320.temperature
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: am2320_last_collected_secs
+       on: am2320.temperature
+    class: Other
+component: Sensors
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@ -1,17 +1,23 @@
 # raise a warning alarm if an anomaly probability is consistently above 50%

-template: anomalies_anomaly_probabilities
-      on: anomalies.probability
-  lookup: average -2m foreach *
-   every: 1m
-    warn: $this > 50
-    info: average anomaly probability over the last 2 minutes
+ template: anomalies_anomaly_probabilities
+       on: anomalies.probability
+    class: Netdata
+component: ML
+     type: Errors
+   lookup: average -2m foreach *
+    every: 1m
+     warn: $this > 50
+     info: average anomaly probability over the last 2 minutes

 # raise a warning alarm if an anomaly flag is consistently firing

-template: anomalies_anomaly_flags
-      on: anomalies.anomaly
-  lookup: sum -2m foreach *
-   every: 1m
-    warn: $this > 10
-    info: number of anomalies in the last 2 minutes
+ template: anomalies_anomaly_flags
+       on: anomalies.anomaly
+    class: Netdata
+component: ML
+     type: Errors
+   lookup: sum -2m foreach *
+    every: 1m
+     warn: $this > 10
+     info: number of anomalies in the last 2 minutes
--- a/health/health.d/apache.conf
+++ b/health/health.d/apache.conf
@ -1,14 +1,17 @@

 # make sure apache is running

-template: apache_last_collected_secs
-      on: apache.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: apache_last_collected_secs
+       on: apache.requests
+    class: Web Server
+component: Apache
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@ -1,40 +1,49 @@
 # you can disable an alarm notification by setting the 'to' line to: silent

-template: apcupsd_10min_ups_load
-      on: apcupsd.load
-      os: *
-   hosts: *
-  lookup: average -10m unaligned of percentage
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 10m multiplier 1.5 max 1h
-    info: average UPS load over the last 10 minutes
-      to: sitemgr
+ template: apcupsd_10min_ups_load
+       on: apcupsd.load
+    class: Power Supply
+component: UPS
+     type: Utilization
+       os: *
+    hosts: *
+   lookup: average -10m unaligned of percentage
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 10m multiplier 1.5 max 1h
+     info: average UPS load over the last 10 minutes
+       to: sitemgr

 # Discussion in https://github.com/netdata/netdata/pull/3928:
 # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
-template: apcupsd_ups_charge
-      on: apcupsd.charge
-      os: *
-   hosts: *
-  lookup: average -60s unaligned of charge
-   units: %
-   every: 60s
-    warn: $this < 100
-    crit: $this < (($status == $CRITICAL) ? (60) : (50))
-   delay: down 10m multiplier 1.5 max 1h
-    info: average UPS charge over the last minute
-      to: sitemgr
+ template: apcupsd_ups_charge
+       on: apcupsd.charge
+    class: Power Supply
+component: UPS
+     type: Errors
+       os: *
+    hosts: *
+   lookup: average -60s unaligned of charge
+    units: %
+    every: 60s
+     warn: $this < 100
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 10m multiplier 1.5 max 1h
+     info: average UPS charge over the last minute
+       to: sitemgr

-template: apcupsd_last_collected_secs
-      on: apcupsd.load
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sitemgr
+ template: apcupsd_last_collected_secs
+       on: apcupsd.load
+    class: Power Supply
+component: UPS device
+     type: Latency
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sitemgr
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@ -1,33 +1,42 @@
 # Alert that backends subsystem will be disabled soon
-   alarm: backend_metrics_eol
-      on: netdata.backend_metrics
-   units: boolean
-    calc: $now - $last_collected_t 
-   every: 1m
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 1h
-    info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
-      to: sysadmin
+    alarm: backend_metrics_eol
+       on: netdata.backend_metrics
+    class: Netdata
+component: Exporting engine
+     type: Errors
+    units: boolean
+     calc: $now - $last_collected_t
+    every: 1m
+     warn: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+       to: sysadmin

 # make sure we are sending data to backend

-   alarm: backend_last_buffering
-      on: netdata.backend_metrics
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful buffering of backend data
-      to: dba
+    alarm: backend_last_buffering
+       on: netdata.backend_metrics
+    class: Netdata
+component: Exporting engine
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful buffering of backend data
+       to: dba

-   alarm: backend_metrics_sent
-      on: netdata.backend_metrics
-   units: %
-    calc: abs($sent) * 100 / abs($buffered)
-   every: 10s
-    warn: $this != 100
-   delay: down 5m multiplier 1.5 max 1h
-    info: percentage of metrics sent to the backend server
-      to: dba
+    alarm: backend_metrics_sent
+       on: netdata.backend_metrics
+    class: Netdata
+component: Exporting engine
+     type: Workload
+    units: %
+     calc: abs($sent) * 100 / abs($buffered)
+    every: 10s
+     warn: $this != 100
+    delay: down 5m multiplier 1.5 max 1h
+     info: percentage of metrics sent to the backend server
+       to: dba
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@ -1,24 +1,30 @@

-template: bcache_cache_errors
-      on: disk.bcache_cache_read_races
-  lookup: sum -1m unaligned absolute
-   units: errors
-   every: 1m
-    warn: $this > 0
-   delay: up 2m down 1h multiplier 1.5 max 2h
-    info: number of times data was read from the cache, \
-          the bucket was reused and invalidated in the last 10 minutes \
-          (when this occurs the data is reread from the backing device)
-      to: sysadmin
+ template: bcache_cache_errors
+       on: disk.bcache_cache_read_races
+    class: System
+component: Disk
+     type: Errors
+   lookup: sum -1m unaligned absolute
+    units: errors
+    every: 1m
+     warn: $this > 0
+    delay: up 2m down 1h multiplier 1.5 max 2h
+     info: number of times data was read from the cache, \
+           the bucket was reused and invalidated in the last 10 minutes \
+           (when this occurs the data is reread from the backing device)
+       to: sysadmin

-template: bcache_cache_dirty
-      on: disk.bcache_cache_alloc
-    calc: $dirty + $metadata + $undefined
-   units: %
-   every: 1m
-    warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
-    crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: percentage of cache space used for dirty data and metadata \
-          (this usually means your SSD cache is too small)
-      to: sysadmin
+ template: bcache_cache_dirty
+       on: disk.bcache_cache_alloc
+    class: System
+component: Disk
+     type: Utilization
+     calc: $dirty + $metadata + $undefined
+    units: %
+    every: 1m
+     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
+     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: percentage of cache space used for dirty data and metadata \
+           (this usually means your SSD cache is too small)
+       to: sysadmin
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@ -1,17 +1,20 @@
 # get the number of buried jobs in all queues

-template: beanstalk_server_buried_jobs
-      on: beanstalk.current_jobs
-    calc: $buried
-   units: jobs
-   every: 10s
-    warn: $this > 0
-    crit: $this > 10
-   delay: up 0 down 5m multiplier 1.2 max 1h
-    info: number of buried jobs across all tubes. \
-          You need to manually kick them so they can be processed. \
-          Presence of buried jobs in a tube does not affect new jobs.
-      to: sysadmin
+ template: beanstalk_server_buried_jobs
+       on: beanstalk.current_jobs
+    class: Messaging
+component: Beanstalk
+     type: Workload
+     calc: $buried
+    units: jobs
+    every: 10s
+     warn: $this > 0
+     crit: $this > 10
+    delay: up 0 down 5m multiplier 1.2 max 1h
+     info: number of buried jobs across all tubes. \
+           You need to manually kick them so they can be processed. \
+           Presence of buried jobs in a tube does not affect new jobs.
+       to: sysadmin
      
 # get the number of buried jobs per queue

--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@ -1,9 +1,12 @@
-template: bind_rndc_stats_file_size
-      on: bind_rndc.stats_size
-   units: megabytes
-   every: 60
-    calc: $stats_size
-    warn: $this > 512
-    crit: $this > 1024
-    info: BIND statistics-file size
-      to: sysadmin
+ template: bind_rndc_stats_file_size
+       on: bind_rndc.stats_size
+    class: DNS
+component: BIND
+     type: Utilization
+    units: megabytes
+    every: 60
+     calc: $stats_size
+     warn: $this > 512
+     crit: $this > 1024
+     info: BIND statistics-file size
+       to: sysadmin
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@ -1,62 +1,74 @@
 # Alarms for various BOINC issues.

 # Warn on any compute errors encountered.
-template: boinc_compute_errors
-      on: boinc.states
-      os: *
-   hosts: *
-families: *
-  lookup: average -10m unaligned of comperror
-   units: tasks
-   every: 1m
-    warn: $this > 0
-    crit: $this > 1
-   delay: up 1m down 5m multiplier 1.5 max 1h
-    info: average number of compute errors over the last 10 minutes
-      to: sysadmin
+ template: boinc_compute_errors
+       on: boinc.states
+    class: Computing
+component: BOINC
+     type: Errors
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of comperror
+    units: tasks
+    every: 1m
+     warn: $this > 0
+     crit: $this > 1
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: average number of compute errors over the last 10 minutes
+       to: sysadmin

 # Warn on lots of upload errors
-template: boinc_upload_errors
-      on: boinc.states
-      os: *
-   hosts: *
-families: *
-  lookup: average -10m unaligned of upload_failed
-   units: tasks
-   every: 1m
-    warn: $this > 0
-    crit: $this > 1
-   delay: up 1m down 5m multiplier 1.5 max 1h
-    info: average number of failed uploads over the last 10 minutes
-      to: sysadmin
+ template: boinc_upload_errors
+       on: boinc.states
+    class: Computing
+component: BOINC
+     type: Errors
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of upload_failed
+    units: tasks
+    every: 1m
+     warn: $this > 0
+     crit: $this > 1
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: average number of failed uploads over the last 10 minutes
+       to: sysadmin

 # Warn on the task queue being empty
-template: boinc_total_tasks
-      on: boinc.tasks
-      os: *
-   hosts: *
-families: *
-  lookup: average -10m unaligned of total
-   units: tasks
-   every: 1m
-    warn: $this < 1
-    crit: $this < 0.1
-   delay: up 5m down 10m multiplier 1.5 max 1h
-    info: average number of total tasks over the last 10 minutes
-      to: sysadmin
+ template: boinc_total_tasks
+       on: boinc.tasks
+    class: Computing
+component: BOINC
+     type: Utilization
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of total
+    units: tasks
+    every: 1m
+     warn: $this < 1
+     crit: $this < 0.1
+    delay: up 5m down 10m multiplier 1.5 max 1h
+     info: average number of total tasks over the last 10 minutes
+       to: sysadmin

 # Warn on no active tasks with a non-empty queue
-template: boinc_active_tasks
-      on: boinc.tasks
-      os: *
-   hosts: *
-families: *
-  lookup: average -10m unaligned of active
-    calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
-   units: tasks
-   every: 1m
-    warn: $this < 1
-    crit: $this < 0.1
-   delay: up 5m down 10m multiplier 1.5 max 1h
-    info: average number of active tasks over the last 10 minutes
-      to: sysadmin
+ template: boinc_active_tasks
+       on: boinc.tasks
+    class: Computing
+component: BOINC
+     type: Utilization
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of active
+     calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+    units: tasks
+    every: 1m
+     warn: $this < 1
+     crit: $this < 0.1
+    delay: up 5m down 10m multiplier 1.5 max 1h
+     info: average number of active tasks over the last 10 minutes
+       to: sysadmin
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@ -1,56 +1,68 @@

-template: btrfs_allocated
-      on: btrfs.disk
-      os: *
-   hosts: *
-families: *
-    calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95))
-    crit: $this > (($status == $CRITICAL) ? (95) : (98))
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: percentage of allocated BTRFS physical disk space
-      to: sysadmin
+ template: btrfs_allocated
+       on: btrfs.disk
+    class: System
+component: File system
+     type: Utilization
+       os: *
+    hosts: *
+ families: *
+     calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95))
+     crit: $this > (($status == $CRITICAL) ? (95) : (98))
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: percentage of allocated BTRFS physical disk space
+       to: sysadmin

-template: btrfs_data
-      on: btrfs.data
-      os: *
-   hosts: *
-families: *
-    calc: $used * 100 / ($used + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
-    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: utilization of BTRFS data space
-      to: sysadmin
+ template: btrfs_data
+       on: btrfs.data
+    class: System
+component: File system
+     type: Utilization
+       os: *
+    hosts: *
+ families: *
+     calc: $used * 100 / ($used + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: utilization of BTRFS data space
+       to: sysadmin

-template: btrfs_metadata
-      on: btrfs.metadata
-      os: *
-   hosts: *
-families: *
-    calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
-    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: utilization of BTRFS metadata space
-      to: sysadmin
+ template: btrfs_metadata
+       on: btrfs.metadata
+    class: System
+component: File system
+     type: Utilization
+       os: *
+    hosts: *
+ families: *
+     calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: utilization of BTRFS metadata space
+       to: sysadmin

-template: btrfs_system
-      on: btrfs.system
-      os: *
-   hosts: *
-families: *
-    calc: $used * 100 / ($used + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
-    crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: utilization of BTRFS system space
-      to: sysadmin
+ template: btrfs_system
+       on: btrfs.system
+    class: System
+component: File system
+     type: Utilization
+       os: *
+    hosts: *
+ families: *
+     calc: $used * 100 / ($used + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: utilization of BTRFS system space
+       to: sysadmin
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@ -1,12 +1,15 @@
 # low ceph disk available

-template: ceph_cluster_space_usage
-      on: ceph.general_usage
-    calc: $used * 100 / ($used + $avail)
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING ) ? (85) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 5m multiplier 1.2 max 1h
-    info: cluster disk space utilization
-      to: sysadmin
+ template: ceph_cluster_space_usage
+       on: ceph.general_usage
+    class: Storage
+component: Ceph
+     type: Utilization
+     calc: $used * 100 / ($used + $avail)
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING ) ? (85) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 5m multiplier 1.2 max 1h
+     info: cluster disk space utilization
+       to: sysadmin
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@ -1,28 +1,34 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-template: cgroup_10min_cpu_usage
-      on: cgroup.cpu_limit
-      os: linux
-   hosts: *
-  lookup: average -10m unaligned
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average cgroup CPU utilization over the last 10 minutes
-      to: sysadmin
+ template: cgroup_10min_cpu_usage
+       on: cgroup.cpu_limit
+    class: Cgroups
+component: CPU
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cgroup CPU utilization over the last 10 minutes
+       to: sysadmin

-template: cgroup_ram_in_use
-      on: cgroup.mem_usage
-      os: linux
-   hosts: *
-    calc: ($ram) * 100 / $memory_limit
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: cgroup memory utilization
-      to: sysadmin
+ template: cgroup_ram_in_use
+       on: cgroup.mem_usage
+    class: Cgroups
+component: Memory
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ($ram) * 100 / $memory_limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: cgroup memory utilization
+       to: sysadmin
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@ -1,91 +1,115 @@

 # Availability

-template: cockroachdb_last_collected_secs
-      on: cockroachdb.live_nodes
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: cockroachdb_last_collected_secs
+       on: cockroachdb.live_nodes
+    class: Database
+component: CockroachDB
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba

 # Capacity

-template: cockroachdb_used_storage_capacity
-      on: cockroachdb.storage_used_capacity_percentage
-    calc: $capacity_used_percent
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: storage capacity utilization
-      to: dba
+ template: cockroachdb_used_storage_capacity
+       on: cockroachdb.storage_used_capacity_percentage
+    class: Database
+component: CockroachDB
+     type: Utilization
+     calc: $capacity_used_percent
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: storage capacity utilization
+       to: dba

-template: cockroachdb_used_usable_storage_capacity
-      on: cockroachdb.storage_used_capacity_percentage
-    calc: $capacity_usable_used_percent
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: storage usable space utilization
-      to: dba
+ template: cockroachdb_used_usable_storage_capacity
+       on: cockroachdb.storage_used_capacity_percentage
+    class: Database
+component: CockroachDB
+     type: Utilization
+     calc: $capacity_usable_used_percent
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: storage usable space utilization
+       to: dba

 # Replication

-template: cockroachdb_unavailable_ranges
-      on: cockroachdb.ranges_replication_problem
-    calc: $ranges_unavailable
-   units: num
-   every: 10s
-    warn: $this > 0
-   delay: down 15m multiplier 1.5 max 1h
-    info: number of ranges with fewer live replicas than the replication target
-      to: dba
+ template: cockroachdb_unavailable_ranges
+       on: cockroachdb.ranges_replication_problem
+    class: Database
+component: CockroachDB
+     type: Utilization
+     calc: $ranges_unavailable
+    units: num
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of ranges with fewer live replicas than the replication target
+       to: dba

-template: cockroachdb_replicas_leaders_not_leaseholders
-      on: cockroachdb.replicas_leaders
-    calc: $replicas_leaders_not_leaseholders
-   units: num
-   every: 10s
-    warn: $this > 0
-   delay: down 15m multiplier 1.5 max 1h
-    info: number of replicas that are Raft leaders whose range lease is held by another store
-      to: dba
+ template: cockroachdb_replicas_leaders_not_leaseholders
+       on: cockroachdb.replicas_leaders
+    class: Database
+component: CockroachDB
+     type: Utilization
+     calc: $replicas_leaders_not_leaseholders
+    units: num
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of replicas that are Raft leaders whose range lease is held by another store
+       to: dba

 # FD

-template: cockroachdb_open_file_descriptors_limit
-      on: cockroachdb.process_file_descriptors
-    calc: $sys_fd_open/$sys_fd_softlimit * 100
-   units: %
-   every: 10s
-    warn: $this > 80
-   delay: down 15m multiplier 1.5 max 1h
-    info: open file descriptors utilization (against softlimit)
-      to: dba
+ template: cockroachdb_open_file_descriptors_limit
+       on: cockroachdb.process_file_descriptors
+    class: Database
+component: CockroachDB
+     type: Utilization
+     calc: $sys_fd_open/$sys_fd_softlimit * 100
+    units: %
+    every: 10s
+     warn: $this > 80
+    delay: down 15m multiplier 1.5 max 1h
+     info: open file descriptors utilization (against softlimit)
+       to: dba

 # SQL

-template: cockroachdb_sql_active_connections
-      on: cockroachdb.sql_connections
-    calc: $sql_conns
-   units: active connections
-   every: 10s
-    info: number of active SQL connections
-      to: dba
+ template: cockroachdb_sql_active_connections
+       on: cockroachdb.sql_connections
+    class: Database
+component: CockroachDB
+     type: Utilization
+     calc: $sql_conns
+    units: active connections
+    every: 10s
+     info: number of active SQL connections
+       to: dba

-template: cockroachdb_sql_executed_statements_total_last_5m
-      on: cockroachdb.sql_statements_total
-  lookup: sum -5m absolute of sql_query_count
-   units: statements
-   every: 10s
-   warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
-   delay: down 15m up 30s multiplier 1.5 max 1h
-    info: number of executed SQL statements in the last 5 minutes
-      to: dba
+ template: cockroachdb_sql_executed_statements_total_last_5m
+       on: cockroachdb.sql_statements_total
+    class: Database
+component: CockroachDB
+     type: Workload
+   lookup: sum -5m absolute of sql_query_count
+    units: statements
+    every: 10s
+     warn: $this == 0 AND $cockroachdb_sql_active_connections != 0
+    delay: down 15m up 30s multiplier 1.5 max 1h
+     info: number of executed SQL statements in the last 5 minutes
+       to: dba
--- a/health/health.d/couchdb.conf
+++ b/health/health.d/couchdb.conf
@ -1,13 +1,16 @@

 # make sure couchdb is running

-template: couchdb_last_collected_secs
-      on: couchdb.request_methods
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: couchdb_last_collected_secs
+       on: couchdb.request_methods
+    class: Database
+component: CouchDB
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@ -1,55 +1,67 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-template: 10min_cpu_usage
-      on: system.cpu
-      os: linux
-   hosts: *
-  lookup: average -10m unaligned of user,system,softirq,irq,guest
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
-      to: sysadmin
+ template: 10min_cpu_usage
+       on: system.cpu
+    class: System
+component: CPU
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned of user,system,softirq,irq,guest
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
+       to: sysadmin

-template: 10min_cpu_iowait
-      on: system.cpu
-      os: linux
-   hosts: *
-  lookup: average -10m unaligned of iowait
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (20) : (40))
-    crit: $this > (($status == $CRITICAL) ? (40) : (50))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average CPU iowait time over the last 10 minutes
-      to: sysadmin
+ template: 10min_cpu_iowait
+       on: system.cpu
+    class: System
+component: CPU
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned of iowait
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (20) : (40))
+     crit: $this > (($status == $CRITICAL) ? (40) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU iowait time over the last 10 minutes
+       to: sysadmin

-template: 20min_steal_cpu
-      on: system.cpu
-      os: linux
-   hosts: *
-  lookup: average -20m unaligned of steal
-   units: %
-   every: 5m
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (20) : (30))
-   delay: down 1h multiplier 1.5 max 2h
-    info: average CPU steal time over the last 20 minutes
-      to: sysadmin
+ template: 20min_steal_cpu
+       on: system.cpu
+    class: System
+component: CPU
+     type: Latency
+       os: linux
+    hosts: *
+   lookup: average -20m unaligned of steal
+    units: %
+    every: 5m
+     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+     crit: $this > (($status == $CRITICAL) ? (20) : (30))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average CPU steal time over the last 20 minutes
+       to: sysadmin

 ## FreeBSD
-template: 10min_cpu_usage
-      on: system.cpu
-      os: freebsd
-   hosts: *
-  lookup: average -10m unaligned of user,system,interrupt
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average CPU utilization over the last 10 minutes (excluding nice)
-      to: sysadmin
+ template: 10min_cpu_usage
+       on: system.cpu
+    class: System
+component: CPU
+     type: Utilization
+       os: freebsd
+    hosts: *
+   lookup: average -10m unaligned of user,system,interrupt
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization over the last 10 minutes (excluding nice)
+       to: sysadmin
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@ -1,52 +1,64 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

- alarm: 10min_dbengine_global_fs_errors
-    on: netdata.dbengine_global_errors
-    os: linux freebsd macos
- hosts: *
-lookup: sum -10m unaligned of fs_errors
- units: errors
- every: 10s
-  crit: $this > 0
- delay: down 15m multiplier 1.5 max 1h
-  info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
-    to: sysadmin
+    alarm: 10min_dbengine_global_fs_errors
+       on: netdata.dbengine_global_errors
+    class: Netdata
+component: DB engine
+     type: Errors
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of fs_errors
+    units: errors
+    every: 10s
+     crit: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
+       to: sysadmin

- alarm: 10min_dbengine_global_io_errors
-    on: netdata.dbengine_global_errors
-    os: linux freebsd macos
- hosts: *
-lookup: sum -10m unaligned of io_errors
- units: errors
- every: 10s
-  crit: $this > 0
- delay: down 1h multiplier 1.5 max 3h
-  info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
-    to: sysadmin
+    alarm: 10min_dbengine_global_io_errors
+       on: netdata.dbengine_global_errors
+    class: Netdata
+component: DB engine
+     type: Errors
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of io_errors
+    units: errors
+    every: 10s
+     crit: $this > 0
+    delay: down 1h multiplier 1.5 max 3h
+     info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
+       to: sysadmin

- alarm: 10min_dbengine_global_flushing_warnings
-    on: netdata.dbengine_global_errors
-    os: linux freebsd macos
- hosts: *
-lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
- units: errors
- every: 10s
-  warn: $this > 0
- delay: down 1h multiplier 1.5 max 3h
-  info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
-        Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
-    to: sysadmin
+    alarm: 10min_dbengine_global_flushing_warnings
+       on: netdata.dbengine_global_errors
+    class: Netdata
+component: DB engine
+     type: Errors
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
+    units: errors
+    every: 10s
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 3h
+     info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+           Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
+       to: sysadmin

- alarm: 10min_dbengine_global_flushing_errors
-    on: netdata.dbengine_long_term_page_stats
-    os: linux freebsd macos
- hosts: *
-lookup: sum -10m unaligned of flushing_pressure_deletions
- units: pages
- every: 10s
-  crit: $this != 0
- delay: down 1h multiplier 1.5 max 3h
-  info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
-        Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
-    to: sysadmin
+    alarm: 10min_dbengine_global_flushing_errors
+       on: netdata.dbengine_long_term_page_stats
+    class: Netdata
+component: DB engine
+     type: Errors
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of flushing_pressure_deletions
+    units: pages
+    every: 10s
+     crit: $this != 0
+    delay: down 1h multiplier 1.5 max 3h
+     info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+           Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
+       to: sysadmin
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@ -9,33 +9,39 @@
 # raise an alarm if the disk is low on
 # available disk space

-template: disk_space_usage
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: !/dev !/dev/* !/run !/run/* *
-    calc: $used * 100 / ($avail + $used)
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING ) ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: disk space utilization
-      to: sysadmin
+ template: disk_space_usage
+       on: disk.space
+    class: System
+component: Disk
+     type: Utilization
+       os: linux freebsd
+    hosts: *
+ families: !/dev !/dev/* !/run !/run/* *
+     calc: $used * 100 / ($avail + $used)
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING ) ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: disk $family space utilization
+       to: sysadmin

-template: disk_inode_usage
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: !/dev !/dev/* !/run !/run/* *
-    calc: $used * 100 / ($avail + $used)
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: up 1m down 15m multiplier 1.5 max 1h
-    info: disk inode utilization
-      to: sysadmin
+ template: disk_inode_usage
+       on: disk.inodes
+    class: System
+component: Disk
+     type: Utilization
+       os: linux freebsd
+    hosts: *
+ families: !/dev !/dev/* !/run !/run/* *
+     calc: $used * 100 / ($avail + $used)
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: disk $family inode utilization
+       to: sysadmin


 # -----------------------------------------------------------------------------
@ -128,21 +134,24 @@ families: !/dev !/dev/* !/run !/run/* *
 # by calculating the average disk utilization
 # for the last 10 minutes

-template: 10min_disk_utilization
-      on: disk.util
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: average -10m unaligned
-   units: %
-   every: 1m
-   green: 90
-     red: 98
-    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
-    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
-   delay: down 15m multiplier 1.2 max 1h
-    info: average percentage of time the disk was busy over the last 10 minutes
-      to: silent
+ template: 10min_disk_utilization
+       on: disk.util
+    class: System
+component: Disk
+     type: Utilization
+       os: linux freebsd
+    hosts: *
+ families: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+    green: 90
+      red: 98
+     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+    delay: down 15m multiplier 1.2 max 1h
+     info: average percentage of time $family disk was busy over the last 10 minutes
+       to: silent


 # raise an alarm if the disk backlog
@ -150,18 +159,21 @@ families: *
 # for 10 minutes
 # (i.e. the disk cannot catch up)

-template: 10min_disk_backlog
-      on: disk.backlog
-      os: linux
-   hosts: *
-families: *
-  lookup: average -10m unaligned
-   units: ms
-   every: 1m
-   green: 2000
-     red: 5000
-    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
-    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
-   delay: down 15m multiplier 1.2 max 1h
-    info: average disk backlog size over the last 10 minutes
-      to: silent
+ template: 10min_disk_backlog
+       on: disk.backlog
+    class: System
+component: Disk
+     type: Latency
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -10m unaligned
+    units: ms
+    every: 1m
+    green: 2000
+      red: 5000
+     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+    delay: down 15m multiplier 1.2 max 1h
+     info: average backlog size of the $family disk over the last 10 minutes
+       to: silent
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@ -1,12 +1,15 @@

 # detect dns query failure

-template: dns_query_time_query_time
-      on: dns_query_time.query_time
-  lookup: average -10s unaligned foreach *
-   units: ms
-   every: 10s
-    warn: $this == nan
-   delay: up 20s down 5m multiplier 1.5 max 1h
-    info: average DNS query round trip time over the last 10 seconds
-      to: sysadmin
+ template: dns_query_time_query_time
+       on: dns_query_time.query_time
+    class: DNS
+component: DNS
+     type: Latency
+   lookup: average -10s unaligned foreach *
+    units: ms
+    every: 10s
+     warn: $this == nan
+    delay: up 20s down 5m multiplier 1.5 max 1h
+     info: average DNS query round trip time over the last 10 seconds
+       to: sysadmin
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@ -1,12 +1,15 @@
 # dhcp-range utilization

-template: dnsmasq_dhcp_dhcp_range_utilization
-      on: dnsmasq_dhcp.dhcp_range_utilization
-   every: 10s
-   units: %
-    calc: $used
-    warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
-    crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
-   delay: down 5m
-    info: DHCP range utilization
-      to: sysadmin
+ template: dnsmasq_dhcp_dhcp_range_utilization
+       on: dnsmasq_dhcp.dhcp_range_utilization
+    class: DHCP
+component: Dnsmasq
+     type: Utilization
+    every: 10s
+    units: %
+     calc: $used
+     warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+    delay: down 5m
+     info: DHCP range utilization
+       to: sysadmin
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@ -1,8 +1,11 @@
-template: docker_unhealthy_containers
-      on: docker.unhealthy_containers
-   units: unhealthy containers
-   every: 10s
-  lookup: average -10s
-    crit: $this > 0
-    info: average number of unhealthy docker containers over the last 10 seconds
-      to: sysadmin
+ template: docker_unhealthy_containers
+       on: docker.unhealthy_containers
+    class: Containers
+component: Docker
+     type: Errors
+    units: unhealthy containers
+    every: 10s
+   lookup: average -10s
+     crit: $this > 0
+     info: average number of unhealthy docker containers over the last 10 seconds
+       to: sysadmin
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@ -1,12 +1,15 @@

 # make sure elasticsearch is running

-template: elasticsearch_last_collected
-      on: elasticsearch.cluster_health_status
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: elasticsearch_last_collected
+       on: elasticsearch.cluster_health_status
+    class: Search engine
+component: Elasticsearch
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@ -3,14 +3,17 @@
 # the alarm is checked every 1 minute
 # and examines the last hour of data

-   alarm: lowest_entropy
-      on: system.entropy
-      os: linux
-   hosts: *
-  lookup: min -5m unaligned
-   units: entries
-   every: 5m
-    warn: $this < (($status >= $WARNING) ? (200) : (100))
-   delay: down 1h multiplier 1.5 max 2h
-    info: minimum number of entries in the random numbers pool in the last 5 minutes
-      to: silent
+    alarm: lowest_entropy
+       on: system.entropy
+    class: System
+component: Cryptography
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: min -5m unaligned
+    units: entries
+    every: 5m
+     warn: $this < (($status >= $WARNING) ? (200) : (100))
+    delay: down 1h multiplier 1.5 max 2h
+     info: minimum number of entries in the random numbers pool in the last 5 minutes
+       to: silent
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@ -11,13 +11,16 @@ families: *
    info: number of seconds since the last successful buffering of exporting data
      to: dba

-template: exporting_metrics_sent
-families: *
-      on: exporting_data_size
-   units: %
-    calc: abs($sent) * 100 / abs($buffered)
-   every: 10s
-    warn: $this != 100
-   delay: down 5m multiplier 1.5 max 1h
-    info: percentage of metrics sent to the external database server
-      to: dba
+ template: exporting_metrics_sent
+ families: *
+       on: exporting_data_size
+    class: Netdata
+component: Exporting engine
+     type: Workload
+    units: %
+     calc: abs($sent) * 100 / abs($buffered)
+    every: 10s
+     warn: $this != 100
+    delay: down 5m multiplier 1.5 max 1h
+     info: percentage of metrics sent to the external database server
+       to: dba
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@ -1,52 +1,64 @@

-template: fping_last_collected_secs
-families: *
-      on: fping.latency
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: fping_last_collected_secs
+ families: *
+       on: fping.latency
+    class: Other
+component: Network
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

-template: fping_host_reachable
-families: *
-      on: fping.latency
-    calc: $average != nan
-   units: up/down
-   every: 10s
-    crit: $this == 0
-   delay: down 30m multiplier 1.5 max 2h
-    info: reachability status of the network host (0: unreachable, 1: reachable)
-      to: sysadmin
+ template: fping_host_reachable
+ families: *
+       on: fping.latency
+    class: Other
+component: Network
+     type: Errors
+     calc: $average != nan
+    units: up/down
+    every: 10s
+     crit: $this == 0
+    delay: down 30m multiplier 1.5 max 2h
+     info: reachability status of the network host (0: unreachable, 1: reachable)
+       to: sysadmin

-template: fping_host_latency
-families: *
-      on: fping.latency
-  lookup: average -10s unaligned of average
-   units: ms
-   every: 10s
-   green: 500
-     red: 1000
-    warn: $this > $green OR $max > $red
-    crit: $this > $red
-   delay: down 30m multiplier 1.5 max 2h
-    info: average latency to the network host over the last 10 seconds
-      to: sysadmin
+ template: fping_host_latency
+ families: *
+       on: fping.latency
+    class: Other
+component: Network
+     type: Latency
+   lookup: average -10s unaligned of average
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: $this > $green OR $max > $red
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: average latency to the network host over the last 10 seconds
+       to: sysadmin

-template: fping_packet_loss
-families: *
-      on: fping.quality
-  lookup: average -10m unaligned of returned
-    calc: 100 - $this
-   green: 1
-     red: 10
-   units: %
-   every: 10s
-    warn: $this > $green
-    crit: $this > $red
-   delay: down 30m multiplier 1.5 max 2h
-    info: packet loss ratio to the network host over the last 10 minutes
-      to: sysadmin
+ template: fping_packet_loss
+ families: *
+       on: fping.quality
+    class: System
+component: Network
+     type: Errors
+   lookup: average -10m unaligned of returned
+     calc: 100 - $this
+    green: 1
+      red: 10
+    units: %
+    every: 10s
+     warn: $this > $green
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: packet loss ratio to the network host over the last 10 minutes
+       to: sysadmin
--- a/health/health.d/fronius.conf
+++ b/health/health.d/fronius.conf
@ -1,11 +1,14 @@
-template: fronius_last_collected_secs
-families: *
-      on: fronius.power
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sitemgr
+ template: fronius_last_collected_secs
+ families: *
+       on: fronius.power
+    class: Power Supply
+component: Solar
+     type: Latency
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sitemgr
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@ -1,22 +1,28 @@
 # make sure Gearman is running
-template: gearman_last_collected_secs
-      on: gearman.total_jobs
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: gearman_last_collected_secs
+       on: gearman.total_jobs
+    class: Computing
+component: Gearman
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

-template: gearman_workers_queued
-      on: gearman.single_job
-  lookup: average -10m unaligned match-names of Queued
-   units: workers
-   every: 10s
-    warn: $this > 30000
-    crit: $this > 100000
-   delay: down 5m multiplier 1.5 max 1h
-    info: average number of queued jobs over the last 10 minutes
-      to: sysadmin
+ template: gearman_workers_queued
+       on: gearman.single_job
+    class: Computing
+component: Gearman
+     type: Latency
+   lookup: average -10m unaligned match-names of Queued
+    units: workers
+    every: 10s
+     warn: $this > 30000
+     crit: $this > 100000
+    delay: down 5m multiplier 1.5 max 1h
+     info: average number of queued jobs over the last 10 minutes
+       to: sysadmin
--- a/health/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
@ -1,27 +1,36 @@
-template: haproxy_backend_server_status
-      on: haproxy_hs.down
-   units: failed servers
-   every: 10s
-  lookup: average -10s
-    crit: $this > 0
-    info: average number of failed haproxy backend servers over the last 10 seconds
-      to: sysadmin
+ template: haproxy_backend_server_status
+       on: haproxy_hs.down
+    class: Web Proxy
+component: HAProxy
+     type: Errors
+    units: failed servers
+    every: 10s
+   lookup: average -10s
+     crit: $this > 0
+     info: average number of failed haproxy backend servers over the last 10 seconds
+       to: sysadmin

-template: haproxy_backend_status
-      on: haproxy_hb.down
-   units: failed backend
-   every: 10s
-  lookup: average -10s
-    crit: $this > 0
-    info: average number of failed haproxy backends over the last 10 seconds
-      to: sysadmin
+ template: haproxy_backend_status
+       on: haproxy_hb.down
+    class: Web Proxy
+component: HAProxy
+     type: Errors
+    units: failed backend
+    every: 10s
+   lookup: average -10s
+     crit: $this > 0
+     info: average number of failed haproxy backends over the last 10 seconds
+       to: sysadmin

-template: haproxy_last_collected
-      on: haproxy_hb.down
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: haproxy_last_collected
+       on: haproxy_hb.down
+    class: Web Proxy
+component: HAProxy
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@ -1,75 +1,93 @@

 # make sure hdfs is running

-template: hdfs_last_collected_secs
-      on: hdfs.heap_memory
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: hdfs_last_collected_secs
+       on: hdfs.heap_memory
+    class: Storage
+component: HDFS
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster


 # Common

-template: hdfs_capacity_usage
-      on: hdfs.capacity
-    calc: ($used) * 100 / ($used + $remaining)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: summary datanodes space capacity utilization
-      to: sysadmin
+ template: hdfs_capacity_usage
+       on: hdfs.capacity
+    class: Storage
+component: HDFS
+     type: Utilization
+     calc: ($used) * 100 / ($used + $remaining)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (80) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: summary datanodes space capacity utilization
+       to: sysadmin


 # NameNode

-template: hdfs_missing_blocks
-      on: hdfs.blocks
-    calc: $missing
-   units: missing blocks
-   every: 10s
-    warn: $this > 0
-   delay: down 15m multiplier 1.5 max 1h
-    info: number of missing blocks
-      to: sysadmin
+ template: hdfs_missing_blocks
+       on: hdfs.blocks
+    class: Storage
+component: HDFS
+     type: Errors
+     calc: $missing
+    units: missing blocks
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of missing blocks
+       to: sysadmin


-template: hdfs_stale_nodes
-      on: hdfs.data_nodes
-    calc: $stale
-   units: dead nodes
-   every: 10s
-    warn: $this > 0
-   delay: down 15m multiplier 1.5 max 1h
-    info: number of datanodes marked stale due to delayed heartbeat
-      to: sysadmin
+ template: hdfs_stale_nodes
+       on: hdfs.data_nodes
+    class: Storage
+component: HDFS
+     type: Errors
+     calc: $stale
+    units: dead nodes
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of datanodes marked stale due to delayed heartbeat
+       to: sysadmin


-template: hdfs_dead_nodes
-      on: hdfs.data_nodes
-    calc: $dead
-   units: dead nodes
-   every: 10s
-    crit: $this > 0
-   delay: down 15m multiplier 1.5 max 1h
-    info: number of datanodes which are currently dead
-      to: sysadmin
+ template: hdfs_dead_nodes
+       on: hdfs.data_nodes
+    class: Storage
+component: HDFS
+     type: Errors
+     calc: $dead
+    units: dead nodes
+    every: 10s
+     crit: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of datanodes which are currently dead
+       to: sysadmin


 # DataNode

-template: hdfs_num_failed_volumes
-      on: hdfs.num_failed_volumes
-    calc: $fsds_num_failed_volumes
-   units: failed volumes
-   every: 10s
-    warn: $this > 0
-   delay: down 15m multiplier 1.5 max 1h
-    info: number of failed volumes
-      to: sysadmin
+ template: hdfs_num_failed_volumes
+       on: hdfs.num_failed_volumes
+    class: Storage
+component: HDFS
+     type: Errors
+     calc: $fsds_num_failed_volumes
+    units: failed volumes
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of failed volumes
+       to: sysadmin
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@ -1,99 +1,126 @@
-template: httpcheck_last_collected_secs
-families: *
-      on: httpcheck.status
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: httpcheck_last_collected_secs
+ families: *
+       on: httpcheck.status
+    class: Other
+component: HTTP endpoint
+     type: Latency
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: httpcheck_web_service_up
-families: *
-      on: httpcheck.status
-  lookup: average -1m unaligned percentage of success
-    calc: ($this < 75) ? (0) : ($this)
-   every: 5s
-   units: up/down
-    info: average ratio of successful HTTP requests over the last minute (at least 75%)
-      to: silent
+ template: httpcheck_web_service_up
+ families: *
+       on: httpcheck.status
+    class: Web Server
+component: HTTP endpoint
+     type: Utilization
+   lookup: average -1m unaligned percentage of success
+     calc: ($this < 75) ? (0) : ($this)
+    every: 5s
+    units: up/down
+     info: average ratio of successful HTTP requests over the last minute (at least 75%)
+       to: silent

-template: httpcheck_web_service_bad_content
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of bad_content
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average ratio of HTTP responses with unexpected content over the last 5 minutes
- options: no-clear-notification
-      to: webmaster
+ template: httpcheck_web_service_bad_content
+ families: *
+       on: httpcheck.status
+    class: Web Server
+component: HTTP endpoint
+     type: Workload
+   lookup: average -5m unaligned percentage of bad_content
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of HTTP responses with unexpected content over the last 5 minutes
+  options: no-clear-notification
+       to: webmaster

-template: httpcheck_web_service_bad_status
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of bad_status
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average ratio of HTTP responses with unexpected status over the last 5 minutes
- options: no-clear-notification
-      to: webmaster
+ template: httpcheck_web_service_bad_status
+ families: *
+       on: httpcheck.status
+    class: Web Server
+component: HTTP endpoint
+     type: Workload
+   lookup: average -5m unaligned percentage of bad_status
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of HTTP responses with unexpected status over the last 5 minutes
+  options: no-clear-notification
+       to: webmaster

-template: httpcheck_web_service_timeouts
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of timeout
-   every: 10s
-   units: %
-    info: average ratio of HTTP request timeouts over the last 5 minutes
+ template: httpcheck_web_service_timeouts
+ families: *
+       on: httpcheck.status
+    class: Web Server
+component: HTTP endpoint
+     type: Latency
+   lookup: average -5m unaligned percentage of timeout
+    every: 10s
+    units: %
+     info: average ratio of HTTP request timeouts over the last 5 minutes

-template: httpcheck_no_web_service_connections
-families: *
-      on: httpcheck.status
-  lookup: average -5m unaligned percentage of no_connection
-   every: 10s
-   units: %
-    info: average ratio of failed requests during the last 5 minutes
+ template: httpcheck_no_web_service_connections
+ families: *
+       on: httpcheck.status
+    class: Other
+component: HTTP endpoint
+     type: Errors
+   lookup: average -5m unaligned percentage of no_connection
+    every: 10s
+    units: %
+     info: average ratio of failed requests during the last 5 minutes

 # combined timeout & no connection alarm
-template: httpcheck_web_service_unreachable
-families: *
-      on: httpcheck.status
-    calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
-   units: %
-   every: 10s
-    warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
-    crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
- options: no-clear-notification
-      to: webmaster
+ template: httpcheck_web_service_unreachable
+ families: *
+       on: httpcheck.status
+    class: Web Server
+component: HTTP endpoint
+     type: Errors
+     calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
+    units: %
+    every: 10s
+     warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
+     crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
+  options: no-clear-notification
+       to: webmaster

-template: httpcheck_1h_web_service_response_time
-families: *
-      on: httpcheck.responsetime
-  lookup: average -1h unaligned of time
-   every: 30s
-   units: ms
-    info: average HTTP response time over the last hour
+ template: httpcheck_1h_web_service_response_time
+ families: *
+       on: httpcheck.responsetime
+    class: Other
+component: HTTP endpoint
+     type: Latency
+   lookup: average -1h unaligned of time
+    every: 30s
+    units: ms
+     info: average HTTP response time over the last hour

-template: httpcheck_web_service_slow
-families: *
-      on: httpcheck.responsetime
-  lookup: average -3m unaligned of time
-   units: ms
-   every: 10s
-    warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
-    crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
-   delay: down 5m multiplier 1.5 max 1h
-    info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
- options: no-clear-notification
-      to: webmaster
+ template: httpcheck_web_service_slow
+ families: *
+       on: httpcheck.responsetime
+    class: Web Server
+component: HTTP endpoint
+     type: Latency
+   lookup: average -3m unaligned of time
+    units: ms
+    every: 10s
+     warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
+     crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
+    delay: down 5m multiplier 1.5 max 1h
+     info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
+  options: no-clear-notification
+       to: webmaster
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@ -1,13 +1,16 @@
-template: ioping_disk_latency
-families: *
-      on: ioping.latency
-  lookup: average -10s unaligned of average
-   units: ms
-   every: 10s
-   green: 500
-     red: 1000
-    warn: $this > $green OR $max > $red
-    crit: $this > $red
-   delay: down 30m multiplier 1.5 max 2h
-    info: average I/O latency over the last 10 seconds
-      to: sysadmin
+ template: ioping_disk_latency
+ families: *
+       on: ioping.latency
+    class: System
+component: Disk
+     type: Latency
+   lookup: average -10s unaligned of average
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: $this > $green OR $max > $red
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: average I/O latency over the last 10 seconds
+       to: sysadmin
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@ -1,28 +1,34 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-   alarm: semaphores_used
-      on: system.ipc_semaphores
-      os: linux
-   hosts: *
-    calc: $semaphores * 100 / $ipc_semaphores_max
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (70) : (90))
-   delay: down 5m multiplier 1.5 max 1h
-    info: IPC semaphore utilization
-      to: sysadmin
+    alarm: semaphores_used
+       on: system.ipc_semaphores
+    class: System
+component: IPC
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: $semaphores * 100 / $ipc_semaphores_max
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (70) : (90))
+    delay: down 5m multiplier 1.5 max 1h
+     info: IPC semaphore utilization
+       to: sysadmin

-   alarm: semaphore_arrays_used
-      on: system.ipc_semaphore_arrays
-      os: linux
-   hosts: *
-    calc: $arrays * 100 / $ipc_semaphores_arrays_max
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (70) : (90))
-   delay: down 5m multiplier 1.5 max 1h
-    info: IPC semaphore arrays utilization
-      to: sysadmin
+    alarm: semaphore_arrays_used
+       on: system.ipc_semaphore_arrays
+    class: System
+component: IPC
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: $arrays * 100 / $ipc_semaphores_arrays_max
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (70) : (90))
+    delay: down 5m multiplier 1.5 max 1h
+     info: IPC semaphore arrays utilization
+       to: sysadmin
--- a/health/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
@ -1,11 +1,14 @@

-template: ipfs_datastore_usage
-      on: ipfs.repo_size
-    calc: $size * 100 / $avail
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: IPFS datastore utilization
-      to: sysadmin
+ template: ipfs_datastore_usage
+       on: ipfs.repo_size
+    class: Data Sharing
+component: IPFS
+     type: Utilization
+     calc: $size * 100 / $avail
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: IPFS datastore utilization
+       to: sysadmin
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@ -1,20 +1,26 @@
-   alarm: ipmi_sensors_states
-      on: ipmi.sensors_states
-    calc: $warning + $critical
-   units: sensors
-   every: 10s
-    warn: $this > 0
-    crit: $critical > 0
-   delay: up 5m down 15m multiplier 1.5 max 1h
-    info: number of IPMI sensors in non-nominal state
-      to: sysadmin
+    alarm: ipmi_sensors_states
+       on: ipmi.sensors_states
+    class: System
+component: IPMI
+     type: Errors
+     calc: $warning + $critical
+    units: sensors
+    every: 10s
+     warn: $this > 0
+     crit: $critical > 0
+    delay: up 5m down 15m multiplier 1.5 max 1h
+     info: number of IPMI sensors in non-nominal state
+       to: sysadmin

-   alarm: ipmi_events
-      on: ipmi.events
-    calc: $events
-   units: events
-   every: 10s
-    warn: $this > 0
-   delay: up 5m down 15m multiplier 1.5 max 1h
-    info: number of events in the IPMI System Event Log (SEL)
-      to: sysadmin
+    alarm: ipmi_events
+       on: ipmi.events
+    class: System
+component: IPMI
+     type: Utilization
+     calc: $events
+    units: events
+    every: 10s
+     warn: $this > 0
+    delay: up 5m down 15m multiplier 1.5 max 1h
+     info: number of events in the IPMI System Event Log (SEL)
+       to: sysadmin
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@ -4,39 +4,48 @@

 # True (1) if the node is experiencing a configuration-related error, false (0) otherwise.

-   template: kubelet_node_config_error
-         on: k8s_kubelet.kubelet_node_config_error
-       calc: $kubelet_node_config_error
-      units: bool
-      every: 10s
-       warn: $this == 1
-      delay: down 1m multiplier 1.5 max 2h
-       info: the node is experiencing a configuration-related error (0: false, 1: true)
-         to: sysadmin
+ template: kubelet_node_config_error
+       on: k8s_kubelet.kubelet_node_config_error
+    class: Kubernetes
+component: Kubelet
+     type: Errors
+     calc: $kubelet_node_config_error
+    units: bool
+    every: 10s
+     warn: $this == 1
+    delay: down 1m multiplier 1.5 max 2h
+     info: the node is experiencing a configuration-related error (0: false, 1: true)
+       to: sysadmin

 # Failed Token() requests to the alternate token source

-   template: kubelet_token_requests
-     lookup: sum -10s of token_fail_count
-         on: k8s_kubelet.kubelet_token_requests
-      units: failed requests
-      every: 10s
-       warn: $this > 0
-      delay: down 1m multiplier 1.5 max 2h
-       info: number of failed Token() requests to the alternate token source
-         to: sysadmin
+ template: kubelet_token_requests
+   lookup: sum -10s of token_fail_count
+       on: k8s_kubelet.kubelet_token_requests
+    class: Kubernetes
+component: Kubelet
+     type: Errors
+    units: failed requests
+    every: 10s
+     warn: $this > 0
+    delay: down 1m multiplier 1.5 max 2h
+     info: number of failed Token() requests to the alternate token source
+       to: sysadmin

 # Docker and runtime operation errors

-   template: kubelet_operations_error
-     lookup: sum -1m
-         on: k8s_kubelet.kubelet_operations_errors
-      units: errors
-      every: 10s
-       warn: $this > (($status >= $WARNING)  ? (0) : (20))
-      delay: up 30s down 1m multiplier 1.5 max 2h
-       info: number of Docker or runtime operation errors
-         to: sysadmin
+ template: kubelet_operations_error
+   lookup: sum -1m
+       on: k8s_kubelet.kubelet_operations_errors
+    class: Kubernetes
+component: Kubelet
+     type: Errors
+    units: errors
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (0) : (20))
+    delay: up 30s down 1m multiplier 1.5 max 2h
+     info: number of Docker or runtime operation errors
+       to: sysadmin

 # -----------------------------------------------------------------------------

@ -53,66 +62,84 @@

 # quantile 0.5

-template: kubelet_1m_pleg_relist_latency_quantile_05
-      on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
-  lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
-   units: microseconds
-   every: 10s
-    info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
+ template: kubelet_1m_pleg_relist_latency_quantile_05
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Kubernetes
+component: Kubelet
+     type: Latency
+   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
+    units: microseconds
+    every: 10s
+     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)

-template: kubelet_10s_pleg_relist_latency_quantile_05
-      on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
-  lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
-    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
-   every: 10s
-   units: %
-   warn: $this > (($status >= $WARNING)?(100):(200))
-   crit: $this > (($status >= $WARNING)?(200):(400))
-  delay: down 1m multiplier 1.5 max 2h
-   info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
-         compared to the last minute (quantile 0.5)
-     to: sysadmin
+ template: kubelet_10s_pleg_relist_latency_quantile_05
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Kubernetes
+component: Kubelet
+     type: Latency
+   lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
+     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(100):(200))
+     crit: $this > (($status >= $WARNING)?(200):(400))
+    delay: down 1m multiplier 1.5 max 2h
+     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+           compared to the last minute (quantile 0.5)
+       to: sysadmin

 # quantile 0.9

-template: kubelet_1m_pleg_relist_latency_quantile_09
-      on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
-  lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
-   units: microseconds
-   every: 10s
-    info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
+ template: kubelet_1m_pleg_relist_latency_quantile_09
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Kubernetes
+component: Kubelet
+     type: Latency
+   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
+    units: microseconds
+    every: 10s
+     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)

-template: kubelet_10s_pleg_relist_latency_quantile_09
-      on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
-  lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
-    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
-   every: 10s
-   units: %
-   warn: $this > (($status >= $WARNING)?(200):(400))
-   crit: $this > (($status >= $WARNING)?(400):(800))
-  delay: down 1m multiplier 1.5 max 2h
-   info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
-         compared to the last minute (quantile 0.9)
-     to: sysadmin
+ template: kubelet_10s_pleg_relist_latency_quantile_09
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Kubernetes
+component: Kubelet
+     type: Latency
+   lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
+     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(400))
+     crit: $this > (($status >= $WARNING)?(400):(800))
+    delay: down 1m multiplier 1.5 max 2h
+     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+           compared to the last minute (quantile 0.9)
+       to: sysadmin

 # quantile 0.99

-template: kubelet_1m_pleg_relist_latency_quantile_099
-      on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
-  lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
-   units: microseconds
-   every: 10s
-    info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
+ template: kubelet_1m_pleg_relist_latency_quantile_099
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Kubernetes
+component: Kubelet
+     type: Latency
+   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
+    units: microseconds
+    every: 10s
+     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)

-template: kubelet_10s_pleg_relist_latency_quantile_099
-      on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
-  lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
-    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
-   every: 10s
-   units: %
-   warn: $this > (($status >= $WARNING)?(400):(800))
-   crit: $this > (($status >= $WARNING)?(800):(1200))
-  delay: down 1m multiplier 1.5 max 2h
-   info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
-         compared to the last minute (quantile 0.99)
-     to: sysadmin
+ template: kubelet_10s_pleg_relist_latency_quantile_099
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Kubernetes
+component: Kubelet
+     type: Latency
+   lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
+     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(400):(800))
+     crit: $this > (($status >= $WARNING)?(800):(1200))
+    delay: down 1m multiplier 1.5 max 2h
+     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+           compared to the last minute (quantile 0.99)
+       to: sysadmin
--- a/health/health.d/lighttpd.conf
+++ b/health/health.d/lighttpd.conf
@ -1,14 +1,17 @@

 # make sure lighttpd is running

-template: lighttpd_last_collected_secs
-      on: lighttpd.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: lighttpd_last_collected_secs
+       on: lighttpd.requests
+    class: Web Server
+component: Lighttpd
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@ -1,12 +1,15 @@
 # Alert on low battery capacity.

-template: linux_power_supply_capacity
-      on: powersupply.capacity
-    calc: $capacity
-   units: %
-   every: 10s
-    warn: $this < 10
-    crit: $this < 5
-   delay: up 30s down 5m multiplier 1.2 max 1h
-    info: percentage of remaining power supply capacity
-      to: sysadmin
+ template: linux_power_supply_capacity
+       on: powersupply.capacity
+    class: Power Supply
+component: Battery
+     type: Utilization
+     calc: $capacity
+    units: %
+    every: 10s
+     warn: $this < 10
+     crit: $this < 5
+    delay: up 30s down 5m multiplier 1.2 max 1h
+     info: percentage of remaining power supply capacity
+       to: sysadmin
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@ -4,51 +4,63 @@
 # Calculate the base trigger point for the load average alarms.
 # This is the maximum number of CPU's in the system over the past 1
 # minute, with a special case for a single CPU of setting the trigger at 2.
-   alarm: load_cpu_number
-      on: system.load
-      os: linux
-   hosts: *
-    calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
-   units: cpus
-   every: 1m
-    info: number of active CPU cores in the system
+    alarm: load_cpu_number
+       on: system.load
+    class: System
+component: Load
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+    units: cpus
+    every: 1m
+     info: number of active CPU cores in the system

 # Send alarms if the load average is unusually high.
 # These intentionally _do not_ calculate the average over the sampled
 # time period because the values being checked already are averages.

-   alarm: load_average_15
-      on: system.load
-      os: linux
-   hosts: *
-  lookup: max -1m unaligned of load15
-   units: load
-   every: 1m
-    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
-   delay: down 15m multiplier 1.5 max 1h
-    info: system fifteen-minute load average
-      to: sysadmin
+    alarm: load_average_15
+       on: system.load
+    class: System
+component: Load
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: max -1m unaligned of load15
+    units: load
+    every: 1m
+     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
+    delay: down 15m multiplier 1.5 max 1h
+     info: system fifteen-minute load average
+       to: sysadmin

-   alarm: load_average_5
-      on: system.load
-      os: linux
-   hosts: *
-  lookup: max -1m unaligned of load5
-   units: load
-   every: 1m
-    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
-   delay: down 15m multiplier 1.5 max 1h
-    info: system five-minute load average
-      to: sysadmin
+    alarm: load_average_5
+       on: system.load
+    class: System
+component: Load
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: max -1m unaligned of load5
+    units: load
+    every: 1m
+     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
+    delay: down 15m multiplier 1.5 max 1h
+     info: system five-minute load average
+       to: sysadmin

-   alarm: load_average_1
-      on: system.load
-      os: linux
-   hosts: *
-  lookup: max -1m unaligned of load1
-   units: load
-   every: 1m
-    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
-   delay: down 15m multiplier 1.5 max 1h
-    info: system one-minute load average
-      to: sysadmin
+    alarm: load_average_1
+       on: system.load
+    class: System
+component: Load
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: max -1m unaligned of load1
+    units: load
+    every: 1m
+     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
+    delay: down 15m multiplier 1.5 max 1h
+     info: system one-minute load average
+       to: sysadmin
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@ -1,39 +1,51 @@
-template: mdstat_last_collected
-      on: md.disks
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: mdstat_last_collected
+       on: md.disks
+    class: System
+component: RAID
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin

-template: mdstat_disks
-      on: md.disks
-   units: failed devices
-   every: 10s
-    calc: $down
-    crit: $this > 0
-    info: number of devices in the down state. \
-          Any number > 0 indicates that the array is degraded.
-      to: sysadmin
+ template: mdstat_disks
+       on: md.disks
+    class: System
+component: RAID
+     type: Errors
+    units: failed devices
+    every: 10s
+     calc: $down
+     crit: $this > 0
+     info: number of devices in the down state for the $family array. \
+           Any number > 0 indicates that the array is degraded.
+       to: sysadmin

-template: mdstat_mismatch_cnt
-      on: md.mismatch_cnt
-   units: unsynchronized blocks
-    calc: $count
-   every: 60s
-    warn: $this > 1024
-   delay: up 30m
-    info: number of unsynchronized blocks
-      to: sysadmin
+ template: mdstat_mismatch_cnt
+       on: md.mismatch_cnt
+    class: System
+component: RAID
+     type: Errors
+    units: unsynchronized blocks
+     calc: $count
+    every: 60s
+     warn: $this > 1024
+    delay: up 30m
+     info: number of unsynchronized blocks for the $family array
+       to: sysadmin

-template: mdstat_nonredundant_last_collected
-      on: md.nonredundant
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: mdstat_nonredundant_last_collected
+       on: md.nonredundant
+    class: System
+component: RAID
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@ -1,56 +1,71 @@

 ## Adapters (controllers)

-template: megacli_adapter_state
-      on: megacli.adapter_degraded
-  lookup: max -10s foreach *
-   units: boolean
-   every: 10s
-    crit: $this > 0
-   delay: down 5m multiplier 2 max 10m
-    info: adapter is in the degraded state (0: false, 1: true)
-      to: sysadmin
+ template: megacli_adapter_state
+       on: megacli.adapter_degraded
+    class: System
+component: RAID
+     type: Errors
+   lookup: max -10s foreach *
+    units: boolean
+    every: 10s
+     crit: $this > 0
+    delay: down 5m multiplier 2 max 10m
+     info: adapter is in the degraded state (0: false, 1: true)
+       to: sysadmin

 ## Physical Disks

-template: megacli_pd_predictive_failures
-      on: megacli.pd_predictive_failure
-  lookup: sum -10s foreach *
-   units: predictive failures
-   every: 10s
-    warn: $this > 0
-   delay: up 1m down 5m multiplier 2 max 10m
-    info: number of physical drive predictive failures
-      to: sysadmin
+ template: megacli_pd_predictive_failures
+       on: megacli.pd_predictive_failure
+    class: System
+component: RAID
+     type: Errors
+   lookup: sum -10s foreach *
+    units: predictive failures
+    every: 10s
+     warn: $this > 0
+    delay: up 1m down 5m multiplier 2 max 10m
+     info: number of physical drive predictive failures
+       to: sysadmin

-template: megacli_pd_media_errors
-      on: megacli.pd_media_error
-  lookup: sum -10s foreach *
-   units: media errors
-   every: 10s
-    warn: $this > 0
-   delay: up 1m down 5m multiplier 2 max 10m
-    info: number of physical drive media errors
-      to: sysadmin
+ template: megacli_pd_media_errors
+       on: megacli.pd_media_error
+    class: System
+component: RAID
+     type: Errors
+   lookup: sum -10s foreach *
+    units: media errors
+    every: 10s
+     warn: $this > 0
+    delay: up 1m down 5m multiplier 2 max 10m
+     info: number of physical drive media errors
+       to: sysadmin

 ## Battery Backup Units (BBU)

-template: megacli_bbu_relative_charge
-      on: megacli.bbu_relative_charge
-  lookup: average -10s
-   units: percent
-   every: 10s
-    warn: $this <= (($status >= $WARNING)  ? (85) : (80))
-    crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
-    info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
-      to: sysadmin
+ template: megacli_bbu_relative_charge
+       on: megacli.bbu_relative_charge
+    class: System
+component: RAID
+     type: Workload
+   lookup: average -10s
+    units: percent
+    every: 10s
+     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
+     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
+     info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
+       to: sysadmin

-template: megacli_bbu_cycle_count
-      on: megacli.bbu_cycle_count
-  lookup: average -10s
-   units: cycles
-   every: 10s
-    warn: $this >= 100
-    crit: $this >= 500
-    info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
-      to: sysadmin
+ template: megacli_bbu_cycle_count
+       on: megacli.bbu_cycle_count
+    class: System
+component: RAID
+     type: Workload
+   lookup: average -10s
+    units: cycles
+    every: 10s
+     warn: $this >= 100
+     crit: $this >= 500
+     info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
+       to: sysadmin
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@ -1,53 +1,65 @@

 # make sure memcached is running

-template: memcached_last_collected_secs
-      on: memcached.cache
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: memcached_last_collected_secs
+       on: memcached.cache
+    class: KV Storage
+component: Memcached
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba


 # detect if memcached cache is full

-template: memcached_cache_memory_usage
-      on: memcached.cache
-    calc: $used * 100 / ($used + $available)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
-   delay: up 0 down 15m multiplier 1.5 max 1h
-    info: cache memory utilization
-      to: dba
+ template: memcached_cache_memory_usage
+       on: memcached.cache
+    class: KV Storage
+component: Memcached
+     type: Utilization
+     calc: $used * 100 / ($used + $available)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    delay: up 0 down 15m multiplier 1.5 max 1h
+     info: cache memory utilization
+       to: dba


 # find the rate memcached cache is filling

-template: memcached_cache_fill_rate
-      on: memcached.cache
-  lookup: min -10m at -50m unaligned of available
-    calc: ($this - $available) / (($now - $after) / 3600)
-   units: KB/hour
-   every: 1m
-    info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
+ template: memcached_cache_fill_rate
+       on: memcached.cache
+    class: KV Storage
+component: Memcached
+     type: Utilization
+   lookup: min -10m at -50m unaligned of available
+     calc: ($this - $available) / (($now - $after) / 3600)
+    units: KB/hour
+    every: 1m
+     info: average rate the cache fills up (positive), or frees up (negative) space over the last hour


 # find the hours remaining until memcached cache is full

-template: memcached_out_of_cache_space_time
-      on: memcached.cache
-    calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.5 max 1h
-    info: estimated time the cache will run out of space \
-          if the system continues to add data at the same rate as the past hour
-      to: dba
+ template: memcached_out_of_cache_space_time
+       on: memcached.cache
+    class: KV Storage
+component: Memcached
+     type: Utilization
+     calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
+    units: hours
+    every: 10s
+     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+    delay: down 15m multiplier 1.5 max 1h
+     info: estimated time the cache will run out of space \
+           if the system continues to add data at the same rate as the past hour
+       to: dba
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@ -1,38 +1,47 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-   alarm: 1hour_ecc_memory_correctable
-      on: mem.ecc_ce
-      os: linux
-   hosts: *
-  lookup: sum -10m unaligned
-   units: errors
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 1h
-    info: number of ECC correctable errors in the last 10 minutes
-      to: sysadmin
+    alarm: 1hour_ecc_memory_correctable
+       on: mem.ecc_ce
+    class: System
+component: Memory
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned
+    units: errors
+    every: 1m
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: number of ECC correctable errors in the last 10 minutes
+       to: sysadmin

-   alarm: 1hour_ecc_memory_uncorrectable
-      on: mem.ecc_ue
-      os: linux
-   hosts: *
-  lookup: sum -10m unaligned
-   units: errors
-   every: 1m
-    crit: $this > 0
-   delay: down 1h multiplier 1.5 max 1h
-    info: number of ECC uncorrectable errors in the last 10 minutes
-      to: sysadmin
+    alarm: 1hour_ecc_memory_uncorrectable
+       on: mem.ecc_ue
+    class: System
+component: Memory
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned
+    units: errors
+    every: 1m
+     crit: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: number of ECC uncorrectable errors in the last 10 minutes
+       to: sysadmin

-   alarm: 1hour_memory_hw_corrupted
-      on: mem.hwcorrupt
-      os: linux
-   hosts: *
-    calc: $HardwareCorrupted
-   units: MB
-   every: 10s
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 1h
-    info: amount of memory corrupted due to a hardware failure
-      to: sysadmin
+    alarm: 1hour_memory_hw_corrupted
+       on: mem.hwcorrupt
+    class: System
+component: Memory
+     type: Errors
+       os: linux
+    hosts: *
+     calc: $HardwareCorrupted
+    units: MB
+    every: 10s
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: amount of memory corrupted due to a hardware failure
+       to: sysadmin
--- a/health/health.d/mongodb.conf
+++ b/health/health.d/mongodb.conf
@ -1,13 +1,16 @@

 # make sure mongodb is running

-template: mongodb_last_collected_secs
-      on: mongodb.read_operations
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: mongodb_last_collected_secs
+       on: mongodb.read_operations
+    class: Database
+component: MongoDB
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@ -1,150 +1,186 @@

 # make sure mysql is running

-template: mysql_last_collected_secs
-      on: mysql.queries
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: mysql_last_collected_secs
+       on: mysql.queries
+    class: Database
+component: MySQL
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba


 # -----------------------------------------------------------------------------
 # slow queries

-template: mysql_10s_slow_queries
-      on: mysql.queries
-  lookup: sum -10s of slow_queries
-   units: slow queries
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (10) : (20))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of slow queries in the last 10 seconds
-      to: dba
+ template: mysql_10s_slow_queries
+       on: mysql.queries
+    class: Database
+component: MySQL
+     type: Latency
+   lookup: sum -10s of slow_queries
+    units: slow queries
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+     crit: $this > (($status == $CRITICAL) ? (10) : (20))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of slow queries in the last 10 seconds
+       to: dba


 # -----------------------------------------------------------------------------
 # lock waits

-template: mysql_10s_table_locks_immediate
-      on: mysql.table_locks
-  lookup: sum -10s absolute of immediate
-   units: immediate locks
-   every: 10s
-    info: number of table immediate locks in the last 10 seconds
-      to: dba
+ template: mysql_10s_table_locks_immediate
+       on: mysql.table_locks
+    class: Database
+component: MySQL
+     type: Utilization
+   lookup: sum -10s absolute of immediate
+    units: immediate locks
+    every: 10s
+     info: number of table immediate locks in the last 10 seconds
+       to: dba

-template: mysql_10s_table_locks_waited
-      on: mysql.table_locks
-  lookup: sum -10s absolute of waited
-   units: waited locks
-   every: 10s
-    info: number of table waited locks in the last 10 seconds
-      to: dba
+ template: mysql_10s_table_locks_waited
+       on: mysql.table_locks
+    class: Database
+component: MySQL
+     type: Latency
+   lookup: sum -10s absolute of waited
+    units: waited locks
+    every: 10s
+     info: number of table waited locks in the last 10 seconds
+       to: dba

-template: mysql_10s_waited_locks_ratio
-      on: mysql.table_locks
-    calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (10) : (25))
-    crit: $this > (($status == $CRITICAL) ? (25) : (50))
-   delay: down 30m multiplier 1.5 max 1h
-    info: ratio of waited table locks over the last 10 seconds
-      to: dba
+ template: mysql_10s_waited_locks_ratio
+       on: mysql.table_locks
+    class: Database
+component: MySQL
+     type: Latency
+     calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (10) : (25))
+     crit: $this > (($status == $CRITICAL) ? (25) : (50))
+    delay: down 30m multiplier 1.5 max 1h
+     info: ratio of waited table locks over the last 10 seconds
+       to: dba


 # -----------------------------------------------------------------------------
 # connections

-template: mysql_connections
-      on: mysql.connections_active
-    calc: $active * 100 / $limit
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (60) : (70))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
-   delay: down 15m multiplier 1.5 max 1h
-    info: client connections utilization
-      to: dba
+ template: mysql_connections
+       on: mysql.connections_active
+    class: Database
+component: MySQL
+     type: Utilization
+     calc: $active * 100 / $limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (60) : (70))
+     crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    delay: down 15m multiplier 1.5 max 1h
+     info: client connections utilization
+       to: dba


 # -----------------------------------------------------------------------------
 # replication

-template: mysql_replication
-      on: mysql.slave_status
-    calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
-   units: ok/failed
-   every: 10s
-    crit: $this == 0
-   delay: down 5m multiplier 1.5 max 1h
-    info: replication status (0: stopped, 1: working)
-      to: dba
+ template: mysql_replication
+       on: mysql.slave_status
+    class: Database
+component: MySQL
+     type: Errors
+     calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
+    units: ok/failed
+    every: 10s
+     crit: $this == 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: replication status (0: stopped, 1: working)
+       to: dba

-template: mysql_replication_lag
-      on: mysql.slave_behind
-    calc: $seconds
-   units: seconds
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (10) : (30))
-   delay: down 15m multiplier 1.5 max 1h
-    info: difference between the timestamp of the latest transaction processed by the SQL thread and \
-          the timestamp of the same transaction when it was processed on the master
-      to: dba
+ template: mysql_replication_lag
+       on: mysql.slave_behind
+    class: Database
+component: MySQL
+     type: Errors
+     calc: $seconds
+    units: seconds
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+     crit: $this > (($status == $CRITICAL) ? (10) : (30))
+    delay: down 15m multiplier 1.5 max 1h
+     info: difference between the timestamp of the latest transaction processed by the SQL thread and \
+           the timestamp of the same transaction when it was processed on the master
+       to: dba


 # -----------------------------------------------------------------------------
 # galera cluster size

-template: mysql_galera_cluster_size_max_2m
-      on: mysql.galera_cluster_size
-  lookup: max -2m absolute
-   units: nodes
-   every: 10s
-    info: maximum galera cluster size in the last 2 minutes
-      to: dba
+ template: mysql_galera_cluster_size_max_2m
+       on: mysql.galera_cluster_size
+    class: Database
+component: MySQL
+     type: Utilization
+   lookup: max -2m absolute
+    units: nodes
+    every: 10s
+     info: maximum galera cluster size in the last 2 minutes
+       to: dba

-template: mysql_galera_cluster_size
-      on: mysql.galera_cluster_size
-    calc: $nodes
-   units: nodes
-   every: 10s
-    warn: $this > $mysql_galera_cluster_size_max_2m
-    crit: $this < $mysql_galera_cluster_size_max_2m
-   delay: up 20s down 5m multiplier 1.5 max 1h
-    info: current galera cluster size, compared to the maximum size in the last 2 minutes
-      to: dba
+ template: mysql_galera_cluster_size
+       on: mysql.galera_cluster_size
+    class: Database
+component: MySQL
+     type: Utilization
+     calc: $nodes
+    units: nodes
+    every: 10s
+     warn: $this > $mysql_galera_cluster_size_max_2m
+     crit: $this < $mysql_galera_cluster_size_max_2m
+    delay: up 20s down 5m multiplier 1.5 max 1h
+     info: current galera cluster size, compared to the maximum size in the last 2 minutes
+       to: dba

 # galera node state

-template: mysql_galera_cluster_state
-      on: mysql.galera_cluster_state
-    calc: $state
-   every: 10s
-    warn: $this == 2 OR $this == 3
-    crit: $this == 0 OR $this == 1 OR $this >= 5
-   delay: up 30s down 5m multiplier 1.5 max 1h
-    info: galera node state \
-          (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
-      to: dba
+ template: mysql_galera_cluster_state
+       on: mysql.galera_cluster_state
+    class: Database
+component: MySQL
+     type: Errors
+     calc: $state
+    every: 10s
+     warn: $this == 2 OR $this == 3
+     crit: $this == 0 OR $this == 1 OR $this >= 5
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: galera node state \
+           (0: Undefined, 1: Joining, 2: Donor/Desynced, 3: Joined, 4: Synced, 5: Inconsistent)
+       to: dba


 # galera node status

-template: mysql_galera_cluster_status
-      on: mysql.galera_cluster_status
-    calc: $wsrep_cluster_status
-   every: 10s
-    crit: $mysql_galera_cluster_state != nan AND $this != 0
-   delay: up 30s down 5m multiplier 1.5 max 1h
-    info: galera node cluster component status \
-          (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
-          Any other value than primary indicates that the node is part of a nonoperational component.
-      to: dba
+ template: mysql_galera_cluster_status
+       on: mysql.galera_cluster_status
+    class: Database
+component: MySQL
+     type: Errors
+     calc: $wsrep_cluster_status
+    every: 10s
+     crit: $mysql_galera_cluster_state != nan AND $this != 0
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: galera node cluster component status \
+           (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
+           Any other value than primary indicates that the node is part of a nonoperational component.
+       to: dba
--- a/health/health.d/named.conf
+++ b/health/health.d/named.conf
@ -1,14 +1,17 @@

 # make sure named is running

-template: named_last_collected_secs
-      on: named.global_queries
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: domainadmin
+ template: named_last_collected_secs
+       on: named.global_queries
+    class: DNS
+component: BIND
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: domainadmin

--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@ -6,16 +6,22 @@

 template: interface_speed
       on: net.net
+    class: System
+component: Network
+     type: Latency
       os: *
    hosts: *
 families: *
     calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
    units: Mbit
    every: 10s
-     info: network interface current speed
+     info: network interface $family current speed

 template: 1m_received_traffic_overflow
       on: net.net
+    class: System
+component: Network
+     type: Workload
       os: linux
    hosts: *
 families: *
@ -25,11 +31,14 @@
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (85) : (90))
    delay: up 1m down 1m multiplier 1.5 max 1h
-     info: average inbound utilization for the network interface over the last minute
+     info: average inbound utilization for the network interface $family over the last minute
       to: sysadmin

 template: 1m_sent_traffic_overflow
       on: net.net
+    class: System
+component: Network
+     type: Workload
       os: linux
    hosts: *
 families: *
@ -39,7 +48,7 @@
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (85) : (90))
    delay: up 1m down 1m multiplier 1.5 max 1h
-     info: average outbound utilization for the network interface over the last minute
+     info: average outbound utilization for the network interface $family over the last minute
       to: sysadmin

 # -----------------------------------------------------------------------------
@ -52,110 +61,134 @@
 # it is possible to have expected packet drops on an interface for some network configurations
 # look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information

-template: inbound_packets_dropped
-      on: net.drops
-      os: linux
-   hosts: *
-families: !net* *
-  lookup: sum -10m unaligned absolute of inbound
-   units: packets
-   every: 1m
-    info: number of inbound dropped packets for the network interface in the last 10 minutes
+ template: inbound_packets_dropped
+       on: net.drops
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: !net* *
+   lookup: sum -10m unaligned absolute of inbound
+    units: packets
+    every: 1m
+     info: number of inbound dropped packets for the network interface $family in the last 10 minutes

-template: outbound_packets_dropped
-      on: net.drops
-      os: linux
-   hosts: *
-families: !net* *
-  lookup: sum -10m unaligned absolute of outbound
-   units: packets
-   every: 1m
-    info: number of outbound dropped packets for the network interface in the last 10 minutes
+ template: outbound_packets_dropped
+       on: net.drops
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: !net* *
+   lookup: sum -10m unaligned absolute of outbound
+    units: packets
+    every: 1m
+     info: number of outbound dropped packets for the network interface $family in the last 10 minutes

-template: inbound_packets_dropped_ratio
-      on: net.packets
-      os: linux
-   hosts: *
-families: !net* !wl* *
-  lookup: sum -10m unaligned absolute of received
-    calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 2
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of inbound dropped packets for the network interface over the last 10 minutes
-      to: sysadmin
+ template: inbound_packets_dropped_ratio
+       on: net.packets
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: !net* !wl* *
+   lookup: sum -10m unaligned absolute of received
+     calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin

-template: outbound_packets_dropped_ratio
-      on: net.packets
-      os: linux
-   hosts: *
-families: !net* !wl* *
-  lookup: sum -10m unaligned absolute of sent
-    calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 2
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of outbound dropped packets for the network interface over the last 10 minutes
-      to: sysadmin
+ template: outbound_packets_dropped_ratio
+       on: net.packets
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: !net* !wl* *
+   lookup: sum -10m unaligned absolute of sent
+     calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin

-template: wifi_inbound_packets_dropped_ratio
-      on: net.packets
-      os: linux
-   hosts: *
-families: wl*
-  lookup: sum -10m unaligned absolute of received
-    calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 10
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of inbound dropped packets for the network interface over the last 10 minutes
-      to: sysadmin
+ template: wifi_inbound_packets_dropped_ratio
+       on: net.packets
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: wl*
+   lookup: sum -10m unaligned absolute of received
+     calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 10
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin

-template: wifi_outbound_packets_dropped_ratio
-      on: net.packets
-      os: linux
-   hosts: *
-families: wl*
-  lookup: sum -10m unaligned absolute of sent
-    calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 10
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of outbound dropped packets for the network interface over the last 10 minutes
-      to: sysadmin
+ template: wifi_outbound_packets_dropped_ratio
+       on: net.packets
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: wl*
+   lookup: sum -10m unaligned absolute of sent
+     calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 10
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin

 # -----------------------------------------------------------------------------
 # interface errors

-template: interface_inbound_errors
-      on: net.errors
-      os: freebsd
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute of inbound
-   units: errors
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of inbound errors for the network interface in the last 10 minutes
-      to: sysadmin
+ template: interface_inbound_errors
+       on: net.errors
+    class: System
+component: Network
+     type: Errors
+       os: freebsd
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute of inbound
+    units: errors
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of inbound errors for the network interface $family in the last 10 minutes
+       to: sysadmin

-template: interface_outbound_errors
-      on: net.errors
-      os: freebsd
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute of outbound
-   units: errors
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of outbound errors for the network interface in the last 10 minutes
-      to: sysadmin
+ template: interface_outbound_errors
+       on: net.errors
+    class: System
+component: Network
+     type: Errors
+       os: freebsd
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute of outbound
+    units: errors
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of outbound errors for the network interface $family in the last 10 minutes
+       to: sysadmin

 # -----------------------------------------------------------------------------
 # FIFO errors
@ -165,18 +198,21 @@ families: *
 # the alarm is checked every 1 minute
 # and examines the last 10 minutes of data

-template: 10min_fifo_errors
-      on: net.fifo
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute
-   units: errors
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of FIFO errors for the network interface in the last 10 minutes
-      to: sysadmin
+ template: 10min_fifo_errors
+       on: net.fifo
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute
+    units: errors
+    every: 1m
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of FIFO errors for the network interface $family in the last 10 minutes
+       to: sysadmin

 # -----------------------------------------------------------------------------
 # check for packet storms
@ -187,28 +223,34 @@ families: *
 # we assume the minimum packet storm should at least have
 # 10000 packets/s, average of the last 10 seconds

-template: 1m_received_packets_rate
-      on: net.packets
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: average -1m unaligned of received
-   units: packets
-   every: 10s
-    info: average number of packets received by the network interface over the last minute
+ template: 1m_received_packets_rate
+       on: net.packets
+    class: System
+component: Network
+     type: Workload
+       os: linux freebsd
+    hosts: *
+ families: *
+   lookup: average -1m unaligned of received
+    units: packets
+    every: 10s
+     info: average number of packets received by the network interface $family over the last minute

-template: 10s_received_packets_storm
-      on: net.packets
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: average -10s unaligned of received
-    calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
-   every: 10s
-   units: %
-    warn: $this > (($status >= $WARNING)?(200):(5000))
-    crit: $this > (($status == $CRITICAL)?(5000):(6000))
- options: no-clear-notification
-    info: ratio of average number of received packets for the network interface over the last 10 seconds, \
-          compared to the rate over the last minute
-      to: sysadmin
+ template: 10s_received_packets_storm
+       on: net.packets
+    class: System
+component: Network
+     type: Workload
+       os: linux freebsd
+    hosts: *
+ families: *
+   lookup: average -10s unaligned of received
+     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(5000))
+     crit: $this > (($status == $CRITICAL)?(5000):(6000))
+  options: no-clear-notification
+     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute
+       to: sysadmin
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@ -1,16 +1,19 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-   alarm: netfilter_conntrack_full
-      on: netfilter.conntrack_sockets
-      os: linux
-   hosts: *
-  lookup: max -10s unaligned of connections
-    calc: $this * 100 / $netfilter_conntrack_max
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (85) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (95))
-   delay: down 5m multiplier 1.5 max 1h
-    info: netfilter connection tracker table size utilization
-      to: sysadmin
+    alarm: netfilter_conntrack_full
+       on: netfilter.conntrack_sockets
+    class: System
+component: Network
+     type: Workload
+       os: linux
+    hosts: *
+   lookup: max -10s unaligned of connections
+     calc: $this * 100 / $netfilter_conntrack_max
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (95))
+    delay: down 5m multiplier 1.5 max 1h
+     info: netfilter connection tracker table size utilization
+       to: sysadmin
--- a/health/health.d/nginx.conf
+++ b/health/health.d/nginx.conf
@ -1,14 +1,17 @@

 # make sure nginx is running

-template: nginx_last_collected_secs
-      on: nginx.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: nginx_last_collected_secs
+       on: nginx.requests
+    class: Web Server
+component: NGINX
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

--- a/health/health.d/nginx_plus.conf
+++ b/health/health.d/nginx_plus.conf
@ -1,14 +1,17 @@

 # make sure nginx_plus is running

-template: nginx_plus_last_collected_secs
-      on: nginx_plus.requests_total
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: nginx_plus_last_collected_secs
+       on: nginx_plus.requests_total
+    class: Web Server
+component: NGINX Plus
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

--- a/health/health.d/phpfpm.conf
+++ b/health/health.d/phpfpm.conf
@ -1,14 +1,17 @@

 # make sure phpfpm is running

-template: phpfpm_last_collected_secs
-      on: phpfpm.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: phpfpm_last_collected_secs
+       on: phpfpm.requests
+    class: Web Server
+component: PHP-FPM
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@ -1,65 +1,80 @@

 # Make sure Pi-hole is responding.

-template: pihole_last_collected_secs
-      on: pihole.dns_queries_total
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: pihole_last_collected_secs
+       on: pihole.dns_queries_total
+    class: Ad Filtering
+component: Pi-hole
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

 # Blocked DNS queries.

-template: pihole_blocked_queries
-      on: pihole.dns_queries_percentage
-   every: 10s
-   units: %
-    calc: $blocked
-    warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
-    crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
-   delay: up 2m down 5m
-    info: percentage of blocked dns queries over the last 24 hour
-      to: sysadmin
+ template: pihole_blocked_queries
+       on: pihole.dns_queries_percentage
+    class: Ad Filtering
+component: Pi-hole
+     type: Errors
+    every: 10s
+    units: %
+     calc: $blocked
+     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+     crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
+    delay: up 2m down 5m
+     info: percentage of blocked dns queries over the last 24 hour
+       to: sysadmin


 # Blocklist last update time.
 # Default update interval is a week.

-template: pihole_blocklist_last_update
-      on: pihole.blocklist_last_update
-   every: 10s
-   units: seconds
-    calc: $ago
-    warn: $this > 60 * 60 * 24 * 8
-    crit: $this > 60 * 60 * 24 * 8 * 2
-    info: gravity.list (blocklist) file last update time
-      to: sysadmin
+ template: pihole_blocklist_last_update
+       on: pihole.blocklist_last_update
+    class: Ad Filtering
+component: Pi-hole
+     type: Errors
+    every: 10s
+    units: seconds
+     calc: $ago
+     warn: $this > 60 * 60 * 24 * 8
+     crit: $this > 60 * 60 * 24 * 8 * 2
+     info: gravity.list (blocklist) file last update time
+       to: sysadmin

 # Gravity file check (gravity.list).

-template: pihole_blocklist_gravity_file
-      on: pihole.blocklist_last_update
-   every: 10s
-   units: boolean
-    calc: $file_exists
-    crit: $this != 1
-   delay: up 2m down 5m
-    info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
-      to: sysadmin
+ template: pihole_blocklist_gravity_file
+       on: pihole.blocklist_last_update
+    class: Ad Filtering
+component: Pi-hole
+     type: Errors
+    every: 10s
+    units: boolean
+     calc: $file_exists
+     crit: $this != 1
+    delay: up 2m down 5m
+     info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
+       to: sysadmin

 # Pi-hole's ability to block unwanted domains.
 # Should be enabled. The whole point of Pi-hole!

-template: pihole_status
-      on: pihole.unwanted_domains_blocking_status
-   every: 10s
-   units: boolean
-    calc: $enabled
-    warn: $this != 1
-   delay: up 2m down 5m
-    info: unwanted domains blocking status (0: enabled, 1: disabled)
-      to: sysadmin
+ template: pihole_status
+       on: pihole.unwanted_domains_blocking_status
+    class: Ad Filtering
+component: Pi-hole
+     type: Errors
+    every: 10s
+    units: boolean
+     calc: $enabled
+     warn: $this != 1
+    delay: up 2m down 5m
+     info: unwanted domains blocking status (0: enabled, 1: disabled)
+       to: sysadmin
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@ -1,46 +1,58 @@
-template: portcheck_last_collected_secs
-families: *
-      on: portcheck.status
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: portcheck_last_collected_secs
+ families: *
+       on: portcheck.status
+    class: Other
+component: TCP endpoint
+     type: Latency
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: portcheck_service_reachable
-families: *
-      on: portcheck.status
-  lookup: average -1m unaligned percentage of success
-    calc: ($this < 75) ? (0) : ($this)
-   every: 5s
-   units: up/down
-    info: average ratio of successful connections over the last minute (at least 75%)
-      to: silent
+ template: portcheck_service_reachable
+ families: *
+       on: portcheck.status
+    class: Other
+component: TCP endpoint
+     type: Workload
+   lookup: average -1m unaligned percentage of success
+     calc: ($this < 75) ? (0) : ($this)
+    every: 5s
+    units: up/down
+     info: average ratio of successful connections over the last minute (at least 75%)
+       to: silent

-template: portcheck_connection_timeouts
-families: *
-      on: portcheck.status
-  lookup: average -5m unaligned percentage of timeout
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average ratio of timeouts over the last 5 minutes
-      to: sysadmin
+ template: portcheck_connection_timeouts
+ families: *
+       on: portcheck.status
+    class: Other
+component: TCP endpoint
+     type: Errors
+   lookup: average -5m unaligned percentage of timeout
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of timeouts over the last 5 minutes
+       to: sysadmin

-template: portcheck_connection_fails
-families: *
-      on: portcheck.status
-  lookup: average -5m unaligned percentage of no_connection,failed
-   every: 10s
-   units: %
-    warn: $this >= 10 AND $this < 40
-    crit: $this >= 40
-   delay: down 5m multiplier 1.5 max 1h
-    info: average ratio of failed connections over the last 5 minutes
-      to: sysadmin
+ template: portcheck_connection_fails
+ families: *
+       on: portcheck.status
+    class: Other
+component: TCP endpoint
+     type: Errors
+   lookup: average -5m unaligned percentage of no_connection,failed
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of failed connections over the last 5 minutes
+       to: sysadmin
--- a/health/health.d/postgres.conf
+++ b/health/health.d/postgres.conf
@ -1,13 +1,16 @@

 # make sure postgres is running

-template: postgres_last_collected_secs
-      on: postgres.db_stat_transactions
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: postgres_last_collected_secs
+       on: postgres.db_stat_transactions
+    class: Database
+component: PostgreSQL
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@ -1,13 +1,16 @@
 # you can disable an alarm notification by setting the 'to' line to: silent

-   alarm: active_processes
-      on: system.active_processes
-   hosts: *
-    calc: $active * 100 / $pidmax
-   units: %
-   every: 5s
-    warn: $this > (($status >= $WARNING)  ? (85) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (95))
-   delay: down 5m multiplier 1.5 max 1h
-    info: system process IDs (PID) space utilization
-      to: sysadmin
+    alarm: active_processes
+       on: system.active_processes
+    class: System
+component: Processes
+     type: Workload
+    hosts: *
+     calc: $active * 100 / $pidmax
+    units: %
+    every: 5s
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (95))
+    delay: down 5m multiplier 1.5 max 1h
+     info: system process IDs (PID) space utilization
+       to: sysadmin
--- a/health/health.d/pulsar.conf
+++ b/health/health.d/pulsar.conf
@ -1,13 +1,16 @@

 # Availability

-template: pulsar_last_collected_secs
-      on: pulsar.broker_components
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: pulsar_last_collected_secs
+       on: pulsar.broker_components
+    class: Messaging
+component: Pulsar
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@ -1,41 +1,50 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-   alarm: used_ram_to_ignore
-      on: system.ram
-      os: linux freebsd
-   hosts: *
-    calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
-   every: 10s
-    info: amount of memory reported as used, \
-          but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+    alarm: used_ram_to_ignore
+       on: system.ram
+    class: System
+component: Memory
+     type: Utilization
+       os: linux freebsd
+    hosts: *
+     calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
+    every: 10s
+     info: amount of memory reported as used, \
+           but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)

-   alarm: ram_in_use
-      on: system.ram
-      os: linux
-   hosts: *
-#   calc: $used * 100 / ($used + $cached + $free)
-    calc: ($used - $used_ram_to_ignore) * 100 / ($used  + $cached + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: system memory utilization
-      to: sysadmin
+    alarm: ram_in_use
+       on: system.ram
+    class: System
+component: Memory
+     type: Utilization
+       os: linux
+    hosts: *
+#    calc: $used * 100 / ($used + $cached + $free)
+     calc: ($used - $used_ram_to_ignore) * 100 / ($used  + $cached + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: system memory utilization
+       to: sysadmin

-   alarm: ram_available
-      on: mem.available
-      os: linux
-   hosts: *
-    calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
-   units: %
-   every: 10s
-    warn: $this < (($status >= $WARNING)  ? (15) : (10))
-    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
-   delay: down 15m multiplier 1.5 max 1h
-    info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
-      to: sysadmin
+    alarm: ram_available
+       on: mem.available
+    class: System
+component: Memory
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+    units: %
+    every: 10s
+     warn: $this < (($status >= $WARNING)  ? (15) : (10))
+     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+    delay: down 15m multiplier 1.5 max 1h
+     info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
+       to: sysadmin

   alarm: oom_kill
      on: mem.oom_kill
@ -50,28 +59,34 @@
      to: sysadmin

 ## FreeBSD
-   alarm: ram_in_use
-      on: system.ram
-      os: freebsd
-   hosts: *
-    calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: system memory utilization
-      to: sysadmin
+    alarm: ram_in_use
+       on: system.ram
+    class: System
+component: Memory
+     type: Utilization
+       os: freebsd
+    hosts: *
+     calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: system memory utilization
+       to: sysadmin

-   alarm: ram_available
-      on: system.ram
-      os: freebsd
-   hosts: *
-    calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
-   units: %
-   every: 10s
-    warn: $this < (($status >= $WARNING)  ? (15) : (10))
-    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
-   delay: down 15m multiplier 1.5 max 1h
-    info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
-      to: sysadmin
+    alarm: ram_available
+       on: system.ram
+    class: System
+component: Memory
+     type: Utilization
+       os: freebsd
+    hosts: *
+     calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+    units: %
+    every: 10s
+     warn: $this < (($status >= $WARNING)  ? (15) : (10))
+     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+    delay: down 15m multiplier 1.5 max 1h
+     info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
+       to: sysadmin
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@ -1,34 +1,43 @@

 # make sure redis is running

-template: redis_last_collected_secs
-      on: redis.operations
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: redis_last_collected_secs
+       on: redis.operations
+    class: KV Storage
+component: Redis
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba

-template: redis_bgsave_broken
-families: *
-      on: redis.bgsave_health
-   every: 10s
-    crit: $rdb_last_bgsave_status != 0
-   units: ok/failed
-    info: status of the last RDB save operation (0: ok, 1: error)
-   delay: down 5m multiplier 1.5 max 1h
-      to: dba
+ template: redis_bgsave_broken
+ families: *
+       on: redis.bgsave_health
+    class: KV Storage
+component: Redis
+     type: Errors
+    every: 10s
+     crit: $rdb_last_bgsave_status != 0
+    units: ok/failed
+     info: status of the last RDB save operation (0: ok, 1: error)
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba

-template: redis_bgsave_slow
-families: *
-      on: redis.bgsave_now
-   every: 10s
-    warn: $rdb_bgsave_in_progress > 600
-    crit: $rdb_bgsave_in_progress > 1200
-   units: seconds
-    info: duration of the on-going RDB save operation
-   delay: down 5m multiplier 1.5 max 1h
-      to: dba
+ template: redis_bgsave_slow
+ families: *
+       on: redis.bgsave_now
+    class: KV Storage
+component: Redis
+     type: Latency
+    every: 10s
+     warn: $rdb_bgsave_in_progress > 600
+     crit: $rdb_bgsave_in_progress > 1200
+    units: seconds
+     info: duration of the on-going RDB save operation
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
--- a/health/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
@ -1,25 +1,31 @@
 # make sure RetroShare is running

-template: retroshare_last_collected_secs
-      on: retroshare.peers
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: retroshare_last_collected_secs
+       on: retroshare.peers
+    class: Data Sharing
+component: Retroshare
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # make sure the DHT is fine when active

-template: retroshare_dht_working
-      on: retroshare.dht
-    calc: $dht_size_all
-   units: peers
-   every: 1m
-    warn: $this < (($status >= $WARNING)  ? (120) : (100))
-    crit: $this < (($status == $CRITICAL) ? (10)  : (1))
-   delay: up 0 down 15m multiplier 1.5 max 1h
-    info: number of DHT peers
-      to: sysadmin
+ template: retroshare_dht_working
+       on: retroshare.dht
+    class: Data Sharing
+component: Retroshare
+     type: Utilization
+     calc: $dht_size_all
+    units: peers
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (120) : (100))
+     crit: $this < (($status == $CRITICAL) ? (10)  : (1))
+    delay: up 0 down 15m multiplier 1.5 max 1h
+     info: number of DHT peers
+       to: sysadmin
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@ -1,86 +1,107 @@
 # Ensure that Riak is running.  template: riak_last_collected_secs
-template: riakkv_last_collected_secs
-      on: riak.kv.throughput
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: dba
+ template: riakkv_last_collected_secs
+       on: riak.kv.throughput
+    class: Database
+component: Riak KV
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: dba

 # Warn if a list keys operation is running.
-template: riakkv_list_keys_active
-      on: riak.core.fsm_active
-    calc: $list_fsm_active
-   units: state machines
-   every: 10s
-    warn: $list_fsm_active > 0
-    info: number of currently running list keys finite state machines
-      to: dba
+ template: riakkv_list_keys_active
+       on: riak.core.fsm_active
+    class: Database
+component: Riak KV
+     type: Utilization
+     calc: $list_fsm_active
+    units: state machines
+    every: 10s
+     warn: $list_fsm_active > 0
+     info: number of currently running list keys finite state machines
+       to: dba


 ## Timing healthchecks
 # KV GET
-template: riakkv_1h_kv_get_mean_latency
-      on: riak.kv.latency.get
-    calc: $node_get_fsm_time_mean
-  lookup: average -1h unaligned of time
-   every: 30s
-   units: ms
-    info: average time between reception of client GET request and \
-          subsequent response to client over the last hour
+ template: riakkv_1h_kv_get_mean_latency
+       on: riak.kv.latency.get
+    class: Database
+component: Riak KV
+     type: Latency
+     calc: $node_get_fsm_time_mean
+   lookup: average -1h unaligned of time
+    every: 30s
+    units: ms
+     info: average time between reception of client GET request and \
+           subsequent response to client over the last hour

-template: riakkv_kv_get_slow
-      on: riak.kv.latency.get
-    calc: $mean
-  lookup: average -3m unaligned of time
-   units: ms
-   every: 10s
-    warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
-    crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
-    info: average time between reception of client GET request and \
-          subsequent response to the client over the last 3 minutes, \
-          compared to the average over the last hour
-   delay: down 5m multiplier 1.5 max 1h
-      to: dba
+ template: riakkv_kv_get_slow
+       on: riak.kv.latency.get
+    class: Database
+component: Riak KV
+     type: Latency
+     calc: $mean
+   lookup: average -3m unaligned of time
+    units: ms
+    every: 10s
+     warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+     crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
+     info: average time between reception of client GET request and \
+           subsequent response to the client over the last 3 minutes, \
+           compared to the average over the last hour
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba

 # KV PUT
-template: riakkv_1h_kv_put_mean_latency
-      on: riak.kv.latency.put
-    calc: $node_put_fsm_time_mean
-  lookup: average -1h unaligned of time
-   every: 30s
-   units: ms
-    info: average time between reception of client PUT request and \
-          subsequent response to the client over the last hour
+ template: riakkv_1h_kv_put_mean_latency
+       on: riak.kv.latency.put
+    class: Database
+component: Riak KV
+     type: Latency
+     calc: $node_put_fsm_time_mean
+   lookup: average -1h unaligned of time
+    every: 30s
+    units: ms
+     info: average time between reception of client PUT request and \
+           subsequent response to the client over the last hour

-template: riakkv_kv_put_slow
-      on: riak.kv.latency.put
-    calc: $mean
-  lookup: average -3m unaligned of time
-   units: ms
-   every: 10s
-    warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
-    crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
-    info: average time between reception of client PUT request and \
-          subsequent response to the client over the last 3 minutes, \
-          compared to the average over the last hour
-   delay: down 5m multiplier 1.5 max 1h
-      to: dba
+ template: riakkv_kv_put_slow
+       on: riak.kv.latency.put
+    class: Database
+component: Riak KV
+     type: Latency
+     calc: $mean
+   lookup: average -3m unaligned of time
+    units: ms
+    every: 10s
+     warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+     crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
+     info: average time between reception of client PUT request and \
+           subsequent response to the client over the last 3 minutes, \
+           compared to the average over the last hour
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba


 ## VM healthchecks

 # Default Erlang VM process limit: 262144
 # On systems observed, this is < 2000, but may grow depending on load.
-template: riakkv_vm_high_process_count
-      on: riak.vm
-    calc: $sys_process_count
-   units: processes
-   every: 10s
-    warn: $this > 10000
-    crit: $this > 100000
-    info: number of processes running in the Erlang VM
-      to: dba
+ template: riakkv_vm_high_process_count
+       on: riak.vm
+    class: Database
+component: Riak KV
+     type: Utilization
+     calc: $sys_process_count
+    units: processes
+    every: 10s
+     warn: $this > 10000
+     crit: $this > 100000
+     info: number of processes running in the Erlang VM
+       to: dba
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@ -1,38 +1,47 @@

 # make sure scaleio is running

-template: scaleio_last_collected_secs
-      on: scaleio.system_capacity_total
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: scaleio_last_collected_secs
+       on: scaleio.system_capacity_total
+    class: Storage
+component: ScaleIO
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # make sure Storage Pool capacity utilization is under limit

-template: scaleio_storage_pool_capacity_utilization
-      on: scaleio.storage_pool_capacity_utilization
-    calc: $used
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: storage pool capacity utilization
-      to: sysadmin
+ template: scaleio_storage_pool_capacity_utilization
+       on: scaleio.storage_pool_capacity_utilization
+    class: Storage
+component: ScaleIO
+     type: Utilization
+     calc: $used
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: storage pool capacity utilization
+       to: sysadmin


 # make sure Sdc is connected to MDM

-template: scaleio_sdc_mdm_connection_state
-      on: scaleio.sdc_mdm_connection_state
-    calc: $connected
-   every: 10s
-    warn: $this != 1
-   delay: up 30s down 5m multiplier 1.5 max 1h
-    info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
-      to: sysadmin
+ template: scaleio_sdc_mdm_connection_state
+       on: scaleio.sdc_mdm_connection_state
+    class: Storage
+component: ScaleIO
+     type: Utilization
+     calc: $connected
+    every: 10s
+     warn: $this != 1
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
+       to: sysadmin
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@ -3,43 +3,52 @@

 # check for common /proc/net/softnet_stat errors

-   alarm: 1min_netdev_backlog_exceeded
-      on: system.softnet_stat
-      os: linux
-   hosts: *
-  lookup: average -1m unaligned absolute of dropped
-   units: packets
-   every: 10s
-    warn: $this > (($status >= $WARNING) ? (0) : (10))
-   delay: down 1h multiplier 1.5 max 2h
-    info: average number of dropped packets in the last minute \
-          due to exceeded net.core.netdev_max_backlog
-      to: sysadmin
+    alarm: 1min_netdev_backlog_exceeded
+       on: system.softnet_stat
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: average -1m unaligned absolute of dropped
+    units: packets
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average number of dropped packets in the last minute \
+           due to exceeded net.core.netdev_max_backlog
+       to: sysadmin

-   alarm: 1min_netdev_budget_ran_outs
-      on: system.softnet_stat
-      os: linux
-   hosts: *
-  lookup: average -1m unaligned absolute of squeezed
-   units: events
-   every: 10s
-    warn: $this > (($status >= $WARNING) ? (0) : (10))
-   delay: down 1h multiplier 1.5 max 2h
-    info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
-          net.core.netdev_budget_usecs with work remaining over the last minute \
-          (this can be a cause for dropped packets)
-      to: silent
+    alarm: 1min_netdev_budget_ran_outs
+       on: system.softnet_stat
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: average -1m unaligned absolute of squeezed
+    units: events
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
+           net.core.netdev_budget_usecs with work remaining over the last minute \
+           (this can be a cause for dropped packets)
+       to: silent

-   alarm: 10min_netisr_backlog_exceeded
-      on: system.softnet_stat
-      os: freebsd
-   hosts: *
-  lookup: average -1m unaligned absolute of qdrops
-   units: packets
-   every: 10s
-    warn: $this > (($status >= $WARNING) ? (0) : (10))
-   delay: down 1h multiplier 1.5 max 2h
-    info: average number of drops in the last minute \
-          due to exceeded sysctl net.route.netisr_maxqlen \
-          (this can be a cause for dropped packets)
-      to: sysadmin
+    alarm: 10min_netisr_backlog_exceeded
+       on: system.softnet_stat
+    class: System
+component: Network
+     type: Errors
+       os: freebsd
+    hosts: *
+   lookup: average -1m unaligned absolute of qdrops
+    units: packets
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average number of drops in the last minute \
+           due to exceeded sysctl net.route.netisr_maxqlen \
+           (this can be a cause for dropped packets)
+       to: sysadmin
--- a/health/health.d/squid.conf
+++ b/health/health.d/squid.conf
@ -1,14 +1,17 @@

 # make sure squid is running

-template: squid_last_collected_secs
-      on: squid.clients_requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: proxyadmin
+ template: squid_last_collected_secs
+       on: squid.clients_requests
+    class: Web Proxy
+component: Squid
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: proxyadmin

--- a/health/health.d/stiebeleltron.conf
+++ b/health/health.d/stiebeleltron.conf
@ -1,11 +1,14 @@
-template: stiebeleltron_last_collected_secs
-families: *
-      on: stiebeleltron.heating.hc1
-    calc: $now - $last_collected_t
-   every: 10s
-   units: seconds ago
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sitemgr
+ template: stiebeleltron_last_collected_secs
+ families: *
+       on: stiebeleltron.heating.hc1
+    class: Other
+component: Sensors
+     type: Latency
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sitemgr
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@ -1,29 +1,35 @@

 # you can disable an alarm notification by setting the 'to' line to: silent

-   alarm: 30min_ram_swapped_out
-      on: system.swapio
-      os: linux freebsd
-   hosts: *
-  lookup: sum -30m unaligned absolute of out
-          # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
-    calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   units: % of RAM
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (20) : (30))
-   delay: down 15m multiplier 1.5 max 1h
-    info: percentage of the system RAM swapped in the last 30 minutes
-      to: sysadmin
+    alarm: 30min_ram_swapped_out
+       on: system.swapio
+    class: System
+component: Memory
+     type: Workload
+       os: linux freebsd
+    hosts: *
+   lookup: sum -30m unaligned absolute of out
+           # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
+     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+    units: % of RAM
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (20) : (30))
+    delay: down 15m multiplier 1.5 max 1h
+     info: percentage of the system RAM swapped in the last 30 minutes
+       to: sysadmin

-   alarm: used_swap
-      on: system.swap
-      os: linux freebsd
-   hosts: *
-    calc: $used * 100 / ( $used + $free )
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: up 30s down 15m multiplier 1.5 max 1h
-    info: swap memory utilization
-      to: sysadmin
+    alarm: used_swap
+       on: system.swap
+    class: System
+component: Memory
+     type: Utilization
+       os: linux freebsd
+    hosts: *
+     calc: $used * 100 / ( $used + $free )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: up 30s down 15m multiplier 1.5 max 1h
+     info: swap memory utilization
+       to: sysadmin
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@ -2,111 +2,141 @@
 ## States: 1 - active, 2 - inactive, 3 - activating, 4 - deactivating, 5 - failed.

 ## Service units
-template: systemd_service_units_state
-      on: systemd.service_units_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd service units are in the failed state
-      to: sysadmin
+ template: systemd_service_units_state
+       on: systemd.service_units_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd service units are in the failed state
+       to: sysadmin

 ## Socket units
-template: systemd_socket_units_state
-      on: systemd.socket_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd socket units are in the failed state
-      to: sysadmin
+ template: systemd_socket_units_state
+       on: systemd.socket_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd socket units are in the failed state
+       to: sysadmin

 ## Target units
-template: systemd_target_units_state
-      on: systemd.target_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd target units are in the failed state
-      to: sysadmin
+ template: systemd_target_units_state
+       on: systemd.target_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd target units are in the failed state
+       to: sysadmin

 ## Path units
-template: systemd_path_units_state
-      on: systemd.path_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd path units are in the failed state
-      to: sysadmin
+ template: systemd_path_units_state
+       on: systemd.path_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd path units are in the failed state
+       to: sysadmin

 ## Device units
-template: systemd_device_units_state
-      on: systemd.device_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more the systemd device units are in the failed state
-      to: sysadmin
+ template: systemd_device_units_state
+       on: systemd.device_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more the systemd device units are in the failed state
+       to: sysadmin

 ## Mount units
-template: systemd_mount_units_state
-      on: systemd.mount_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more the systemd mount units are in the failed state
-      to: sysadmin
+ template: systemd_mount_units_state
+       on: systemd.mount_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more the systemd mount units are in the failed state
+       to: sysadmin

 ## Automount units
-template: systemd_automount_units_state
-      on: systemd.automount_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd automount units are in the failed state
-      to: sysadmin
+ template: systemd_automount_units_state
+       on: systemd.automount_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd automount units are in the failed state
+       to: sysadmin

 ## Swap units
-template: systemd_swap_units_state
-      on: systemd.swap_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd swap units are in the failed state
-      to: sysadmin
+ template: systemd_swap_units_state
+       on: systemd.swap_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd swap units are in the failed state
+       to: sysadmin

 ## Scope units
-template: systemd_scope_units_state
-      on: systemd.scope_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd scope units are in the failed state
-      to: sysadmin
+ template: systemd_scope_units_state
+       on: systemd.scope_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd scope units are in the failed state
+       to: sysadmin

 ## Slice units
-template: systemd_slice_units_state
-      on: systemd.slice_unit_state
-  lookup: max -1s min2max
-   units: ok/failed
-   every: 10s
-    warn: $this != nan AND $this == 5
-   delay: down 5m multiplier 1.5 max 1h
-    info: one or more systemd slice units are in the failed state
-      to: sysadmin
+ template: systemd_slice_units_state
+       on: systemd.slice_unit_state
+    class: Linux
+component: Systemd units
+     type: Errors
+   lookup: max -1s min2max
+    units: ok/failed
+    every: 10s
+     warn: $this != nan AND $this == 5
+    delay: down 5m multiplier 1.5 max 1h
+     info: one or more systemd slice units are in the failed state
+       to: sysadmin
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@ -5,15 +5,18 @@
 # In this case, the alarm will always be zero.
 #

-   alarm: tcp_connections
-      on: ipv4.tcpsock
-      os: linux
-   hosts: *
-    calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
-    crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: IPv4 TCP connections utilization
-      to: sysadmin
+    alarm: tcp_connections
+       on: ipv4.tcpsock
+    class: System
+component: Network
+     type: Workload
+       os: linux
+    hosts: *
+     calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+     crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: IPv4 TCP connections utilization
+       to: sysadmin
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@ -18,33 +18,39 @@
 # -----------------------------------------------------------------------------
 # tcp accept queue (at the kernel)

-   alarm: 1m_tcp_accept_queue_overflows
-      on: ip.tcp_accept_queue
-      os: linux
-   hosts: *
-  lookup: average -60s unaligned absolute of ListenOverflows
-   units: overflows
-   every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (1) : (5))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: average number of overflows in the TCP accept queue over the last minute
-      to: sysadmin
+    alarm: 1m_tcp_accept_queue_overflows
+       on: ip.tcp_accept_queue
+    class: System
+component: Network
+     type: Workload
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of ListenOverflows
+    units: overflows
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (1) : (5))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: average number of overflows in the TCP accept queue over the last minute
+       to: sysadmin

 # THIS IS TOO GENERIC
 # CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
-   alarm: 1m_tcp_accept_queue_drops
-      on: ip.tcp_accept_queue
-      os: linux
-   hosts: *
-  lookup: average -60s unaligned absolute of ListenDrops
-   units: drops
-   every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (1) : (5))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: average number of dropped packets in the TCP accept queue over the last minute
-      to: sysadmin
+    alarm: 1m_tcp_accept_queue_drops
+       on: ip.tcp_accept_queue
+    class: System
+component: Network
+     type: Workload
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of ListenDrops
+    units: drops
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (1) : (5))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: average number of dropped packets in the TCP accept queue over the last minute
+       to: sysadmin


 # -----------------------------------------------------------------------------
@ -55,30 +61,36 @@
 # enabled or not. In both cases this probably indicates a SYN flood attack,
 # so i guess a notification should be sent.

-   alarm: 1m_tcp_syn_queue_drops
-      on: ip.tcp_syn_queue
-      os: linux
-   hosts: *
-  lookup: average -60s unaligned absolute of TCPReqQFullDrop
-   units: drops
-   every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (0) : (5))
-   delay: up 10 down 5m multiplier 1.5 max 1h
-    info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
-          (SYN cookies were not enabled)
-      to: sysadmin
+    alarm: 1m_tcp_syn_queue_drops
+       on: ip.tcp_syn_queue
+    class: System
+component: Network
+     type: Workload
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of TCPReqQFullDrop
+    units: drops
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (0) : (5))
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
+           (SYN cookies were not enabled)
+       to: sysadmin

-   alarm: 1m_tcp_syn_queue_cookies
-      on: ip.tcp_syn_queue
-      os: linux
-   hosts: *
-  lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
-   units: cookies
-   every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (0) : (5))
-   delay: up 10 down 5m multiplier 1.5 max 1h
-    info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
-      to: sysadmin
+    alarm: 1m_tcp_syn_queue_cookies
+       on: ip.tcp_syn_queue
+    class: System
+component: Network
+     type: Workload
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
+    units: cookies
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (0) : (5))
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
+       to: sysadmin

--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@ -6,15 +6,18 @@
 # and a critical when TCP is 90% of its upper memory limit
 #

-   alarm: tcp_memory
-      on: ipv4.sockstat_tcp_mem
-      os: linux
-   hosts: *
-    calc: ${mem} * 100 / ${tcp_mem_high}
-   units: %
-   every: 10s
-    warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
-    crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: TCP memory utilization
-      to: sysadmin
+    alarm: tcp_memory
+       on: ipv4.sockstat_tcp_mem
+    class: System
+component: Network
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ${mem} * 100 / ${tcp_mem_high}
+    units: %
+    every: 10s
+     warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
+     crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: TCP memory utilization
+       to: sysadmin
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@ -7,15 +7,18 @@
 # so we alarm warning at 25% and critical at 50%
 #

-   alarm: tcp_orphans
-      on: ipv4.sockstat_tcp_sockets
-      os: linux
-   hosts: *
-    calc: ${orphan} * 100 / ${tcp_max_orphans}
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
-    crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: orphan IPv4 TCP sockets utilization
-      to: sysadmin
+    alarm: tcp_orphans
+       on: ipv4.sockstat_tcp_sockets
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+     calc: ${orphan} * 100 / ${tcp_max_orphans}
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+     crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: orphan IPv4 TCP sockets utilization
+       to: sysadmin
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@ -4,54 +4,66 @@
 # -----------------------------------------------------------------------------
 # tcp resets this host sends

-   alarm: 1m_ipv4_tcp_resets_sent
-      on: ipv4.tcphandshake
-      os: linux
-   hosts: *
-  lookup: average -1m at -10s unaligned absolute of OutRsts
-   units: tcp resets/s
-   every: 10s
-    info: average number of sent TCP RESETS over the last minute
+    alarm: 1m_ipv4_tcp_resets_sent
+       on: ipv4.tcphandshake
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: average -1m at -10s unaligned absolute of OutRsts
+    units: tcp resets/s
+    every: 10s
+     info: average number of sent TCP RESETS over the last minute

-   alarm: 10s_ipv4_tcp_resets_sent
-      on: ipv4.tcphandshake
-      os: linux
-   hosts: *
-  lookup: average -10s unaligned absolute of OutRsts
-   units: tcp resets/s
-   every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
-   delay: up 20s down 60m multiplier 1.2 max 2h
- options: no-clear-notification
-    info: average number of sent TCP RESETS over the last 10 seconds. \
-          This can indicate a port scan, \
-          or that a service running on this host has crashed. \
-          Netdata will not send a clear notification for this alarm.
-      to: sysadmin
+    alarm: 10s_ipv4_tcp_resets_sent
+       on: ipv4.tcphandshake
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: average -10s unaligned absolute of OutRsts
+    units: tcp resets/s
+    every: 10s
+     warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
+    delay: up 20s down 60m multiplier 1.2 max 2h
+  options: no-clear-notification
+     info: average number of sent TCP RESETS over the last 10 seconds. \
+           This can indicate a port scan, \
+           or that a service running on this host has crashed. \
+           Netdata will not send a clear notification for this alarm.
+       to: sysadmin

 # -----------------------------------------------------------------------------
 # tcp resets this host receives

-   alarm: 1m_ipv4_tcp_resets_received
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-  lookup: average -1m at -10s unaligned absolute of AttemptFails
-   units: tcp resets/s
-   every: 10s
-    info: average number of received TCP RESETS over the last minute
+    alarm: 1m_ipv4_tcp_resets_received
+       on: ipv4.tcphandshake
+    class: System
+component: Network
+     type: Errors
+       os: linux freebsd
+    hosts: *
+   lookup: average -1m at -10s unaligned absolute of AttemptFails
+    units: tcp resets/s
+    every: 10s
+     info: average number of received TCP RESETS over the last minute

-   alarm: 10s_ipv4_tcp_resets_received
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-  lookup: average -10s unaligned absolute of AttemptFails
-   units: tcp resets/s
-   every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
-   delay: up 20s down 60m multiplier 1.2 max 2h
- options: no-clear-notification
-    info: average number of received TCP RESETS over the last 10 seconds. \
-          This can be an indication that a service this host needs has crashed. \
-          Netdata will not send a clear notification for this alarm.
-      to: sysadmin
+    alarm: 10s_ipv4_tcp_resets_received
+       on: ipv4.tcphandshake
+    class: System
+component: Network
+     type: Errors
+       os: linux freebsd
+    hosts: *
+   lookup: average -10s unaligned absolute of AttemptFails
+    units: tcp resets/s
+    every: 10s
+     warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
+    delay: up 20s down 60m multiplier 1.2 max 2h
+  options: no-clear-notification
+     info: average number of received TCP RESETS over the last 10 seconds. \
+           This can be an indication that a service this host needs has crashed. \
+           Netdata will not send a clear notification for this alarm.
+       to: sysadmin
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@ -4,29 +4,35 @@
 # -----------------------------------------------------------------------------
 # UDP receive buffer errors

-   alarm: 1m_ipv4_udp_receive_buffer_errors
-      on: ipv4.udperrors
-      os: linux freebsd
-   hosts: *
-  lookup: average -1m unaligned absolute of RcvbufErrors
-   units: errors
-   every: 10s
-    warn: $this > (($status >= $WARNING) ? (0) : (10))
-    info: average number of UDP receive buffer errors over the last minute
-   delay: up 1m down 60m multiplier 1.2 max 2h
-      to: sysadmin
+    alarm: 1m_ipv4_udp_receive_buffer_errors
+       on: ipv4.udperrors
+    class: System
+component: Network
+     type: Errors
+       os: linux freebsd
+    hosts: *
+   lookup: average -1m unaligned absolute of RcvbufErrors
+    units: errors
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+     info: average number of UDP receive buffer errors over the last minute
+    delay: up 1m down 60m multiplier 1.2 max 2h
+       to: sysadmin

 # -----------------------------------------------------------------------------
 # UDP send buffer errors

-   alarm: 1m_ipv4_udp_send_buffer_errors
-      on: ipv4.udperrors
-      os: linux
-   hosts: *
-  lookup: average -1m unaligned absolute of SndbufErrors
-   units: errors
-   every: 10s
-    warn: $this > (($status >= $WARNING) ? (0) : (10))
-    info: average number of UDP send buffer errors over the last minute
-   delay: up 1m down 60m multiplier 1.2 max 2h
-      to: sysadmin
+    alarm: 1m_ipv4_udp_send_buffer_errors
+       on: ipv4.udperrors
+    class: System
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+   lookup: average -1m unaligned absolute of SndbufErrors
+    units: errors
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+     info: average number of UDP send buffer errors over the last minute
+    delay: up 1m down 60m multiplier 1.2 max 2h
+       to: sysadmin
--- a/health/health.d/unbound.conf
+++ b/health/health.d/unbound.conf
@ -1,35 +1,44 @@

 # make sure unbound is running

-template: unbound_last_collected_secs
-      on: unbound.queries
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: unbound_last_collected_secs
+       on: unbound.queries
+    class: DNS
+component: Unbound
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # make sure there is no overwritten/dropped queries in the request-list

-template: unbound_request_list_overwritten
-      on: unbound.request_list_jostle_list
-  lookup: average -60s unaligned absolute match-names of overwritten
-   units: queries
-   every: 10s
-    warn: $this > 5
-   delay: up 10 down 5m multiplier 1.5 max 1h
-    info: number of overwritten queries in the request-list
-      to: sysadmin
+ template: unbound_request_list_overwritten
+       on: unbound.request_list_jostle_list
+    class: DNS
+component: Unbound
+     type: Errors
+   lookup: average -60s unaligned absolute match-names of overwritten
+    units: queries
+    every: 10s
+     warn: $this > 5
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: number of overwritten queries in the request-list
+       to: sysadmin

-template: unbound_request_list_dropped
-      on: unbound.request_list_jostle_list
-  lookup: average -60s unaligned absolute match-names of dropped
-   units: queries
-   every: 10s
-    warn: $this > 0
-   delay: up 10 down 5m multiplier 1.5 max 1h
-    info: number of dropped queries in the request-list
-      to: sysadmin
+ template: unbound_request_list_dropped
+       on: unbound.request_list_jostle_list
+    class: DNS
+component: Unbound
+     type: Errors
+   lookup: average -60s unaligned absolute match-names of dropped
+    units: queries
+    every: 10s
+     warn: $this > 0
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: number of dropped queries in the request-list
+       to: sysadmin
--- a/health/health.d/varnish.conf
+++ b/health/health.d/varnish.conf
@ -1,9 +1,12 @@
-   alarm: varnish_last_collected
-      on: varnish.uptime
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+    alarm: varnish_last_collected
+       on: varnish.uptime
+    class: Web Proxy
+component: Varnish
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@ -1,16 +1,19 @@

 # make sure vcsa is running and responding

-template: vcsa_last_collected_secs
-      on: vcsa.system_health
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: vcsa_last_collected_secs
+       on: vcsa.system_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # Overall system health:
 #  - 0: all components are healthy.
@ -19,17 +22,20 @@ template: vcsa_last_collected_secs
 #  - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon.
 #  - 4: no health data is available.

-template: vcsa_system_health
-      on: vcsa.system_health
-  lookup: max -10s unaligned of system
-   units: status
-   every: 10s
-    warn: ($this == 1) || ($this == 2)
-    crit: $this == 3
-   delay: down 1m multiplier 1.5 max 1h
-    info: overall system health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_system_health
+       on: vcsa.system_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of system
+    units: status
+    every: 10s
+     warn: ($this == 1) || ($this == 2)
+     crit: $this == 3
+    delay: down 1m multiplier 1.5 max 1h
+     info: overall system health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin

 # Components health:
 #  - 0: healthy.
@ -38,77 +44,95 @@ template: vcsa_system_health
 #  - 3: unavailable, or will stop functioning soon.
 #  - 4: no health data is available.

-template: vcsa_swap_health
-      on: vcsa.components_health
-  lookup: max -10s unaligned of swap
-   units: status
-   every: 10s
-    warn: $this == 1
-    crit: ($this == 2) || ($this == 3)
-   delay: down 1m multiplier 1.5 max 1h
-    info: swap health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_swap_health
+       on: vcsa.components_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of swap
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: swap health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin

-template: vcsa_storage_health
-      on: vcsa.components_health
-  lookup: max -10s unaligned of storage
-   units: status
-   every: 10s
-    warn: $this == 1
-    crit: ($this == 2) || ($this == 3)
-   delay: down 1m multiplier 1.5 max 1h
-    info: storage health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_storage_health
+       on: vcsa.components_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of storage
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: storage health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin

-template: vcsa_mem_health
-      on: vcsa.components_health
-  lookup: max -10s unaligned of mem
-   units: status
-   every: 10s
-    warn: $this == 1
-    crit: ($this == 2) || ($this == 3)
-   delay: down 1m multiplier 1.5 max 1h
-    info: memory health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_mem_health
+       on: vcsa.components_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of mem
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: memory health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin

-template: vcsa_load_health
-      on: vcsa.components_health
-  lookup: max -10s unaligned of load
-   units: status
-   every: 10s
-    warn: $this == 1
-    crit: ($this == 2) || ($this == 3)
-   delay: down 1m multiplier 1.5 max 1h
-    info: load health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_load_health
+       on: vcsa.components_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Utilization
+   lookup: max -10s unaligned of load
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: load health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin

-template: vcsa_database_storage_health
-      on: vcsa.components_health
-  lookup: max -10s unaligned of database_storage
-   units: status
-   every: 10s
-    warn: $this == 1
-    crit: ($this == 2) || ($this == 3)
-   delay: down 1m multiplier 1.5 max 1h
-    info: database storage health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_database_storage_health
+       on: vcsa.components_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of database_storage
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: database storage health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin

-template: vcsa_applmgmt_health
-      on: vcsa.components_health
-  lookup: max -10s unaligned of applmgmt
-   units: status
-   every: 10s
-    warn: $this == 1
-    crit: ($this == 2) || ($this == 3)
-   delay: down 1m multiplier 1.5 max 1h
-    info: applmgmt health status \
-          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_applmgmt_health
+       on: vcsa.components_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of applmgmt
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: applmgmt health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin


 # Software updates health:
@ -117,14 +141,17 @@ template: vcsa_applmgmt_health
 #  - 3: security updates are available.
 #  - 4: an error retrieving information on software updates.

-template: vcsa_software_updates_health
-      on: vcsa.software_updates_health
-  lookup: max -10s unaligned of software_packages
-   units: status
-   every: 10s
-    warn: $this == 4
-    crit: $this == 3
-   delay: down 1m multiplier 1.5 max 1h
-    info: software updates availability status \
-          (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
-      to: sysadmin
+ template: vcsa_software_updates_health
+       on: vcsa.software_updates_health
+    class: Virtual Machine
+component: VMware vCenter
+     type: Errors
+   lookup: max -10s unaligned of software_packages
+    units: status
+    every: 10s
+     warn: $this == 4
+     crit: $this == 3
+    delay: down 1m multiplier 1.5 max 1h
+     info: software updates availability status \
+           (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
+       to: sysadmin
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@ -1,300 +1,381 @@

 # Availability

-template: vernemq_last_collected_secs
-      on: vernemq.node_uptime
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: vernemq_last_collected_secs
+       on: vernemq.node_uptime
+    class: Messaging
+component: VerneMQ
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 # Socket errors

-template: vernemq_socket_errors
-      on: vernemq.socket_errors
-  lookup: sum -1m unaligned absolute of socket_error
-   units: errors
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 2m down 5m multiplier 1.5 max 2h
-    info: number of socket errors in the last minute
-      to: sysadmin
+ template: vernemq_socket_errors
+       on: vernemq.socket_errors
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute of socket_error
+    units: errors
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of socket errors in the last minute
+       to: sysadmin

 # Queues dropped/expired/unhandled PUBLISH messages

-template: vernemq_queue_message_drop
-      on: vernemq.queue_undelivered_messages
-  lookup: sum -1m unaligned absolute of queue_message_drop
-   units: dropped messages
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of dropped messaged due to full queues in the last minute
-      to: sysadmin
+ template: vernemq_queue_message_drop
+       on: vernemq.queue_undelivered_messages
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute of queue_message_drop
+    units: dropped messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of dropped messaged due to full queues in the last minute
+       to: sysadmin

-template: vernemq_queue_message_expired
-      on: vernemq.queue_undelivered_messages
-  lookup: sum -1m unaligned absolute of queue_message_expired
-   units: expired messages
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (15))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of messages which expired before delivery in the last minute
-      to: sysadmin
+ template: vernemq_queue_message_expired
+       on: vernemq.queue_undelivered_messages
+    class: Messaging
+component: VerneMQ
+     type: Latency
+   lookup: sum -1m unaligned absolute of queue_message_expired
+    units: expired messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (15))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of messages which expired before delivery in the last minute
+       to: sysadmin

-template: vernemq_queue_message_unhandled
-      on: vernemq.queue_undelivered_messages
-  lookup: sum -1m unaligned absolute of queue_message_unhandled
-   units: unhandled messages
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of unhandled messages (connections with clean session=true) in the last minute
-      to: sysadmin
+ template: vernemq_queue_message_unhandled
+       on: vernemq.queue_undelivered_messages
+    class: Messaging
+component: VerneMQ
+     type: Latency
+   lookup: sum -1m unaligned absolute of queue_message_unhandled
+    units: unhandled messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of unhandled messages (connections with clean session=true) in the last minute
+       to: sysadmin

 # Erlang VM

-template: vernemq_average_scheduler_utilization
-      on: vernemq.average_scheduler_utilization
-  lookup: average -10m unaligned
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average scheduler utilization over the last 10 minutes
-      to: sysadmin
+ template: vernemq_average_scheduler_utilization
+       on: vernemq.average_scheduler_utilization
+    class: Messaging
+component: VerneMQ
+     type: Utilization
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average scheduler utilization over the last 10 minutes
+       to: sysadmin

 # Cluster communication and netsplits

-template: vernemq_cluster_dropped
-      on: vernemq.cluster_dropped
-  lookup: sum -1m unaligned
-   units: KiB
-   every: 1m
-    warn: $this > 0
-   delay: up 5m down 5m multiplier 1.5 max 1h
-    info: amount of traffic dropped during communication with the cluster nodes in the last minute
-      to: sysadmin
+ template: vernemq_cluster_dropped
+       on: vernemq.cluster_dropped
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned
+    units: KiB
+    every: 1m
+     warn: $this > 0
+    delay: up 5m down 5m multiplier 1.5 max 1h
+     info: amount of traffic dropped during communication with the cluster nodes in the last minute
+       to: sysadmin

-template: vernemq_netsplits
-      on: vernemq.netsplits
-  lookup: sum -1m unaligned absolute of netsplit_detected
-   units: netsplits
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: number of detected netsplits (split brain situation) in the last minute
-      to: sysadmin
+ template: vernemq_netsplits
+       on: vernemq.netsplits
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute of netsplit_detected
+    units: netsplits
+    every: 10s
+     warn: $this > 0
+    delay: down 5m multiplier 1.5 max 2h
+     info: number of detected netsplits (split brain situation) in the last minute
+       to: sysadmin

 # Unsuccessful CONNACK

-template: vernemq_mqtt_connack_sent_reason_unsuccessful
-      on: vernemq.mqtt_connack_sent_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_connack_sent_reason_unsuccessful
+       on: vernemq.mqtt_connack_sent_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
+       to: sysadmin

 # Not normal DISCONNECT

-template: vernemq_mqtt_disconnect_received_reason_not_normal
-      on: vernemq.mqtt_disconnect_received_reason
-  lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received not normal v5 DISCONNECT packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_disconnect_received_reason_not_normal
+       on: vernemq.mqtt_disconnect_received_reason
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received not normal v5 DISCONNECT packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_disconnect_sent_reason_not_normal
-      on: vernemq.mqtt_disconnect_sent_reason
-  lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of sent not normal v5 DISCONNECT packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_disconnect_sent_reason_not_normal
+       on: vernemq.mqtt_disconnect_sent_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of sent not normal v5 DISCONNECT packets in the last minute
+       to: sysadmin

 # SUBSCRIBE errors and unauthorized attempts

-template: vernemq_mqtt_subscribe_error
-      on: vernemq.mqtt_subscribe_error
-  lookup: sum -1m unaligned absolute
-   units: failed ops
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of failed v3/v5 SUBSCRIBE operations in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_subscribe_error
+       on: vernemq.mqtt_subscribe_error
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute
+    units: failed ops
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of failed v3/v5 SUBSCRIBE operations in the last minute
+       to: sysadmin

-template: vernemq_mqtt_subscribe_auth_error
-      on: vernemq.mqtt_subscribe_auth_error
-  lookup: sum -1m unaligned absolute
-   units: attempts
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_subscribe_auth_error
+       on: vernemq.mqtt_subscribe_auth_error
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute
+    units: attempts
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+       to: sysadmin

 # UNSUBSCRIBE errors

-template: vernemq_mqtt_unsubscribe_error
-      on: vernemq.mqtt_unsubscribe_error
-  lookup: sum -1m unaligned absolute
-   units: failed ops
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_unsubscribe_error
+       on: vernemq.mqtt_unsubscribe_error
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute
+    units: failed ops
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
+       to: sysadmin

 # PUBLISH errors and unauthorized attempts

-template: vernemq_mqtt_publish_errors
-      on: vernemq.mqtt_publish_errors
-  lookup: sum -1m unaligned absolute
-   units: failed ops
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of failed v3/v5 PUBLISH operations in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_publish_errors
+       on: vernemq.mqtt_publish_errors
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute
+    units: failed ops
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of failed v3/v5 PUBLISH operations in the last minute
+       to: sysadmin

-template: vernemq_mqtt_publish_auth_errors
-      on: vernemq.mqtt_publish_auth_errors
-  lookup: sum -1m unaligned absolute
-   units: attempts
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_publish_auth_errors
+       on: vernemq.mqtt_publish_auth_errors
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute
+    units: attempts
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
+       to: sysadmin

 # Unsuccessful and unexpected PUBACK

-template: vernemq_mqtt_puback_received_reason_unsuccessful
-      on: vernemq.mqtt_puback_received_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unsuccessful v5 PUBACK packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_puback_received_reason_unsuccessful
+       on: vernemq.mqtt_puback_received_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBACK packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_puback_sent_reason_unsuccessful
-      on: vernemq.mqtt_puback_sent_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of sent unsuccessful v5 PUBACK packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_puback_sent_reason_unsuccessful
+       on: vernemq.mqtt_puback_sent_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBACK packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_puback_unexpected
-      on: vernemq.mqtt_puback_invalid_error
-  lookup: sum -1m unaligned absolute
-   units: messages
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unexpected v3/v5 PUBACK packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_puback_unexpected
+       on: vernemq.mqtt_puback_invalid_error
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute
+    units: messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unexpected v3/v5 PUBACK packets in the last minute
+       to: sysadmin

 # Unsuccessful and unexpected PUBREC

-template: vernemq_mqtt_pubrec_received_reason_unsuccessful
-      on: vernemq.mqtt_pubrec_received_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unsuccessful v5 PUBREC packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubrec_received_reason_unsuccessful
+       on: vernemq.mqtt_pubrec_received_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBREC packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
-      on: vernemq.mqtt_pubrec_sent_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of sent unsuccessful v5 PUBREC packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
+       on: vernemq.mqtt_pubrec_sent_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBREC packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_pubrec_invalid_error
-      on: vernemq.mqtt_pubrec_invalid_error
-  lookup: sum -1m unaligned absolute
-   units: messages
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unexpected v3 PUBREC packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubrec_invalid_error
+       on: vernemq.mqtt_pubrec_invalid_error
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute
+    units: messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unexpected v3 PUBREC packets in the last minute
+       to: sysadmin

 # Unsuccessful PUBREL

-template: vernemq_mqtt_pubrel_received_reason_unsuccessful
-      on: vernemq.mqtt_pubrel_received_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unsuccessful v5 PUBREL packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubrel_received_reason_unsuccessful
+       on: vernemq.mqtt_pubrel_received_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBREL packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
-      on: vernemq.mqtt_pubrel_sent_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of sent unsuccessful v5 PUBREL packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
+       on: vernemq.mqtt_pubrel_sent_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBREL packets in the last minute
+       to: sysadmin

 # Unsuccessful and unexpected PUBCOMP

-template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
-      on: vernemq.mqtt_pubcomp_received_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unsuccessful v5 PUBCOMP packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
+       on: vernemq.mqtt_pubcomp_received_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBCOMP packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
-      on: vernemq.mqtt_pubcomp_sent_reason
-  lookup: sum -1m unaligned absolute match-names of !success,*
-   units: packets
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
+       on: vernemq.mqtt_pubcomp_sent_reason
+    class: Messaging
+component: VerneMQ
+     type: Errors
+   lookup: sum -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
+       to: sysadmin

-template: vernemq_mqtt_pubcomp_unexpected
-      on: vernemq.mqtt_pubcomp_invalid_error
-  lookup: sum -1m unaligned absolute
-   units: messages
-   every: 1m
-    warn: $this > (($status >= $WARNING) ? (0) : (5))
-   delay: up 5m down 5m multiplier 1.5 max 2h
-    info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
-      to: sysadmin
+ template: vernemq_mqtt_pubcomp_unexpected
+       on: vernemq.mqtt_pubcomp_invalid_error
+    class: Messaging
+component: VerneMQ
+     type: Workload
+   lookup: sum -1m unaligned absolute
+    units: messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 5m down 5m multiplier 1.5 max 2h
+     info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
+       to: sysadmin
--- a/health/health.d/vsphere.conf
+++ b/health/health.d/vsphere.conf
@ -4,138 +4,171 @@
 # -----------------------------------------------VM Specific------------------------------------------------------------
 # Memory

-template: vsphere_vm_mem_usage
-      on: vsphere.vm_mem_usage_percentage
-   hosts: *
-    calc: $used
-   units: %
-   every: 20s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: virtual machine memory utilization
+ template: vsphere_vm_mem_usage
+       on: vsphere.vm_mem_usage_percentage
+    class: Virtual Machine
+component: Memory
+     type: Utilization
+    hosts: *
+     calc: $used
+    units: %
+    every: 20s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: virtual machine memory utilization

 # -----------------------------------------------HOST Specific----------------------------------------------------------
 # Memory

-template: vsphere_host_mem_usage
-      on: vsphere.host_mem_usage_percentage
-   hosts: *
-    calc: $used
-   units: %
-   every: 20s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: host memory utilization
+ template: vsphere_host_mem_usage
+       on: vsphere.host_mem_usage_percentage
+    class: Virtual Machine
+component: Memory
+     type: Utilization
+    hosts: *
+     calc: $used
+    units: %
+    every: 20s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: host memory utilization

 # Network errors

-template: vsphere_inbound_packets_errors
-      on: vsphere.net_errors_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of rx
-   units: packets
-   every: 1m
-    info: number of inbound errors for the network interface in the last 10 minutes
+ template: vsphere_inbound_packets_errors
+       on: vsphere.net_errors_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+    units: packets
+    every: 1m
+     info: number of inbound errors for the network interface in the last 10 minutes

-template: vsphere_outbound_packets_errors
-      on: vsphere.net_errors_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of tx
-   units: packets
-   every: 1m
-    info: number of outbound errors for the network interface in the last 10 minutes
+ template: vsphere_outbound_packets_errors
+       on: vsphere.net_errors_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+    units: packets
+    every: 1m
+     info: number of outbound errors for the network interface in the last 10 minutes

 # Network errors ratio

-template: vsphere_inbound_packets_errors_ratio
-      on: vsphere.net_packets_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of rx
-    calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 2
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of inbound errors for the network interface over the last 10 minutes
-      to: sysadmin
+ template: vsphere_inbound_packets_errors_ratio
+       on: vsphere.net_packets_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+     calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound errors for the network interface over the last 10 minutes
+       to: sysadmin

-template: vsphere_outbound_packets_errors_ratio
-      on: vsphere.net_packets_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of tx
-    calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 2
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of outbound errors for the network interface over the last 10 minutes
-      to: sysadmin
+ template: vsphere_outbound_packets_errors_ratio
+       on: vsphere.net_packets_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+     calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound errors for the network interface over the last 10 minutes
+       to: sysadmin

 # -----------------------------------------------Common-------------------------------------------------------------------
 # CPU

-template: vsphere_cpu_usage
-      on: vsphere.cpu_usage_total
-   hosts: *
-  lookup: average -10m unaligned match-names of used
-   units: %
-   every: 20s
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average CPU utilization
-      to: sysadmin
+ template: vsphere_cpu_usage
+       on: vsphere.cpu_usage_total
+    class: Virtual Machine
+component: CPU
+     type: Utilization
+    hosts: *
+   lookup: average -10m unaligned match-names of used
+    units: %
+    every: 20s
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization
+       to: sysadmin

 # Network drops

-template: vsphere_inbound_packets_dropped
-      on: vsphere.net_drops_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of rx
-   units: packets
-   every: 1m
-    info: number of inbound dropped packets for the network interface in the last 10 minutes
+ template: vsphere_inbound_packets_dropped
+       on: vsphere.net_drops_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+    units: packets
+    every: 1m
+     info: number of inbound dropped packets for the network interface in the last 10 minutes

-template: vsphere_outbound_packets_dropped
-      on: vsphere.net_drops_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of tx
-   units: packets
-   every: 1m
-    info: number of outbound dropped packets for the network interface in the last 10 minutes
+ template: vsphere_outbound_packets_dropped
+       on: vsphere.net_drops_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+    units: packets
+    every: 1m
+     info: number of outbound dropped packets for the network interface in the last 10 minutes

 # Network drops ratio

-template: vsphere_inbound_packets_dropped_ratio
-      on: vsphere.net_packets_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of rx
-    calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 2
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of inbound dropped packets for the network interface over the last 10 minutes
-      to: sysadmin
+ template: vsphere_inbound_packets_dropped_ratio
+       on: vsphere.net_packets_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+     calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound dropped packets for the network interface over the last 10 minutes
+       to: sysadmin

-template: vsphere_outbound_packets_dropped_ratio
-      on: vsphere.net_packets_total
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of tx
-    calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
-   units: %
-   every: 1m
-    warn: $this >= 2
-   delay: up 1m down 1h multiplier 1.5 max 2h
-    info: ratio of outbound dropped packets for the network interface over the last 10 minutes
-      to: sysadmin
+ template: vsphere_outbound_packets_dropped_ratio
+       on: vsphere.net_packets_total
+    class: Virtual Machine
+component: Network
+     type: Errors
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+     calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound dropped packets for the network interface over the last 10 minutes
+       to: sysadmin
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@ -1,17 +1,20 @@

 # make sure we can collect web log data

-template: last_collected_secs
-      on: web_log.response_codes
-families: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: last_collected_secs
+       on: web_log.response_codes
+    class: Web Server
+component: Web log
+     type: Latency
+ families: *
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster


 # -----------------------------------------------------------------------------
@ -24,66 +27,81 @@ families: *
 #
 # i.e. when there are at least 120 requests during the last minute

-template: 1m_requests
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned
-    calc: ($this == 0)?(1):($this)
-   units: requests
-   every: 10s
-    info: number of HTTP requests in the last minute
+ template: 1m_requests
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned
+     calc: ($this == 0)?(1):($this)
+    units: requests
+    every: 10s
+     info: number of HTTP requests in the last minute

-template: 1m_successful
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of successful_requests
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
-      to: webmaster
+ template: 1m_successful
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned of successful_requests
+     calc: $this * 100 / $1m_requests
+    units: %
+    every: 10s
+     warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+     crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
+       to: webmaster

-template: 1m_redirects
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of redirects
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of redirection HTTP requests over the last minute (3xx except 304)
-      to: webmaster
+ template: 1m_redirects
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned of redirects
+     calc: $this * 100 / $1m_requests
+    units: %
+    every: 10s
+     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
+     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of redirection HTTP requests over the last minute (3xx except 304)
+       to: webmaster

-template: 1m_bad_requests
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of bad_requests
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of client error HTTP requests over the last minute (4xx except 401)
-      to: webmaster
+ template: 1m_bad_requests
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Errors
+ families: *
+   lookup: sum -1m unaligned of bad_requests
+     calc: $this * 100 / $1m_requests
+    units: %
+    every: 10s
+     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
+     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of client error HTTP requests over the last minute (4xx except 401)
+       to: webmaster

-template: 1m_internal_errors
-      on: web_log.response_statuses
-families: *
-  lookup: sum -1m unaligned of server_errors
-    calc: $this * 100 / $1m_requests
-   units: %
-   every: 10s
-    warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of server error HTTP requests over the last minute (5xx)
-      to: webmaster
+ template: 1m_internal_errors
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Errors
+ families: *
+   lookup: sum -1m unaligned of server_errors
+     calc: $this * 100 / $1m_requests
+    units: %
+    every: 10s
+     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
+     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of server error HTTP requests over the last minute (5xx)
+       to: webmaster

 # unmatched lines

@ -94,26 +112,32 @@ families: *
 #
 # i.e. when there are at least 120 requests during the last minute

-template: 1m_total_requests
-      on: web_log.response_codes
-families: *
-  lookup: sum -1m unaligned
-    calc: ($this == 0)?(1):($this)
-   units: requests
-   every: 10s
-    info: number of HTTP requests over the last minute
+ template: 1m_total_requests
+       on: web_log.response_codes
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned
+     calc: ($this == 0)?(1):($this)
+    units: requests
+    every: 10s
+     info: number of HTTP requests over the last minute

-template: 1m_unmatched
-      on: web_log.response_codes
-families: *
-  lookup: sum -1m unaligned of unmatched
-    calc: $this * 100 / $1m_total_requests
-   units: %
-   every: 10s
-    warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
-   delay: up 1m down 5m multiplier 1.5 max 1h
-    info: percentage of unparsed log lines over the last minute
-      to: webmaster
+ template: 1m_unmatched
+       on: web_log.response_codes
+    class: Web Server
+component: Web log
+     type: Errors
+ families: *
+   lookup: sum -1m unaligned of unmatched
+     calc: $this * 100 / $1m_total_requests
+    units: %
+    every: 10s
+     warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: percentage of unparsed log lines over the last minute
+       to: webmaster

 # -----------------------------------------------------------------------------
 # web slow
@ -125,28 +149,34 @@ families: *
 #
 # i.e. when there are at least 120 requests during the last minute

-template: 10m_response_time
-      on: web_log.response_time
-families: *
-  lookup: average -10m unaligned of avg
-   units: ms
-   every: 30s
-    info: average HTTP response time over the last 10 minutes
+ template: 10m_response_time
+       on: web_log.response_time
+    class: System
+component: Web log
+     type: Latency
+ families: *
+   lookup: average -10m unaligned of avg
+    units: ms
+    every: 30s
+     info: average HTTP response time over the last 10 minutes

-template: web_slow
-      on: web_log.response_time
-families: *
-  lookup: average -1m unaligned of avg
-   units: ms
-   every: 10s
-   green: 500
-     red: 1000
-    warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
-    crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
-   delay: down 15m multiplier 1.5 max 1h
-    info: average HTTP response time over the last minute
- options: no-clear-notification
-      to: webmaster
+ template: web_slow
+       on: web_log.response_time
+    class: Web Server
+component: Web log
+     type: Latency
+ families: *
+   lookup: average -1m unaligned of avg
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
+     crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
+    delay: down 15m multiplier 1.5 max 1h
+     info: average HTTP response time over the last minute
+  options: no-clear-notification
+       to: webmaster

 # -----------------------------------------------------------------------------
 # web too many or too few requests
@ -159,36 +189,45 @@ families: *
 # i.e. when there were at least 120 requests during the 5 minutes starting
 #      at -10m and ending at -5m

-template: 5m_successful_old
-      on: web_log.response_statuses
-families: *
-  lookup: average -5m at -5m unaligned of successful_requests
-   units: requests/s
-   every: 30s
-    info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
+ template: 5m_successful_old
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: average -5m at -5m unaligned of successful_requests
+    units: requests/s
+    every: 30s
+     info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago

-template: 5m_successful
-      on: web_log.response_statuses
-families: *
-  lookup: average -5m unaligned of successful_requests
-   units: requests/s
-   every: 30s
-    info: average number of successful HTTP requests over the last 5 minutes
+ template: 5m_successful
+       on: web_log.response_statuses
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: average -5m unaligned of successful_requests
+    units: requests/s
+    every: 30s
+     info: average number of successful HTTP requests over the last 5 minutes

-template: 5m_requests_ratio
-      on: web_log.response_codes
-families: *
-    calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
-   units: %
-   every: 30s
-    warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
-    crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
-   delay: down 15m multiplier 1.5 max 1h
-options: no-clear-notification
-    info: ratio of successful HTTP requests over the last 5 minutes, \
-          compared with the previous 5 minutes \
-          (clear notification for this alarm will not be sent)
-      to: webmaster
+ template: 5m_requests_ratio
+       on: web_log.response_codes
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+     calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
+    units: %
+    every: 30s
+     warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+     crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+    delay: down 15m multiplier 1.5 max 1h
+  options: no-clear-notification
+     info: ratio of successful HTTP requests over the last 5 minutes, \
+           compared with the previous 5 minutes \
+           (clear notification for this alarm will not be sent)
+       to: webmaster



@ -196,17 +235,20 @@ options: no-clear-notification

 # make sure we can collect web log data

-template: web_log_last_collected_secs
-      on: web_log.requests
-families: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: web_log_last_collected_secs
+       on: web_log.requests
+    class: Web Server
+component: Web log
+     type: Latency
+ families: *
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

 # unmatched lines

@ -217,26 +259,32 @@ families: *
 #
 # i.e. when there are at least 120 requests during the last minute

-template: web_log_1m_total_requests
-      on: web_log.requests
-families: *
-  lookup: sum -1m unaligned
-    calc: ($this == 0)?(1):($this)
-   units: requests
-   every: 10s
-    info: number of HTTP requests in the last minute
+ template: web_log_1m_total_requests
+       on: web_log.requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned
+     calc: ($this == 0)?(1):($this)
+    units: requests
+    every: 10s
+     info: number of HTTP requests in the last minute

-template: web_log_1m_unmatched
-      on: web_log.excluded_requests
-families: *
-  lookup: sum -1m unaligned of unmatched
-    calc: $this * 100 / $web_log_1m_total_requests
-   units: %
-   every: 10s
-    warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
-   delay: up 1m down 5m multiplier 1.5 max 1h
-    info: percentage of unparsed log lines over the last minute
-      to: webmaster
+ template: web_log_1m_unmatched
+       on: web_log.excluded_requests
+    class: Web Server
+component: Web log
+     type: Errors
+ families: *
+   lookup: sum -1m unaligned of unmatched
+     calc: $this * 100 / $web_log_1m_total_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: percentage of unparsed log lines over the last minute
+       to: webmaster

 # -----------------------------------------------------------------------------
 # high level response code alarms
@ -248,66 +296,81 @@ families: *
 #
 # i.e. when there are at least 120 requests during the last minute

-template: web_log_1m_requests
-      on: web_log.type_requests
-families: *
-  lookup: sum -1m unaligned
-    calc: ($this == 0)?(1):($this)
-   units: requests
-   every: 10s
-    info: number of HTTP requests in the last minute
+ template: web_log_1m_requests
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned
+     calc: ($this == 0)?(1):($this)
+    units: requests
+    every: 10s
+     info: number of HTTP requests in the last minute

-template: web_log_1m_successful
-      on: web_log.type_requests
-families: *
-  lookup: sum -1m unaligned of success
-    calc: $this * 100 / $web_log_1m_requests
-   units: %
-   every: 10s
-    warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
-    crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
-      to: webmaster
+ template: web_log_1m_successful
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned of success
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
+       to: webmaster

-template: web_log_1m_redirects
-      on: web_log.type_requests
-families: *
-  lookup: sum -1m unaligned of redirect
-    calc: $this * 100 / $web_log_1m_requests
-   units: %
-   every: 10s
-    warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
-    crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of redirection HTTP requests over the last minute (3xx except 304)
-      to: webmaster
+ template: web_log_1m_redirects
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: sum -1m unaligned of redirect
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of redirection HTTP requests over the last minute (3xx except 304)
+       to: webmaster

-template: web_log_1m_bad_requests
-      on: web_log.type_requests
-families: *
-  lookup: sum -1m unaligned of bad
-    calc: $this * 100 / $web_log_1m_requests
-   units: %
-   every: 10s
-    warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
-    crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of client error HTTP requests over the last minute (4xx except 401)
-      to: webmaster
+ template: web_log_1m_bad_requests
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Errors
+ families: *
+   lookup: sum -1m unaligned of bad
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of client error HTTP requests over the last minute (4xx except 401)
+       to: webmaster

-template: web_log_1m_internal_errors
-      on: web_log.type_requests
-families: *
-  lookup: sum -1m unaligned of error
-    calc: $this * 100 / $web_log_1m_requests
-   units: %
-   every: 10s
-    warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
-    crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
-   delay: up 2m down 15m multiplier 1.5 max 1h
-    info: ratio of server error HTTP requests over the last minute (5xx)
-      to: webmaster
+ template: web_log_1m_internal_errors
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Errors
+ families: *
+   lookup: sum -1m unaligned of error
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of server error HTTP requests over the last minute (5xx)
+       to: webmaster

 # -----------------------------------------------------------------------------
 # web slow
@ -319,28 +382,34 @@ families: *
 #
 # i.e. when there are at least 120 requests during the last minute

-template: web_log_10m_response_time
-      on: web_log.request_processing_time
-families: *
-  lookup: average -10m unaligned of avg
-   units: ms
-   every: 30s
-    info: average HTTP response time over the last 10 minutes
+ template: web_log_10m_response_time
+       on: web_log.request_processing_time
+    class: System
+component: Web log
+     type: Latency
+ families: *
+   lookup: average -10m unaligned of avg
+    units: ms
+    every: 30s
+     info: average HTTP response time over the last 10 minutes

-template: web_log_web_slow
-      on: web_log.request_processing_time
-families: *
-  lookup: average -1m unaligned of avg
-   units: ms
-   every: 10s
-   green: 500
-     red: 1000
-    warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
-    crit: ($web_log_1m_requests > 120) ? ($this > $red   && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
-   delay: down 15m multiplier 1.5 max 1h
-    info: average HTTP response time over the last 1 minute
- options: no-clear-notification
-      to: webmaster
+ template: web_log_web_slow
+       on: web_log.request_processing_time
+    class: Web Server
+component: Web log
+     type: Latency
+ families: *
+   lookup: average -1m unaligned of avg
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this > $red   && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
+    delay: down 15m multiplier 1.5 max 1h
+     info: average HTTP response time over the last 1 minute
+  options: no-clear-notification
+       to: webmaster

 # -----------------------------------------------------------------------------
 # web too many or too few requests
@ -353,33 +422,42 @@ families: *
 # i.e. when there were at least 120 requests during the 5 minutes starting
 #      at -10m and ending at -5m

-template: web_log_5m_successful_old
-      on: web_log.type_requests
-families: *
-  lookup: average -5m at -5m unaligned of success
-   units: requests/s
-   every: 30s
-    info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
+ template: web_log_5m_successful_old
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: average -5m at -5m unaligned of success
+    units: requests/s
+    every: 30s
+     info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago

-template: web_log_5m_successful
-      on: web_log.type_requests
-families: *
-  lookup: average -5m unaligned of success
-   units: requests/s
-   every: 30s
-    info: average number of successful HTTP requests over the last 5 minutes
+ template: web_log_5m_successful
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+   lookup: average -5m unaligned of success
+    units: requests/s
+    every: 30s
+     info: average number of successful HTTP requests over the last 5 minutes

-template: web_log_5m_requests_ratio
-      on: web_log.type_requests
-families: *
-    calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
-   units: %
-   every: 30s
-    warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
-    crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
-   delay: down 15m multiplier 1.5 max 1h
-options: no-clear-notification
-    info: ratio of successful HTTP requests over over the last 5 minutes, \
-          compared with the previous 5 minutes \
-          (clear notification for this alarm will not be sent)
-      to: webmaster
+ template: web_log_5m_requests_ratio
+       on: web_log.type_requests
+    class: Web Server
+component: Web log
+     type: Workload
+ families: *
+     calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
+    units: %
+    every: 30s
+     warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+     crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+    delay: down 15m multiplier 1.5 max 1h
+  options: no-clear-notification
+     info: ratio of successful HTTP requests over over the last 5 minutes, \
+           compared with the previous 5 minutes \
+           (clear notification for this alarm will not be sent)
+       to: webmaster
--- a/health/health.d/whoisquery.conf
+++ b/health/health.d/whoisquery.conf
@ -1,24 +1,30 @@

 # make sure whoisquery is running

-template: whoisquery_last_collected_secs
-      on: whoisquery.time_until_expiration
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 60s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: whoisquery_last_collected_secs
+       on: whoisquery.time_until_expiration
+    class: Other
+component: WHOIS
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 60s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster


-template: whoisquery_days_until_expiration
-      on: whoisquery.time_until_expiration
-   calc:  $expiry
-   units: seconds
-   every: 60s
-    warn: $this < $days_until_expiration_warning*24*60*60
-    crit: $this < $days_until_expiration_critical*24*60*60
-    info: time until the domain name registration expires
-      to: webmaster
+ template: whoisquery_days_until_expiration
+       on: whoisquery.time_until_expiration
+    class: Other
+component: WHOIS
+     type: Utilization
+     calc: $expiry
+    units: seconds
+    every: 60s
+     warn: $this < $days_until_expiration_warning*24*60*60
+     crit: $this < $days_until_expiration_critical*24*60*60
+     info: time until the domain name registration expires
+       to: webmaster
--- a/health/health.d/wmi.conf
+++ b/health/health.d/wmi.conf
@ -3,128 +3,155 @@

 ## Availability

-template: wmi_last_collected_secs
-      on: cpu.collector_duration
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
+ template: wmi_last_collected_secs
+       on: cpu.collector_duration
+    class: Windows
+component: Availability
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin

 ## CPU

-template: wmi_10min_cpu_usage
-      on: wmi.cpu_utilization_total
-      os: linux
-   hosts: *
-  lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
-   units: %
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (85))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
-   delay: down 15m multiplier 1.5 max 1h
-    info: average CPU utilization over the last 10 minutes
-      to: sysadmin
+ template: wmi_10min_cpu_usage
+       on: wmi.cpu_utilization_total
+    class: Windows
+component: CPU
+     type: Utilization
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization over the last 10 minutes
+       to: sysadmin


 ## Memory

-template: wmi_ram_in_use
-      on: wmi.memory_utilization
-      os: linux
-   hosts: *
-    calc: ($used) * 100 / ($used + $available)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: memory utilization
-      to: sysadmin
+ template: wmi_ram_in_use
+       on: wmi.memory_utilization
+    class: Windows
+component: Memory
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ($used) * 100 / ($used + $available)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: memory utilization
+       to: sysadmin

-template: wmi_swap_in_use
-      on: wmi.memory_swap_utilization
-      os: linux
-   hosts: *
-    calc: ($used) * 100 / ($used + $available)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: swap memory utilization
-      to: sysadmin
+ template: wmi_swap_in_use
+       on: wmi.memory_swap_utilization
+    class: Windows
+component: Memory
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ($used) * 100 / ($used + $available)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: swap memory utilization
+       to: sysadmin


 ## Network

-template: wmi_inbound_packets_discarded
-      on: wmi.net_discarded
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of inbound
-   units: packets
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of inbound discarded packets for the network interface in the last 10 minutes
-      to: sysadmin
+ template: wmi_inbound_packets_discarded
+       on: wmi.net_discarded
+    class: Windows
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of inbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of inbound discarded packets for the network interface in the last 10 minutes
+       to: sysadmin

-template: wmi_outbound_packets_discarded
-      on: wmi.net_discarded
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of outbound
-   units: packets
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of outbound discarded packets for the network interface in the last 10 minutes
-      to: sysadmin
+ template: wmi_outbound_packets_discarded
+       on: wmi.net_discarded
+    class: Windows
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of outbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of outbound discarded packets for the network interface in the last 10 minutes
+       to: sysadmin

-template: wmi_inbound_packets_errors
-      on: wmi.net_errors
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of inbound
-   units: packets
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of inbound errors for the network interface in the last 10 minutes
-      to: sysadmin
+ template: wmi_inbound_packets_errors
+       on: wmi.net_errors
+    class: Windows
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of inbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of inbound errors for the network interface in the last 10 minutes
+       to: sysadmin

-template: wmi_outbound_packets_errors
-      on: wmi.net_errors
-      os: linux
-   hosts: *
-families: *
-  lookup: sum -10m unaligned absolute match-names of outbound
-   units: packets
-   every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of outbound errors for the network interface in the last 10 minutes
-      to: sysadmin
+ template: wmi_outbound_packets_errors
+       on: wmi.net_errors
+    class: Windows
+component: Network
+     type: Errors
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of outbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of outbound errors for the network interface in the last 10 minutes
+       to: sysadmin


 ## Disk

-template: wmi_disk_in_use
-      on: wmi.logical_disk_utilization
-      os: linux
-   hosts: *
-    calc: ($used) * 100 / ($used + $free)
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: disk space utilization
-      to: sysadmin
+ template: wmi_disk_in_use
+       on: wmi.logical_disk_utilization
+    class: Windows
+component: Disk
+     type: Utilization
+       os: linux
+    hosts: *
+     calc: ($used) * 100 / ($used + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: disk space utilization
+       to: sysadmin
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@ -1,32 +1,41 @@

 # make sure x509check is running

-template: x509check_last_collected_secs
-      on: x509check.time_until_expiration
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 60s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: x509check_last_collected_secs
+       on: x509check.time_until_expiration
+    class: Certificates
+component: x509 certificates
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 60s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster


-template: x509check_days_until_expiration
-      on: x509check.time_until_expiration
-   calc:  $expiry
-   units: seconds
-   every: 60s
-    warn: $this < $days_until_expiration_warning*24*60*60
-    crit: $this < $days_until_expiration_critical*24*60*60
-    info: time until x509 certificate expires
-      to: webmaster
+ template: x509check_days_until_expiration
+       on: x509check.time_until_expiration
+    class: Certificates
+component: x509 certificates
+     type: Latency
+     calc: $expiry
+    units: seconds
+    every: 60s
+     warn: $this < $days_until_expiration_warning*24*60*60
+     crit: $this < $days_until_expiration_critical*24*60*60
+     info: time until x509 certificate expires
+       to: webmaster
      
-template: x509check_revocation_status
-      on: x509check.revocation_status
-    calc: $revoked
-   every: 60s
-    crit: $this != nan AND $this != 0
-    info: x509 certificate revocation status (0: revoked, 1: valid)
-      to: webmaster
+ template: x509check_revocation_status
+       on: x509check.revocation_status
+    class: Certificates
+component: x509 certificates
+     type: Errors
+     calc: $revoked
+    every: 60s
+     crit: $this != nan AND $this != 0
+     info: x509 certificate revocation status (0: revoked, 1: valid)
+       to: webmaster
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@ -1,10 +1,13 @@

-   alarm: zfs_memory_throttle
-      on: zfs.memory_ops
-  lookup: sum -10m unaligned absolute of throttled
-   units: events
-   every: 1m
-    warn: $this > 0
-   delay: down 1h multiplier 1.5 max 2h
-    info: number of times ZFS had to limit the ARC growth in the last 10 minutes
-      to: sysadmin
+    alarm: zfs_memory_throttle
+       on: zfs.memory_ops
+    class: System
+component: File system
+     type: Utilization
+   lookup: sum -10m unaligned absolute of throttled
+    units: events
+    every: 1m
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of times ZFS had to limit the ARC growth in the last 10 minutes
+       to: sysadmin
--- a/health/health.d/zookeeper.conf
+++ b/health/health.d/zookeeper.conf
@ -1,14 +1,17 @@

 # make sure zookeeper is running

-template: zookeeper_last_collected_secs
-      on: zookeeper.requests
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: webmaster
+ template: zookeeper_last_collected_secs
+       on: zookeeper.requests
+    class: KV Storage
+component: ZooKeeper
+     type: Latency
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster

--- a/health/health.h
+++ b/health/health.h
@ -37,20 +37,6 @@ extern unsigned int default_health_enabled;
 #define HEALTH_LISTEN_BACKLOG 4096
 #endif

-#define HEALTH_ON_KEY "on"
-#define HEALTH_EVERY_KEY "every"
-#define HEALTH_GREEN_KEY "green"
-#define HEALTH_RED_KEY "red"
-#define HEALTH_WARN_KEY "warn"
-#define HEALTH_CRIT_KEY "crit"
-#define HEALTH_EXEC_KEY "exec"
-#define HEALTH_RECIPIENT_KEY "to"
-#define HEALTH_UNITS_KEY "units"
-#define HEALTH_INFO_KEY "info"
-#define HEALTH_DELAY_KEY "delay"
-#define HEALTH_OPTIONS_KEY "options"
-#define HEALTH_FOREACH_KEY "foreach"
-
 #define HEALTH_SILENCERS_MAX_FILE_LEN 10000

 extern char *silencers_filename;
@ -81,6 +67,9 @@ extern ALARM_ENTRY* health_create_alarm_entry(
        const char *name,
        const char *chart,
        const char *family,
+        const char *class,
+        const char *component,
+        const char *type,
        const char *exec,
        const char *recipient,
        time_t duration,
--- a/health/health_config.c
+++ b/health/health_config.c
@ -23,10 +23,14 @@
 #define HEALTH_RECIPIENT_KEY "to"
 #define HEALTH_UNITS_KEY "units"
 #define HEALTH_INFO_KEY "info"
+#define HEALTH_CLASS_KEY "class"
+#define HEALTH_COMPONENT_KEY "component"
+#define HEALTH_TYPE_KEY "type"
 #define HEALTH_DELAY_KEY "delay"
 #define HEALTH_OPTIONS_KEY "options"
 #define HEALTH_REPEAT_KEY "repeat"
 #define HEALTH_HOST_LABEL_KEY "host labels"
+#define HEALTH_FOREACH_KEY "foreach"

 static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
    if(!rc->chart) {
@ -499,6 +503,9 @@ static int health_readfile(const char *filename, void *data) {
            hash_lookup = 0,
            hash_units = 0,
            hash_info = 0,
+            hash_class = 0,
+            hash_component = 0,
+            hash_type = 0,
            hash_recipient = 0,
            hash_delay = 0,
            hash_options = 0,
@ -526,6 +533,9 @@ static int health_readfile(const char *filename, void *data) {
        hash_every = simple_uhash(HEALTH_EVERY_KEY);
        hash_units = simple_hash(HEALTH_UNITS_KEY);
        hash_info = simple_hash(HEALTH_INFO_KEY);
+        hash_class = simple_uhash(HEALTH_CLASS_KEY);
+        hash_component = simple_uhash(HEALTH_COMPONENT_KEY);
+        hash_type = simple_uhash(HEALTH_TYPE_KEY);
        hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
        hash_delay = simple_uhash(HEALTH_DELAY_KEY);
        hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
@ -696,6 +706,39 @@ static int health_readfile(const char *filename, void *data) {
                rc->chart = strdupz(value);
                rc->hash_chart = simple_hash(rc->chart);
            }
+            else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
+                if(rc->class) {
+                    if(strcmp(rc->class, value) != 0)
+                        error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                                line, filename, rc->name, key, rc->class, value, value);
+
+                    freez(rc->class);
+                }
+                rc->class = strdupz(value);
+                strip_quotes(rc->class);
+            }
+            else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
+                if(rc->component) {
+                    if(strcmp(rc->component, value) != 0)
+                        error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                                line, filename, rc->name, key, rc->component, value, value);
+
+                    freez(rc->component);
+                }
+                rc->component = strdupz(value);
+                strip_quotes(rc->component);
+            }
+            else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
+                if(rc->type) {
+                    if(strcmp(rc->type, value) != 0)
+                        error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                                line, filename, rc->name, key, rc->type, value, value);
+
+                    freez(rc->type);
+                }
+                rc->type = strdupz(value);
+                strip_quotes(rc->type);
+            }
            else if(hash == hash_lookup && !strcasecmp(key, HEALTH_LOOKUP_KEY)) {
                health_parse_db_lookup(line, filename, value, &rc->group, &rc->after, &rc->before,
                        &rc->update_every, &rc->options, &rc->dimensions, &rc->foreachdim);
@ -848,6 +891,39 @@ static int health_readfile(const char *filename, void *data) {
                rt->context = strdupz(value);
                rt->hash_context = simple_hash(rt->context);
            }
+            else if(hash == hash_class && !strcasecmp(key, HEALTH_CLASS_KEY)) {
+                if(rt->class) {
+                    if(strcmp(rt->class, value) != 0)
+                        error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                              line, filename, rt->name, key, rt->class, value, value);
+
+                    freez(rt->class);
+                }
+                rt->class = strdupz(value);
+                strip_quotes(rt->class);
+            }
+            else if(hash == hash_component && !strcasecmp(key, HEALTH_COMPONENT_KEY)) {
+                if(rt->component) {
+                    if(strcmp(rt->component, value) != 0)
+                        error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                              line, filename, rt->name, key, rt->component, value, value);
+
+                    freez(rt->component);
+                }
+                rt->component = strdupz(value);
+                strip_quotes(rt->component);
+            }
+            else if(hash == hash_type && !strcasecmp(key, HEALTH_TYPE_KEY)) {
+                if(rt->type) {
+                    if(strcmp(rt->type, value) != 0)
+                        error("Health configuration at line %zu of file '%s' for alarm '%s' has key '%s' twice, once with value '%s' and later with value '%s'. Using ('%s').",
+                              line, filename, rt->name, key, rt->type, value, value);
+
+                    freez(rt->type);
+                }
+                rt->type = strdupz(value);
+                strip_quotes(rt->type);
+            }
            else if(hash == hash_families && !strcasecmp(key, HEALTH_FAMILIES_KEY)) {
                freez(rt->family_match);
                simple_pattern_free(rt->family_pattern);
--- a/health/health_json.c
+++ b/health/health_json.c
@ -23,6 +23,9 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
                    "\t\t\"name\": \"%s\",\n"
                    "\t\t\"chart\": \"%s\",\n"
                    "\t\t\"family\": \"%s\",\n"
+                    "\t\t\"class\": \"%s\",\n"
+                    "\t\t\"component\": \"%s\",\n"
+                    "\t\t\"type\": \"%s\",\n"
                    "\t\t\"processed\": %s,\n"
                    "\t\t\"updated\": %s,\n"
                    "\t\t\"exec_run\": %lu,\n"
@ -52,6 +55,9 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
                   , ae->name
                   , ae->chart
                   , ae->family
+                   , ae->class?ae->class:"Unknown"
+                   , ae->component?ae->component:"Unknown"
+                   , ae->type?ae->type:"Unknown"
                   , (ae->flags & HEALTH_ENTRY_FLAG_PROCESSED)?"true":"false"
                   , (ae->flags & HEALTH_ENTRY_FLAG_UPDATED)?"true":"false"
                   , (unsigned long)ae->exec_run_timestamp
@ -76,7 +82,22 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
                   , (ae->flags & HEALTH_ENTRY_FLAG_SILENCED)?"true":"false"
    );

-    health_string2json(wb, "\t\t", "info", ae->info?ae->info:"", ",\n");
+    char *replaced_info = NULL;
+    if (likely(ae->info)) {
+        char *m = NULL;
+        replaced_info = strdupz(ae->info);
+        size_t pos = 0;
+        while ((m = strstr(replaced_info + pos, "$family"))) {
+            char *buf = NULL;
+            pos = m - replaced_info;
+            buf = find_and_replace(replaced_info, "$family", ae->family ? ae->family : "", m);
+            freez(replaced_info);
+            replaced_info = strdupz(buf);
+            freez(buf);
+        }
+    }
+
+    health_string2json(wb, "\t\t", "info", replaced_info?replaced_info:"", ",\n");

    if(unlikely(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION)) {
        buffer_strcat(wb, "\t\t\"no_clear_notification\": true,\n");
@ -91,6 +112,8 @@ void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host)
    buffer_strcat(wb, "\n");

    buffer_strcat(wb, "\t}");
+
+    freez(replaced_info);
 }

 void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
@ -140,12 +163,30 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
    char value_string[100 + 1];
    format_value_and_unit(value_string, 100, rc->value, rc->units, -1);

+    char *replaced_info = NULL;
+    if (likely(rc->info)) {
+        char *m;
+        replaced_info = strdupz(rc->info);
+        size_t pos = 0;
+        while ((m = strstr(replaced_info + pos, "$family"))) {
+            char *buf = NULL;
+            pos = m - replaced_info;
+            buf = find_and_replace(replaced_info, "$family", (rc->rrdset && rc->rrdset->family) ? rc->rrdset->family : "", m);
+            freez(replaced_info);
+            replaced_info = strdupz(buf);
+            freez(buf);
+        }
+    }
+
    buffer_sprintf(wb,
            "\t\t\"%s.%s\": {\n"
                    "\t\t\t\"id\": %lu,\n"
                    "\t\t\t\"name\": \"%s\",\n"
                    "\t\t\t\"chart\": \"%s\",\n"
                    "\t\t\t\"family\": \"%s\",\n"
+                    "\t\t\t\"class\": \"%s\",\n"
+                    "\t\t\t\"component\": \"%s\",\n"
+                    "\t\t\t\"type\": \"%s\",\n"
                    "\t\t\t\"active\": %s,\n"
                    "\t\t\t\"disabled\": %s,\n"
                    "\t\t\t\"silenced\": %s,\n"
@ -174,6 +215,9 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
                   , rc->name
                   , rc->chart
                   , (rc->rrdset && rc->rrdset->family)?rc->rrdset->family:""
+                   , rc->class?rc->class:"Unknown"
+                   , rc->component?rc->component:"Unknown"
+                   , rc->type?rc->type:"Unknown"
                   , (rc->rrdset)?"true":"false"
                   , (rc->rrdcalc_flags & RRDCALC_FLAG_DISABLED)?"true":"false"
                   , (rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)?"true":"false"
@ -181,7 +225,7 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
                   , rc->recipient?rc->recipient:host->health_default_recipient
                   , rc->source
                   , rc->units?rc->units:""
-                   , rc->info?rc->info:""
+                   , replaced_info?replaced_info:""
                   , rrdcalc_status2string(rc->status)
                   , (unsigned long)rc->last_status_change
                   , (unsigned long)rc->last_updated
@ -252,6 +296,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
    buffer_strcat(wb, "\n");

    buffer_strcat(wb, "\t\t}");
+
+    freez(replaced_info);
 }

 //void health_rrdcalctemplate2json_nolock(BUFFER *wb, RRDCALCTEMPLATE *rt) {
--- a/health/health_log.c
+++ b/health/health_log.c
@ -111,6 +111,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
                        "\t%d\t%d\t%d\t%d"
                        "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
                        "\t%016lx"
+                        "\t%s\t%s\t%s"
                        "\n"
                            , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
                            , host->hostname
@ -145,6 +146,9 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
                            , ae->new_value
                            , ae->old_value
                            , (uint64_t)ae->last_repeat
+                            , (ae->class)?ae->class:"Unknown"
+                            , (ae->component)?ae->component:"Unknown"
+                            , (ae->type)?ae->type:"Unknown"
        ) < 0))
            error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
        else {
@ -191,7 +195,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
        host->health_log_entries_written++;
        line++;

-        int max_entries = 30, entries = 0;
+        int max_entries = 33, entries = 0;
        char *pointers[max_entries];

        pointers[entries++] = s++;
@ -364,6 +368,20 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char

            ae->last_repeat = last_repeat;

+            if (likely(entries > 28)) {
+                freez(ae->class);
+                ae->class = strdupz(pointers[28]);
+                if(!*ae->class) { freez(ae->class); ae->class = NULL; }
+
+                freez(ae->component);
+                ae->component = strdupz(pointers[29]);
+                if(!*ae->component) { freez(ae->component); ae->component = NULL; }
+
+                freez(ae->type);
+                ae->type = strdupz(pointers[30]);
+                if(!*ae->type) { freez(ae->type); ae->type = NULL; }
+            }
+
            char value_string[100 + 1];
            freez(ae->old_value_string);
            freez(ae->new_value_string);
@ -442,6 +460,9 @@ inline ALARM_ENTRY* health_create_alarm_entry(
        const char *name,
        const char *chart,
        const char *family,
+        const char *class,
+        const char *component,
+        const char *type,
        const char *exec,
        const char *recipient,
        time_t duration,
@ -469,11 +490,19 @@ inline ALARM_ENTRY* health_create_alarm_entry(
    if(family)
        ae->family = strdupz(family);

+    if (class)
+        ae->class = strdupz(class);
+
+    if (component)
+        ae->component = strdupz(component);
+
+    if (type)
+        ae->type = strdupz(type);
+
    if(exec) ae->exec = strdupz(exec);
    if(recipient) ae->recipient = strdupz(recipient);
    if(source) ae->source = strdupz(source);
    if(units) ae->units = strdupz(units);
-    if(info) ae->info = strdupz(info);

    ae->unique_id = host->health_log.next_log_id++;
    ae->alarm_id = alarm_id;
@ -486,6 +515,24 @@ inline ALARM_ENTRY* health_create_alarm_entry(
    ae->old_value_string = strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae->units, -1));
    ae->new_value_string = strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae->units, -1));

+    char *replaced_info = NULL;
+    if (likely(info)) {
+        char *m;
+        replaced_info = strdupz(info);
+        size_t pos = 0;
+        while ((m = strstr(replaced_info + pos, "$family"))) {
+            char *buf = NULL;
+            pos = m - replaced_info;
+            buf = find_and_replace(replaced_info, "$family", (ae->family) ? ae->family : "", m);
+            freez(replaced_info);
+            replaced_info = strdupz(buf);
+            freez(buf);
+        }
+    }
+
+    if(replaced_info) ae->info = strdupz(replaced_info);
+    freez(replaced_info);
+
    ae->old_status = old_status;
    ae->new_status = new_status;
    ae->duration = duration;
@ -548,6 +595,9 @@ inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) {
    freez(ae->name);
    freez(ae->chart);
    freez(ae->family);
+    freez(ae->class);
+    freez(ae->component);
+    freez(ae->type);
    freez(ae->exec);
    freez(ae->recipient);
    freez(ae->source);
--- a/libnetdata/libnetdata.c
+++ b/libnetdata/libnetdata.c
@ -1492,3 +1492,33 @@ char *read_by_filename(char *filename, long *file_size)
        *file_size = size;
    return contents;
 }
+
+char *find_and_replace(const char *src, const char *find, const char *replace, const char *where)
+{
+    size_t size = strlen(src) + 1;
+    size_t find_len = strlen(find);
+    size_t repl_len = strlen(replace);
+    char *value, *dst;
+
+    if (likely(where))
+        size += (repl_len - find_len);
+
+    value = mallocz(size);
+    dst = value;
+
+    if (likely(where)) {
+        size_t count = where - src;
+
+        memmove(dst, src, count);
+        src += count;
+        dst += count;
+
+        memmove(dst, replace, repl_len);
+        src += find_len;
+        dst += repl_len;
+    }
+
+    strcpy(dst, src);
+
+    return value;
+}
--- a/libnetdata/libnetdata.h
+++ b/libnetdata/libnetdata.h
@ -279,6 +279,7 @@ extern void recursive_config_double_dir_load(
        , size_t depth
 );
 extern char *read_by_filename(char *filename, long *file_size);
+extern char *find_and_replace(const char *src, const char *find, const char *replace, const char *where);

 /* fix for alpine linux */
 #ifndef RUSAGE_THREAD