0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-29 07:00:01 +00:00

remove python.d/smartd_log ()

This commit is contained in:
Ilya Mashchenko 2024-05-06 18:04:30 +03:00 committed by GitHub
parent 8747d882b0
commit 1b549b1dd7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 0 additions and 1521 deletions

View file

@ -2750,7 +2750,6 @@ install(FILES
src/collectors/python.d.plugin/retroshare/retroshare.conf
src/collectors/python.d.plugin/riakkv/riakkv.conf
src/collectors/python.d.plugin/samba/samba.conf
src/collectors/python.d.plugin/smartd_log/smartd_log.conf
src/collectors/python.d.plugin/spigotmc/spigotmc.conf
src/collectors/python.d.plugin/squid/squid.conf
src/collectors/python.d.plugin/tomcat/tomcat.conf
@ -2795,7 +2794,6 @@ install(FILES
src/collectors/python.d.plugin/retroshare/retroshare.chart.py
src/collectors/python.d.plugin/riakkv/riakkv.chart.py
src/collectors/python.d.plugin/samba/samba.chart.py
src/collectors/python.d.plugin/smartd_log/smartd_log.chart.py
src/collectors/python.d.plugin/spigotmc/spigotmc.chart.py
src/collectors/python.d.plugin/squid/squid.chart.py
src/collectors/python.d.plugin/tomcat/tomcat.chart.py

View file

@ -1 +0,0 @@
integrations/s.m.a.r.t..md

View file

@ -1,223 +0,0 @@
<!--startmeta
custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/python.d.plugin/smartd_log/README.md"
meta_yaml: "https://github.com/netdata/netdata/edit/master/src/collectors/python.d.plugin/smartd_log/metadata.yaml"
sidebar_label: "S.M.A.R.T."
learn_status: "Published"
learn_rel_path: "Collecting Metrics/Hardware Devices and Sensors"
most_popular: False
message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE"
endmeta-->
# S.M.A.R.T.
<img src="https://netdata.cloud/img/smart.png" width="150"/>
Plugin: python.d.plugin
Module: smartd_log
<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" />
## Overview
This collector monitors HDD/SSD S.M.A.R.T. metrics about drive health and performance.
It reads `smartd` log files to collect the metrics.
This collector is supported on all platforms.
This collector only supports collecting metrics from a single instance of this integration.
### Default Behavior
#### Auto-Detection
Upon satisfying the prerequisites, the collector will auto-detect metrics if written in either `/var/log/smartd/` or `/var/lib/smartmontools/`.
#### Limits
The default configuration for this integration does not impose any limits on data collection.
#### Performance Impact
The default configuration for this integration is not expected to impose a significant performance impact on the system.
## Metrics
Metrics grouped by *scope*.
The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.
The metrics listed below are split in terms of availability on device type, SCSI or ATA.
### Per S.M.A.R.T. instance
These metrics refer to the entire monitored application.
This scope has no labels.
Metrics:
| Metric | Dimensions | Unit | SCSI | ATA |
|:------|:----------|:----|:---:|:---:|
| smartd_log.read_error_rate | a dimension per device | value | | • |
| smartd_log.seek_error_rate | a dimension per device | value | | • |
| smartd_log.soft_read_error_rate | a dimension per device | errors | | • |
| smartd_log.write_error_rate | a dimension per device | value | | • |
| smartd_log.read_total_err_corrected | a dimension per device | errors | • | |
| smartd_log.read_total_unc_errors | a dimension per device | errors | • | |
| smartd_log.write_total_err_corrected | a dimension per device | errors | • | |
| smartd_log.write_total_unc_errors | a dimension per device | errors | • | |
| smartd_log.verify_total_err_corrected | a dimension per device | errors | • | |
| smartd_log.verify_total_unc_errors | a dimension per device | errors | • | |
| smartd_log.sata_interface_downshift | a dimension per device | events | | • |
| smartd_log.udma_crc_error_count | a dimension per device | errors | | • |
| smartd_log.throughput_performance | a dimension per device | value | | • |
| smartd_log.seek_time_performance | a dimension per device | value | | • |
| smartd_log.start_stop_count | a dimension per device | events | | • |
| smartd_log.power_on_hours_count | a dimension per device | hours | | • |
| smartd_log.power_cycle_count | a dimension per device | events | | • |
| smartd_log.unexpected_power_loss | a dimension per device | events | | • |
| smartd_log.spin_up_time | a dimension per device | ms | | • |
| smartd_log.spin_up_retries | a dimension per device | retries | | • |
| smartd_log.calibration_retries | a dimension per device | retries | | • |
| smartd_log.airflow_temperature_celsius | a dimension per device | celsius | | • |
| smartd_log.temperature_celsius | a dimension per device | celsius | • | • |
| smartd_log.reallocated_sectors_count | a dimension per device | sectors | | • |
| smartd_log.reserved_block_count | a dimension per device | percentage | | • |
| smartd_log.program_fail_count | a dimension per device | errors | | • |
| smartd_log.erase_fail_count | a dimension per device | failures | | • |
| smartd_log.wear_leveller_worst_case_erase_count | a dimension per device | erases | | • |
| smartd_log.unused_reserved_nand_blocks | a dimension per device | blocks | | • |
| smartd_log.reallocation_event_count | a dimension per device | events | | • |
| smartd_log.current_pending_sector_count | a dimension per device | sectors | | • |
| smartd_log.offline_uncorrectable_sector_count | a dimension per device | sectors | | • |
| smartd_log.percent_lifetime_used | a dimension per device | percentage | | • |
| smartd_log.media_wearout_indicator | a dimension per device | percentage | | • |
| smartd_log.nand_writes_1gib | a dimension per device | GiB | | • |
## Alerts
There are no alerts configured by default for this integration.
## Setup
### Prerequisites
#### Configure `smartd` to write attribute information to files.
`smartd` must be running with `-A` option to write `smartd` attribute information to files.
For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ content) in `/etc/default/smartmontools`:
```
# dump smartd attrs info every 600 seconds
smartd_opts="-A /var/log/smartd/ -i 600"
```
You may need to create the smartd directory before smartd will write to it:
```sh
mkdir -p /var/log/smartd
```
Otherwise, all the smartd `.csv` files may get written to `/var/lib/smartmontools` (default location). See also <https://linux.die.net/man/8/smartd> for more info on the `-A --attributelog=PREFIX` command.
`smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files.
### Configuration
#### File
The configuration file name for this integration is `python.d/smartd_log.conf`.
You can edit the configuration file using the `edit-config` script from the
Netdata [config directory](https://github.com/netdata/netdata/blob/master/docs/netdata-agent/configuration.md#the-netdata-config-directory).
```bash
cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
sudo ./edit-config python.d/smartd_log.conf
```
#### Options
This particular collector does not need further configuration to work if permissions are satisfied, but you can always customize it's data collection behavior.
There are 2 sections:
* Global variables
* One or more JOBS that can define multiple different instances to monitor.
The following options can be defined globally: priority, penalty, autodetection_retry, update_every, but can also be defined per JOB to override the global values.
Additionally, the following collapsed table contains all the options that can be configured inside a JOB definition.
Every configuration JOB starts with a `job_name` value which will appear in the dashboard, unless a `name` parameter is specified.
<details><summary>Config options</summary>
| Name | Description | Default | Required |
|:----|:-----------|:-------|:--------:|
| log_path | path to smartd log files. | /var/log/smartd | yes |
| exclude_disks | Space-separated patterns. If the pattern is in the drive name, the module will not collect data for it. | | no |
| age | Time in minutes since the last dump to file. | 30 | no |
| update_every | Sets the default data collection frequency. | 1 | no |
| priority | Controls the order of charts at the netdata dashboard. | 60000 | no |
| autodetection_retry | Sets the job re-check interval in seconds. | 0 | no |
| penalty | Indicates whether to apply penalty to update_every in case of failures. | yes | no |
| name | Job name. This value will overwrite the `job_name` value. JOBS with the same name are mutually exclusive. Only one of them will be allowed running at any time. This allows autodetection to try several alternatives and pick the one that works. | | no |
</details>
#### Examples
##### Basic
A basic configuration example.
```yaml
custom:
name: smartd_log
log_path: '/var/log/smartd/'
```
## Troubleshooting
### Debug Mode
To troubleshoot issues with the `smartd_log` collector, run the `python.d.plugin` with the debug option enabled. The output
should give you clues as to why the collector isn't working.
- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on
your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.
```bash
cd /usr/libexec/netdata/plugins.d/
```
- Switch to the `netdata` user.
```bash
sudo -u netdata -s
```
- Run the `python.d.plugin` to debug the collector:
```bash
./python.d.plugin smartd_log debug trace
```

View file

@ -1,429 +0,0 @@
plugin_name: python.d.plugin
modules:
- meta:
plugin_name: python.d.plugin
module_name: smartd_log
monitored_instance:
name: S.M.A.R.T.
link: "https://linux.die.net/man/8/smartd"
categories:
- data-collection.hardware-devices-and-sensors
icon_filename: "smart.png"
related_resources:
integrations:
list: []
info_provided_to_referring_integrations:
description: ""
keywords:
- smart
- S.M.A.R.T.
- SCSI devices
- ATA devices
most_popular: false
overview:
data_collection:
metrics_description: |
This collector monitors HDD/SSD S.M.A.R.T. metrics about drive health and performance.
method_description: |
It reads `smartd` log files to collect the metrics.
supported_platforms:
include: []
exclude: []
multi_instance: false
additional_permissions:
description: ""
default_behavior:
auto_detection:
description: Upon satisfying the prerequisites, the collector will auto-detect metrics if written in either `/var/log/smartd/` or `/var/lib/smartmontools/`.
limits:
description: ""
performance_impact:
description: ""
setup:
prerequisites:
list:
- title: Configure `smartd` to write attribute information to files.
description: |
`smartd` must be running with `-A` option to write `smartd` attribute information to files.
For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ content) in `/etc/default/smartmontools`:
```
# dump smartd attrs info every 600 seconds
smartd_opts="-A /var/log/smartd/ -i 600"
```
You may need to create the smartd directory before smartd will write to it:
```sh
mkdir -p /var/log/smartd
```
Otherwise, all the smartd `.csv` files may get written to `/var/lib/smartmontools` (default location). See also <https://linux.die.net/man/8/smartd> for more info on the `-A --attributelog=PREFIX` command.
`smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files.
configuration:
file:
name: "python.d/smartd_log.conf"
options:
description: |
This particular collector does not need further configuration to work if permissions are satisfied, but you can always customize it's data collection behavior.
There are 2 sections:
* Global variables
* One or more JOBS that can define multiple different instances to monitor.
The following options can be defined globally: priority, penalty, autodetection_retry, update_every, but can also be defined per JOB to override the global values.
Additionally, the following collapsed table contains all the options that can be configured inside a JOB definition.
Every configuration JOB starts with a `job_name` value which will appear in the dashboard, unless a `name` parameter is specified.
folding:
title: "Config options"
enabled: true
list:
- name: log_path
description: path to smartd log files.
default_value: /var/log/smartd
required: true
- name: exclude_disks
description: Space-separated patterns. If the pattern is in the drive name, the module will not collect data for it.
default_value: ""
required: false
- name: age
description: Time in minutes since the last dump to file.
default_value: 30
required: false
- name: update_every
description: Sets the default data collection frequency.
default_value: 1
required: false
- name: priority
description: Controls the order of charts at the netdata dashboard.
default_value: 60000
required: false
- name: autodetection_retry
description: Sets the job re-check interval in seconds.
default_value: 0
required: false
- name: penalty
description: Indicates whether to apply penalty to update_every in case of failures.
default_value: yes
required: false
- name: name
description: >
Job name. This value will overwrite the `job_name` value. JOBS with the same name are mutually exclusive. Only one of them will be allowed running at any time. This allows autodetection to try several alternatives and pick the one that works.
default_value: ""
required: false
examples:
folding:
enabled: true
title: "Config"
list:
- name: Basic
description: A basic configuration example.
folding:
enabled: false
config: |
custom:
name: smartd_log
log_path: '/var/log/smartd/'
troubleshooting:
problems:
list: []
alerts: []
metrics:
folding:
title: Metrics
enabled: false
description: "The metrics listed below are split in terms of availability on device type, SCSI or ATA."
availability:
- "SCSI"
- "ATA"
scopes:
- name: global
description: "These metrics refer to the entire monitored application."
labels: []
metrics:
- name: smartd_log.read_error_rate
description: Read Error Rate
availability:
- ATA
unit: "value"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.seek_error_rate
description: Seek Error Rate
availability:
- ATA
unit: "value"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.soft_read_error_rate
description: Soft Read Error Rate
availability:
- ATA
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.write_error_rate
description: Write Error Rate
availability:
- ATA
unit: "value"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.read_total_err_corrected
description: Read Error Corrected
availability:
- SCSI
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.read_total_unc_errors
description: Read Error Uncorrected
availability:
- SCSI
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.write_total_err_corrected
description: Write Error Corrected
availability:
- SCSI
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.write_total_unc_errors
description: Write Error Uncorrected
availability:
- SCSI
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.verify_total_err_corrected
description: Verify Error Corrected
availability:
- SCSI
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.verify_total_unc_errors
description: Verify Error Uncorrected
availability:
- SCSI
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.sata_interface_downshift
description: SATA Interface Downshift
availability:
- ATA
unit: "events"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.udma_crc_error_count
description: UDMA CRC Error Count
availability:
- ATA
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.throughput_performance
description: Throughput Performance
availability:
- ATA
unit: "value"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.seek_time_performance
description: Seek Time Performance
availability:
- ATA
unit: "value"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.start_stop_count
description: Start/Stop Count
availability:
- ATA
unit: "events"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.power_on_hours_count
description: Power-On Hours Count
availability:
- ATA
unit: "hours"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.power_cycle_count
description: Power Cycle Count
availability:
- ATA
unit: "events"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.unexpected_power_loss
description: Unexpected Power Loss
availability:
- ATA
unit: "events"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.spin_up_time
description: Spin-Up Time
availability:
- ATA
unit: "ms"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.spin_up_retries
description: Spin-up Retries
availability:
- ATA
unit: "retries"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.calibration_retries
description: Calibration Retries
availability:
- ATA
unit: "retries"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.airflow_temperature_celsius
description: Airflow Temperature Celsius
availability:
- ATA
unit: "celsius"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.temperature_celsius
description: Temperature
availability:
- SCSI
- ATA
unit: "celsius"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.reallocated_sectors_count
description: Reallocated Sectors Count
availability:
- ATA
unit: "sectors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.reserved_block_count
description: Reserved Block Count
availability:
- ATA
unit: "percentage"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.program_fail_count
description: Program Fail Count
availability:
- ATA
unit: "errors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.erase_fail_count
description: Erase Fail Count
availability:
- ATA
unit: "failures"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.wear_leveller_worst_case_erase_count
description: Wear Leveller Worst Case Erase Count
availability:
- ATA
unit: "erases"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.unused_reserved_nand_blocks
description: Unused Reserved NAND Blocks
availability:
- ATA
unit: "blocks"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.reallocation_event_count
description: Reallocation Event Count
availability:
- ATA
unit: "events"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.current_pending_sector_count
description: Current Pending Sector Count
availability:
- ATA
unit: "sectors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.offline_uncorrectable_sector_count
description: Offline Uncorrectable Sector Count
availability:
- ATA
unit: "sectors"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.percent_lifetime_used
description: Percent Lifetime Used
availability:
- ATA
unit: "percentage"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.media_wearout_indicator
description: Media Wearout Indicator
availability:
- ATA
unit: "percentage"
chart_type: line
dimensions:
- name: a dimension per device
- name: smartd_log.nand_writes_1gib
description: NAND Writes
availability:
- ATA
unit: "GiB"
chart_type: line
dimensions:
- name: a dimension per device

View file

@ -1,790 +0,0 @@
# -*- coding: utf-8 -*-
# Description: smart netdata python.d module
# Author: ilyam8, vorph1
# SPDX-License-Identifier: GPL-3.0-or-later
import os
import re
from copy import deepcopy
from time import time
from bases.FrameworkServices.SimpleService import SimpleService
from bases.collection import read_last_line
INCREMENTAL = 'incremental'
ABSOLUTE = 'absolute'
ATA = 'ata'
SCSI = 'scsi'
CSV = '.csv'
DEF_RESCAN_INTERVAL = 60
DEF_AGE = 30
DEF_PATH = '/var/log/smartd'
ATTR1 = '1'
ATTR2 = '2'
ATTR3 = '3'
ATTR4 = '4'
ATTR5 = '5'
ATTR7 = '7'
ATTR8 = '8'
ATTR9 = '9'
ATTR10 = '10'
ATTR11 = '11'
ATTR12 = '12'
ATTR13 = '13'
ATTR170 = '170'
ATTR171 = '171'
ATTR172 = '172'
ATTR173 = '173'
ATTR174 = '174'
ATTR177 = '177'
ATTR180 = '180'
ATTR183 = '183'
ATTR190 = '190'
ATTR194 = '194'
ATTR196 = '196'
ATTR197 = '197'
ATTR198 = '198'
ATTR199 = '199'
ATTR202 = '202'
ATTR206 = '206'
ATTR233 = '233'
ATTR241 = '241'
ATTR242 = '242'
ATTR249 = '249'
ATTR_READ_ERR_COR = 'read-total-err-corrected'
ATTR_READ_ERR_UNC = 'read-total-unc-errors'
ATTR_WRITE_ERR_COR = 'write-total-err-corrected'
ATTR_WRITE_ERR_UNC = 'write-total-unc-errors'
ATTR_VERIFY_ERR_COR = 'verify-total-err-corrected'
ATTR_VERIFY_ERR_UNC = 'verify-total-unc-errors'
ATTR_TEMPERATURE = 'temperature'
RE_ATA = re.compile(
'(\d+);' # attribute
'(\d+);' # normalized value
'(\d+)', # raw value
re.X
)
RE_SCSI = re.compile(
'([a-z-]+);' # attribute
'([0-9.]+)', # raw value
re.X
)
ORDER = [
# errors
'read_error_rate',
'seek_error_rate',
'soft_read_error_rate',
'write_error_rate',
'read_total_err_corrected',
'read_total_unc_errors',
'write_total_err_corrected',
'write_total_unc_errors',
'verify_total_err_corrected',
'verify_total_unc_errors',
# external failure
'sata_interface_downshift',
'udma_crc_error_count',
# performance
'throughput_performance',
'seek_time_performance',
# power
'start_stop_count',
'power_on_hours_count',
'power_cycle_count',
'unexpected_power_loss',
# spin
'spin_up_time',
'spin_up_retries',
'calibration_retries',
# temperature
'airflow_temperature_celsius',
'temperature_celsius',
# wear
'reallocated_sectors_count',
'reserved_block_count',
'program_fail_count',
'erase_fail_count',
'wear_leveller_worst_case_erase_count',
'unused_reserved_nand_blocks',
'reallocation_event_count',
'current_pending_sector_count',
'offline_uncorrectable_sector_count',
'percent_lifetime_used',
'media_wearout_indicator',
'total_lbas_written',
'total_lbas_read',
]
CHARTS = {
'read_error_rate': {
'options': [None, 'Read Error Rate', 'value', 'errors', 'smartd_log.read_error_rate', 'line'],
'lines': [],
'attrs': [ATTR1],
'algo': ABSOLUTE,
},
'seek_error_rate': {
'options': [None, 'Seek Error Rate', 'value', 'errors', 'smartd_log.seek_error_rate', 'line'],
'lines': [],
'attrs': [ATTR7],
'algo': ABSOLUTE,
},
'soft_read_error_rate': {
'options': [None, 'Soft Read Error Rate', 'errors', 'errors', 'smartd_log.soft_read_error_rate', 'line'],
'lines': [],
'attrs': [ATTR13],
'algo': INCREMENTAL,
},
'write_error_rate': {
'options': [None, 'Write Error Rate', 'value', 'errors', 'smartd_log.write_error_rate', 'line'],
'lines': [],
'attrs': [ATTR206],
'algo': ABSOLUTE,
},
'read_total_err_corrected': {
'options': [None, 'Read Error Corrected', 'errors', 'errors', 'smartd_log.read_total_err_corrected', 'line'],
'lines': [],
'attrs': [ATTR_READ_ERR_COR],
'algo': INCREMENTAL,
},
'read_total_unc_errors': {
'options': [None, 'Read Error Uncorrected', 'errors', 'errors', 'smartd_log.read_total_unc_errors', 'line'],
'lines': [],
'attrs': [ATTR_READ_ERR_UNC],
'algo': INCREMENTAL,
},
'write_total_err_corrected': {
'options': [None, 'Write Error Corrected', 'errors', 'errors', 'smartd_log.write_total_err_corrected', 'line'],
'lines': [],
'attrs': [ATTR_WRITE_ERR_COR],
'algo': INCREMENTAL,
},
'write_total_unc_errors': {
'options': [None, 'Write Error Uncorrected', 'errors', 'errors', 'smartd_log.write_total_unc_errors', 'line'],
'lines': [],
'attrs': [ATTR_WRITE_ERR_UNC],
'algo': INCREMENTAL,
},
'verify_total_err_corrected': {
'options': [None, 'Verify Error Corrected', 'errors', 'errors', 'smartd_log.verify_total_err_corrected',
'line'],
'lines': [],
'attrs': [ATTR_VERIFY_ERR_COR],
'algo': INCREMENTAL,
},
'verify_total_unc_errors': {
'options': [None, 'Verify Error Uncorrected', 'errors', 'errors', 'smartd_log.verify_total_unc_errors', 'line'],
'lines': [],
'attrs': [ATTR_VERIFY_ERR_UNC],
'algo': INCREMENTAL,
},
'sata_interface_downshift': {
'options': [None, 'SATA Interface Downshift', 'events', 'external failure',
'smartd_log.sata_interface_downshift', 'line'],
'lines': [],
'attrs': [ATTR183],
'algo': INCREMENTAL,
},
'udma_crc_error_count': {
'options': [None, 'UDMA CRC Error Count', 'errors', 'external failure', 'smartd_log.udma_crc_error_count',
'line'],
'lines': [],
'attrs': [ATTR199],
'algo': INCREMENTAL,
},
'throughput_performance': {
'options': [None, 'Throughput Performance', 'value', 'performance', 'smartd_log.throughput_performance',
'line'],
'lines': [],
'attrs': [ATTR2],
'algo': ABSOLUTE,
},
'seek_time_performance': {
'options': [None, 'Seek Time Performance', 'value', 'performance', 'smartd_log.seek_time_performance', 'line'],
'lines': [],
'attrs': [ATTR8],
'algo': ABSOLUTE,
},
'start_stop_count': {
'options': [None, 'Start/Stop Count', 'events', 'power', 'smartd_log.start_stop_count', 'line'],
'lines': [],
'attrs': [ATTR4],
'algo': ABSOLUTE,
},
'power_on_hours_count': {
'options': [None, 'Power-On Hours Count', 'hours', 'power', 'smartd_log.power_on_hours_count', 'line'],
'lines': [],
'attrs': [ATTR9],
'algo': ABSOLUTE,
},
'power_cycle_count': {
'options': [None, 'Power Cycle Count', 'events', 'power', 'smartd_log.power_cycle_count', 'line'],
'lines': [],
'attrs': [ATTR12],
'algo': ABSOLUTE,
},
'unexpected_power_loss': {
'options': [None, 'Unexpected Power Loss', 'events', 'power', 'smartd_log.unexpected_power_loss', 'line'],
'lines': [],
'attrs': [ATTR174],
'algo': ABSOLUTE,
},
'spin_up_time': {
'options': [None, 'Spin-Up Time', 'ms', 'spin', 'smartd_log.spin_up_time', 'line'],
'lines': [],
'attrs': [ATTR3],
'algo': ABSOLUTE,
},
'spin_up_retries': {
'options': [None, 'Spin-up Retries', 'retries', 'spin', 'smartd_log.spin_up_retries', 'line'],
'lines': [],
'attrs': [ATTR10],
'algo': INCREMENTAL,
},
'calibration_retries': {
'options': [None, 'Calibration Retries', 'retries', 'spin', 'smartd_log.calibration_retries', 'line'],
'lines': [],
'attrs': [ATTR11],
'algo': INCREMENTAL,
},
'airflow_temperature_celsius': {
'options': [None, 'Airflow Temperature Celsius', 'celsius', 'temperature',
'smartd_log.airflow_temperature_celsius', 'line'],
'lines': [],
'attrs': [ATTR190],
'algo': ABSOLUTE,
},
'temperature_celsius': {
'options': [None, 'Temperature', 'celsius', 'temperature', 'smartd_log.temperature_celsius', 'line'],
'lines': [],
'attrs': [ATTR194, ATTR_TEMPERATURE],
'algo': ABSOLUTE,
},
'reallocated_sectors_count': {
'options': [None, 'Reallocated Sectors Count', 'sectors', 'wear', 'smartd_log.reallocated_sectors_count',
'line'],
'lines': [],
'attrs': [ATTR5],
'algo': ABSOLUTE,
},
'reserved_block_count': {
'options': [None, 'Reserved Block Count', 'percentage', 'wear', 'smartd_log.reserved_block_count', 'line'],
'lines': [],
'attrs': [ATTR170],
'algo': ABSOLUTE,
},
'program_fail_count': {
'options': [None, 'Program Fail Count', 'errors', 'wear', 'smartd_log.program_fail_count', 'line'],
'lines': [],
'attrs': [ATTR171],
'algo': INCREMENTAL,
},
'erase_fail_count': {
'options': [None, 'Erase Fail Count', 'failures', 'wear', 'smartd_log.erase_fail_count', 'line'],
'lines': [],
'attrs': [ATTR172],
'algo': INCREMENTAL,
},
'wear_leveller_worst_case_erase_count': {
'options': [None, 'Wear Leveller Worst Case Erase Count', 'erases', 'wear',
'smartd_log.wear_leveller_worst_case_erase_count', 'line'],
'lines': [],
'attrs': [ATTR173],
'algo': ABSOLUTE,
},
'unused_reserved_nand_blocks': {
'options': [None, 'Unused Reserved NAND Blocks', 'blocks', 'wear', 'smartd_log.unused_reserved_nand_blocks',
'line'],
'lines': [],
'attrs': [ATTR180],
'algo': ABSOLUTE,
},
'reallocation_event_count': {
'options': [None, 'Reallocation Event Count', 'events', 'wear', 'smartd_log.reallocation_event_count', 'line'],
'lines': [],
'attrs': [ATTR196],
'algo': INCREMENTAL,
},
'current_pending_sector_count': {
'options': [None, 'Current Pending Sector Count', 'sectors', 'wear', 'smartd_log.current_pending_sector_count',
'line'],
'lines': [],
'attrs': [ATTR197],
'algo': ABSOLUTE,
},
'offline_uncorrectable_sector_count': {
'options': [None, 'Offline Uncorrectable Sector Count', 'sectors', 'wear',
'smartd_log.offline_uncorrectable_sector_count', 'line'],
'lines': [],
'attrs': [ATTR198],
'algo': ABSOLUTE,
},
'percent_lifetime_used': {
'options': [None, 'Percent Lifetime Used', 'percentage', 'wear', 'smartd_log.percent_lifetime_used', 'line'],
'lines': [],
'attrs': [ATTR202],
'algo': ABSOLUTE,
},
'media_wearout_indicator': {
'options': [None, 'Media Wearout Indicator', 'percentage', 'wear', 'smartd_log.media_wearout_indicator', 'line'],
'lines': [],
'attrs': [ATTR233, ATTR177],
'algo': ABSOLUTE,
},
'nand_writes_1gib': {
'options': [None, 'NAND Writes', 'GiB', 'wear', 'smartd_log.nand_writes_1gib', 'line'],
'lines': [],
'attrs': [ATTR249],
'algo': ABSOLUTE,
},
'total_lbas_written': {
'options': [None, 'Total LBAs Written', 'sectors', 'wear', 'smartd_log.total_lbas_written', 'line'],
'lines': [],
'attrs': [ATTR241],
'algo': ABSOLUTE,
},
'total_lbas_read': {
'options': [None, 'Total LBAs Read', 'sectors', 'wear', 'smartd_log.total_lbas_read', 'line'],
'lines': [],
'attrs': [ATTR242],
'algo': ABSOLUTE,
},
}
# NOTE: 'parse_temp' decodes ATA 194 raw value. Not heavily tested. Written by @Ferroin
# C code:
# https://github.com/smartmontools/smartmontools/blob/master/smartmontools/atacmds.cpp#L2051
#
# Calling 'parse_temp' on the raw value will return a 4-tuple, containing
# * temperature
# * minimum
# * maximum
# * over-temperature count
# substituting None for values it can't decode.
#
# Example:
# >>> parse_temp(42952491042)
# >>> (34, 10, 43, None)
#
#
# def check_temp_word(i):
# if i <= 0x7F:
# return 0x11
# elif i <= 0xFF:
# return 0x01
# elif 0xFF80 <= i:
# return 0x10
# return 0x00
#
#
# def check_temp_range(t, b0, b1):
# if b0 > b1:
# t0, t1 = b1, b0
# else:
# t0, t1 = b0, b1
#
# if all([
# -60 <= t0,
# t0 <= t,
# t <= t1,
# t1 <= 120,
# not (t0 == -1 and t1 <= 0)
# ]):
# return t0, t1
# return None, None
#
#
# def parse_temp(raw):
# byte = list()
# word = list()
# for i in range(0, 6):
# byte.append(0xFF & (raw >> (i * 8)))
# for i in range(0, 3):
# word.append(0xFFFF & (raw >> (i * 16)))
#
# ctwd = check_temp_word(word[0])
#
# if not word[2]:
# if ctwd and not word[1]:
# # byte[0] is temp, no other data
# return byte[0], None, None, None
#
# if ctwd and all(check_temp_range(byte[0], byte[2], byte[3])):
# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max
# trange = check_temp_range(byte[0], byte[2], byte[3])
# return byte[0], trange[0], trange[1], None
#
# if ctwd and all(check_temp_range(byte[0], byte[1], byte[2])):
# # byte[0] is temp, byte[1] is max or min, byte[2] is min or max
# trange = check_temp_range(byte[0], byte[1], byte[2])
# return byte[0], trange[0], trange[1], None
#
# return None, None, None, None
#
# if ctwd:
# if all(
# [
# ctwd & check_temp_word(word[1]) & check_temp_word(word[2]) != 0x00,
# all(check_temp_range(byte[0], byte[2], byte[4])),
# ]
# ):
# # byte[0] is temp, byte[2] is max or min, byte[4] is min or max
# trange = check_temp_range(byte[0], byte[2], byte[4])
# return byte[0], trange[0], trange[1], None
# else:
# trange = check_temp_range(byte[0], byte[2], byte[3])
# if word[2] < 0x7FFF and all(trange) and trange[1] >= 40:
# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max, word[2] is overtemp count
# return byte[0], trange[0], trange[1], word[2]
# # no data
# return None, None, None, None
CHARTED_ATTRS = dict((attr, k) for k, v in CHARTS.items() for attr in v['attrs'])
class BaseAtaSmartAttribute:
def __init__(self, name, normalized_value, raw_value):
self.name = name
self.normalized_value = normalized_value
self.raw_value = raw_value
def value(self):
raise NotImplementedError
class AtaRaw(BaseAtaSmartAttribute):
def value(self):
return self.raw_value
class AtaNormalized(BaseAtaSmartAttribute):
def value(self):
return self.normalized_value
class Ata3(BaseAtaSmartAttribute):
def value(self):
value = int(self.raw_value)
# https://github.com/netdata/netdata/issues/5919
#
# 3;151;38684000679;
# 423 (Average 447)
# 38684000679 & 0xFFF -> 423
# (38684000679 & 0xFFF0000) >> 16 -> 447
if value > 1e6:
return value & 0xFFF
return value
class Ata9(BaseAtaSmartAttribute):
def value(self):
value = int(self.raw_value)
if value > 1e6:
return value & 0xFFFF
return value
class Ata190(BaseAtaSmartAttribute):
def value(self):
return 100 - int(self.normalized_value)
class Ata194(BaseAtaSmartAttribute):
# https://github.com/netdata/netdata/issues/3041
# https://github.com/netdata/netdata/issues/5919
#
# The low byte is the current temperature, the third lowest is the maximum, and the fifth lowest is the minimum
def value(self):
value = int(self.raw_value)
if value > 1e6:
return value & 0xFF
return min(int(self.normalized_value), int(self.raw_value))
class BaseSCSISmartAttribute:
def __init__(self, name, raw_value):
self.name = name
self.raw_value = raw_value
def value(self):
raise NotImplementedError
class SCSIRaw(BaseSCSISmartAttribute):
def value(self):
return self.raw_value
def ata_attribute_factory(value):
name = value[0]
if name == ATTR3:
return Ata3(*value)
elif name == ATTR9:
return Ata9(*value)
elif name == ATTR190:
return Ata190(*value)
elif name == ATTR194:
return Ata194(*value)
elif name in [
ATTR1,
ATTR7,
ATTR177,
ATTR202,
ATTR206,
ATTR233,
]:
return AtaNormalized(*value)
return AtaRaw(*value)
def scsi_attribute_factory(value):
return SCSIRaw(*value)
def attribute_factory(value):
name = value[0]
if name.isdigit():
return ata_attribute_factory(value)
return scsi_attribute_factory(value)
def handle_error(*errors):
def on_method(method):
def on_call(*args):
try:
return method(*args)
except errors:
return None
return on_call
return on_method
class DiskLogFile:
def __init__(self, full_path):
self.path = full_path
self.size = os.path.getsize(full_path)
@handle_error(OSError)
def is_changed(self):
return self.size != os.path.getsize(self.path)
@handle_error(OSError)
def is_active(self, current_time, limit):
return (current_time - os.path.getmtime(self.path)) / 60 < limit
@handle_error(OSError)
def read(self):
self.size = os.path.getsize(self.path)
return read_last_line(self.path)
class BaseDisk:
def __init__(self, name, log_file):
self.raw_name = name
self.name = re.sub(r'_+', '_', name)
self.log_file = log_file
self.attrs = list()
self.alive = True
self.charted = False
def __eq__(self, other):
if isinstance(other, BaseDisk):
return self.raw_name == other.raw_name
return self.raw_name == other
def __ne__(self, other):
return not self == other
def __hash__(self):
return hash(repr(self))
def parser(self, data):
raise NotImplementedError
@handle_error(TypeError)
def populate_attrs(self):
self.attrs = list()
line = self.log_file.read()
for value in self.parser(line):
self.attrs.append(attribute_factory(value))
return len(self.attrs)
def data(self):
data = dict()
for attr in self.attrs:
data['{0}_{1}'.format(self.name, attr.name)] = attr.value()
return data
class ATADisk(BaseDisk):
def parser(self, data):
return RE_ATA.findall(data)
class SCSIDisk(BaseDisk):
def parser(self, data):
return RE_SCSI.findall(data)
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
SimpleService.__init__(self, configuration=configuration, name=name)
self.order = ORDER
self.definitions = deepcopy(CHARTS)
self.log_path = configuration.get('log_path', DEF_PATH)
self.age = configuration.get('age', DEF_AGE)
self.exclude = configuration.get('exclude_disks', str()).split()
self.disks = list()
self.runs = 0
self.do_force_rescan = False
def check(self):
return self.scan() > 0
def get_data(self):
self.runs += 1
if self.do_force_rescan or self.runs % DEF_RESCAN_INTERVAL == 0:
self.cleanup()
self.scan()
self.do_force_rescan = False
data = dict()
for disk in self.disks:
if not disk.alive:
continue
if not disk.charted:
self.add_disk_to_charts(disk)
changed = disk.log_file.is_changed()
if changed is None:
disk.alive = False
self.do_force_rescan = True
continue
if changed and disk.populate_attrs() is None:
disk.alive = False
self.do_force_rescan = True
continue
data.update(disk.data())
return data
def cleanup(self):
current_time = time()
for disk in self.disks[:]:
if any(
[
not disk.alive,
not disk.log_file.is_active(current_time, self.age),
]
):
self.disks.remove(disk.raw_name)
self.remove_disk_from_charts(disk)
def scan(self):
self.debug('scanning {0}'.format(self.log_path))
current_time = time()
for full_name in os.listdir(self.log_path):
disk = self.create_disk_from_file(full_name, current_time)
if not disk:
continue
self.disks.append(disk)
return len(self.disks)
def create_disk_from_file(self, full_name, current_time):
if not full_name.endswith(CSV):
self.debug('skipping {0}: not a csv file'.format(full_name))
return None
name = os.path.basename(full_name).split('.')[-3]
path = os.path.join(self.log_path, full_name)
if name in self.disks:
self.debug('skipping {0}: already in disks'.format(full_name))
return None
if [p for p in self.exclude if p in name]:
self.debug('skipping {0}: filtered by `exclude` option'.format(full_name))
return None
if not os.access(path, os.R_OK):
self.debug('skipping {0}: not readable'.format(full_name))
return None
if os.path.getsize(path) == 0:
self.debug('skipping {0}: zero size'.format(full_name))
return None
if (current_time - os.path.getmtime(path)) / 60 > self.age:
self.debug('skipping {0}: haven\'t been updated for last {1} minutes'.format(full_name, self.age))
return None
if ATA in full_name:
disk = ATADisk(name, DiskLogFile(path))
elif SCSI in full_name:
disk = SCSIDisk(name, DiskLogFile(path))
else:
self.debug('skipping {0}: unknown type'.format(full_name))
return None
disk.populate_attrs()
if not disk.attrs:
self.error('skipping {0}: parsing failed'.format(full_name))
return None
self.debug('added {0}'.format(full_name))
return disk
def add_disk_to_charts(self, disk):
if len(self.charts) == 0 or disk.charted:
return
disk.charted = True
for attr in disk.attrs:
chart_id = CHARTED_ATTRS.get(attr.name)
if not chart_id or chart_id not in self.charts:
continue
chart = self.charts[chart_id]
dim = [
'{0}_{1}'.format(disk.name, attr.name),
disk.name,
CHARTS[chart_id]['algo'],
]
if dim[0] in self.charts[chart_id].dimensions:
chart.hide_dimension(dim[0], reverse=True)
else:
chart.add_dimension(dim)
def remove_disk_from_charts(self, disk):
if len(self.charts) == 0 or not disk.charted:
return
for attr in disk.attrs:
chart_id = CHARTED_ATTRS.get(attr.name)
if not chart_id or chart_id not in self.charts:
continue
self.charts[chart_id].del_dimension('{0}_{1}'.format(disk.name, attr.name))

View file

@ -1,76 +0,0 @@
# netdata python.d.plugin configuration for smartd log
#
# This file is in YaML format. Generally the format is:
#
# name: value
#
# There are 2 sections:
# - global variables
# - one or more JOBS
#
# JOBS allow you to collect values from multiple sources.
# Each source will have its own set of charts.
#
# JOB parameters have to be indented (using spaces only, example below).
# ----------------------------------------------------------------------
# Global Variables
# These variables set the defaults for all JOBs, however each JOB
# may define its own, overriding the defaults.
# update_every sets the default data collection frequency.
# If unset, the python.d.plugin default is used.
# update_every: 1
# priority controls the order of charts at the netdata dashboard.
# Lower numbers move the charts towards the top of the page.
# If unset, the default for python.d.plugin is used.
# priority: 60000
# penalty indicates whether to apply penalty to update_every in case of failures.
# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes.
# penalty: yes
# autodetection_retry sets the job re-check interval in seconds.
# The job is not deleted if check fails.
# Attempts to start the job are made once every autodetection_retry.
# This feature is disabled by default.
# autodetection_retry: 0
# ----------------------------------------------------------------------
# JOBS (data collection sources)
#
# The default JOBS share the same *name*. JOBS with the same name
# are mutually exclusive. Only one of them will be allowed running at
# any time. This allows autodetection to try several alternatives and
# pick the one that works.
#
# Any number of jobs is supported.
#
# All python.d.plugin JOBS (for all its modules) support a set of
# predefined parameters. These are:
#
# job_name:
# name: myname # the JOB's name as it will appear at the
# # dashboard (by default is the job_name)
# # JOBs sharing a name are mutually exclusive
# update_every: 1 # the JOB's data collection frequency
# priority: 60000 # the JOB's order on the dashboard
# penalty: yes # the JOB's penalty
# autodetection_retry: 0 # the JOB's re-check interval in seconds
#
# Additionally to the above, smartd_log also supports the following:
#
# log_path: '/path/to/smartd_logs' # path to smartd log files. Default is /var/log/smartd
# exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it.
# age: 30 # time in minutes since the last dump to file. If smartd has not dumped data within this time the job exits.
#
# ----------------------------------------------------------------------
custom:
name: smartd_log
log_path: '/var/log/smartd/'
debian:
name: smartd_log
log_path: '/var/lib/smartmontools/'