0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-27 22:26:21 +00:00

adaptec_raid python module ()

* adaptec_raid module init version

* adaptec_raid minor

* adaptec_raid minor

* adaptec_raid minor

* adaptec_raid arcconf command fix

* adaptec_raid minor fixes

* adaptec_raid add alarms

* adaptec_raid add link to screenshot to the readme
This commit is contained in:
Ilya Mashchenko 2018-10-23 20:48:56 +09:00 committed by GitHub
parent c0c5318100
commit b85833f081
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 370 additions and 0 deletions

View file

@ -37,6 +37,7 @@ dist_pythonconfig_DATA = \
$(top_srcdir)/installer/.keep \
$(NULL)
include adaptec_raid/Makefile.inc
include apache/Makefile.inc
include beanstalk/Makefile.inc
include bind_rndc/Makefile.inc

View file

@ -0,0 +1,13 @@
# SPDX-License-Identifier: GPL-3.0-or-later
# THIS IS NOT A COMPLETE Makefile
# IT IS INCLUDED BY ITS PARENT'S Makefile.am
# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
# install these files
dist_python_DATA += adaptec_raid/adaptec_raid.chart.py
dist_pythonconfig_DATA += adaptec_raid/adaptec_raid.conf
# do not install these files, but include them in the distribution
dist_noinst_DATA += adaptec_raid/README.md adaptec_raid/Makefile.inc

View file

@ -0,0 +1,27 @@
# adaptec raid
Module collects logical and physical devices health metrics.
**Requirements:**
* `netdata` user needs to be able to sudo the `arcconf` program without password
To grab stats it executes:
* `sudo -n arcconf GETCONFIG 1 LD`
* `sudo -n arcconf GETCONFIG 1 PD`
It produces:
1. **Logical Device Status**
2. **Physical Device State**
3. **Physical Device S.M.A.R.T warnings**
4. **Physical Device Temperature**
Screenshot:
![image](https://user-images.githubusercontent.com/22274335/47278133-6d306680-d601-11e8-87c2-cc9c0f42d686.png)
---

View file

@ -0,0 +1,245 @@
# -*- coding: utf-8 -*-
# Description: adaptec_raid netdata python.d module
# Author: Ilya Mashchenko (l2isbad)
# SPDX-License-Identifier: GPL-3.0-or-later
import re
from copy import deepcopy
from bases.FrameworkServices.ExecutableService import ExecutableService
from bases.collection import find_binary
update_every = 5
ORDER = [
'ld_status',
'pd_state',
'pd_smart_warnings',
'pd_temperature',
]
CHARTS = {
'ld_status': {
'options': [None, 'Status Is Not OK', 'bool', 'logical devices', 'adapter_raid.ld_status', 'line'],
'lines': []
},
'pd_state': {
'options': [None, 'State Is Not OK', 'bool', 'physical devices', 'adapter_raid.pd_state', 'line'],
'lines': []
},
'pd_smart_warnings': {
'options': [None, 'S.M.A.R.T warnings', 'count', 'physical devices',
'adapter_raid.smart_warnings', 'line'],
'lines': []
},
'pd_temperature': {
'options': [None, 'Temperature', 'celsius', 'physical devices', 'adapter_raid.temperature', 'line'],
'lines': []
},
}
SUDO = 'sudo'
ARCCONF = 'arcconf'
BAD_LD_STATUS = (
'Degraded',
'Failed',
)
GOOD_PD_STATUS = (
'Online',
)
RE_LD = re.compile(
r'Logical device number\s+([0-9]+).*?'
r'Status of logical device\s+: ([a-zA-Z]+)'
)
def find_lds(d):
d = ' '.join(v.strip() for v in d)
return [LD(*v) for v in RE_LD.findall(d)]
def find_pds(d):
pds = list()
pd = PD()
for row in d:
row = row.strip()
if row.startswith('Device #'):
pd = PD()
pd.id = row.split('#')[-1]
elif not pd.id:
continue
if row.startswith('State'):
v = row.split()[-1]
pd.state = v
elif row.startswith('S.M.A.R.T. warnings'):
v = row.split()[-1]
pd.smart_warnings = v
elif row.startswith('Temperature'):
v = row.split(':')[-1].split()[0]
pd.temperature = v
elif row.startswith('NCQ status'):
if pd.id and pd.state and pd.smart_warnings:
pds.append(pd)
pd = PD()
return pds
class LD:
def __init__(self, ld_id, status):
self.id = ld_id
self.status = status
def data(self):
return {
'ld_{0}_status'.format(self.id): int(self.status in BAD_LD_STATUS)
}
class PD:
def __init__(self):
self.id = None
self.state = None
self.smart_warnings = None
self.temperature = None
def data(self):
data = {
'pd_{0}_state'.format(self.id): int(self.state not in GOOD_PD_STATUS),
'pd_{0}_smart_warnings'.format(self.id): self.smart_warnings,
}
if self.temperature and self.temperature.isdigit():
data['pd_{0}_temperature'.format(self.id)] = self.temperature
return data
class Arcconf:
def __init__(self, arcconf):
self.arcconf = arcconf
def ld_info(self):
return [self.arcconf, 'GETCONFIG', '1', 'LD']
def pd_info(self):
return [self.arcconf, 'GETCONFIG', '1', 'PD']
# TODO: hardcoded sudo...
class SudoArcconf:
def __init__(self, arcconf, sudo):
self.arcconf = Arcconf(arcconf)
self.sudo = sudo
def ld_info(self):
return [self.sudo, '-n'] + self.arcconf.ld_info()
def pd_info(self):
return [self.sudo, '-n'] + self.arcconf.pd_info()
class Service(ExecutableService):
def __init__(self, configuration=None, name=None):
ExecutableService.__init__(self, configuration=configuration, name=name)
self.order = ORDER
self.definitions = deepcopy(CHARTS)
self.use_sudo = self.configuration.get('use_sudo', True)
self.arcconf = None
def execute(self, command, stderr=False):
return self._get_raw_data(command=command, stderr=stderr)
def check(self):
sudo = find_binary(SUDO)
if self.use_sudo:
if not sudo:
self.error('can\'t locate "{0}" binary'.format(SUDO))
return False
err = self.execute([sudo, '-n', '-v'], True)
if err:
self.error(' '.join(err))
return False
arcconf = find_binary(ARCCONF)
if not arcconf:
self.error('can\'t locate "{0}" binary'.format(ARCCONF))
return False
if self.use_sudo:
self.arcconf = SudoArcconf(arcconf, sudo)
else:
self.arcconf = Arcconf(arcconf)
lds = self.get_lds()
if not lds:
return False
self.debug('discovered logical devices ids: {0}'.format([ld.id for ld in lds]))
pds = self.get_pds()
if not pds:
return False
self.debug('discovered physical devices ids: {0}'.format([pd.id for pd in pds]))
self.update_charts(lds, pds)
return True
def get_data(self):
data = dict()
for ld in self.get_lds():
data.update(ld.data())
for pd in self.get_pds():
data.update(pd.data())
return data
def get_lds(self):
raw_lds = self.execute(self.arcconf.ld_info())
if not raw_lds:
return None
lds = find_lds(raw_lds)
if not lds:
self.error('failed to parse "{0}" output'.format(' '.join(self.arcconf.ld_info())))
self.debug('output: {0}'.format(raw_lds))
return None
return lds
def get_pds(self):
raw_pds = self.execute(self.arcconf.pd_info())
if not raw_pds:
return None
pds = find_pds(raw_pds)
if not pds:
self.error('failed to parse "{0}" output'.format(' '.join(self.arcconf.pd_info())))
self.debug('output: {0}'.format(raw_pds))
return None
return pds
def update_charts(self, lds, pds):
charts = self.definitions
for ld in lds:
dim = ['ld_{0}_status'.format(ld.id), 'ld {0}'.format(ld.id)]
charts['ld_status']['lines'].append(dim)
for pd in pds:
dim = ['pd_{0}_state'.format(pd.id), 'pd {0}'.format(pd.id)]
charts['pd_state']['lines'].append(dim)
dim = ['pd_{0}_smart_warnings'.format(pd.id), 'pd {0}'.format(pd.id)]
charts['pd_smart_warnings']['lines'].append(dim)
dim = ['pd_{0}_temperature'.format(pd.id), 'pd {0}'.format(pd.id)]
charts['pd_temperature']['lines'].append(dim)

View file

@ -0,0 +1,59 @@
# netdata python.d.plugin configuration for adaptec raid
#
# This file is in YaML format. Generally the format is:
#
# name: value
#
# ----------------------------------------------------------------------
# Global Variables
# These variables set the defaults for all JOBs, however each JOB
# may define its own, overriding the defaults.
# update_every sets the default data collection frequency.
# If unset, the python.d.plugin default is used.
# update_every: 1
# priority controls the order of charts at the netdata dashboard.
# Lower numbers move the charts towards the top of the page.
# If unset, the default for python.d.plugin is used.
# priority: 60000
# retries sets the number of retries to be made in case of failures.
# If unset, the default for python.d.plugin is used.
# Attempts to restore the service are made once every update_every
# and only if the module has collected values in the past.
# retries: 60
# autodetection_retry sets the job re-check interval in seconds.
# The job is not deleted if check fails.
# Attempts to start the job are made once every autodetection_retry.
# This feature is disabled by default.
# autodetection_retry: 0
# ----------------------------------------------------------------------
# JOBS (data collection sources)
#
# The default JOBS share the same *name*. JOBS with the same name
# are mutually exclusive. Only one of them will be allowed running at
# any time. This allows autodetection to try several alternatives and
# pick the one that works.
#
# Any number of jobs is supported.
#
# All python.d.plugin JOBS (for all its modules) support a set of
# predefined parameters. These are:
#
# job_name:
# name: myname # the JOB's name as it will appear at the
# # dashboard (by default is the job_name)
# # JOBs sharing a name are mutually exclusive
# update_every: 1 # the JOB's data collection frequency
# priority: 60000 # the JOB's order on the dashboard
# retries: 60 # the JOB's number of restoration attempts
# autodetection_retry: 0 # the JOB's re-check interval in seconds
# ----------------------------------------------------------------------
# IMPORTANT
# The netdata user needs to be able to sudo the arcconf program without password:
# netdata ALL=(root) NOPASSWD: /path/to/arcconf

View file

@ -22,6 +22,7 @@ dist_userhealthconfig_DATA = \
healthconfigdir=$(libconfigdir)/health.d
dist_healthconfig_DATA = \
$(top_srcdir)/installer/.keep \
health.d/adaptec_raid.conf \
health.d/apache.conf \
health.d/apcupsd.conf \
health.d/backend.conf \

View file

@ -0,0 +1,24 @@
# logical device status check
template: adapter_raid_ld_status
on: adapter_raid.ld_status
lookup: max -5s
units: bool
every: 10s
crit: $this > 0
delay: down 5m multiplier 1.5 max 1h
info: at least 1 logical device is failed or degraded
to: sysadmin
# physical device state check
template: adapter_raid_pd_state
on: adapter_raid.pd_state
lookup: max -5s
units: bool
every: 10s
crit: $this > 0
delay: down 5m multiplier 1.5 max 1h
info: at least 1 physical device is not in online state
to: sysadmin