0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-05-09 19:40:24 +00:00
netdata_netdata/python.d/boinc.chart.py
Austin S. Hemmelgarn dd36c217d4 Add a python.d module for monitoring BOINC clients.
This adds a python.d module for monitoring Berkeley Open Infrastructure
Network Computing clients.  This is the framework utilized by a vast
majority of publicly run distributed computing projects (including
SETI@Home and World Community Grid).

This only tracks the number of tasks in various task states on the
system, as almost everything else is trivial to track using other
existing plugins (such as apps.plugin).  It utilizes the same RPC
mechanism that the official GUI management tool for BOINC clients uses.

In most cases, the data this tracks is not hugely interesting (it doesn't
change very often, and generally doesn't change very much on any given
system), but there are a handful of situations that it lets us provide
alerts for that are of particular interest to users running BOINC on
headless systems (namely computational errors, reporting failures,
and running out of tasks to run).

Internally, BOINC has 3 state-machines that a given task will advance
through, the task state, scheduling state, and process state.  The task
state tracks the high-level status of the task (is it new, ready to run,
finished running and being reported, hit a computational error, etc).
the scheduling state tracks whether the scheduling algorithm says the
task can run or not, and the process state tracks the status of the
aassociated process for the task that does the actual computing.  This
module provides charts tracking task counts in each state for each of
these state machines, as well as one counting total number of local
tasks, and how many tasks are 'active' (that is, how many are being
considered for processing).

We also provide a set of default alarms to alert on compute errors,
upload/reporting failures, an empty local task queue, and a low number
of active tasks.

Currently, the module only runs on Linux systems because it relys on a
Linux specific hack to detect a disconnected socket, but it can monitor
BOINC clients on any platform remotely (including Windows systems).
2018-06-11 07:51:24 -04:00

178 lines
5.7 KiB
Python

# -*- coding: utf-8 -*-
# Description: BOINC netdata python.d module
# Author: Austin S. Hemmelgarn (Ferroin)
# SPDX-License-Identifier: GPL-3.0+
import platform
import socket
from copy import deepcopy
from bases.FrameworkServices.SimpleService import SimpleService
from third_party import boinc_client
ORDER = ['tasks', 'states', 'sched_states', 'process_states']
CHARTS = {
'tasks': {
'options': [None, 'Overall Tasks', 'tasks', 'boinc', 'boinc.tasks', 'line'],
'lines': [
['total', 'Total', 'absolute', 1, 1],
['active', 'Active', 'absolute', 1, 1]
]
},
'states': {
'options': [None, 'Tasks per State', 'tasks', 'boinc', 'boinc.states', 'line'],
'lines': [
['new', 'New', 'absolute', 1, 1],
['downloading', 'Downloading', 'absolute', 1, 1],
['downloaded', 'Ready to Run', 'absolute', 1, 1],
['comperror', 'Compute Errors', 'absolute', 1, 1],
['uploading', 'Uploading', 'absolute', 1, 1],
['uploaded', 'Uploaded', 'absolute', 1, 1],
['aborted', 'Aborted', 'absolute', 1, 1],
['upload_failed', 'Failed Uploads', 'absolute', 1, 1]
]
},
'sched_states': {
'options': [None, 'Tasks per Scheduler State', 'tasks', 'boinc', 'boinc.sched', 'line'],
'lines': [
['uninit_sched', 'Uninitialized', 'absolute', 1, 1],
['preempted', 'Preempted', 'absolute', 1, 1],
['scheduled', 'Scheduled', 'absolute', 1, 1]
]
},
'process_states': {
'options': [None, 'Tasks per Process State', 'tasks', 'boinc', 'boinc.process', 'line'],
'lines': [
['uninit_proc', 'Uninitialized', 'absolute', 1, 1],
['executing', 'Executing', 'absolute', 1, 1],
['suspended', 'Suspended', 'absolute', 1, 1],
['aborting', 'Aborted', 'absolute', 1, 1],
['quit', 'Quit', 'absolute', 1, 1],
['copy_pending', 'Copy Pending', 'absolute', 1, 1]
]
}
}
# A simple template used for pre-loading the return dictionary to make
# the _get_data() method simpler.
_DATA_TEMPLATE = {
'total': 0,
'active': 0,
'new': 0,
'downloading': 0,
'downloaded': 0,
'comperror': 0,
'uploading': 0,
'uploaded': 0,
'aborted': 0,
'upload_failed': 0,
'uninit_sched': 0,
'preempted': 0,
'scheduled': 0,
'uninit_proc': 0,
'executing': 0,
'suspended': 0,
'aborting': 0,
'quit': 0,
'copy_pending': 0
}
# Map task states to dimensions
_TASK_MAP = {
boinc_client.ResultState.NEW: 'new',
boinc_client.ResultState.FILES_DOWNLOADING: 'downloading',
boinc_client.ResultState.FILES_DOWNLOADED: 'downloaded',
boinc_client.ResultState.COMPUTE_ERROR: 'comperror',
boinc_client.ResultState.FILES_UPLOADING: 'uploading',
boinc_client.ResultState.FILES_UPLOADED: 'uploaded',
boinc_client.ResultState.ABORTED: 'aborted',
boinc_client.ResultState.UPLOAD_FAILED: 'upload_failed'
}
# Map scheduler states to dimensions
_SCHED_MAP = {
boinc_client.CpuSched.UNINITIALIZED: 'uninit_sched',
boinc_client.CpuSched.PREEMPTED: 'preempted',
boinc_client.CpuSched.SCHEDULED: 'scheduled',
}
# Maps process states to dimensions
_PROC_MAP = {
boinc_client.Process.UNINITIALIZED: 'uninit_proc',
boinc_client.Process.EXECUTING: 'executing',
boinc_client.Process.SUSPENDED: 'suspended',
boinc_client.Process.ABORT_PENDING: 'aborted',
boinc_client.Process.QUIT_PENDING: 'quit',
boinc_client.Process.COPY_PENDING: 'copy_pending'
}
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
SimpleService.__init__(self, configuration=configuration, name=name)
self.order = ORDER
self.definitions = CHARTS
self.host = self.configuration.get('host', 'localhost')
self.port = self.configuration.get('port', 0)
self.password = self.configuration.get('password', '')
self.client = boinc_client.BoincClient(host=self.host, port=self.port, passwd=self.password)
self.alive = False
def check(self):
if platform.system() != 'Linux':
self.error('Only supported on Linux.')
return False
self.connect()
if self.client.connected and self.client.authorized:
self.alive = True
else:
self.alive = False
return self.alive
def connect(self):
self.client.connect()
def reconnect(self):
try:
self.client.disconnect()
except socket.error:
pass
self.client.connect()
if self.client.connected and self.client.authorized:
self.alive = True
else:
self.alive = False
return self.alive
def is_alive(self):
if (not self.alive) or \
self.client.rpc.sock.getsockopt(socket.IPPROTO_TCP, socket.TCP_INFO, 0) != 1:
return self.reconnect()
return True
def _get_data(self):
if not self.is_alive():
return None
data = deepcopy(_DATA_TEMPLATE)
results = []
try:
results = self.client.get_tasks()
except socket.error:
self.error('Connection is dead')
self.alive = False
return None
for task in results:
data['total'] += 1
data[_TASK_MAP[task.state]] += 1
try:
if task.active_task:
data['active'] += 1
data[_SCHED_MAP[task.scheduler_state]] += 1
data[_PROC_MAP[task.active_task_state]] += 1
except AttributeError:
pass
return data