mirror of
https://github.com/netdata/netdata.git
synced 2025-05-09 19:40:24 +00:00

This adds a python.d module for monitoring Berkeley Open Infrastructure Network Computing clients. This is the framework utilized by a vast majority of publicly run distributed computing projects (including SETI@Home and World Community Grid). This only tracks the number of tasks in various task states on the system, as almost everything else is trivial to track using other existing plugins (such as apps.plugin). It utilizes the same RPC mechanism that the official GUI management tool for BOINC clients uses. In most cases, the data this tracks is not hugely interesting (it doesn't change very often, and generally doesn't change very much on any given system), but there are a handful of situations that it lets us provide alerts for that are of particular interest to users running BOINC on headless systems (namely computational errors, reporting failures, and running out of tasks to run). Internally, BOINC has 3 state-machines that a given task will advance through, the task state, scheduling state, and process state. The task state tracks the high-level status of the task (is it new, ready to run, finished running and being reported, hit a computational error, etc). the scheduling state tracks whether the scheduling algorithm says the task can run or not, and the process state tracks the status of the aassociated process for the task that does the actual computing. This module provides charts tracking task counts in each state for each of these state machines, as well as one counting total number of local tasks, and how many tasks are 'active' (that is, how many are being considered for processing). We also provide a set of default alarms to alert on compute errors, upload/reporting failures, an empty local task queue, and a low number of active tasks. Currently, the module only runs on Linux systems because it relys on a Linux specific hack to detect a disconnected socket, but it can monitor BOINC clients on any platform remotely (including Windows systems).
178 lines
5.7 KiB
Python
178 lines
5.7 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Description: BOINC netdata python.d module
|
|
# Author: Austin S. Hemmelgarn (Ferroin)
|
|
# SPDX-License-Identifier: GPL-3.0+
|
|
|
|
import platform
|
|
import socket
|
|
|
|
from copy import deepcopy
|
|
|
|
from bases.FrameworkServices.SimpleService import SimpleService
|
|
|
|
from third_party import boinc_client
|
|
|
|
|
|
ORDER = ['tasks', 'states', 'sched_states', 'process_states']
|
|
|
|
CHARTS = {
|
|
'tasks': {
|
|
'options': [None, 'Overall Tasks', 'tasks', 'boinc', 'boinc.tasks', 'line'],
|
|
'lines': [
|
|
['total', 'Total', 'absolute', 1, 1],
|
|
['active', 'Active', 'absolute', 1, 1]
|
|
]
|
|
},
|
|
'states': {
|
|
'options': [None, 'Tasks per State', 'tasks', 'boinc', 'boinc.states', 'line'],
|
|
'lines': [
|
|
['new', 'New', 'absolute', 1, 1],
|
|
['downloading', 'Downloading', 'absolute', 1, 1],
|
|
['downloaded', 'Ready to Run', 'absolute', 1, 1],
|
|
['comperror', 'Compute Errors', 'absolute', 1, 1],
|
|
['uploading', 'Uploading', 'absolute', 1, 1],
|
|
['uploaded', 'Uploaded', 'absolute', 1, 1],
|
|
['aborted', 'Aborted', 'absolute', 1, 1],
|
|
['upload_failed', 'Failed Uploads', 'absolute', 1, 1]
|
|
]
|
|
},
|
|
'sched_states': {
|
|
'options': [None, 'Tasks per Scheduler State', 'tasks', 'boinc', 'boinc.sched', 'line'],
|
|
'lines': [
|
|
['uninit_sched', 'Uninitialized', 'absolute', 1, 1],
|
|
['preempted', 'Preempted', 'absolute', 1, 1],
|
|
['scheduled', 'Scheduled', 'absolute', 1, 1]
|
|
]
|
|
},
|
|
'process_states': {
|
|
'options': [None, 'Tasks per Process State', 'tasks', 'boinc', 'boinc.process', 'line'],
|
|
'lines': [
|
|
['uninit_proc', 'Uninitialized', 'absolute', 1, 1],
|
|
['executing', 'Executing', 'absolute', 1, 1],
|
|
['suspended', 'Suspended', 'absolute', 1, 1],
|
|
['aborting', 'Aborted', 'absolute', 1, 1],
|
|
['quit', 'Quit', 'absolute', 1, 1],
|
|
['copy_pending', 'Copy Pending', 'absolute', 1, 1]
|
|
]
|
|
}
|
|
}
|
|
|
|
# A simple template used for pre-loading the return dictionary to make
|
|
# the _get_data() method simpler.
|
|
_DATA_TEMPLATE = {
|
|
'total': 0,
|
|
'active': 0,
|
|
'new': 0,
|
|
'downloading': 0,
|
|
'downloaded': 0,
|
|
'comperror': 0,
|
|
'uploading': 0,
|
|
'uploaded': 0,
|
|
'aborted': 0,
|
|
'upload_failed': 0,
|
|
'uninit_sched': 0,
|
|
'preempted': 0,
|
|
'scheduled': 0,
|
|
'uninit_proc': 0,
|
|
'executing': 0,
|
|
'suspended': 0,
|
|
'aborting': 0,
|
|
'quit': 0,
|
|
'copy_pending': 0
|
|
}
|
|
|
|
# Map task states to dimensions
|
|
_TASK_MAP = {
|
|
boinc_client.ResultState.NEW: 'new',
|
|
boinc_client.ResultState.FILES_DOWNLOADING: 'downloading',
|
|
boinc_client.ResultState.FILES_DOWNLOADED: 'downloaded',
|
|
boinc_client.ResultState.COMPUTE_ERROR: 'comperror',
|
|
boinc_client.ResultState.FILES_UPLOADING: 'uploading',
|
|
boinc_client.ResultState.FILES_UPLOADED: 'uploaded',
|
|
boinc_client.ResultState.ABORTED: 'aborted',
|
|
boinc_client.ResultState.UPLOAD_FAILED: 'upload_failed'
|
|
}
|
|
|
|
# Map scheduler states to dimensions
|
|
_SCHED_MAP = {
|
|
boinc_client.CpuSched.UNINITIALIZED: 'uninit_sched',
|
|
boinc_client.CpuSched.PREEMPTED: 'preempted',
|
|
boinc_client.CpuSched.SCHEDULED: 'scheduled',
|
|
}
|
|
|
|
# Maps process states to dimensions
|
|
_PROC_MAP = {
|
|
boinc_client.Process.UNINITIALIZED: 'uninit_proc',
|
|
boinc_client.Process.EXECUTING: 'executing',
|
|
boinc_client.Process.SUSPENDED: 'suspended',
|
|
boinc_client.Process.ABORT_PENDING: 'aborted',
|
|
boinc_client.Process.QUIT_PENDING: 'quit',
|
|
boinc_client.Process.COPY_PENDING: 'copy_pending'
|
|
}
|
|
|
|
|
|
class Service(SimpleService):
|
|
def __init__(self, configuration=None, name=None):
|
|
SimpleService.__init__(self, configuration=configuration, name=name)
|
|
self.order = ORDER
|
|
self.definitions = CHARTS
|
|
self.host = self.configuration.get('host', 'localhost')
|
|
self.port = self.configuration.get('port', 0)
|
|
self.password = self.configuration.get('password', '')
|
|
self.client = boinc_client.BoincClient(host=self.host, port=self.port, passwd=self.password)
|
|
self.alive = False
|
|
|
|
def check(self):
|
|
if platform.system() != 'Linux':
|
|
self.error('Only supported on Linux.')
|
|
return False
|
|
self.connect()
|
|
if self.client.connected and self.client.authorized:
|
|
self.alive = True
|
|
else:
|
|
self.alive = False
|
|
return self.alive
|
|
|
|
def connect(self):
|
|
self.client.connect()
|
|
|
|
def reconnect(self):
|
|
try:
|
|
self.client.disconnect()
|
|
except socket.error:
|
|
pass
|
|
self.client.connect()
|
|
if self.client.connected and self.client.authorized:
|
|
self.alive = True
|
|
else:
|
|
self.alive = False
|
|
return self.alive
|
|
|
|
def is_alive(self):
|
|
if (not self.alive) or \
|
|
self.client.rpc.sock.getsockopt(socket.IPPROTO_TCP, socket.TCP_INFO, 0) != 1:
|
|
return self.reconnect()
|
|
return True
|
|
|
|
def _get_data(self):
|
|
if not self.is_alive():
|
|
return None
|
|
data = deepcopy(_DATA_TEMPLATE)
|
|
results = []
|
|
try:
|
|
results = self.client.get_tasks()
|
|
except socket.error:
|
|
self.error('Connection is dead')
|
|
self.alive = False
|
|
return None
|
|
for task in results:
|
|
data['total'] += 1
|
|
data[_TASK_MAP[task.state]] += 1
|
|
try:
|
|
if task.active_task:
|
|
data['active'] += 1
|
|
data[_SCHED_MAP[task.scheduler_state]] += 1
|
|
data[_PROC_MAP[task.active_task_state]] += 1
|
|
except AttributeError:
|
|
pass
|
|
return data
|