0
0
Fork 0
mirror of https://github.com/netdata/netdata.git synced 2025-04-16 18:37:50 +00:00

go.d nvidia_smi: add loop mode ()

This commit is contained in:
Ilya Mashchenko 2024-08-12 13:47:43 +03:00 committed by GitHub
parent 67824a216b
commit 63631b495b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 205 additions and 10 deletions

View file

@ -23,6 +23,12 @@
"type": "number",
"minimum": 0.5,
"default": 10
},
"loop_mode": {
"title": "Loop Mode",
"description": "When enabled, `nvidia-smi` is executed continuously in a separate thread using the `-l` option.",
"type": "boolean",
"default": true
}
},
"required": [
@ -42,6 +48,9 @@
},
"timeout": {
"ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)."
},
"loop_mode": {
"ui:help": "In loop mode, `nvidia-smi` will repeatedly query GPU data at specified intervals, defined by the `-l SEC` or `--loop=SEC` parameter, rather than just running the query once. This enables ongoing performance tracking by putting the application to sleep between queries."
}
}
}

View file

@ -3,9 +3,14 @@
package nvidia_smi
import (
"bufio"
"bytes"
"context"
"errors"
"fmt"
"os/exec"
"strconv"
"sync"
"time"
"github.com/netdata/netdata/go/plugins/logger"
@ -13,14 +18,30 @@ import (
type nvidiaSmiBinary interface {
queryGPUInfo() ([]byte, error)
stop() error
}
func newNvidiaSmiExec(path string, cfg Config, log *logger.Logger) (*nvidiaSmiExec, error) {
return &nvidiaSmiExec{
Logger: log,
binPath: path,
timeout: cfg.Timeout.Duration(),
}, nil
func newNvidiaSmiBinary(path string, cfg Config, log *logger.Logger) (nvidiaSmiBinary, error) {
if !cfg.LoopMode {
return &nvidiaSmiExec{
Logger: log,
binPath: path,
timeout: cfg.Timeout.Duration(),
}, nil
}
smi := &nvidiaSmiLoopExec{
Logger: log,
binPath: path,
updateEvery: cfg.UpdateEvery,
firstSampleTimeout: time.Second * 3,
}
if err := smi.run(); err != nil {
return nil, err
}
return smi, nil
}
type nvidiaSmiExec struct {
@ -44,3 +65,149 @@ func (e *nvidiaSmiExec) queryGPUInfo() ([]byte, error) {
return bs, nil
}
func (e *nvidiaSmiExec) stop() error { return nil }
type nvidiaSmiLoopExec struct {
*logger.Logger
binPath string
updateEvery int
firstSampleTimeout time.Duration
cmd *exec.Cmd
done chan struct{}
mux sync.Mutex
lastSample string
}
func (e *nvidiaSmiLoopExec) queryGPUInfo() ([]byte, error) {
select {
case <-e.done:
return nil, errors.New("process has already exited")
default:
}
e.mux.Lock()
defer e.mux.Unlock()
return []byte(e.lastSample), nil
}
func (e *nvidiaSmiLoopExec) run() error {
secs := 5
if e.updateEvery < secs {
secs = e.updateEvery
}
cmd := exec.Command(e.binPath, "-q", "-x", "-l", strconv.Itoa(secs))
e.Debugf("executing '%s'", cmd)
r, err := cmd.StdoutPipe()
if err != nil {
return err
}
if err := cmd.Start(); err != nil {
return err
}
firstSample := make(chan struct{}, 1)
done := make(chan struct{})
e.cmd = cmd
e.done = done
go func() {
defer close(done)
var buf bytes.Buffer
var insideLog bool
var emptyRows int64
var outsideLogRows int64
const unexpectedRowsLimit = 500
sc := bufio.NewScanner(r)
for sc.Scan() {
line := sc.Text()
if !insideLog {
outsideLogRows++
} else {
outsideLogRows = 0
}
if line == "" {
emptyRows++
} else {
emptyRows = 0
}
if outsideLogRows >= unexpectedRowsLimit || emptyRows >= unexpectedRowsLimit {
e.Errorf("unexpected output from nvidia-smi loop: outside log rows %d, empty rows %d", outsideLogRows, emptyRows)
break
}
switch {
case line == "<nvidia_smi_log>":
insideLog = true
buf.Reset()
buf.WriteString(line)
buf.WriteByte('\n')
case line == "</nvidia_smi_log>":
insideLog = false
buf.WriteString(line)
e.mux.Lock()
e.lastSample = buf.String()
e.mux.Unlock()
buf.Reset()
select {
case firstSample <- struct{}{}:
default:
}
case insideLog:
buf.WriteString(line)
buf.WriteByte('\n')
default:
continue
}
}
}()
select {
case <-e.done:
_ = e.stop()
return errors.New("process exited before the first sample was collected")
case <-time.After(e.firstSampleTimeout):
_ = e.stop()
return errors.New("timed out waiting for first sample")
case <-firstSample:
return nil
}
}
func (e *nvidiaSmiLoopExec) stop() error {
if e.cmd == nil || e.cmd.Process == nil {
return nil
}
_ = e.cmd.Process.Kill()
_ = e.cmd.Wait()
e.cmd = nil
select {
case <-e.done:
return nil
case <-time.After(time.Second * 2):
return errors.New("timed out waiting for process to exit")
}
}

View file

@ -18,5 +18,5 @@ func (nv *NvidiaSmi) initNvidiaSmiExec() (nvidiaSmiBinary, error) {
binPath = path
}
return newNvidiaSmiExec(binPath, nv.Config, nv.Logger)
return newNvidiaSmiBinary(binPath, nv.Config, nv.Logger)
}

View file

@ -73,6 +73,10 @@ modules:
description: nvidia_smi binary execution timeout.
default_value: 2
required: false
- name: loop_mode
description: "When enabled, `nvidia-smi` is executed continuously in a separate thread using the `-l` option."
default_value: true
required: false
examples:
folding:
title: Config

View file

@ -29,7 +29,8 @@ func init() {
func New() *NvidiaSmi {
return &NvidiaSmi{
Config: Config{
Timeout: web.Duration(time.Second * 10),
Timeout: web.Duration(time.Second * 10),
LoopMode: true,
},
binName: "nvidia-smi",
charts: &module.Charts{},
@ -43,6 +44,7 @@ type Config struct {
UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"`
Timeout web.Duration `yaml:"timeout,omitempty" json:"timeout"`
BinaryPath string `yaml:"binary_path" json:"binary_path"`
LoopMode bool `yaml:"loop_mode,omitempty" json:"loop_mode"`
}
type NvidiaSmi struct {
@ -103,4 +105,11 @@ func (nv *NvidiaSmi) Collect() map[string]int64 {
return mx
}
func (nv *NvidiaSmi) Cleanup() {}
func (nv *NvidiaSmi) Cleanup() {
if nv.exec != nil {
if err := nv.exec.stop(); err != nil {
nv.Errorf("cleanup: %v", err)
}
nv.exec = nil
}
}

View file

@ -418,6 +418,10 @@ func (m *mockNvidiaSmi) queryGPUInfo() ([]byte, error) {
return m.gpuInfo, nil
}
func (m *mockNvidiaSmi) stop() error {
return nil
}
func prepareCaseMIGA100(nv *NvidiaSmi) {
nv.exec = &mockNvidiaSmi{gpuInfo: dataXMLA100SXM4MIG}
}

View file

@ -1,5 +1,6 @@
{
"update_every": 123,
"timeout": 123.123,
"binary_path": "ok"
"binary_path": "ok",
"loop_mode": true
}

View file

@ -1,3 +1,4 @@
update_every: 123
timeout: 123.123
binary_path: "ok"
loop_mode: true