mirror of
https://github.com/netdata/netdata.git
synced 2025-04-16 18:37:50 +00:00
Display uptime for processes (#6654)
* Get process uptime * Calculate target uptime * Update charts * Show collected data * Fix chart names * Update the documentation * Fix a flag value * Add an explanation note for the 'carried over uptime' chart * Move the functions for getting uptime to libnetdata * Rename the function for geting uptime * Remove redundant code * Fix starttime calculation * More accurate definition for the carried over uptime * fix group starttime calculation * Fix typo
This commit is contained in:
parent
1907c1486f
commit
c79112e853
6 changed files with 195 additions and 68 deletions
collectors
libnetdata/clocks
web/gui
|
@ -47,6 +47,11 @@ Each of these sections provides the same number of charts:
|
|||
- Threads Running
|
||||
- Processes Running
|
||||
- Pipes Open
|
||||
- Carried Over Uptime (since the Netdata restart)
|
||||
- Minimum Uptime
|
||||
- Average Uptime
|
||||
- Maximum Uptime
|
||||
|
||||
- Swap Memory
|
||||
- Swap Memory Used
|
||||
- Major Page Faults (i.e. swap activity)
|
||||
|
|
|
@ -260,6 +260,12 @@ struct target {
|
|||
kernel_uint_t openeventpolls;
|
||||
kernel_uint_t openother;
|
||||
|
||||
kernel_uint_t starttime;
|
||||
kernel_uint_t collected_starttime;
|
||||
kernel_uint_t uptime_min;
|
||||
kernel_uint_t uptime_sum;
|
||||
kernel_uint_t uptime_max;
|
||||
|
||||
unsigned int processes; // how many processes have been merged to this
|
||||
int exposed; // if set, we have sent this to netdata
|
||||
int hidden; // if set, we set the hidden flag on the dimension
|
||||
|
@ -345,7 +351,7 @@ struct pid_stat {
|
|||
// int64_t nice;
|
||||
int32_t num_threads;
|
||||
// int64_t itrealvalue;
|
||||
// kernel_uint_t starttime;
|
||||
kernel_uint_t collected_starttime;
|
||||
// kernel_uint_t vsize;
|
||||
// kernel_uint_t rss;
|
||||
// kernel_uint_t rsslim;
|
||||
|
@ -419,6 +425,8 @@ struct pid_stat {
|
|||
usec_t io_collected_usec;
|
||||
usec_t last_io_collected_usec;
|
||||
|
||||
kernel_uint_t uptime;
|
||||
|
||||
char *fds_dirname; // the full directory name in /proc/PID/fd
|
||||
|
||||
char *stat_filename;
|
||||
|
@ -433,6 +441,8 @@ struct pid_stat {
|
|||
|
||||
size_t pagesize;
|
||||
|
||||
kernel_uint_t global_uptime;
|
||||
|
||||
// log each problem once per process
|
||||
// log flood protection flags (log_thrown)
|
||||
#define PID_LOG_IO 0x00000001
|
||||
|
@ -1421,7 +1431,8 @@ static inline int read_proc_pid_stat(struct pid_stat *p, void *ptr) {
|
|||
// p->nice = str2kernel_uint_t(procfile_lineword(ff, 0, 18));
|
||||
p->num_threads = (int32_t)str2uint32_t(procfile_lineword(ff, 0, 19));
|
||||
// p->itrealvalue = str2kernel_uint_t(procfile_lineword(ff, 0, 20));
|
||||
// p->starttime = str2kernel_uint_t(procfile_lineword(ff, 0, 21));
|
||||
p->collected_starttime = str2kernel_uint_t(procfile_lineword(ff, 0, 21)) / system_hz;
|
||||
p->uptime = (global_uptime > p->collected_starttime)?(global_uptime - p->collected_starttime):0;
|
||||
// p->vsize = str2kernel_uint_t(procfile_lineword(ff, 0, 22));
|
||||
// p->rss = str2kernel_uint_t(procfile_lineword(ff, 0, 23));
|
||||
// p->rsslim = str2kernel_uint_t(procfile_lineword(ff, 0, 24));
|
||||
|
@ -1490,6 +1501,8 @@ cleanup:
|
|||
return 0;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
static inline int read_proc_pid_io(struct pid_stat *p, void *ptr) {
|
||||
(void)ptr;
|
||||
#ifdef __FreeBSD__
|
||||
|
@ -2634,6 +2647,12 @@ static int collect_data_for_all_processes(void) {
|
|||
collect_data_for_pid(pid, &procbase[i]);
|
||||
}
|
||||
#else
|
||||
static char uptime_filename[FILENAME_MAX + 1] = "";
|
||||
if(*uptime_filename == '\0')
|
||||
snprintfz(uptime_filename, FILENAME_MAX, "%s/proc/uptime", netdata_configured_host_prefix);
|
||||
|
||||
global_uptime = (kernel_uint_t)(uptime_msec(uptime_filename) / MSEC_PER_SEC);
|
||||
|
||||
char dirname[FILENAME_MAX + 1];
|
||||
|
||||
snprintfz(dirname, FILENAME_MAX, "%s/proc", netdata_configured_host_prefix);
|
||||
|
@ -2879,6 +2898,11 @@ static size_t zero_all_targets(struct target *root) {
|
|||
w->openother = 0;
|
||||
}
|
||||
|
||||
w->collected_starttime = 0;
|
||||
w->uptime_min = 0;
|
||||
w->uptime_sum = 0;
|
||||
w->uptime_max = 0;
|
||||
|
||||
if(unlikely(w->root_pid)) {
|
||||
struct pid_on_target *pid_on_target_to_free, *pid_on_target = w->root_pid;
|
||||
|
||||
|
@ -3032,6 +3056,11 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p,
|
|||
w->processes++;
|
||||
w->num_threads += p->num_threads;
|
||||
|
||||
if(!w->collected_starttime || p->collected_starttime < w->collected_starttime) w->collected_starttime = p->collected_starttime;
|
||||
if(!w->uptime_min || p->uptime < w->uptime_min) w->uptime_min = p->uptime;
|
||||
w->uptime_sum += p->uptime;
|
||||
if(!w->uptime_max || w->uptime_max < p->uptime) w->uptime_max = p->uptime;
|
||||
|
||||
if(unlikely(debug_enabled || w->debug_enabled)) {
|
||||
debug_log_int("aggregating '%s' pid %d on target '%s' utime=" KERNEL_UINT_FORMAT ", stime=" KERNEL_UINT_FORMAT ", gtime=" KERNEL_UINT_FORMAT ", cutime=" KERNEL_UINT_FORMAT ", cstime=" KERNEL_UINT_FORMAT ", cgtime=" KERNEL_UINT_FORMAT ", minflt=" KERNEL_UINT_FORMAT ", majflt=" KERNEL_UINT_FORMAT ", cminflt=" KERNEL_UINT_FORMAT ", cmajflt=" KERNEL_UINT_FORMAT "", p->comm, p->pid, w->name, p->utime, p->stime, p->gtime, p->cutime, p->cstime, p->cgtime, p->minflt, p->majflt, p->cminflt, p->cmajflt);
|
||||
|
||||
|
@ -3042,6 +3071,19 @@ static inline void aggregate_pid_on_target(struct target *w, struct pid_stat *p,
|
|||
}
|
||||
}
|
||||
|
||||
static inline void post_aggregate_targets(struct target *root) {
|
||||
struct target *w;
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(w->collected_starttime) {
|
||||
if (!w->starttime || w->collected_starttime < w->starttime) {
|
||||
w->starttime = w->collected_starttime;
|
||||
}
|
||||
} else {
|
||||
w->starttime = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void calculate_netdata_statistics(void) {
|
||||
|
||||
apply_apps_groups_targets_inheritance();
|
||||
|
@ -3102,6 +3144,10 @@ static void calculate_netdata_statistics(void) {
|
|||
aggregate_pid_fds_on_targets(p);
|
||||
}
|
||||
|
||||
post_aggregate_targets(apps_groups_root_target);
|
||||
post_aggregate_targets(users_root_target);
|
||||
post_aggregate_targets(groups_root_target);
|
||||
|
||||
cleanup_exited_pids();
|
||||
}
|
||||
|
||||
|
@ -3457,6 +3503,36 @@ static void send_collected_data_to_netdata(struct target *root, const char *type
|
|||
}
|
||||
send_END();
|
||||
|
||||
#ifndef __FreeBSD__
|
||||
send_BEGIN(type, "uptime", dt);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed && w->processes))
|
||||
send_SET(w->name, (global_uptime > w->starttime)?(global_uptime - w->starttime):0);
|
||||
}
|
||||
send_END();
|
||||
|
||||
send_BEGIN(type, "uptime_min", dt);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed && w->processes))
|
||||
send_SET(w->name, w->uptime_min);
|
||||
}
|
||||
send_END();
|
||||
|
||||
send_BEGIN(type, "uptime_avg", dt);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed && w->processes))
|
||||
send_SET(w->name, w->processes?(w->uptime_sum / w->processes):0);
|
||||
}
|
||||
send_END();
|
||||
|
||||
send_BEGIN(type, "uptime_max", dt);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed && w->processes))
|
||||
send_SET(w->name, w->uptime_max);
|
||||
}
|
||||
send_END();
|
||||
#endif
|
||||
|
||||
send_BEGIN(type, "mem", dt);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed && w->processes))
|
||||
|
@ -3615,6 +3691,32 @@ static void send_charts_updates_to_netdata(struct target *root, const char *type
|
|||
fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
|
||||
}
|
||||
|
||||
#ifndef __FreeBSD__
|
||||
fprintf(stdout, "CHART %s.uptime '' '%s Carried Over Uptime' 'seconds' processes %s.uptime line 20008 %d\n", type, title, type, update_every);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed))
|
||||
fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
|
||||
}
|
||||
|
||||
fprintf(stdout, "CHART %s.uptime_min '' '%s Minimum Uptime' 'seconds' processes %s.uptime_min line 20009 %d\n", type, title, type, update_every);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed))
|
||||
fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
|
||||
}
|
||||
|
||||
fprintf(stdout, "CHART %s.uptime_avg '' '%s Average Uptime' 'seconds' processes %s.uptime_avg line 20010 %d\n", type, title, type, update_every);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed))
|
||||
fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
|
||||
}
|
||||
|
||||
fprintf(stdout, "CHART %s.uptime_max '' '%s Maximum Uptime' 'seconds' processes %s.uptime_max line 20011 %d\n", type, title, type, update_every);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed))
|
||||
fprintf(stdout, "DIMENSION %s '' absolute 1 1\n", w->name);
|
||||
}
|
||||
#endif
|
||||
|
||||
fprintf(stdout, "CHART %s.cpu_user '' '%s CPU User Time (%d%% = %d core%s)' 'percentage' cpu %s.cpu_user stacked 20020 %d\n", type, title, (processors * 100), processors, (processors>1)?"s":"", type, update_every);
|
||||
for (w = root; w ; w = w->next) {
|
||||
if(unlikely(w->exposed))
|
||||
|
|
|
@ -2,76 +2,17 @@
|
|||
|
||||
#include "plugin_proc.h"
|
||||
|
||||
static inline collected_number uptime_from_boottime(void) {
|
||||
#ifdef CLOCK_BOOTTIME_IS_AVAILABLE
|
||||
return now_boottime_usec() / 1000;
|
||||
#else
|
||||
error("uptime cannot be read from CLOCK_BOOTTIME on this system.");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static procfile *read_proc_uptime_ff = NULL;
|
||||
static inline collected_number read_proc_uptime(void) {
|
||||
if(unlikely(!read_proc_uptime_ff)) {
|
||||
char filename[FILENAME_MAX + 1];
|
||||
snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/uptime");
|
||||
|
||||
read_proc_uptime_ff = procfile_open(config_get("plugin:proc:/proc/uptime", "filename to monitor", filename), " \t", PROCFILE_FLAG_DEFAULT);
|
||||
if(unlikely(!read_proc_uptime_ff)) return 0;
|
||||
}
|
||||
|
||||
read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff);
|
||||
if(unlikely(!read_proc_uptime_ff)) return 0;
|
||||
|
||||
if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) {
|
||||
error("/proc/uptime has no lines.");
|
||||
return 0;
|
||||
}
|
||||
if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) {
|
||||
error("/proc/uptime has less than 1 word in it.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (collected_number)(strtold(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0);
|
||||
}
|
||||
|
||||
int do_proc_uptime(int update_every, usec_t dt) {
|
||||
(void)dt;
|
||||
|
||||
static int use_boottime = -1;
|
||||
static char *uptime_filename = NULL;
|
||||
if(!uptime_filename) {
|
||||
char filename[FILENAME_MAX + 1];
|
||||
snprintfz(filename, FILENAME_MAX, "%s%s", netdata_configured_host_prefix, "/proc/uptime");
|
||||
|
||||
if(unlikely(use_boottime == -1)) {
|
||||
collected_number uptime_boottime = uptime_from_boottime();
|
||||
collected_number uptime_proc = read_proc_uptime();
|
||||
|
||||
long long delta = (long long)uptime_boottime - (long long)uptime_proc;
|
||||
if(delta < 0) delta = -delta;
|
||||
|
||||
if(delta <= 1000 && uptime_boottime != 0) {
|
||||
procfile_close(read_proc_uptime_ff);
|
||||
info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta);
|
||||
use_boottime = 1;
|
||||
}
|
||||
else if(uptime_proc != 0) {
|
||||
info("Using /proc/uptime for uptime (dt is %lld ms)", delta);
|
||||
use_boottime = 0;
|
||||
}
|
||||
else {
|
||||
error("Cannot find any way to read uptime on this system.");
|
||||
return 1;
|
||||
}
|
||||
uptime_filename = config_get("plugin:proc:/proc/uptime", "filename to monitor", filename);
|
||||
}
|
||||
|
||||
collected_number uptime;
|
||||
if(use_boottime)
|
||||
uptime = uptime_from_boottime();
|
||||
else
|
||||
uptime = read_proc_uptime();
|
||||
|
||||
|
||||
// --------------------------------------------------------------------
|
||||
|
||||
static RRDSET *st = NULL;
|
||||
static RRDDIM *rd = NULL;
|
||||
|
||||
|
@ -97,7 +38,7 @@ int do_proc_uptime(int update_every, usec_t dt) {
|
|||
else
|
||||
rrdset_next(st);
|
||||
|
||||
rrddim_set_by_pointer(st, rd, uptime);
|
||||
rrddim_set_by_pointer(st, rd, uptime_msec(uptime_filename));
|
||||
|
||||
rrdset_done(st);
|
||||
|
||||
|
|
|
@ -210,3 +210,68 @@ int sleep_usec(usec_t usec) {
|
|||
return ret;
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline collected_number uptime_from_boottime(void) {
|
||||
#ifdef CLOCK_BOOTTIME_IS_AVAILABLE
|
||||
return now_boottime_usec() / 1000;
|
||||
#else
|
||||
error("uptime cannot be read from CLOCK_BOOTTIME on this system.");
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static procfile *read_proc_uptime_ff = NULL;
|
||||
static inline collected_number read_proc_uptime(char *filename) {
|
||||
if(unlikely(!read_proc_uptime_ff)) {
|
||||
read_proc_uptime_ff = procfile_open(filename, " \t", PROCFILE_FLAG_DEFAULT);
|
||||
if(unlikely(!read_proc_uptime_ff)) return 0;
|
||||
}
|
||||
|
||||
read_proc_uptime_ff = procfile_readall(read_proc_uptime_ff);
|
||||
if(unlikely(!read_proc_uptime_ff)) return 0;
|
||||
|
||||
if(unlikely(procfile_lines(read_proc_uptime_ff) < 1)) {
|
||||
error("/proc/uptime has no lines.");
|
||||
return 0;
|
||||
}
|
||||
if(unlikely(procfile_linewords(read_proc_uptime_ff, 0) < 1)) {
|
||||
error("/proc/uptime has less than 1 word in it.");
|
||||
return 0;
|
||||
}
|
||||
|
||||
return (collected_number)(strtold(procfile_lineword(read_proc_uptime_ff, 0, 0), NULL) * 1000.0);
|
||||
}
|
||||
|
||||
inline collected_number uptime_msec(char *filename){
|
||||
static int use_boottime = -1;
|
||||
|
||||
if(unlikely(use_boottime == -1)) {
|
||||
collected_number uptime_boottime = uptime_from_boottime();
|
||||
collected_number uptime_proc = read_proc_uptime(filename);
|
||||
|
||||
long long delta = (long long)uptime_boottime - (long long)uptime_proc;
|
||||
if(delta < 0) delta = -delta;
|
||||
|
||||
if(delta <= 1000 && uptime_boottime != 0) {
|
||||
procfile_close(read_proc_uptime_ff);
|
||||
info("Using now_boottime_usec() for uptime (dt is %lld ms)", delta);
|
||||
use_boottime = 1;
|
||||
}
|
||||
else if(uptime_proc != 0) {
|
||||
info("Using /proc/uptime for uptime (dt is %lld ms)", delta);
|
||||
use_boottime = 0;
|
||||
}
|
||||
else {
|
||||
error("Cannot find any way to read uptime on this system.");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
collected_number uptime;
|
||||
if(use_boottime)
|
||||
uptime = uptime_from_boottime();
|
||||
else
|
||||
uptime = read_proc_uptime(filename);
|
||||
|
||||
return uptime;
|
||||
}
|
||||
|
|
|
@ -136,4 +136,6 @@ extern int sleep_usec(usec_t usec);
|
|||
*/
|
||||
void test_clock_boottime(void);
|
||||
|
||||
extern collected_number uptime_msec(char *filename);
|
||||
|
||||
#endif /* NETDATA_CLOCKS_H */
|
||||
|
|
|
@ -985,6 +985,10 @@ netdataDashboard.context = {
|
|||
height: 2.0
|
||||
},
|
||||
|
||||
'apps.uptime': {
|
||||
info: 'Carried over process group uptime since the Netdata restart. The period of time within which at least one process in the group was running.'
|
||||
},
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// USERS
|
||||
|
||||
|
@ -1008,6 +1012,10 @@ netdataDashboard.context = {
|
|||
height: 2.0
|
||||
},
|
||||
|
||||
'users.uptime': {
|
||||
info: 'Carried over process group uptime since the Netdata restart. The period of time within which at least one process in the group was running.'
|
||||
},
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// GROUPS
|
||||
|
||||
|
@ -1020,7 +1028,7 @@ netdataDashboard.context = {
|
|||
},
|
||||
|
||||
'groups.vmem': {
|
||||
info: 'Virtual memory allocated per user group. Please check <a href="https://github.com/netdata/netdata/tree/master/daemon#virtual-memory" target="_blank">this article</a> for more information.'
|
||||
info: 'Virtual memory allocated per user group since the Netdata restart. Please check <a href="https://github.com/netdata/netdata/tree/master/daemon#virtual-memory" target="_blank">this article</a> for more information.'
|
||||
},
|
||||
|
||||
'groups.preads': {
|
||||
|
@ -1031,6 +1039,10 @@ netdataDashboard.context = {
|
|||
height: 2.0
|
||||
},
|
||||
|
||||
'groups.uptime': {
|
||||
info: 'Carried over process group uptime. The period of time within which at least one process in the group was running.'
|
||||
},
|
||||
|
||||
// ------------------------------------------------------------------------
|
||||
// NETWORK QoS
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue