-
Notifications
You must be signed in to change notification settings - Fork 1.2k
cpu plugin: add linux-specific "guest" states. #2439
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3dda0fa
f00e980
a44d70c
6b95249
092b2d8
ac7a73e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -98,9 +98,11 @@ | |
#define COLLECTD_CPU_STATE_INTERRUPT 5 | ||
#define COLLECTD_CPU_STATE_SOFTIRQ 6 | ||
#define COLLECTD_CPU_STATE_STEAL 7 | ||
#define COLLECTD_CPU_STATE_IDLE 8 | ||
#define COLLECTD_CPU_STATE_ACTIVE 9 /* sum of (!idle) */ | ||
#define COLLECTD_CPU_STATE_MAX 10 /* #states */ | ||
#define COLLECTD_CPU_STATE_GUEST 8 | ||
#define COLLECTD_CPU_STATE_GUEST_NICE 9 | ||
#define COLLECTD_CPU_STATE_IDLE 10 | ||
#define COLLECTD_CPU_STATE_ACTIVE 11 /* sum of (!idle) */ | ||
#define COLLECTD_CPU_STATE_MAX 12 /* #states */ | ||
|
||
#if HAVE_STATGRAB_H | ||
#include <statgrab.h> | ||
|
@@ -119,7 +121,7 @@ | |
|
||
static const char *cpu_state_names[] = {"user", "system", "wait", "nice", | ||
"swap", "interrupt", "softirq", "steal", | ||
"idle", "active"}; | ||
"guest", "guest_nice", "idle", "active"}; | ||
|
||
#ifdef PROCESSOR_CPU_LOAD_INFO | ||
static mach_port_t port_host; | ||
|
@@ -193,9 +195,12 @@ static _Bool report_by_cpu = 1; | |
static _Bool report_by_state = 1; | ||
static _Bool report_percent = 0; | ||
static _Bool report_num_cpu = 0; | ||
static _Bool report_guest = 0; | ||
static _Bool subtract_guest = 1; | ||
|
||
static const char *config_keys[] = {"ReportByCpu", "ReportByState", | ||
"ReportNumCpu", "ValuesPercentage"}; | ||
"ReportNumCpu", "ValuesPercentage", | ||
"ReportGuestState", "SubtractGuestState"}; | ||
static int config_keys_num = STATIC_ARRAY_SIZE(config_keys); | ||
8000 |
|
|
static int cpu_config(char const *key, char const *value) /* {{{ */ | ||
|
@@ -208,6 +213,10 @@ static int cpu_config(char const *key, char const *value) /* {{{ */ | |
report_by_state = IS_TRUE(value) ? 1 : 0; | ||
else if (strcasecmp(key, "ReportNumCpu") == 0) | ||
report_num_cpu = IS_TRUE(value) ? 1 : 0; | ||
else if (strcasecmp(key, "ReportGuestState") == 0) | ||
report_guest = IS_TRUE(value) ? 1 : 0; | ||
else if (strcasecmp(key, "SubtractGuestState") == 0) | ||
subtract_guest = IS_TRUE(value) ? 1 : 0; | ||
else | ||
return -1; | ||
|
||
|
@@ -524,7 +533,7 @@ static void cpu_commit_without_aggregation(void) /* {{{ */ | |
static void cpu_commit(void) /* {{{ */ | ||
{ | ||
gauge_t global_rates[COLLECTD_CPU_STATE_MAX] = { | ||
NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN /* Batman! */ | ||
NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN, NAN /* Batman! */ | ||
}; | ||
|
||
if (report_num_cpu) | ||
|
@@ -545,7 +554,8 @@ static void cpu_commit(void) /* {{{ */ | |
for (size_t cpu_num = 0; cpu_num < global_cpu_num; cpu_num++) { | ||
cpu_state_t *this_cpu_states = get_cpu_state(cpu_num, 0); | ||
gauge_t local_rates[COLLECTD_CPU_STATE_MAX] = {NAN, NAN, NAN, NAN, NAN, | ||
NAN, NAN, NAN, NAN, NAN}; | ||
NAN, NAN, NAN, NAN, NAN, | ||
NAN, NAN }; | ||
|
||
for (size_t state = 0; state < COLLECTD_CPU_STATE_MAX; state++) | ||
if (this_cpu_states[state].has_value) | ||
|
@@ -632,7 +642,7 @@ static int cpu_read(void) { | |
FILE *fh; | ||
char buf[1024]; | ||
|
||
char *fields[9]; | ||
char *fields[11]; | ||
int numfields; | ||
|
||
if ((fh = fopen("/proc/stat", "r")) == NULL) { | ||
|
@@ -648,14 +658,15 @@ static int cpu_read(void) { | |
if ((buf[3] < '0') || (buf[3] > '9')) | ||
continue; | ||
|
||
numfields = strsplit(buf, fields, 9); | ||
numfields = strsplit(buf, fields, STATIC_ARRAY_SIZE(fields)); | ||
if (numfields < 5) | ||
continue; | ||
|
||
cpu = atoi(fields[0] + 3); | ||
|
||
cpu_stage(cpu, COLLECTD_CPU_STATE_USER, (derive_t)atoll(fields[1]), now); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_NICE, (derive_t)atoll(fields[2]), now); | ||
/* Do not stage User and Nice immediately: we may need to alter them later: */ | ||
long long user_value = atoll(fields[1]); | ||
long long nice_value = atoll(fields[2]); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_SYSTEM, (derive_t)atoll(fields[3]), now); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_IDLE, (derive_t)atoll(fields[4]), now); | ||
|
||
|
@@ -665,11 +676,40 @@ static int cpu_read(void) { | |
now); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_SOFTIRQ, (derive_t)atoll(fields[7]), | ||
now); | ||
} | ||
|
||
if (numfields >= 9) | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_STEAL, (derive_t)atoll(fields[8]), | ||
now); | ||
if (numfields >= 9) { /* Steal (since Linux 2.6.11) */ | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_STEAL, (derive_t)atoll(fields[8]), now); | ||
} | ||
|
||
if (numfields >= 10) { /* Guest (since Linux 2.6.24) */ | ||
if (report_guest) { | ||
long long value = atoll(fields[9]); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_GUEST, (derive_t)value, now); | ||
/* Guest is included in User; optionally subtract Guest from User: */ | ||
if (subtract_guest) { | ||
user_value -= value; | ||
if (user_value < 0) user_value = 0; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi! Do you think (or, possible, know) is that condition is possible at practice? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Kernel code:
So, that will be possible only on counter overflow, which should be practically impossible. |
||
} | ||
} | ||
} | ||
|
||
if (numfields >= 11) { /* Guest_nice (since Linux 2.6.33) */ | ||
if (report_guest) { | ||
long long value = atoll(fields[10]); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_GUEST_NICE, (derive_t)value, now); | ||
/* Guest_nice is included in Nice; optionally subtract Guest_nice from | ||
Nice: */ | ||
if (subtract_guest) { | ||
nice_value -= value; | ||
if (nice_value < 0) nice_value = 0; | ||
} | ||
} | ||
} | ||
|
||
/* Eventually stage User and Nice: */ | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_USER, (derive_t)user_value, now); | ||
cpu_stage(cpu, COLLECTD_CPU_STATE_NICE, (derive_t)nice_value, now); | ||
} | ||
fclose(fh); | ||
/* }}} #endif defined(KERNEL_LINUX) */ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Having two options to control this behavior feels a bit like premature optimization. In other words, are users actually going to use those options? (This is an honest question, not a rhetorical one; I know nothing about the new guest counters ;)
I'd prefer to implement the behavior that
ReportGuestState true; SubtractGuestState true
has and implement config options only if (and when) users ask for it.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have implemented these options to address the following concern:
I believe having these options keeps you safe from any angry user, as they enable end users to revert to whatever old behaviour they were used to. So I would argue that, yes, indeed, some people will use them. Specifically, these counters are notoriously useless within VMs: people with 10 000+ VMs at $RandomCloudCompany may not want to store 20 000+ extra useless metrics. I may even use ReportGuestState myself, depending on the host I deploy collectd to :)
However, we surely can change the default values. The actual question is: what is the policy of the collectd project regarding new metrics within existing plugins?
ReportGuestState false; SubtractGuestState true
ReportGuestState true; SubtractGuestState true
ReportGuestState true; SubtractGuestState false
The choice is yours :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Conservative: New versions need to be backwards compatible (with the exception of new major versions). Backwards compatibility is defined by the user observable behavior, i.e. the previous config file needs to still work and all metrics that were reported previously need to still be reported.
I'd consider this a bit of an edge case, sine the previous metrics are still present. But it sounds like having users make a conscious decision to enable these metrics is reasonable.
You've made a good case for
ReportGuestState
– who would useSubtractGuestState
?Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
End users who only want the raw values straight out from the kernel to graph them the way they want in one or several graphs. Typically,
SubtractGuestState true
is useful for people who want to stack all CPU metrics in a single graph and be done with it. But some prefer to mix stacked and non-stacked values on a single graph. Some prefer a separate graph to reflect the host vs guests CPU consumption.I tend to focus on graph aficionados but alerting is to be considered too: some alerts and thresholds may assume that gues 10000 t is included in user, others may assume that user reflects only the host-specific activity.
That being said, since you aim for a conservative policy, what will most likely happen is that end users will notice the lack of guest and guest_nice (like I did), have a look at the documentation, be glad to find they can enable them easily and hopefully learn at the same time that subtlety about them being part of user and nice (I didn't know).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
SGTM, let's go with both options.