Skip to content

Commit

Permalink
metrics: prevent negative counter from iowait decrease
Browse files Browse the repository at this point in the history
The iowait metric obtained from `/proc/stat` can under some circumstances
decrease. The relevant condition is when an interrupt arrives on a different
core than the one that gets woken up for the IO, and a particular counter in the
kernel for that core gets interrupted. This is documented in the man page for
the `proc(5)` pseudo-filesystem, and considered an unfortunate behavior that
can't be changed for the sake of ABI compatibility.

In Nomad, we get the current "busy" time (everything except for idle) and
compare it to the previous busy time to get the counter incremeent. If the
iowait counter decreases and the idle counter increases more than the increase
in the total busy time, we can get a negative total. This previously caused a
panic in our metrics collection (see #15861) but that is being prevented by
reporting an error message.

Fix the bug by putting a zero floor on the values we return from the host CPU
stats calculator.

Backport-of: #18835
  • Loading branch information
tgross committed Oct 24, 2023
1 parent 7eb5b70 commit 41d2460
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 15 deletions.
10 changes: 5 additions & 5 deletions client/stats/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,7 @@ func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, u
currentIdle := times.Idle
currentUser := times.User
currentSystem := times.System
currentTotal := times.Total()
currentTotal := times.Total() // this is Idle + currentBusy
currentBusy := times.User + times.System + times.Nice + times.Iowait + times.Irq +
times.Softirq + times.Steal + times.Guest + times.GuestNice

Expand All @@ -288,16 +288,16 @@ func (h *HostCpuStatsCalculator) Calculate(times cpu.TimesStat) (idle float64, u
total = ((currentBusy - h.prevBusy) / deltaTotal) * 100

// Protect against any invalid values
if math.IsNaN(idle) || math.IsInf(idle, 0) {
if math.IsNaN(idle) || math.IsInf(idle, 0) || idle < 0.0 {
idle = 100.0
}
if math.IsNaN(user) || math.IsInf(user, 0) {
if math.IsNaN(user) || math.IsInf(user, 0) || user < 0.0 {
user = 0.0
}
if math.IsNaN(system) || math.IsInf(system, 0) {
if math.IsNaN(system) || math.IsInf(system, 0) || system < 0.0 {
system = 0.0
}
if math.IsNaN(total) || math.IsInf(total, 0) {
if math.IsNaN(total) || math.IsInf(total, 0) || total < 0 {
total = 0.0
}

Expand Down
39 changes: 29 additions & 10 deletions client/stats/host_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"testing"

"github.com/shirou/gopsutil/v3/cpu"
"github.com/shoenig/test/must"
)

func TestHostCpuStatsCalculator_Nan(t *testing.T) {
Expand All @@ -20,16 +21,34 @@ func TestHostCpuStatsCalculator_Nan(t *testing.T) {
calculator.Calculate(times)
idle, user, system, total := calculator.Calculate(times)

if idle != 100.0 {
t.Errorf("idle: Expected: %f, Got %f", 100.0, idle)
}
if user != 0.0 {
t.Errorf("user: Expected: %f, Got %f", 0.0, user)
}
if system != 0.0 {
t.Errorf("system: Expected: %f, Got %f", 0.0, system)
must.Eq(t, 100.0, idle, must.Sprint("unexpected idle stats"))
must.Eq(t, 0.0, user, must.Sprint("unexpected user stats"))
must.Eq(t, 0.0, system, must.Sprint("unexpected system stats"))
must.Eq(t, 0.0, total, must.Sprint("unexpected total stats"))
}

func TestHostCpuStatsCalculator_DecreasedIOWait(t *testing.T) {
times := cpu.TimesStat{
CPU: "cpu0",
User: 20000,
Nice: 100,
System: 9000,
Idle: 370000,
Iowait: 700,
}
if total != 0.0 {
t.Errorf("total: Expected: %f, Got %f", 0.0, total)

calculator := NewHostCpuStatsCalculator()
calculator.Calculate(times)

times = cpu.TimesStat{
CPU: "cpu0",
User: 20000,
Nice: 100,
System: 9000,
Idle: 380000,
Iowait: 600,
}

_, _, _, total := calculator.Calculate(times)
must.GreaterEq(t, 0.0, total, must.Sprint("total must never be negative"))
}

0 comments on commit 41d2460

Please sign in to comment.