Skip to content

Commit

Permalink
Merge pull request #2067 from prometheus/superq/idle_jump
Browse files Browse the repository at this point in the history
Handle small backwards jumps in CPU idle
  • Loading branch information
SuperQ authored Jul 7, 2021
2 parents 35a2de2 + 73c9a10 commit 2510378
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 7 deletions.
24 changes: 17 additions & 7 deletions collector/cpu_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,10 +46,14 @@ type cpuCollector struct {
cpuBugsIncludeRegexp *regexp.Regexp
}

// Idle jump back limit in seconds.
const jumpBackSeconds = 3.0

var (
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
enableCPUInfo = kingpin.Flag("collector.cpu.info", "Enables metric cpu_info").Bool()
flagsInclude = kingpin.Flag("collector.cpu.info.flags-include", "Filter the `flags` field in cpuInfo with a value that must be a regular expression").String()
bugsInclude = kingpin.Flag("collector.cpu.info.bugs-include", "Filter the `bugs` field in cpuInfo with a value that must be a regular expression").String()
jumpBackDebugMessage = fmt.Sprintf("CPU Idle counter jumped backwards more than %f seconds, possible hotplug event, resetting CPU stats", jumpBackSeconds)
)

func init() {
Expand Down Expand Up @@ -302,6 +306,7 @@ func (c *cpuCollector) updateStat(ch chan<- prometheus.Metric) error {

// updateCPUStats updates the internal cache of CPU stats.
func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {

// Acquire a lock to update the stats.
c.cpuStatsMutex.Lock()
defer c.cpuStatsMutex.Unlock()
Expand All @@ -312,12 +317,17 @@ func (c *cpuCollector) updateCPUStats(newStats []procfs.CPUStat) {
}

for i, n := range newStats {
// If idle jumps backwards, assume we had a hotplug event and reset the stats for this CPU.
if n.Idle < c.cpuStats[i].Idle {
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards, possible hotplug event, resetting CPU stats", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
// If idle jumps backwards by more than X seconds, assume we had a hotplug event and reset the stats for this CPU.
if (c.cpuStats[i].Idle - n.Idle) >= jumpBackSeconds {
level.Debug(c.logger).Log("msg", jumpBackDebugMessage, "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
c.cpuStats[i] = procfs.CPUStat{}
}
c.cpuStats[i].Idle = n.Idle

if n.Idle >= c.cpuStats[i].Idle {
c.cpuStats[i].Idle = n.Idle
} else {
level.Debug(c.logger).Log("msg", "CPU Idle counter jumped backwards", "cpu", i, "old_value", c.cpuStats[i].Idle, "new_value", n.Idle)
}

if n.User >= c.cpuStats[i].User {
c.cpuStats[i].User = n.User
Expand Down
105 changes: 105 additions & 0 deletions collector/cpu_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2021 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// +build !nocpu

package collector

import (
"reflect"
"testing"

"github.com/go-kit/log"
"github.com/prometheus/procfs"
)

func makeTestCPUCollector(s []procfs.CPUStat) *cpuCollector {
dup := make([]procfs.CPUStat, len(s))
copy(dup, s)
return &cpuCollector{
logger: log.NewNopLogger(),
cpuStats: dup,
}
}

func TestCPU(t *testing.T) {
firstCPUStat := []procfs.CPUStat{{
User: 100.0,
Nice: 100.0,
System: 100.0,
Idle: 100.0,
Iowait: 100.0,
IRQ: 100.0,
SoftIRQ: 100.0,
Steal: 100.0,
Guest: 100.0,
GuestNice: 100.0,
}}

c := makeTestCPUCollector(firstCPUStat)
want := []procfs.CPUStat{{
User: 101.0,
Nice: 101.0,
System: 101.0,
Idle: 101.0,
Iowait: 101.0,
IRQ: 101.0,
SoftIRQ: 101.0,
Steal: 101.0,
Guest: 101.0,
GuestNice: 101.0,
}}
c.updateCPUStats(want)
got := c.cpuStats
if !reflect.DeepEqual(want, got) {
t.Fatalf("should have %v CPU Stat: got %v", want, got)
}

c = makeTestCPUCollector(firstCPUStat)
jumpBack := []procfs.CPUStat{{
User: 99.9,
Nice: 99.9,
System: 99.9,
Idle: 99.9,
Iowait: 99.9,
IRQ: 99.9,
SoftIRQ: 99.9,
Steal: 99.9,
Guest: 99.9,
GuestNice: 99.9,
}}
c.updateCPUStats(jumpBack)
got = c.cpuStats
if reflect.DeepEqual(jumpBack, got) {
t.Fatalf("should have %v CPU Stat: got %v", firstCPUStat, got)
}

c = makeTestCPUCollector(firstCPUStat)
resetIdle := []procfs.CPUStat{{
User: 102.0,
Nice: 102.0,
System: 102.0,
Idle: 1.0,
Iowait: 102.0,
IRQ: 102.0,
SoftIRQ: 102.0,
Steal: 102.0,
Guest: 102.0,
GuestNice: 102.0,
}}
c.updateCPUStats(resetIdle)
got = c.cpuStats
if !reflect.DeepEqual(resetIdle, got) {
t.Fatalf("should have %v CPU Stat: got %v", resetIdle, got)
}
}

0 comments on commit 2510378

Please sign in to comment.