Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix num cpu #1561

Merged
merged 12 commits into from
Feb 20, 2020
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

### Changes

* [ENHANCEMENT] Add `--collector.perf.cpus` to allow setting the CPU list for perf stats.
* [CHANGE] Add `--collector.netdev.device-whitelist`. #1279
* [CHANGE] Refactor mdadm collector #1403
* [CHANGE] Add `mountaddr` label to NFS metrics. #1417
Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,18 @@ Depending on the configured value different metrics will be available, for most
cases `0` will provide the most complete set. For more information see [`man 2
perf_event_open`](http://man7.org/linux/man-pages/man2/perf_event_open.2.html).

By default, the perf collector will only collect metrics of the CPUs that
`node_exporter` is running on (ie
[`runtime.NumCPU`](https://golang.org/pkg/runtime/#NumCPU). If this is
insufficient (e.g. if you run `node_exporter` with its CPU affinity set to
specific CPUs) You can specify a list of alternate CPUs by using the
`--collector.perf.cpus` flag. For example, to collect metrics on CPUs 2-6, you
would specify: `--collector.perf --collector.perf.cpus=2-6`. The CPU
configuration is zero indexed and can also take a stride value
`--collector.perf --collector.perf.cpus=1-10:5`, would collect on CPUs
1, 5, and 10.


Name | Description | OS
---------|-------------|----
buddyinfo | Exposes statistics of memory fragments as reported by /proc/buddyinfo. | Linux
Expand Down
165 changes: 129 additions & 36 deletions collector/perf_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,25 @@
package collector

import (
"fmt"
"runtime"
"strconv"
"strings"

"github.com/go-kit/kit/log"
"github.com/hodgesds/perf-utils"
"github.com/prometheus/client_golang/prometheus"
kingpin "gopkg.in/alecthomas/kingpin.v2"
)

const (
perfSubsystem = "perf"
)

var (
perfCPUsFlag = kingpin.Flag("collector.perf.cpus", "List of CPUs from which perf metrics should be collected").Default("").String()
)

func init() {
registerCollector(perfSubsystem, defaultDisabled, NewPerfCollector)
}
Expand All @@ -36,40 +43,123 @@ func init() {
// settings not all profiler values may be exposed on the target system at any
// given time.
type perfCollector struct {
perfHwProfilers map[int]perf.HardwareProfiler
perfSwProfilers map[int]perf.SoftwareProfiler
perfCacheProfilers map[int]perf.CacheProfiler
desc map[string]*prometheus.Desc
logger log.Logger
hwProfilerCPUMap map[*perf.HardwareProfiler]int
swProfilerCPUMap map[*perf.SoftwareProfiler]int
cacheProfilerCPUMap map[*perf.CacheProfiler]int
perfHwProfilers map[int]*perf.HardwareProfiler
perfSwProfilers map[int]*perf.SoftwareProfiler
perfCacheProfilers map[int]*perf.CacheProfiler
desc map[string]*prometheus.Desc
logger log.Logger
}

// perfCPUFlagToCPUs returns a set of CPUs for the perf collectors to monitor.
func perfCPUFlagToCPUs(cpuFlag string) ([]int, error) {
var err error
cpus := []int{}
for _, subset := range strings.Split(cpuFlag, ",") {
// First parse a single CPU.
if !strings.Contains(subset, "-") {
cpu, err := strconv.Atoi(subset)
if err != nil {
return nil, err
}
cpus = append(cpus, cpu)
continue
}

stride := 1
// Handle strides, ie 1-10:5 should yield 1,5,10
strideSet := strings.Split(subset, ":")
if len(strideSet) == 2 {
stride, err = strconv.Atoi(strideSet[1])
if err != nil {
return nil, err
}
}

rangeSet := strings.Split(strideSet[0], "-")
if len(rangeSet) != 2 {
return nil, fmt.Errorf("invalid flag value %q", cpuFlag)
}
start, err := strconv.Atoi(rangeSet[0])
if err != nil {
return nil, err
}
end, err := strconv.Atoi(rangeSet[1])
if err != nil {
return nil, err
}
for i := start; i <= end; i += stride {
cpus = append(cpus, i)
}
}

return cpus, nil
}

// NewPerfCollector returns a new perf based collector, it creates a profiler
// per CPU.
func NewPerfCollector(logger log.Logger) (Collector, error) {
c := &perfCollector{
perfHwProfilers: map[int]perf.HardwareProfiler{},
perfSwProfilers: map[int]perf.SoftwareProfiler{},
perfCacheProfilers: map[int]perf.CacheProfiler{},
logger: logger,
collector := &perfCollector{
perfHwProfilers: map[int]*perf.HardwareProfiler{},
perfSwProfilers: map[int]*perf.SoftwareProfiler{},
perfCacheProfilers: map[int]*perf.CacheProfiler{},
hwProfilerCPUMap: map[*perf.HardwareProfiler]int{},
swProfilerCPUMap: map[*perf.SoftwareProfiler]int{},
cacheProfilerCPUMap: map[*perf.CacheProfiler]int{},
logger: logger,
}
ncpus := runtime.NumCPU()
for i := 0; i < ncpus; i++ {
// Use -1 to profile all processes on the CPU, see:
// man perf_event_open
c.perfHwProfilers[i] = perf.NewHardwareProfiler(-1, i)
if err := c.perfHwProfilers[i].Start(); err != nil {
return c, err
}
c.perfSwProfilers[i] = perf.NewSoftwareProfiler(-1, i)
if err := c.perfSwProfilers[i].Start(); err != nil {
return c, err
}
c.perfCacheProfilers[i] = perf.NewCacheProfiler(-1, i)
if err := c.perfCacheProfilers[i].Start(); err != nil {
return c, err

if perfCPUsFlag != nil && *perfCPUsFlag != "" {
cpus, err := perfCPUFlagToCPUs(*perfCPUsFlag)
if err != nil {
return nil, err
}
for _, cpu := range cpus {
// Use -1 to profile all processes on the CPU, see:
// man perf_event_open
hwProf := perf.NewHardwareProfiler(-1, cpu)
if err := hwProf.Start(); err != nil {
return nil, err
}
collector.perfHwProfilers[cpu] = &hwProf

swProf := perf.NewSoftwareProfiler(-1, cpu)
if err := swProf.Start(); err != nil {
return nil, err
}
collector.perfSwProfilers[cpu] = &swProf

cacheProf := perf.NewCacheProfiler(-1, cpu)
if err := cacheProf.Start(); err != nil {
return nil, err
}
collector.perfCacheProfilers[cpu] = &cacheProf
}
} else {
for i := 0; i < runtime.NumCPU(); i++ {
hwProf := perf.NewHardwareProfiler(-1, i)
if err := hwProf.Start(); err != nil {
return nil, err
}
collector.perfHwProfilers[i] = &hwProf

swProf := perf.NewSoftwareProfiler(-1, i)
if err := swProf.Start(); err != nil {
return nil, err
}
collector.perfSwProfilers[i] = &swProf

cacheProf := perf.NewCacheProfiler(-1, i)
if err := cacheProf.Start(); err != nil {
return nil, err
}
collector.perfCacheProfilers[i] = &cacheProf
}
}
c.desc = map[string]*prometheus.Desc{

collector.desc = map[string]*prometheus.Desc{
"cpucycles_total": prometheus.NewDesc(
prometheus.BuildFQName(
namespace,
Expand Down Expand Up @@ -312,7 +402,7 @@ func NewPerfCollector(logger log.Logger) (Collector, error) {
),
}

return c, nil
return collector, nil
}

// Update implements the Collector interface and will collect metrics per CPU.
Expand All @@ -333,9 +423,10 @@ func (c *perfCollector) Update(ch chan<- prometheus.Metric) error {
}

func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfHwProfilers {
cpuStr := strconv.Itoa(cpu)
hwProfile, err := profiler.Profile()
for _, profiler := range c.perfHwProfilers {
cpuid := c.hwProfilerCPUMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
SuperQ marked this conversation as resolved.
Show resolved Hide resolved
hwProfile, err := (*profiler).Profile()
if err != nil {
return err
}
Expand Down Expand Up @@ -404,9 +495,10 @@ func (c *perfCollector) updateHardwareStats(ch chan<- prometheus.Metric) error {
}

func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfSwProfilers {
cpuStr := strconv.Itoa(cpu)
swProfile, err := profiler.Profile()
for _, profiler := range c.perfSwProfilers {
cpuid := c.swProfilerCPUMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
swProfile, err := (*profiler).Profile()
if err != nil {
return err
}
Expand Down Expand Up @@ -459,9 +551,10 @@ func (c *perfCollector) updateSoftwareStats(ch chan<- prometheus.Metric) error {
}

func (c *perfCollector) updateCacheStats(ch chan<- prometheus.Metric) error {
for cpu, profiler := range c.perfCacheProfilers {
cpuStr := strconv.Itoa(cpu)
cacheProfile, err := profiler.Profile()
for _, profiler := range c.perfCacheProfilers {
cpuid := c.cacheProfilerCPUMap[profiler]
cpuStr := fmt.Sprintf("%d", cpuid)
cacheProfile, err := (*profiler).Profile()
if err != nil {
return err
}
Expand Down
73 changes: 73 additions & 0 deletions collector/perf_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,3 +54,76 @@ func TestPerfCollector(t *testing.T) {
t.Fatal(err)
}
}

func TestPerfCPUFlagToCPUs(t *testing.T) {
tests := []struct {
name string
flag string
exCpus []int
errStr string
}{
{
name: "valid single cpu",
flag: "1",
exCpus: []int{1},
},
{
name: "valid range cpus",
flag: "1-5",
exCpus: []int{1, 2, 3, 4, 5},
},
{
name: "valid double digit",
flag: "10",
exCpus: []int{10},
},
{
name: "valid double digit range",
flag: "10-12",
exCpus: []int{10, 11, 12},
},
{
name: "valid double digit stride",
flag: "10-20:5",
exCpus: []int{10, 15, 20},
},
}

for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
cpus, err := perfCPUFlagToCPUs(test.flag)
if test.errStr != "" {
if err != nil {
t.Fatal("expected error to not be nil")
}
if test.errStr != err.Error() {
t.Fatalf(
"expected error %q, got %q",
test.errStr,
err.Error(),
)
}
return
}
if err != nil {
t.Fatal(err)
}
if len(cpus) != len(test.exCpus) {
t.Fatalf(
"expected cpus %v, got %v",
test.exCpus,
cpus,
)
}
for i := range cpus {
if test.exCpus[i] != cpus[i] {
t.Fatalf(
"expected cpus %v, got %v",
test.exCpus,
cpus,
)
}
}
})
}
}