Skip to content

Commit

Permalink
Aggregation perf metrics enabled by percpu options
Browse files Browse the repository at this point in the history
Signed-off-by: Katarzyna Kujawa <katarzyna.kujawa@intel.com>
  • Loading branch information
katarzyna-z committed Jul 8, 2020
1 parent ab35e42 commit d2eb985
Show file tree
Hide file tree
Showing 3 changed files with 155 additions and 83 deletions.
164 changes: 94 additions & 70 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
package metrics

import (
"flag"
"fmt"
"regexp"
"strconv"
Expand All @@ -30,8 +29,6 @@ import (
"k8s.io/utils/clock"
)

var perfAggregateFlag = flag.Bool("perf_aggregate", false, "Enable core perf metrics aggregation by 'event' and 'id'")

// asFloat64 converts a uint64 into a float64.
func asFloat64(v uint64) float64 { return float64(v) }

Expand Down Expand Up @@ -1552,25 +1549,48 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
}...)
}
if includedMetrics.Has(container.PerfMetrics) {
if includedMetrics.Has(container.PerCpuUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_perf_events_total",
help: "Perf event metric.",
valueType: prometheus.CounterValue,
extraLabels: []string{"cpu", "event"},
getValues: func(s *info.ContainerStats) metricValues {
return getPerCpuCorePerfEvents(s)
},
},
{
name: "container_perf_events_scaling_ratio",
help: "Perf event metric scaling ratio.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"cpu", "event"},
getValues: func(s *info.ContainerStats) metricValues {
return getPerCpuCoreScalingRatio(s)
},
}}...)
} else {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_perf_events_total",
help: "Perf event metric.",
valueType: prometheus.CounterValue,
extraLabels: []string{"cpu", "event"},
getValues: func(s *info.ContainerStats) metricValues {
return getAggregatedCorePerfEvents(s)
},
},
{
name: "container_perf_events_scaling_ratio",
help: "Perf event metric scaling ratio.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"cpu", "event"},
getValues: func(s *info.ContainerStats) metricValues {
return getAvgCoreScalingRatio(s)
},
}}...)
}
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_perf_events_total",
help: "Perf event metric.",
valueType: prometheus.CounterValue,
extraLabels: []string{"cpu", "event"},
getValues: func(s *info.ContainerStats) metricValues {
return getCorePerfEvents(s)
},
},
{
name: "container_perf_events_scaling_ratio",
help: "Perf event metric scaling ratio.",
valueType: prometheus.GaugeValue,
extraLabels: []string{"cpu", "event"},
getValues: func(s *info.ContainerStats) metricValues {
return getCoreScalingRatio(s)
},
},
{
name: "container_perf_uncore_events_total",
help: "Perf uncore event metric.",
Expand Down Expand Up @@ -1891,64 +1911,68 @@ func sanitizeLabelName(name string) string {
return invalidNameCharRE.ReplaceAllString(name, "_")
}

func getCorePerfEvents(s *info.ContainerStats) metricValues {
func getPerCpuCorePerfEvents(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.PerfStats))
for _, metric := range s.PerfStats {
values = append(values, metricValue{
value: float64(metric.Value),
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
timestamp: s.Timestamp,
})
}
return values
}

func getPerCpuCoreScalingRatio(s *info.ContainerStats) metricValues {
values := make(metricValues, 0, len(s.PerfStats))
for _, metric := range s.PerfStats {
values = append(values, metricValue{
value: metric.ScalingRatio,
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
timestamp: s.Timestamp,
})
}
return values
}

func getAggregatedCorePerfEvents(s *info.ContainerStats) metricValues {
values := make(metricValues, 0)
if *perfAggregateFlag {
perfEventStatAgg := make(map[string]uint64)
// aggregate by event
for _, perfStat := range s.PerfStats {
perfEventStatAgg[perfStat.Name] += perfStat.Value
}
// create aggregated metrics
for perfEvent, perfValue := range perfEventStatAgg {
values = append(values, metricValue{
value: float64(perfValue),
labels: []string{"", perfEvent},
timestamp: s.Timestamp,
})
}

} else {
for _, metric := range s.PerfStats {
values = append(values, metricValue{
value: float64(metric.Value),
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
timestamp: s.Timestamp,
})
}
perfEventStatAgg := make(map[string]uint64)
// aggregate by event
for _, perfStat := range s.PerfStats {
perfEventStatAgg[perfStat.Name] += perfStat.Value
}
// create aggregated metrics
for perfEvent, perfValue := range perfEventStatAgg {
values = append(values, metricValue{
value: float64(perfValue),
labels: []string{"", perfEvent},
timestamp: s.Timestamp,
})
}
return values
}

func getCoreScalingRatio(s *info.ContainerStats) metricValues {
func getAvgCoreScalingRatio(s *info.ContainerStats) metricValues {
values := make(metricValues, 0)
if *perfAggregateFlag {
perfEventStatAgg := make(map[string][]float64)
// collect scaling ratios for event
for _, perfStat := range s.PerfStats {
perfEventStatAgg[perfStat.Name] = append(perfEventStatAgg[perfStat.Name], perfStat.ScalingRatio)
perfEventStatAgg := make(map[string][]float64)
// collect scaling ratios for event
for _, perfStat := range s.PerfStats {
perfEventStatAgg[perfStat.Name] = append(perfEventStatAgg[perfStat.Name], perfStat.ScalingRatio)
}
// calculate average scaling ratio
for perfEvent, perfScalingRatio := range perfEventStatAgg {
sumScalingRatio := 0.0
for _, scalingRatio := range perfScalingRatio {
sumScalingRatio += scalingRatio
}
// calculate average scaling ratio
for perfEvent, perfScalingRatio := range perfEventStatAgg {
sumScalingRatio := 0.0
for _, scalingRatio := range perfScalingRatio {
sumScalingRatio += scalingRatio
}

values = append(values, metricValue{
value: sumScalingRatio / float64(len(perfScalingRatio)),
labels: []string{"", perfEvent},
timestamp: s.Timestamp,
})
}
} else {
for _, metric := range s.PerfStats {
values = append(values, metricValue{
value: metric.ScalingRatio,
labels: []string{strconv.Itoa(metric.Cpu), metric.Name},
timestamp: s.Timestamp,
})
}
values = append(values, metricValue{
value: sumScalingRatio / float64(len(perfScalingRatio)),
labels: []string{"", perfEvent},
timestamp: s.Timestamp,
})
}
return values
}
37 changes: 24 additions & 13 deletions metrics/prometheus_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,21 @@ func TestPrometheusCollector(t *testing.T) {
testPrometheusCollector(t, reg, "testdata/prometheus_metrics")
}

func TestPrometheusCollectorWithPerfAggregated(t *testing.T) {
metrics := container.MetricSet{
container.PerfMetrics: struct{}{},
}
c := NewPrometheusCollector(testSubcontainersInfoProvider{}, func(container *info.ContainerInfo) map[string]string {
s := DefaultContainerLabels(container)
s["zone.name"] = "hello"
return s
}, metrics, now, v2.RequestOptions{})
reg := prometheus.NewRegistry()
reg.MustRegister(c)

testPrometheusCollector(t, reg, "testdata/prometheus_metrics_perf_aggregated")
}

func testPrometheusCollector(t *testing.T, gatherer prometheus.Gatherer, metricsFile string) {
wantMetrics, err := os.Open(metricsFile)
if err != nil {
Expand Down Expand Up @@ -123,7 +138,7 @@ func mockLabelFunc(*info.ContainerInfo) map[string]string {
return map[string]string{}
}

func TestGetCorePerfEvents(t *testing.T) {
func TestGetPerCpuCorePerfEvents(t *testing.T) {
containerStats := &info.ContainerStats{
Timestamp: time.Unix(1395066367, 0),
PerfStats: []info.PerfStat{
Expand Down Expand Up @@ -153,12 +168,11 @@ func TestGetCorePerfEvents(t *testing.T) {
},
},
}
metricVals := getCorePerfEvents(containerStats)
metricVals := getPerCpuCorePerfEvents(containerStats)
assert.Equal(t, 4, len(metricVals))
}

func TestGetCorePerfEventsAggregated(t *testing.T) {
*perfAggregateFlag = true
func TestGetPerCpuCoreScalingRatio(t *testing.T) {
containerStats := &info.ContainerStats{
Timestamp: time.Unix(1395066367, 0),
PerfStats: []info.PerfStat{
Expand Down Expand Up @@ -188,12 +202,11 @@ func TestGetCorePerfEventsAggregated(t *testing.T) {
},
},
}
metricVals := getCorePerfEvents(containerStats)
assert.Equal(t, 2, len(metricVals))
*perfAggregateFlag = false
metricVals := getPerCpuCoreScalingRatio(containerStats)
assert.Equal(t, 4, len(metricVals))
}

func TestGetCoreScalingRatio(t *testing.T) {
func TestGetCorePerfEventsAggregated(t *testing.T) {
containerStats := &info.ContainerStats{
Timestamp: time.Unix(1395066367, 0),
PerfStats: []info.PerfStat{
Expand Down Expand Up @@ -223,12 +236,11 @@ func TestGetCoreScalingRatio(t *testing.T) {
},
},
}
metricVals := getCoreScalingRatio(containerStats)
assert.Equal(t, 4, len(metricVals))
metricVals := getAggregatedCorePerfEvents(containerStats)
assert.Equal(t, 2, len(metricVals))
}

func TestGetCoreScalingRatioAverage(t *testing.T) {
*perfAggregateFlag = true
containerStats := &info.ContainerStats{
Timestamp: time.Unix(1395066367, 0),
PerfStats: []info.PerfStat{
Expand Down Expand Up @@ -258,7 +270,6 @@ func TestGetCoreScalingRatioAverage(t *testing.T) {
},
},
}
metricVals := getCoreScalingRatio(containerStats)
metricVals := getAvgCoreScalingRatio(containerStats)
assert.Equal(t, 2, len(metricVals))
*perfAggregateFlag = false
}
37 changes: 37 additions & 0 deletions metrics/testdata/prometheus_metrics_perf_aggregated
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# HELP cadvisor_version_info A metric with a constant '1' value labeled by kernel version, OS version, docker version, cadvisor version & cadvisor revision.
# TYPE cadvisor_version_info gauge
cadvisor_version_info{cadvisorRevision="abcdef",cadvisorVersion="0.16.0",dockerVersion="1.8.1",kernelVersion="4.1.6-200.fc22.x86_64",osVersion="Fedora 22 (Twenty Two)"} 1
# HELP container_last_seen Last time a container was seen by the exporter
# TYPE container_last_seen gauge
container_last_seen{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.395066363e+09 1395066363000
# HELP container_perf_events_scaling_ratio Perf event metric scaling ratio.
# TYPE container_perf_events_scaling_ratio gauge
container_perf_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.75 1395066363000
container_perf_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions_retired",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 0.499999999995 1395066363000
# HELP container_perf_events_total Perf event metric.
# TYPE container_perf_events_total counter
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 579 1395066363000
container_perf_events_total{container_env_foo_env="prod",container_label_foo_label="bar",cpu="",event="instructions_retired",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1110 1395066363000
# HELP container_perf_uncore_events_scaling_ratio Perf uncore event metric scaling ratio.
# TYPE container_perf_uncore_events_scaling_ratio gauge
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1 1395066363000
container_perf_uncore_events_scaling_ratio{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1 1395066363000
# HELP container_perf_uncore_events_total Perf uncore event metric.
# TYPE container_perf_uncore_events_total counter
container_perf_uncore_events_total{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="0",zone_name="hello"} 1.231231512e+09 1395066363000
container_perf_uncore_events_total{container_env_foo_env="prod",container_label_foo_label="bar",event="cas_count_read",id="testcontainer",image="test",name="testcontaineralias",pmu="uncore_imc_0",socket="1",zone_name="hello"} 1.111231331e+09 1395066363000
# HELP container_scrape_error 1 if there was an error while getting container metrics, 0 otherwise
# TYPE container_scrape_error gauge
container_scrape_error 0
# HELP container_spec_cpu_period CPU period of the container.
# TYPE container_spec_cpu_period gauge
container_spec_cpu_period{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 100000
# HELP container_spec_cpu_quota CPU quota of the container.
# TYPE container_spec_cpu_quota gauge
container_spec_cpu_quota{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 10000
# HELP container_spec_cpu_shares CPU share of the container.
# TYPE container_spec_cpu_shares gauge
container_spec_cpu_shares{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1000
# HELP container_start_time_seconds Start time of the container since unix epoch in seconds.
# TYPE container_start_time_seconds gauge
container_start_time_seconds{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 1.257894e+09

0 comments on commit d2eb985

Please sign in to comment.