Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Memory numa stats #2621

Merged
merged 2 commits into from
Aug 13, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion cmd/cadvisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ var (
// Metrics to be ignored.
// Tcp metrics are ignored by default.
ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{
container.MemoryNumaMetrics: struct{}{},
container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkUdpUsageMetrics: struct{}{},
container.NetworkAdvancedTcpUsageMetrics: struct{}{},
Expand All @@ -97,6 +98,7 @@ var (
container.AcceleratorUsageMetrics: struct{}{},
container.DiskUsageMetrics: struct{}{},
container.DiskIOMetrics: struct{}{},
container.MemoryNumaMetrics: struct{}{},
container.NetworkUsageMetrics: struct{}{},
container.NetworkTcpUsageMetrics: struct{}{},
container.NetworkAdvancedTcpUsageMetrics: struct{}{},
Expand Down Expand Up @@ -139,7 +141,7 @@ func (ml *metricSetValue) Set(value string) error {
}

func init() {
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory', 'resctrl'.")
flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'memory_numa', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory', 'resctrl'.")

// Default logging verbosity to V(2)
flag.Set("v", "2")
Expand Down
7 changes: 7 additions & 0 deletions cmd/cadvisor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,12 @@ func TestCPUTopologyMetricsAreDisabledByDefault(t *testing.T) {
assert.True(t, ignoreMetrics.Has(container.CPUTopologyMetrics))
}

func TestMemoryNumaMetricsAreDisabledByDefault(t *testing.T) {
assert.True(t, ignoreMetrics.Has(container.MemoryNumaMetrics))
flag.Parse()
assert.True(t, ignoreMetrics.Has(container.MemoryNumaMetrics))
}

func TestIgnoreMetrics(t *testing.T) {
tests := []struct {
value string
Expand Down Expand Up @@ -86,6 +92,7 @@ func TestToIncludedMetrics(t *testing.T) {
container.ProcessSchedulerMetrics: struct{}{},
container.PerCpuUsageMetrics: struct{}{},
container.MemoryUsageMetrics: struct{}{},
container.MemoryNumaMetrics: struct{}{},
container.CpuLoadMetrics: struct{}{},
container.DiskIOMetrics: struct{}{},
container.AcceleratorUsageMetrics: struct{}{},
Expand Down
2 changes: 2 additions & 0 deletions container/factory.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ const (
ProcessSchedulerMetrics MetricKind = "sched"
PerCpuUsageMetrics MetricKind = "percpu"
MemoryUsageMetrics MetricKind = "memory"
MemoryNumaMetrics MetricKind = "memory_numa"
CpuLoadMetrics MetricKind = "cpuLoad"
DiskIOMetrics MetricKind = "diskIO"
DiskUsageMetrics MetricKind = "disk"
Expand All @@ -70,6 +71,7 @@ var AllMetrics = MetricSet{
ProcessSchedulerMetrics: struct{}{},
PerCpuUsageMetrics: struct{}{},
MemoryUsageMetrics: struct{}{},
MemoryNumaMetrics: struct{}{},
CpuLoadMetrics: struct{}{},
DiskIOMetrics: struct{}{},
AcceleratorUsageMetrics: struct{}{},
Expand Down
23 changes: 23 additions & 0 deletions container/libcontainer/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,26 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.WorkingSet = workingSet
}

func getNumaStats(memoryStats map[uint8]uint64) map[uint8]uint64 {
stats := make(map[uint8]uint64, len(memoryStats))
for node, usage := range memoryStats {
stats[node] = usage
}
return stats
}

func setMemoryNumaStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Memory.ContainerData.NumaStats.Total = getNumaStats(s.MemoryStats.PageUsageByNUMA.Total.Nodes)
ret.Memory.ContainerData.NumaStats.File = getNumaStats(s.MemoryStats.PageUsageByNUMA.File.Nodes)
ret.Memory.ContainerData.NumaStats.Anon = getNumaStats(s.MemoryStats.PageUsageByNUMA.Anon.Nodes)
ret.Memory.ContainerData.NumaStats.Unevictable = getNumaStats(s.MemoryStats.PageUsageByNUMA.Unevictable.Nodes)

ret.Memory.HierarchicalData.NumaStats.Total = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.Total.Nodes)
ret.Memory.HierarchicalData.NumaStats.File = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.File.Nodes)
ret.Memory.HierarchicalData.NumaStats.Anon = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.Anon.Nodes)
ret.Memory.HierarchicalData.NumaStats.Unevictable = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.Unevictable.Nodes)
}

func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) {
ret.Hugetlb = make(map[string]info.HugetlbStats)
for k, v := range s.HugetlbStats {
Expand Down Expand Up @@ -923,6 +943,9 @@ func newContainerStats(libcontainerStats *libcontainer.Stats, includedMetrics co
setDiskIoStats(s, ret)
}
setMemoryStats(s, ret)
if includedMetrics.Has(container.MemoryNumaMetrics) {
setMemoryNumaStats(s, ret)
}
if includedMetrics.Has(container.HugetlbUsageMetrics) {
setHugepageStats(s, ret)
}
Expand Down
1 change: 1 addition & 0 deletions docs/storage/prometheus.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ Metric name | Type | Description | Unit (where applicable) | -disable_metrics pa
`container_memory_cache` | Gauge | Total page cache memory | bytes | |
`container_memory_failcnt` | Counter | Number of memory usage hits limits | | |
`container_memory_failures_total` | Counter | Cumulative count of memory allocation failures | | |
`container_memory_numa_pages` | Gauge | Memory usage per numa node | pages | memory_numa |
`container_memory_max_usage_bytes` | Gauge | Maximum memory usage recorded | bytes | |
`container_memory_rss` | Gauge | Size of RSS | bytes | |
`container_memory_swap` | Gauge | Container swap usage | bytes | |
Expand Down
12 changes: 10 additions & 2 deletions info/v1/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,17 @@ type MemoryStats struct {
HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"`
}

type MemoryNumaStats struct {
Total map[uint8]uint64 `json:"total,omitempty"`
File map[uint8]uint64 `json:"file,omitempty"`
Anon map[uint8]uint64 `json:"anon,omitempty"`
Unevictable map[uint8]uint64 `json:"unevictable,omitempty"`
}

type MemoryStatsMemoryData struct {
Pgfault uint64 `json:"pgfault"`
Pgmajfault uint64 `json:"pgmajfault"`
Pgfault uint64 `json:"pgfault"`
Pgmajfault uint64 `json:"pgmajfault"`
NumaStats MemoryNumaStats `json:"numa_stats,omitempty"`
}

type InterfaceStats struct {
Expand Down
44 changes: 43 additions & 1 deletion metrics/prometheus.go
Original file line number Diff line number Diff line change
Expand Up @@ -422,7 +422,8 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
getValues: func(s *info.ContainerStats) metricValues {
return metricValues{{value: float64(s.Memory.WorkingSet), timestamp: s.Timestamp}}
},
}, {
},
{
name: "container_memory_failures_total",
help: "Cumulative count of memory allocation failures.",
valueType: prometheus.CounterValue,
Expand Down Expand Up @@ -454,6 +455,38 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri
},
}...)
}
if includedMetrics.Has(container.MemoryNumaMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
name: "container_memory_numa_pages",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this measured in #pages, or bytes? The help text should specify the units, and the suffix of the metric should be _

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it is written that memory.numa_stat contains pages and I see that in runc values are only read from file. I'll improve help text.

help: "Memory usage per numa node",
valueType: prometheus.GaugeValue,
extraLabels: []string{"type", "scope", "node"},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we only add two labels to metrics below?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this comment is still relevant

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I see now.

getValues: func(s *info.ContainerStats) metricValues {
values := make(metricValues, 0)

values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.Total,
[]string{"total", "container"}, s.Timestamp)...)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The general rule of thumb for when to use a label vs when to add a new metric is that the sum of a metric across all dimensions should be meaningful. "total" isn't a great dimension to have, as we would expect the sum of dimensions to be the "total". So we can either calculate the "other" portion, or make ...pages_total a separate metric.

Copy link
Collaborator Author

@katarzyna-z katarzyna-z Jul 27, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In documentation it is written that "total" count is sum of file + anon + unevictable so I'll remove metrics with "type"="total".

values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.File,
[]string{"file", "container"}, s.Timestamp)...)
values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.Anon,
[]string{"anon", "container"}, s.Timestamp)...)
values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.Unevictable,
[]string{"unevictable", "container"}, s.Timestamp)...)

values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.Total,
[]string{"total", "hierarchy"}, s.Timestamp)...)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect hierarchy vs container may need to be separate metrics by the same logic above.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I followed the pattern which was used for "container_memory_failures_total metric", see this. There is "scope" label with values "container" or "hierarchy".

values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.File,
[]string{"file", "hierarchy"}, s.Timestamp)...)
values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.Anon,
[]string{"anon", "hierarchy"}, s.Timestamp)...)
values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.Unevictable,
[]string{"unevictable", "hierarchy"}, s.Timestamp)...)
return values
},
},
}...)
}
if includedMetrics.Has(container.AcceleratorUsageMetrics) {
c.containerMetrics = append(c.containerMetrics, []containerMetric{
{
Expand Down Expand Up @@ -1903,3 +1936,12 @@ var invalidNameCharRE = regexp.MustCompile(`[^a-zA-Z0-9_]`)
func sanitizeLabelName(name string) string {
return invalidNameCharRE.ReplaceAllString(name, "_")
}

func getNumaStatsPerNode(nodeStats map[uint8]uint64, labels []string, timestamp time.Time) metricValues {
mValues := make(metricValues, 0, len(nodeStats))
for node, stat := range nodeStats {
nodeLabels := append(labels, strconv.FormatUint(uint64(node), 10))
mValues = append(mValues, metricValue{value: float64(stat), labels: nodeLabels, timestamp: timestamp})
}
return mValues
}
12 changes: 12 additions & 0 deletions metrics/prometheus_fake.go
Original file line number Diff line number Diff line change
Expand Up @@ -327,10 +327,22 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req
ContainerData: info.MemoryStatsMemoryData{
Pgfault: 10,
Pgmajfault: 11,
NumaStats: info.MemoryNumaStats{
Total: map[uint8]uint64{0: 23758, 1: 20000},
File: map[uint8]uint64{0: 16649, 1: 10000},
Anon: map[uint8]uint64{0: 10000, 1: 7109},
Unevictable: map[uint8]uint64{0: 8900, 1: 10000},
},
},
HierarchicalData: info.MemoryStatsMemoryData{
Pgfault: 12,
Pgmajfault: 13,
NumaStats: info.MemoryNumaStats{
Total: map[uint8]uint64{0: 33758, 1: 20000},
File: map[uint8]uint64{0: 36649, 1: 10000},
Anon: map[uint8]uint64{0: 20000, 1: 7109},
Unevictable: map[uint8]uint64{0: 8900, 1: 20000},
},
},
Cache: 14,
RSS: 15,
Expand Down
18 changes: 18 additions & 0 deletions metrics/testdata/prometheus_metrics
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,24 @@ container_memory_mapped_file{container_env_foo_env="prod",container_label_foo_la
# HELP container_memory_max_usage_bytes Maximum memory usage recorded in bytes
# TYPE container_memory_max_usage_bytes gauge
container_memory_max_usage_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000
# HELP container_memory_numa_pages Memory usage per numa node
# TYPE container_memory_numa_pages gauge
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="anon",zone_name="hello"} 10000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="file",zone_name="hello"} 16649 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="total",zone_name="hello"} 23758 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="unevictable",zone_name="hello"} 8900 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="anon",zone_name="hello"} 20000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="file",zone_name="hello"} 36649 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="total",zone_name="hello"} 33758 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="unevictable",zone_name="hello"} 8900 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="anon",zone_name="hello"} 7109 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="file",zone_name="hello"} 10000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="total",zone_name="hello"} 20000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="unevictable",zone_name="hello"} 10000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="anon",zone_name="hello"} 7109 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="file",zone_name="hello"} 10000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="total",zone_name="hello"} 20000 1395066363000
container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="unevictable",zone_name="hello"} 20000 1395066363000
# HELP container_memory_rss Size of RSS in bytes.
# TYPE container_memory_rss gauge
container_memory_rss{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 15 1395066363000
Expand Down