diff --git a/cmd/cadvisor.go b/cmd/cadvisor.go index 505d091b07..ade2d22829 100644 --- a/cmd/cadvisor.go +++ b/cmd/cadvisor.go @@ -81,6 +81,7 @@ var ( // Metrics to be ignored. // Tcp metrics are ignored by default. ignoreMetrics metricSetValue = metricSetValue{container.MetricSet{ + container.MemoryNumaMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkUdpUsageMetrics: struct{}{}, container.NetworkAdvancedTcpUsageMetrics: struct{}{}, @@ -97,6 +98,7 @@ var ( container.AcceleratorUsageMetrics: struct{}{}, container.DiskUsageMetrics: struct{}{}, container.DiskIOMetrics: struct{}{}, + container.MemoryNumaMetrics: struct{}{}, container.NetworkUsageMetrics: struct{}{}, container.NetworkTcpUsageMetrics: struct{}{}, container.NetworkAdvancedTcpUsageMetrics: struct{}{}, @@ -139,7 +141,7 @@ func (ml *metricSetValue) Set(value string) error { } func init() { - flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory', 'resctrl'.") + flag.Var(&ignoreMetrics, "disable_metrics", "comma-separated list of `metrics` to be disabled. Options are 'accelerator', 'cpu_topology','disk', 'diskIO', 'memory_numa', 'network', 'tcp', 'udp', 'percpu', 'sched', 'process', 'hugetlb', 'referenced_memory', 'resctrl'.") // Default logging verbosity to V(2) flag.Set("v", "2") diff --git a/cmd/cadvisor_test.go b/cmd/cadvisor_test.go index 88a19afd29..093a348118 100644 --- a/cmd/cadvisor_test.go +++ b/cmd/cadvisor_test.go @@ -52,6 +52,12 @@ func TestCPUTopologyMetricsAreDisabledByDefault(t *testing.T) { assert.True(t, ignoreMetrics.Has(container.CPUTopologyMetrics)) } +func TestMemoryNumaMetricsAreDisabledByDefault(t *testing.T) { + assert.True(t, ignoreMetrics.Has(container.MemoryNumaMetrics)) + flag.Parse() + assert.True(t, ignoreMetrics.Has(container.MemoryNumaMetrics)) +} + func TestIgnoreMetrics(t *testing.T) { tests := []struct { value string @@ -86,6 +92,7 @@ func TestToIncludedMetrics(t *testing.T) { container.ProcessSchedulerMetrics: struct{}{}, container.PerCpuUsageMetrics: struct{}{}, container.MemoryUsageMetrics: struct{}{}, + container.MemoryNumaMetrics: struct{}{}, container.CpuLoadMetrics: struct{}{}, container.DiskIOMetrics: struct{}{}, container.AcceleratorUsageMetrics: struct{}{}, diff --git a/container/factory.go b/container/factory.go index a972853d09..652070b1b4 100644 --- a/container/factory.go +++ b/container/factory.go @@ -47,6 +47,7 @@ const ( ProcessSchedulerMetrics MetricKind = "sched" PerCpuUsageMetrics MetricKind = "percpu" MemoryUsageMetrics MetricKind = "memory" + MemoryNumaMetrics MetricKind = "memory_numa" CpuLoadMetrics MetricKind = "cpuLoad" DiskIOMetrics MetricKind = "diskIO" DiskUsageMetrics MetricKind = "disk" @@ -70,6 +71,7 @@ var AllMetrics = MetricSet{ ProcessSchedulerMetrics: struct{}{}, PerCpuUsageMetrics: struct{}{}, MemoryUsageMetrics: struct{}{}, + MemoryNumaMetrics: struct{}{}, CpuLoadMetrics: struct{}{}, DiskIOMetrics: struct{}{}, AcceleratorUsageMetrics: struct{}{}, diff --git a/container/libcontainer/handler.go b/container/libcontainer/handler.go index 436379b762..1094b10392 100644 --- a/container/libcontainer/handler.go +++ b/container/libcontainer/handler.go @@ -870,6 +870,24 @@ func setMemoryStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Memory.WorkingSet = workingSet } +func getNumaStats(memoryStats map[uint8]uint64) map[uint8]uint64 { + stats := make(map[uint8]uint64, len(memoryStats)) + for node, usage := range memoryStats { + stats[node] = usage + } + return stats +} + +func setMemoryNumaStats(s *cgroups.Stats, ret *info.ContainerStats) { + ret.Memory.ContainerData.NumaStats.File = getNumaStats(s.MemoryStats.PageUsageByNUMA.File.Nodes) + ret.Memory.ContainerData.NumaStats.Anon = getNumaStats(s.MemoryStats.PageUsageByNUMA.Anon.Nodes) + ret.Memory.ContainerData.NumaStats.Unevictable = getNumaStats(s.MemoryStats.PageUsageByNUMA.Unevictable.Nodes) + + ret.Memory.HierarchicalData.NumaStats.File = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.File.Nodes) + ret.Memory.HierarchicalData.NumaStats.Anon = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.Anon.Nodes) + ret.Memory.HierarchicalData.NumaStats.Unevictable = getNumaStats(s.MemoryStats.PageUsageByNUMA.Hierarchical.Unevictable.Nodes) +} + func setHugepageStats(s *cgroups.Stats, ret *info.ContainerStats) { ret.Hugetlb = make(map[string]info.HugetlbStats) for k, v := range s.HugetlbStats { @@ -923,6 +941,9 @@ func newContainerStats(libcontainerStats *libcontainer.Stats, includedMetrics co setDiskIoStats(s, ret) } setMemoryStats(s, ret) + if includedMetrics.Has(container.MemoryNumaMetrics) { + setMemoryNumaStats(s, ret) + } if includedMetrics.Has(container.HugetlbUsageMetrics) { setHugepageStats(s, ret) } diff --git a/docs/storage/prometheus.md b/docs/storage/prometheus.md index 64e7e6316a..36a847b228 100644 --- a/docs/storage/prometheus.md +++ b/docs/storage/prometheus.md @@ -59,6 +59,7 @@ Metric name | Type | Description | Unit (where applicable) | -disable_metrics pa `container_memory_cache` | Gauge | Total page cache memory | bytes | | `container_memory_failcnt` | Counter | Number of memory usage hits limits | | | `container_memory_failures_total` | Counter | Cumulative count of memory allocation failures | | | +`container_memory_numa_pages` | Gauge | Number of used pages per NUMA node | | memory_numa | `container_memory_max_usage_bytes` | Gauge | Maximum memory usage recorded | bytes | | `container_memory_rss` | Gauge | Size of RSS | bytes | | `container_memory_swap` | Gauge | Container swap usage | bytes | | diff --git a/info/v1/container.go b/info/v1/container.go index 9b75321382..08cff3940f 100644 --- a/info/v1/container.go +++ b/info/v1/container.go @@ -399,9 +399,16 @@ type MemoryStats struct { HierarchicalData MemoryStatsMemoryData `json:"hierarchical_data,omitempty"` } +type MemoryNumaStats struct { + File map[uint8]uint64 `json:"file,omitempty"` + Anon map[uint8]uint64 `json:"anon,omitempty"` + Unevictable map[uint8]uint64 `json:"unevictable,omitempty"` +} + type MemoryStatsMemoryData struct { - Pgfault uint64 `json:"pgfault"` - Pgmajfault uint64 `json:"pgmajfault"` + Pgfault uint64 `json:"pgfault"` + Pgmajfault uint64 `json:"pgmajfault"` + NumaStats MemoryNumaStats `json:"numa_stats,omitempty"` } type InterfaceStats struct { diff --git a/metrics/prometheus.go b/metrics/prometheus.go index aab19a287a..23457f4e99 100644 --- a/metrics/prometheus.go +++ b/metrics/prometheus.go @@ -422,7 +422,8 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri getValues: func(s *info.ContainerStats) metricValues { return metricValues{{value: float64(s.Memory.WorkingSet), timestamp: s.Timestamp}} }, - }, { + }, + { name: "container_memory_failures_total", help: "Cumulative count of memory allocation failures.", valueType: prometheus.CounterValue, @@ -454,6 +455,33 @@ func NewPrometheusCollector(i infoProvider, f ContainerLabelsFunc, includedMetri }, }...) } + if includedMetrics.Has(container.MemoryNumaMetrics) { + c.containerMetrics = append(c.containerMetrics, []containerMetric{ + { + name: "container_memory_numa_pages", + help: "Number of used pages per NUMA node", + valueType: prometheus.GaugeValue, + extraLabels: []string{"type", "scope", "node"}, + getValues: func(s *info.ContainerStats) metricValues { + values := make(metricValues, 0) + values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.File, + []string{"file", "container"}, s.Timestamp)...) + values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.Anon, + []string{"anon", "container"}, s.Timestamp)...) + values = append(values, getNumaStatsPerNode(s.Memory.ContainerData.NumaStats.Unevictable, + []string{"unevictable", "container"}, s.Timestamp)...) + + values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.File, + []string{"file", "hierarchy"}, s.Timestamp)...) + values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.Anon, + []string{"anon", "hierarchy"}, s.Timestamp)...) + values = append(values, getNumaStatsPerNode(s.Memory.HierarchicalData.NumaStats.Unevictable, + []string{"unevictable", "hierarchy"}, s.Timestamp)...) + return values + }, + }, + }...) + } if includedMetrics.Has(container.AcceleratorUsageMetrics) { c.containerMetrics = append(c.containerMetrics, []containerMetric{ { @@ -1903,3 +1931,12 @@ var invalidNameCharRE = regexp.MustCompile(`[^a-zA-Z0-9_]`) func sanitizeLabelName(name string) string { return invalidNameCharRE.ReplaceAllString(name, "_") } + +func getNumaStatsPerNode(nodeStats map[uint8]uint64, labels []string, timestamp time.Time) metricValues { + mValues := make(metricValues, 0, len(nodeStats)) + for node, stat := range nodeStats { + nodeLabels := append(labels, strconv.FormatUint(uint64(node), 10)) + mValues = append(mValues, metricValue{value: float64(stat), labels: nodeLabels, timestamp: timestamp}) + } + return mValues +} diff --git a/metrics/prometheus_fake.go b/metrics/prometheus_fake.go index 7e87d63427..6368c0b75e 100644 --- a/metrics/prometheus_fake.go +++ b/metrics/prometheus_fake.go @@ -327,10 +327,20 @@ func (p testSubcontainersInfoProvider) GetRequestedContainersInfo(string, v2.Req ContainerData: info.MemoryStatsMemoryData{ Pgfault: 10, Pgmajfault: 11, + NumaStats: info.MemoryNumaStats{ + File: map[uint8]uint64{0: 16649, 1: 10000}, + Anon: map[uint8]uint64{0: 10000, 1: 7109}, + Unevictable: map[uint8]uint64{0: 8900, 1: 10000}, + }, }, HierarchicalData: info.MemoryStatsMemoryData{ Pgfault: 12, Pgmajfault: 13, + NumaStats: info.MemoryNumaStats{ + File: map[uint8]uint64{0: 36649, 1: 10000}, + Anon: map[uint8]uint64{0: 20000, 1: 7109}, + Unevictable: map[uint8]uint64{0: 8900, 1: 20000}, + }, }, Cache: 14, RSS: 15, diff --git a/metrics/testdata/prometheus_metrics b/metrics/testdata/prometheus_metrics index be1993c64d..d8ba128f84 100644 --- a/metrics/testdata/prometheus_metrics +++ b/metrics/testdata/prometheus_metrics @@ -152,6 +152,20 @@ container_memory_mapped_file{container_env_foo_env="prod",container_label_foo_la # HELP container_memory_max_usage_bytes Maximum memory usage recorded in bytes # TYPE container_memory_max_usage_bytes gauge container_memory_max_usage_bytes{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 8 1395066363000 +# HELP container_memory_numa_pages Number of used pages per NUMA node +# TYPE container_memory_numa_pages gauge +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="anon",zone_name="hello"} 10000 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="file",zone_name="hello"} 16649 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="container",type="unevictable",zone_name="hello"} 8900 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="anon",zone_name="hello"} 20000 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="file",zone_name="hello"} 36649 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="0",scope="hierarchy",type="unevictable",zone_name="hello"} 8900 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="anon",zone_name="hello"} 7109 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="file",zone_name="hello"} 10000 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="container",type="unevictable",zone_name="hello"} 10000 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="anon",zone_name="hello"} 7109 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="file",zone_name="hello"} 10000 1395066363000 +container_memory_numa_pages{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",node="1",scope="hierarchy",type="unevictable",zone_name="hello"} 20000 1395066363000 # HELP container_memory_rss Size of RSS in bytes. # TYPE container_memory_rss gauge container_memory_rss{container_env_foo_env="prod",container_label_foo_label="bar",id="testcontainer",image="test",name="testcontaineralias",zone_name="hello"} 15 1395066363000