From 28d88b3fb84a53dd17c0a25b090b85762a23c849 Mon Sep 17 00:00:00 2001 From: Sandeep Sukhani Date: Wed, 7 Apr 2021 16:27:40 +0530 Subject: [PATCH] add loki resource usage dashboard for read and write path (#3584) --- production/loki-mixin/config.libsonnet | 12 ++ production/loki-mixin/dashboards.libsonnet | 6 +- .../dashboards/dashboard-utils.libsonnet | 98 ++++++++++++++++ .../dashboards/loki-reads-resources.libsonnet | 106 ++++++++++++++++++ .../loki-writes-resources.libsonnet | 85 ++++++++++++++ 5 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 production/loki-mixin/config.libsonnet create mode 100644 production/loki-mixin/dashboards/dashboard-utils.libsonnet create mode 100644 production/loki-mixin/dashboards/loki-reads-resources.libsonnet create mode 100644 production/loki-mixin/dashboards/loki-writes-resources.libsonnet diff --git a/production/loki-mixin/config.libsonnet b/production/loki-mixin/config.libsonnet new file mode 100644 index 000000000000..220325c80306 --- /dev/null +++ b/production/loki-mixin/config.libsonnet @@ -0,0 +1,12 @@ +{ + _config+:: { + // Tags for dashboards. + tags: ['loki'], + + // The label used to differentiate between different application instances (i.e. 'pod' in a kubernetes install). + per_instance_label: 'pod', + + // The label used to differentiate between different nodes (i.e. servers). + per_node_label: 'instance', + }, +} diff --git a/production/loki-mixin/dashboards.libsonnet b/production/loki-mixin/dashboards.libsonnet index c524df6a4920..e4d2b7c5310d 100644 --- a/production/loki-mixin/dashboards.libsonnet +++ b/production/loki-mixin/dashboards.libsonnet @@ -1,5 +1,9 @@ +(import 'config.libsonnet') + (import 'dashboards/loki-chunks.libsonnet') + (import 'dashboards/loki-logs.libsonnet') + (import 'dashboards/loki-operational.libsonnet') + (import 'dashboards/loki-reads.libsonnet') + -(import 'dashboards/loki-writes.libsonnet') +(import 'dashboards/loki-writes.libsonnet') + +(import 'dashboards/loki-writes-resources.libsonnet') + +(import 'dashboards/loki-reads-resources.libsonnet') + diff --git a/production/loki-mixin/dashboards/dashboard-utils.libsonnet b/production/loki-mixin/dashboards/dashboard-utils.libsonnet new file mode 100644 index 000000000000..0a04634a6292 --- /dev/null +++ b/production/loki-mixin/dashboards/dashboard-utils.libsonnet @@ -0,0 +1,98 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'grafana-builder/grafana.libsonnet') { + // Override the dashboard constructor to add: + // - default tags, + // - some links that propagate the selectred cluster. + dashboard(title):: + super.dashboard(title) + { + addRowIf(condition, row):: + if condition + then self.addRow(row) + else self, + + addClusterSelectorTemplates(multi=true):: + local d = self { + tags: $._config.tags, + links: [ + { + asDropdown: true, + icon: 'external link', + includeVars: true, + keepTime: true, + tags: $._config.tags, + targetBlank: false, + title: 'Loki Dashboards', + type: 'dashboards', + }, + ], + }; + + if multi then + d.addMultiTemplate('cluster', 'loki_build_info', 'cluster') + .addMultiTemplate('namespace', 'loki_build_info', 'namespace') + else + d.addTemplate('cluster', 'loki_build_info', 'cluster') + .addTemplate('namespace', 'loki_build_info', 'namespace'), + + }, + + jobMatcher(job):: + 'cluster=~"$cluster", job=~"($namespace)/%s"' % job, + + namespaceMatcher():: + 'cluster=~"$cluster", namespace=~"$namespace"', + + containerCPUUsagePanel(title, containerName):: + $.panel(title) + + $.queryPanel([ + 'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), containerName], + 'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], + ], ['{{pod}}', 'limit']) + + { + seriesOverrides: [ + { + alias: 'limit', + color: '#E02F44', + fill: 0, + }, + ], + tooltip: { sort: 2 }, // Sort descending. + }, + + containerMemoryWorkingSetPanel(title, containerName):: + $.panel(title) + + $.queryPanel([ + // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up + // summing the memory of the old pod (whose metric will be stale for 5m) to the new pod. + 'max by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName], + 'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName], + ], ['{{pod}}', 'limit']) + + { + seriesOverrides: [ + { + alias: 'limit', + color: '#E02F44', + fill: 0, + }, + ], + yaxes: $.yaxes('bytes'), + tooltip: { sort: 2 }, // Sort descending. + }, + + goHeapInUsePanel(title, jobName):: + $.panel(title) + + $.queryPanel( + 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], + '{{%s}}' % $._config.per_instance_label + ) + + { + yaxes: $.yaxes('bytes'), + tooltip: { sort: 2 }, // Sort descending. + }, + + filterNodeDiskContainer(containerName):: + ||| + ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0) + ||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName], +} \ No newline at end of file diff --git a/production/loki-mixin/dashboards/loki-reads-resources.libsonnet b/production/loki-mixin/dashboards/loki-reads-resources.libsonnet new file mode 100644 index 000000000000..2f4ae02fee53 --- /dev/null +++ b/production/loki-mixin/dashboards/loki-reads-resources.libsonnet @@ -0,0 +1,106 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + grafanaDashboards+: + { + 'loki-reads-resources.json': + ($.dashboard('Loki / Reads Resources')) + .addClusterSelectorTemplates(false) + .addRow( + $.row('Gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'cortex-gw'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + ) + ) + .addRow( + $.row('Query Frontend') + .addPanel( + $.containerCPUUsagePanel('CPU', 'query-frontend'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-frontend'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'query-frontend'), + ) + ) + .addRow( + $.row('Querier') + .addPanel( + $.containerCPUUsagePanel('CPU', 'querier'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'querier'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'querier'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Disk Writes') + + $.queryPanel( + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Reads') + + $.queryPanel( + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Space Utilization') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"querier.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ) + .addRow( + $.row('Ingester') + .addPanel( + $.containerCPUUsagePanel('CPU', 'ingester'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), + ) + ) + .addRow( + $.row('Ruler') + .addPanel( + $.panel('Rules') + + $.queryPanel( + 'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher('ruler')], + '{{%s}}' % $._config.per_instance_label + ), + ) + .addPanel( + $.containerCPUUsagePanel('CPU', 'ruler'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'), + ) + ), + }, +} diff --git a/production/loki-mixin/dashboards/loki-writes-resources.libsonnet b/production/loki-mixin/dashboards/loki-writes-resources.libsonnet new file mode 100644 index 000000000000..19c888f6e9e4 --- /dev/null +++ b/production/loki-mixin/dashboards/loki-writes-resources.libsonnet @@ -0,0 +1,85 @@ +local utils = import 'mixin-utils/utils.libsonnet'; + +(import 'dashboard-utils.libsonnet') { + grafanaDashboards+: + { + 'loki-writes-resources.json': + $.dashboard('Loki / Writes Resources') + .addClusterSelectorTemplates(false) + .addRow( + $.row('Gateway') + .addPanel( + $.containerCPUUsagePanel('CPU', 'cortex-gw'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'), + ) + ) + .addRow( + $.row('Distributor') + .addPanel( + $.containerCPUUsagePanel('CPU', 'distributor'), + ) + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'distributor'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'distributor'), + ) + ) + .addRow( + $.row('Ingester') + .addPanel( + $.panel('In-memory streams') + + $.queryPanel( + 'sum by(%s) (loki_ingester_memory_streams{%s})' % [$._config.per_instance_label, $.jobMatcher('ingester')], + '{{%s}}' % $._config.per_instance_label + ) + + { + tooltip: { sort: 2 }, // Sort descending. + }, + ) + .addPanel( + $.containerCPUUsagePanel('CPU', 'ingester'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'), + ) + .addPanel( + $.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'), + ) + ) + .addRow( + $.row('') + .addPanel( + $.panel('Disk Writes') + + $.queryPanel( + 'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Reads') + + $.queryPanel( + 'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')], + '{{%s}} - {{device}}' % $._config.per_instance_label + ) + + $.stack + + { yaxes: $.yaxes('Bps') }, + ) + .addPanel( + $.panel('Disk Space Utilization') + + $.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"ingester.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') + + { yaxes: $.yaxes('percentunit') }, + ) + ), + }, +}