Skip to content

Commit

Permalink
add loki resource usage dashboard for read and write path (#3584)
Browse files Browse the repository at this point in the history
  • Loading branch information
sandeepsukhani authored Apr 7, 2021
1 parent 0107a11 commit 28d88b3
Show file tree
Hide file tree
Showing 5 changed files with 306 additions and 1 deletion.
12 changes: 12 additions & 0 deletions production/loki-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
_config+:: {
// Tags for dashboards.
tags: ['loki'],

// The label used to differentiate between different application instances (i.e. 'pod' in a kubernetes install).
per_instance_label: 'pod',

// The label used to differentiate between different nodes (i.e. servers).
per_node_label: 'instance',
},
}
6 changes: 5 additions & 1 deletion production/loki-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
(import 'config.libsonnet') +
(import 'dashboards/loki-chunks.libsonnet') +
(import 'dashboards/loki-logs.libsonnet') +
(import 'dashboards/loki-operational.libsonnet') +
(import 'dashboards/loki-reads.libsonnet') +
(import 'dashboards/loki-writes.libsonnet')
(import 'dashboards/loki-writes.libsonnet') +
(import 'dashboards/loki-writes-resources.libsonnet') +
(import 'dashboards/loki-reads-resources.libsonnet')

98 changes: 98 additions & 0 deletions production/loki-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'grafana-builder/grafana.libsonnet') {
// Override the dashboard constructor to add:
// - default tags,
// - some links that propagate the selectred cluster.
dashboard(title)::
super.dashboard(title) + {
addRowIf(condition, row)::
if condition
then self.addRow(row)
else self,

addClusterSelectorTemplates(multi=true)::
local d = self {
tags: $._config.tags,
links: [
{
asDropdown: true,
icon: 'external link',
includeVars: true,
keepTime: true,
tags: $._config.tags,
targetBlank: false,
title: 'Loki Dashboards',
type: 'dashboards',
},
],
};

if multi then
d.addMultiTemplate('cluster', 'loki_build_info', 'cluster')
.addMultiTemplate('namespace', 'loki_build_info', 'namespace')
else
d.addTemplate('cluster', 'loki_build_info', 'cluster')
.addTemplate('namespace', 'loki_build_info', 'namespace'),

},

jobMatcher(job)::
'cluster=~"$cluster", job=~"($namespace)/%s"' % job,

namespaceMatcher()::
'cluster=~"$cluster", namespace=~"$namespace"',

containerCPUUsagePanel(title, containerName)::
$.panel(title) +
$.queryPanel([
'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), containerName],
'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName],
], ['{{pod}}', 'limit']) +
{
seriesOverrides: [
{
alias: 'limit',
color: '#E02F44',
fill: 0,
},
],
tooltip: { sort: 2 }, // Sort descending.
},

containerMemoryWorkingSetPanel(title, containerName)::
$.panel(title) +
$.queryPanel([
// We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up
// summing the memory of the old pod (whose metric will be stale for 5m) to the new pod.
'max by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName],
'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName],
], ['{{pod}}', 'limit']) +
{
seriesOverrides: [
{
alias: 'limit',
color: '#E02F44',
fill: 0,
},
],
yaxes: $.yaxes('bytes'),
tooltip: { sort: 2 }, // Sort descending.
},

goHeapInUsePanel(title, jobName)::
$.panel(title) +
$.queryPanel(
'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)],
'{{%s}}' % $._config.per_instance_label
) +
{
yaxes: $.yaxes('bytes'),
tooltip: { sort: 2 }, // Sort descending.
},

filterNodeDiskContainer(containerName)::
|||
ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0)
||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName],
}
106 changes: 106 additions & 0 deletions production/loki-mixin/dashboards/loki-reads-resources.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'dashboard-utils.libsonnet') {
grafanaDashboards+:
{
'loki-reads-resources.json':
($.dashboard('Loki / Reads Resources'))
.addClusterSelectorTemplates(false)
.addRow(
$.row('Gateway')
.addPanel(
$.containerCPUUsagePanel('CPU', 'cortex-gw'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'),
)
)
.addRow(
$.row('Query Frontend')
.addPanel(
$.containerCPUUsagePanel('CPU', 'query-frontend'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-frontend'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'query-frontend'),
)
)
.addRow(
$.row('Querier')
.addPanel(
$.containerCPUUsagePanel('CPU', 'querier'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'querier'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'querier'),
)
)
.addRow(
$.row('')
.addPanel(
$.panel('Disk Writes') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Reads') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Space Utilization') +
$.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"querier.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') +
{ yaxes: $.yaxes('percentunit') },
)
)
.addRow(
$.row('Ingester')
.addPanel(
$.containerCPUUsagePanel('CPU', 'ingester'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'),
)
)
.addRow(
$.row('Ruler')
.addPanel(
$.panel('Rules') +
$.queryPanel(
'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher('ruler')],
'{{%s}}' % $._config.per_instance_label
),
)
.addPanel(
$.containerCPUUsagePanel('CPU', 'ruler'),
)
)
.addRow(
$.row('')
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'),
)
),
},
}
85 changes: 85 additions & 0 deletions production/loki-mixin/dashboards/loki-writes-resources.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'dashboard-utils.libsonnet') {
grafanaDashboards+:
{
'loki-writes-resources.json':
$.dashboard('Loki / Writes Resources')
.addClusterSelectorTemplates(false)
.addRow(
$.row('Gateway')
.addPanel(
$.containerCPUUsagePanel('CPU', 'cortex-gw'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'),
)
)
.addRow(
$.row('Distributor')
.addPanel(
$.containerCPUUsagePanel('CPU', 'distributor'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'distributor'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'distributor'),
)
)
.addRow(
$.row('Ingester')
.addPanel(
$.panel('In-memory streams') +
$.queryPanel(
'sum by(%s) (loki_ingester_memory_streams{%s})' % [$._config.per_instance_label, $.jobMatcher('ingester')],
'{{%s}}' % $._config.per_instance_label
) +
{
tooltip: { sort: 2 }, // Sort descending.
},
)
.addPanel(
$.containerCPUUsagePanel('CPU', 'ingester'),
)
)
.addRow(
$.row('')
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'),
)
)
.addRow(
$.row('')
.addPanel(
$.panel('Disk Writes') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Reads') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Space Utilization') +
$.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"ingester.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') +
{ yaxes: $.yaxes('percentunit') },
)
),
},
}

0 comments on commit 28d88b3

Please sign in to comment.