Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add loki resource usage dashboard for read and write path #3584

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions production/loki-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
_config+:: {
// Tags for dashboards.
tags: ['loki'],

// The label used to differentiate between different application instances (i.e. 'pod' in a kubernetes install).
per_instance_label: 'pod',

// The label used to differentiate between different nodes (i.e. servers).
per_node_label: 'instance',
},
}
6 changes: 5 additions & 1 deletion production/loki-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
(import 'config.libsonnet') +
(import 'dashboards/loki-chunks.libsonnet') +
(import 'dashboards/loki-logs.libsonnet') +
(import 'dashboards/loki-operational.libsonnet') +
(import 'dashboards/loki-reads.libsonnet') +
(import 'dashboards/loki-writes.libsonnet')
(import 'dashboards/loki-writes.libsonnet') +
(import 'dashboards/loki-writes-resources.libsonnet') +
(import 'dashboards/loki-reads-resources.libsonnet')

98 changes: 98 additions & 0 deletions production/loki-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'grafana-builder/grafana.libsonnet') {
// Override the dashboard constructor to add:
// - default tags,
// - some links that propagate the selectred cluster.
dashboard(title)::
super.dashboard(title) + {
addRowIf(condition, row)::
if condition
then self.addRow(row)
else self,

addClusterSelectorTemplates(multi=true)::
local d = self {
tags: $._config.tags,
links: [
{
asDropdown: true,
icon: 'external link',
includeVars: true,
keepTime: true,
tags: $._config.tags,
targetBlank: false,
title: 'Loki Dashboards',
type: 'dashboards',
},
],
};

if multi then
d.addMultiTemplate('cluster', 'loki_build_info', 'cluster')
.addMultiTemplate('namespace', 'loki_build_info', 'namespace')
else
d.addTemplate('cluster', 'loki_build_info', 'cluster')
.addTemplate('namespace', 'loki_build_info', 'namespace'),

},

jobMatcher(job)::
'cluster=~"$cluster", job=~"($namespace)/%s"' % job,

namespaceMatcher()::
'cluster=~"$cluster", namespace=~"$namespace"',

containerCPUUsagePanel(title, containerName)::
$.panel(title) +
$.queryPanel([
'sum by(pod) (rate(container_cpu_usage_seconds_total{%s,container="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), containerName],
'min(container_spec_cpu_quota{%s,container="%s"} / container_spec_cpu_period{%s,container="%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName],
], ['{{pod}}', 'limit']) +
{
seriesOverrides: [
{
alias: 'limit',
color: '#E02F44',
fill: 0,
},
],
tooltip: { sort: 2 }, // Sort descending.
},

containerMemoryWorkingSetPanel(title, containerName)::
$.panel(title) +
$.queryPanel([
// We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up
// summing the memory of the old pod (whose metric will be stale for 5m) to the new pod.
'max by(pod) (container_memory_working_set_bytes{%s,container="%s"})' % [$.namespaceMatcher(), containerName],
'min(container_spec_memory_limit_bytes{%s,container="%s"} > 0)' % [$.namespaceMatcher(), containerName],
], ['{{pod}}', 'limit']) +
{
seriesOverrides: [
{
alias: 'limit',
color: '#E02F44',
fill: 0,
},
],
yaxes: $.yaxes('bytes'),
tooltip: { sort: 2 }, // Sort descending.
},

goHeapInUsePanel(title, jobName)::
$.panel(title) +
$.queryPanel(
'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)],
'{{%s}}' % $._config.per_instance_label
) +
{
yaxes: $.yaxes('bytes'),
tooltip: { sort: 2 }, // Sort descending.
},

filterNodeDiskContainer(containerName)::
|||
ignoring(%s) group_right() (label_replace(count by(%s, %s, device) (container_fs_writes_bytes_total{%s,container="%s",device!~".*sda.*"}), "device", "$1", "device", "/dev/(.*)") * 0)
||| % [$._config.per_instance_label, $._config.per_node_label, $._config.per_instance_label, $.namespaceMatcher(), containerName],
}
106 changes: 106 additions & 0 deletions production/loki-mixin/dashboards/loki-reads-resources.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'dashboard-utils.libsonnet') {
grafanaDashboards+:
{
'loki-reads-resources.json':
($.dashboard('Loki / Reads Resources'))
.addClusterSelectorTemplates(false)
.addRow(
$.row('Gateway')
.addPanel(
$.containerCPUUsagePanel('CPU', 'cortex-gw'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'),
)
)
.addRow(
$.row('Query Frontend')
.addPanel(
$.containerCPUUsagePanel('CPU', 'query-frontend'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'query-frontend'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'query-frontend'),
)
)
.addRow(
$.row('Querier')
.addPanel(
$.containerCPUUsagePanel('CPU', 'querier'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'querier'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'querier'),
)
)
.addRow(
$.row('')
.addPanel(
$.panel('Disk Writes') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Reads') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('querier')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Space Utilization') +
$.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"querier.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') +
{ yaxes: $.yaxes('percentunit') },
)
)
.addRow(
$.row('Ingester')
.addPanel(
$.containerCPUUsagePanel('CPU', 'ingester'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'),
)
)
.addRow(
$.row('Ruler')
.addPanel(
$.panel('Rules') +
$.queryPanel(
'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher('ruler')],
'{{%s}}' % $._config.per_instance_label
),
)
.addPanel(
$.containerCPUUsagePanel('CPU', 'ruler'),
)
)
.addRow(
$.row('')
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ruler'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'ruler'),
)
),
},
}
85 changes: 85 additions & 0 deletions production/loki-mixin/dashboards/loki-writes-resources.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
local utils = import 'mixin-utils/utils.libsonnet';

(import 'dashboard-utils.libsonnet') {
grafanaDashboards+:
{
'loki-writes-resources.json':
$.dashboard('Loki / Writes Resources')
.addClusterSelectorTemplates(false)
.addRow(
$.row('Gateway')
.addPanel(
$.containerCPUUsagePanel('CPU', 'cortex-gw'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'cortex-gw'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'cortex-gw'),
)
)
.addRow(
$.row('Distributor')
.addPanel(
$.containerCPUUsagePanel('CPU', 'distributor'),
)
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'distributor'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'distributor'),
)
)
.addRow(
$.row('Ingester')
.addPanel(
$.panel('In-memory streams') +
$.queryPanel(
'sum by(%s) (loki_ingester_memory_streams{%s})' % [$._config.per_instance_label, $.jobMatcher('ingester')],
'{{%s}}' % $._config.per_instance_label
) +
{
tooltip: { sort: 2 }, // Sort descending.
},
)
.addPanel(
$.containerCPUUsagePanel('CPU', 'ingester'),
)
)
.addRow(
$.row('')
.addPanel(
$.containerMemoryWorkingSetPanel('Memory (workingset)', 'ingester'),
)
.addPanel(
$.goHeapInUsePanel('Memory (go heap inuse)', 'ingester'),
)
)
.addRow(
$.row('')
.addPanel(
$.panel('Disk Writes') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_written_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Reads') +
$.queryPanel(
'sum by(%s, %s, device) (rate(node_disk_read_bytes_total[$__rate_interval])) + %s' % [$._config.per_node_label, $._config.per_instance_label, $.filterNodeDiskContainer('ingester')],
'{{%s}} - {{device}}' % $._config.per_instance_label
) +
$.stack +
{ yaxes: $.yaxes('Bps') },
)
.addPanel(
$.panel('Disk Space Utilization') +
$.queryPanel('max by(persistentvolumeclaim) (kubelet_volume_stats_used_bytes{%s} / kubelet_volume_stats_capacity_bytes{%s}) and count by(persistentvolumeclaim) (kube_persistentvolumeclaim_labels{%s,label_name=~"ingester.*"})' % [$.namespaceMatcher(), $.namespaceMatcher(), $.namespaceMatcher()], '{{persistentvolumeclaim}}') +
{ yaxes: $.yaxes('percentunit') },
)
),
},
}