From 1b1de590c82e8ac892e6efbe641c723832db4e0b Mon Sep 17 00:00:00 2001 From: beorn7 Date: Mon, 18 May 2020 16:29:14 +0200 Subject: [PATCH 1/2] Keep scrape config in line with the new Prometheus scrape config This is triggered by https://github.com/grafana/jsonnet-libs/pull/261 . The above PR removes the `instance` label. As it has turned out (see PR linked above), a sane `instance` label in Prometheus has to be unique, and that includes the case where a single container exposes metrics on two different endpoints. However, that scenario would still only result in one log stream for Loki to scrape. Therefore, Loki and Prometheus need to sync via target labels uniquely identifying a container (rather than a metrics endpoint). Those labels are namespace, pod, container, also added here. This commit removes the `container_name` label. It is the same as the `container` label and was already added to Loki previously. However, the `container_name` label is deprecated and has disappeared in K8s 1.16, so that it will soon become useless for direct joining. --- .../helm/promtail/templates/configmap.yaml | 20 +++++++++---------- .../ksonnet/promtail/scrape_config.libsonnet | 18 ++++++++--------- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/production/helm/promtail/templates/configmap.yaml b/production/helm/promtail/templates/configmap.yaml index 211279538f61..58eb9bd49609 100644 --- a/production/helm/promtail/templates/configmap.yaml +++ b/production/helm/promtail/templates/configmap.yaml @@ -47,11 +47,11 @@ data: - action: replace source_labels: - __meta_kubernetes_pod_name - target_label: instance + target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name - target_label: container_name + target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: @@ -94,11 +94,11 @@ data: - action: replace source_labels: - __meta_kubernetes_pod_name - target_label: instance + target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name - target_label: container_name + target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: @@ -147,11 +147,11 @@ data: - action: replace source_labels: - __meta_kubernetes_pod_name - target_label: instance + target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name - target_label: container_name + target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: @@ -202,11 +202,11 @@ data: - action: replace source_labels: - __meta_kubernetes_pod_name - target_label: instance + target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name - target_label: container_name + target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: @@ -250,11 +250,11 @@ data: - action: replace source_labels: - __meta_kubernetes_pod_name - target_label: instance + target_label: pod - action: replace source_labels: - __meta_kubernetes_pod_container_name - target_label: container_name + target_label: container - replacement: /var/log/pods/*$1/*.log separator: / source_labels: diff --git a/production/ksonnet/promtail/scrape_config.libsonnet b/production/ksonnet/promtail/scrape_config.libsonnet index cec75fc20669..fedc06e2d65d 100644 --- a/production/ksonnet/promtail/scrape_config.libsonnet +++ b/production/ksonnet/promtail/scrape_config.libsonnet @@ -31,7 +31,7 @@ config { regex: '__meta_kubernetes_pod_label_(.+)', }, - // Rename jobs to be / + // Rename jobs to be /. { source_labels: ['__meta_kubernetes_namespace', '__service__'], action: 'replace', @@ -40,25 +40,24 @@ config { replacement: '$1', }, - // But also include the namespace as a separate label, for routing alerts + // But also include the namespace, pod, container as separate + // labels. They uniquely identify a container. They are also + // identical to the target labels configured in Prometheus + // (but note that Loki does not use an instance label). { source_labels: ['__meta_kubernetes_namespace'], action: 'replace', target_label: 'namespace', }, - - // Rename instances to be the pod name { source_labels: ['__meta_kubernetes_pod_name'], action: 'replace', - target_label: 'instance', + target_label: 'pod', // Not 'pod_name', which disappeared in K8s 1.16. }, - - // Include container_name label { source_labels: ['__meta_kubernetes_pod_container_name'], action: 'replace', - target_label: 'container_name', + target_label: 'container', // Not 'container_name', which disappeared in K8s 1.16. }, // Kubernetes puts logs under subdirectories keyed pod UID and container_name. @@ -76,7 +75,6 @@ config { // Scrape config to scrape any pods with a 'name' label. gen_scrape_config('kubernetes-pods-name', '__meta_kubernetes_pod_uid') { prelabel_config:: [ - // Use name label as __service__. { source_labels: ['__meta_kubernetes_pod_label_name'], @@ -85,7 +83,7 @@ config { ], }, - // Scrape config to scrape any pods with a 'app' label. + // Scrape config to scrape any pods with an 'app' label. gen_scrape_config('kubernetes-pods-app', '__meta_kubernetes_pod_uid') { prelabel_config:: [ // Drop pods with a 'name' label. They will have already been added by From 550e806d73bb11e931a96b1fe3c1484202dc15aa Mon Sep 17 00:00:00 2001 From: beorn7 Date: Fri, 22 May 2020 19:09:15 +0200 Subject: [PATCH 2/2] Adjust Loki dashboards to the new label order - container_name and pod_name have disappeared in K8s 1.16. - We are changing the instance name in Prometheus (and removing it in Loki altogether). The instance name used to be the pod name, so switch usage of the instance label to the pod label. The pod label will generally only appear as a target label with rolling out recent changes to target labels. This will unfortunately break dashboards if looking at older data. --- .../loki-mixin/dashboard-loki-logs.json | 18 +++++++-------- .../dashboard-loki-operational.json | 22 +++++++++---------- production/loki-mixin/dashboards.libsonnet | 2 +- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/production/loki-mixin/dashboard-loki-logs.json b/production/loki-mixin/dashboard-loki-logs.json index 2c14f97523d5..b939267718cc 100644 --- a/production/loki-mixin/dashboard-loki-logs.json +++ b/production/loki-mixin/dashboard-loki-logs.json @@ -60,7 +60,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$deployment.*\", instance=~\"$pod.*\"})", + "expr": "sum(go_goroutines{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"})", "refId": "A" } ], @@ -146,7 +146,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(go_gc_duration_seconds{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$deployment.*\", instance=~\"$pod.*\"}) by (quantile)", + "expr": "sum(go_gc_duration_seconds{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}) by (quantile)", "legendFormat": "{{quantile}}", "refId": "A" } @@ -233,7 +233,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$deployment.*\", pod_name=~\"$pod\", container_name=~\"$container\"}[5m]))", + "expr": "sum(rate(container_cpu_usage_seconds_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"}[5m]))", "refId": "A" } ], @@ -319,7 +319,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$deployment.*\", pod_name=~\"$pod\", container_name=~\"$container\"})", + "expr": "sum(container_memory_working_set_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\"})", "refId": "A" } ], @@ -405,7 +405,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$deployment.*\", pod_name=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_transmit_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -491,7 +491,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"$deployment.*\", pod_name=~\"$pod\"}[5m]))", + "expr": "sum(rate(container_network_receive_bytes_total{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\"}[5m]))", "refId": "A" } ], @@ -664,7 +664,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_instance=~\"$deployment.*\", exported_instance=~\"$pod.*\", container_name=~\"$container\"}[5m])) by (level)", + "expr": "sum(rate(promtail_custom_bad_words_total{cluster=\"$cluster\", exported_namespace=\"$namespace\", exported_pod=~\"$deployment.*\", exported_pod=~\"$pod\", container=~\"$container\"}[5m])) by (level)", "legendFormat": "{{level}}", "refId": "A" } @@ -768,7 +768,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$deployment.*\", instance=~\"$pod.*\", container_name=~\"$container\", level=~\"$level\"}$filter[5m])) by (level)", + "expr": "sum(rate({cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\", level=~\"$level\"}$filter[5m])) by (level)", "intervalFactor": 3, "legendFormat": "{{level}}", "refId": "A" @@ -833,7 +833,7 @@ }, "targets": [ { - "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", instance=~\"$deployment.*\", instance=~\"$pod.*\", container_name=~\"$container\", level=~\"$level\"} $filter", + "expr": "{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"$deployment.*\", pod=~\"$pod\", container=~\"$container\", level=~\"$level\"} $filter", "refId": "A" } ], diff --git a/production/loki-mixin/dashboard-loki-operational.json b/production/loki-mixin/dashboard-loki-operational.json index dc1e6bf284cb..4164d3ef2e6e 100644 --- a/production/loki-mixin/dashboard-loki-operational.json +++ b/production/loki-mixin/dashboard-loki-operational.json @@ -13,7 +13,7 @@ { "datasource": "$logs", "enable": true, - "expr": "{cluster=\"$cluster\", diff_namespace=\"$namespace\", container_name=\"kube-diff-logger\"}", + "expr": "{cluster=\"$cluster\", diff_namespace=\"$namespace\", container=\"kube-diff-logger\"}", "hide": true, "iconColor": "rgba(255, 96, 96, 1)", "name": "deployments", @@ -2023,10 +2023,10 @@ "steppedLine": false, "targets": [ { - "expr": "container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"distributor.*\"}", + "expr": "container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"distributor.*\"}", "instant": false, "intervalFactor": 3, - "legendFormat": "{{pod_name}}", + "legendFormat": "{{pod}}", "refId": "A" } ], @@ -2406,9 +2406,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (instance)", + "expr": "sum(rate(loki_distributor_bytes_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, - "legendFormat": "{{instance}}", + "legendFormat": "{{pod}}", "refId": "A" } ], @@ -2494,9 +2494,9 @@ "steppedLine": false, "targets": [ { - "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (instance)", + "expr": "sum(rate(loki_distributor_lines_received_total{cluster=\"$cluster\", namespace=\"$namespace\"}[5m])) by (pod)", "intervalFactor": 1, - "legendFormat": "{{instance}}", + "legendFormat": "{{pod}}", "refId": "A" } ], @@ -2687,10 +2687,10 @@ "steppedLine": false, "targets": [ { - "expr": "container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"ingester.*\"}", + "expr": "container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"ingester.*\"}", "instant": false, "intervalFactor": 3, - "legendFormat": "{{pod_name}}", + "legendFormat": "{{pod}}", "refId": "A" } ], @@ -3633,10 +3633,10 @@ "steppedLine": false, "targets": [ { - "expr": "container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod_name=~\"querier.*\"}", + "expr": "container_memory_usage_bytes{cluster=\"$cluster\", namespace=\"$namespace\", pod=~\"querier.*\"}", "instant": false, "intervalFactor": 3, - "legendFormat": "{{pod_name}}", + "legendFormat": "{{pod}}", "refId": "A" } ], diff --git a/production/loki-mixin/dashboards.libsonnet b/production/loki-mixin/dashboards.libsonnet index 2ca8376f7031..c34b11480c26 100644 --- a/production/loki-mixin/dashboards.libsonnet +++ b/production/loki-mixin/dashboards.libsonnet @@ -136,7 +136,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; g.row('Flush Stats') .addPanel( g.panel('Queue Length') + - g.queryPanel('cortex_ingester_flush_queue_length{cluster="$cluster", job="$namespace/ingester"}', '{{instance}}'), + g.queryPanel('cortex_ingester_flush_queue_length{cluster="$cluster", job="$namespace/ingester"}', '{{pod}}'), ) .addPanel( g.panel('Flush Rate') +