From c35d28a0e921888e739019c698cce1554fc51dfd Mon Sep 17 00:00:00 2001 From: "renovate[bot]" <29139614+renovate[bot]@users.noreply.github.com> Date: Tue, 7 Jan 2025 16:18:16 +0000 Subject: [PATCH 1/3] chore(deps): update all dependencies --- Makefile | 2 +- ansible/requirements.txt | 4 ++-- ansible/requirements.yaml | 4 ++-- .../core/common/base/external-secrets/app/helm-release.yaml | 2 +- .../core/common/base/kube-system/reloader/helm-release.yaml | 2 +- kubernetes/core/common/base/traefik/app/helm-release.yaml | 2 +- kubernetes/crds/common/external-secrets/source.yaml | 2 +- kubernetes/crds/common/external-snapshotter/source.yaml | 2 +- kubernetes/crds/common/traefik/source.yaml | 2 +- .../common/base/monitoring/vm-stack/helm-release.yaml | 4 ++-- 10 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index e0926ca0..fff608e0 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,7 @@ OUT_DIR := ./out ENVIRONMENT := production STEP := site # renovate: datasource=github-releases depName=mike-engel/jwt-cli -JWT_VERSION := 6.1.1 +JWT_VERSION := 6.2.0 ifndef VERBOSE MAKEFLAGS += --no-print-directory diff --git a/ansible/requirements.txt b/ansible/requirements.txt index b12a146e..9af05da6 100644 --- a/ansible/requirements.txt +++ b/ansible/requirements.txt @@ -1,5 +1,5 @@ -ansible>=10.4,<10.5 -ansible-lint>=24.9,<24.10 +ansible>=10.7,<10.8 +ansible-lint>=24.12,<24.13 lxml>=5.3,<5.4 libvirt-python>=10.10,<10.11 kubernetes>=31,<31.1 diff --git a/ansible/requirements.yaml b/ansible/requirements.yaml index e92132d4..94fb7971 100644 --- a/ansible/requirements.yaml +++ b/ansible/requirements.yaml @@ -1,7 +1,7 @@ collections: - name: community.general - version: 9.4.0 + version: 9.5.3 - name: ansible.posix - version: 1.6.0 + version: 1.6.2 - name: kubernetes.core version: 5.0.0 diff --git a/kubernetes/core/common/base/external-secrets/app/helm-release.yaml b/kubernetes/core/common/base/external-secrets/app/helm-release.yaml index dedef7b6..0ed317c2 100644 --- a/kubernetes/core/common/base/external-secrets/app/helm-release.yaml +++ b/kubernetes/core/common/base/external-secrets/app/helm-release.yaml @@ -10,7 +10,7 @@ spec: chart: spec: chart: external-secrets - version: "0.10.4" + version: "0.12.1" sourceRef: kind: HelmRepository name: external-secrets diff --git a/kubernetes/core/common/base/kube-system/reloader/helm-release.yaml b/kubernetes/core/common/base/kube-system/reloader/helm-release.yaml index 4eb9a190..6c2790d2 100644 --- a/kubernetes/core/common/base/kube-system/reloader/helm-release.yaml +++ b/kubernetes/core/common/base/kube-system/reloader/helm-release.yaml @@ -10,7 +10,7 @@ spec: chart: spec: chart: reloader - version: "1.1.0" + version: "1.2.0" sourceRef: kind: HelmRepository name: stakater diff --git a/kubernetes/core/common/base/traefik/app/helm-release.yaml b/kubernetes/core/common/base/traefik/app/helm-release.yaml index 75d118c1..b5d7017b 100644 --- a/kubernetes/core/common/base/traefik/app/helm-release.yaml +++ b/kubernetes/core/common/base/traefik/app/helm-release.yaml @@ -10,7 +10,7 @@ spec: chart: spec: chart: traefik - version: "32.0.0" + version: "32.1.1" sourceRef: kind: HelmRepository name: traefik diff --git a/kubernetes/crds/common/external-secrets/source.yaml b/kubernetes/crds/common/external-secrets/source.yaml index 23a4f277..5c765fb2 100644 --- a/kubernetes/crds/common/external-secrets/source.yaml +++ b/kubernetes/crds/common/external-secrets/source.yaml @@ -7,7 +7,7 @@ metadata: spec: url: https://github.com/external-secrets/external-secrets ref: - tag: v0.10.4 + tag: v0.12.1 interval: 15m ignore: | # exclude all diff --git a/kubernetes/crds/common/external-snapshotter/source.yaml b/kubernetes/crds/common/external-snapshotter/source.yaml index 89458dbd..35c95f50 100644 --- a/kubernetes/crds/common/external-snapshotter/source.yaml +++ b/kubernetes/crds/common/external-snapshotter/source.yaml @@ -7,7 +7,7 @@ metadata: spec: url: https://github.com/kubernetes-csi/external-snapshotter ref: - tag: v8.1.0 + tag: v8.2.0 interval: 15m ignore: | # exclude all diff --git a/kubernetes/crds/common/traefik/source.yaml b/kubernetes/crds/common/traefik/source.yaml index 3f92ee83..ec5643cf 100644 --- a/kubernetes/crds/common/traefik/source.yaml +++ b/kubernetes/crds/common/traefik/source.yaml @@ -7,7 +7,7 @@ metadata: spec: url: https://github.com/traefik/traefik-helm-chart ref: - tag: v32.0.0 + tag: v32.1.1 interval: 15m ignore: | # exclude all diff --git a/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml b/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml index e080fb86..7a0bfc9d 100644 --- a/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml +++ b/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml @@ -10,7 +10,7 @@ spec: chart: spec: chart: victoria-metrics-k8s-stack - version: "0.25.17" + version: "0.33.2" sourceRef: kind: HelmRepository name: victoriametrics @@ -50,7 +50,7 @@ spec: fullnameOverride: victoria-metrics-operator image: repository: victoriametrics/operator - tag: v0.48.1 + tag: v0.51.2 operator: enable_converter_ownership: true From 3ee8cd44998617a9ebeeb71cc8a240247e36c987 Mon Sep 17 00:00:00 2001 From: orblazer Date: Tue, 7 Jan 2025 17:34:56 +0100 Subject: [PATCH 2/3] chore(kubernetes): remove useless values --- .../core/common/base/external-secrets/app/helm-release.yaml | 4 ---- kubernetes/core/common/base/traefik/app/helm-release.yaml | 5 ----- 2 files changed, 9 deletions(-) diff --git a/kubernetes/core/common/base/external-secrets/app/helm-release.yaml b/kubernetes/core/common/base/external-secrets/app/helm-release.yaml index 0ed317c2..016e1f19 100644 --- a/kubernetes/core/common/base/external-secrets/app/helm-release.yaml +++ b/kubernetes/core/common/base/external-secrets/app/helm-release.yaml @@ -21,10 +21,6 @@ spec: # Disable CRD installCRDs: false - crds: - createClusterExternalSecret: false - createClusterSecretStore: false - createPushSecret: false resources: requests: diff --git a/kubernetes/core/common/base/traefik/app/helm-release.yaml b/kubernetes/core/common/base/traefik/app/helm-release.yaml index b5d7017b..27167ab8 100644 --- a/kubernetes/core/common/base/traefik/app/helm-release.yaml +++ b/kubernetes/core/common/base/traefik/app/helm-release.yaml @@ -50,11 +50,6 @@ spec: enabled: true isDefaultClass: true - providers: - kubernetesIngress: - publishedService: - enabled: true - env: - name: TZ value: ${TIMEZONE} From 3b209389e80b5e0271bd477098fb3155825a90c9 Mon Sep 17 00:00:00 2001 From: orblazer Date: Tue, 7 Jan 2025 19:45:46 +0100 Subject: [PATCH 3/3] chore(monitoring): update to last version --- .../vm-stack/helm-release-patch.yaml | 18 ++- .../kube-apiserver-availability.rules.yaml | 28 ++--- .../rules/kube-apiserver-histogram.rules.yaml | 4 +- .../vm-stack/rules/kube-apiserver-slos.yaml | 66 ----------- .../kube-prometheus-node-recording.rules.yaml | 22 ---- .../vm-stack/rules/kubernetes-resources.yaml | 112 ------------------ .../rules/kubernetes-system-apiserver.yaml | 24 ++-- .../rules/kubernetes-system-kubelet.yaml | 10 +- .../vm-stack/rules/kustomization.yaml | 4 - .../monitoring/vm-stack/rules/vm-health.yaml | 94 --------------- .../monitoring/vm-stack/rules/vmagent.yaml | 32 ++--- .../monitoring/vm-stack/helm-release.yaml | 24 ++-- 12 files changed, 70 insertions(+), 368 deletions(-) delete mode 100644 kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-slos.yaml delete mode 100644 kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-prometheus-node-recording.rules.yaml delete mode 100644 kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-resources.yaml delete mode 100644 kubernetes/services/baku/base/monitoring/vm-stack/rules/vm-health.yaml diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/helm-release-patch.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/helm-release-patch.yaml index a051a5e0..68e5bd60 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/helm-release-patch.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/helm-release-patch.yaml @@ -31,19 +31,17 @@ spec: cluster: baku alertmanager: + monzoTemplate: + enabled: false spec: configSecret: monitoring.alertmanager # Disable tweaked alert rules defaultRules: rules: - kubeApiserverAvailability: false - kubeApiserverBurnrate: false - kubeApiserverHistogram: false - kubeApiserverSlos: false - kubePrometheusNodeRecording: false - kubeScheduler: false - kubernetesResources: false - kubernetesSystem: false - vmhealth: false - vmagent: false + kubeApiserverAvailability: false # For k3s change + kubeApiserverBurnrate: false # For k3s change + kubeApiserverHistogram: false # For k3s change + kubeScheduler: false # For k3s change + kubernetesSystem: false # For k3s change + vmagent: false # remove dashboard annotation diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-availability.rules.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-availability.rules.yaml index c0b172ac..c1456e91 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-availability.rules.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-availability.rules.yaml @@ -11,22 +11,22 @@ spec: rules: - expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) + - expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) + - expr: sum by (code,cluster) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d - - expr: sum by (cluster, verb, scope) (increase(apiserver_request_sli_duration_seconds_count{job="apiserver"}[1h])) - record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h - - expr: sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h[30d]) * 24 * 30) - record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d - - expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) + - expr: sum by (verb,scope,le,cluster) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h - - expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) + - expr: sum by (verb,scope,le,cluster) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d + - expr: sum by (verb,scope,cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"}) + record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h + - expr: sum by (verb,scope,cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"}) + record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d - expr: |- 1 - ( ( @@ -101,19 +101,19 @@ spec: labels: verb: write record: apiserver_request:availability30d - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[5m])) + - expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kubelet",verb=~"LIST|GET"}[5m])) labels: verb: read record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[5m])) + - expr: sum by (code,resource,cluster) (rate(apiserver_request_total{job="kubelet",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: code_resource:apiserver_request_total:rate5m - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) + - expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) + - expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) + - expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - - expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) + - expr: sum by (code,verb,cluster) (increase(apiserver_request_total{job="kubelet",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-histogram.rules.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-histogram.rules.yaml index e954c395..1e0e8ed4 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-histogram.rules.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-histogram.rules.yaml @@ -8,12 +8,12 @@ spec: groups: - name: kube-apiserver-histogram.rules rules: - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kubelet",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: '0.99' verb: read record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile - - expr: histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 + - expr: histogram_quantile(0.99, sum by (le,resource,cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="kubelet",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: '0.99' verb: write diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-slos.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-slos.yaml deleted file mode 100644 index abf6df4d..00000000 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-apiserver-slos.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# Twak from https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/templates/rules/kube-apiserver-slos.yaml -# Changes: added cluster in KubeAPIErrorBudgetBurn -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: monitoring-kube-apiserver-slos -spec: - groups: - - name: kube-apiserver-slos - rules: - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate1h) by (cluster) > (14.40 * 0.01000) - and - sum(apiserver_request:burnrate5m) by (cluster) > (14.40 * 0.01000) - for: 2m - labels: - long: 1h - severity: critical - short: 5m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate6h) by (cluster) > (6.00 * 0.01000) - and - sum(apiserver_request:burnrate30m) by (cluster) > (6.00 * 0.01000) - for: 15m - labels: - long: 6h - severity: critical - short: 30m - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate1d) by (cluster) > (3.00 * 0.01000) - and - sum(apiserver_request:burnrate2h) by (cluster) > (3.00 * 0.01000) - for: 1h - labels: - long: 1d - severity: warning - short: 2h - - alert: KubeAPIErrorBudgetBurn - annotations: - description: The API server is burning too much error budget. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapierrorbudgetburn - summary: The API server is burning too much error budget. - expr: |- - sum(apiserver_request:burnrate3d) by (cluster) > (1.00 * 0.01000) - and - sum(apiserver_request:burnrate6h) by (cluster) > (1.00 * 0.01000) - for: 3h - labels: - long: 3d - severity: warning - short: 6h diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-prometheus-node-recording.rules.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-prometheus-node-recording.rules.yaml deleted file mode 100644 index ce9c8bcb..00000000 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kube-prometheus-node-recording.rules.yaml +++ /dev/null @@ -1,22 +0,0 @@ -# Tweak from https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/templates/rules/kube-prometheus-node-recording.rules.yaml -# Changes: added cluster in rules -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: monitoring-kube-prometheus-node-recording.rules -spec: - groups: - - name: kube-prometheus-node-recording.rules - rules: - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance, cluster) - record: instance:node_cpu:rate:sum - - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance, cluster) - record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance, cluster) - record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance, cluster) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cluster, cpu)) BY (instance, cluster) - record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) by (cluster) - record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cluster, cpu)) by (cluster) - record: cluster:node_cpu:ratio diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-resources.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-resources.yaml deleted file mode 100644 index d57a7a5c..00000000 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-resources.yaml +++ /dev/null @@ -1,112 +0,0 @@ -# Tweak from https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-resources.yaml -# Changes: added cluster in KubeQuotaAlmostFull, KubeQuotaFullyUsed, KubeQuotaExceeded, CPUThrottlingHigh -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: monitoring-kubernetes-resources -spec: - groups: - - name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(namespace_cpu:kube_pod_container_resource_requests:sum{job="kube-state-metrics",}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - and - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - for: 10m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - and - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - for: 10m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Namespaces. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) - / - sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted memory resource requests for Namespaces. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) - / - sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(cluster, instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused - summary: Namespace quota is fully used. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(cluster, instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(cluster, instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace, cluster) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace, cluster) - > ( 25 / 100 ) - for: 15m - labels: - severity: info diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-apiserver.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-apiserver.yaml index c35b0db9..19d3dfe7 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-apiserver.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-apiserver.yaml @@ -1,6 +1,5 @@ # Tweak from https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/templates/rules/kubernetes-system-apiserver.yaml # Note: in k3s job "apiserver" is combined in "kubelet". -# Changes: added cluster in KubeClientCertificateExpiration, KubeAPITerminatedRequests apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: @@ -11,28 +10,35 @@ spec: rules: - alert: KubeClientCertificateExpiration annotations: - description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. - expr: apiserver_client_certificate_expiration_seconds_count{job="kubelet"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubelet"}[5m]))) < 604800 + expr: |- + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubelet"}[5m]))) < 604800 + and + on (job,instance,cluster) apiserver_client_certificate_expiration_seconds_count{job="kubelet"} > 0 for: 5m labels: severity: warning - alert: KubeClientCertificateExpiration annotations: - description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. + description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeclientcertificateexpiration summary: Client certificate is about to expire. - expr: apiserver_client_certificate_expiration_seconds_count{job="kubelet"} > 0 and on(cluster, job) histogram_quantile(0.01, sum by (cluster, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubelet"}[5m]))) < 86400 + expr: |- + histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="kubelet"}[5m]))) < 86400 + and + on (job,instance,cluster) apiserver_client_certificate_expiration_seconds_count{job="kubelet"} > 0 for: 5m labels: severity: critical - alert: KubeAggregatedAPIErrors annotations: - description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has reported errors. It has appeared unavailable {{ $value | humanize }} times averaged over the past 10m. + description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors on cluster {{ $labels.cluster }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapierrors summary: Kubernetes aggregated API has reported errors. - expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total{job="kubelet"}[10m])) > 4 + expr: sum by (instance,name,reason,cluster)(increase(aggregator_unavailable_apiservice_total{job="kubelet"}[1m])) > 0 + for: 10m labels: severity: warning - alert: KubeAggregatedAPIDown @@ -40,7 +46,7 @@ spec: description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeaggregatedapidown summary: Kubernetes aggregated API is down. - expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="kubelet"}[10m]))) * 100 < 85 + expr: (1 - max by (name,namespace,cluster)(avg_over_time(aggregator_unavailable_apiservice{job="kubelet"}[10m]))) * 100 < 85 for: 5m labels: severity: warning @@ -58,7 +64,7 @@ spec: description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeapiterminatedrequests summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. - expr: sum(rate(apiserver_request_terminations_total{job="kubelet"}[10m])) by (cluster) / (sum(rate(apiserver_request_total{job="kubelet"}[10m])) by (cluster) + sum(rate(apiserver_request_terminations_total{job="kubelet"}[10m])) by (cluster)) > 0.20 + expr: sum by (cluster) (rate(apiserver_request_terminations_total{job="kubelet"}[10m])) / ( sum by (cluster) (rate(apiserver_request_total{job="kubelet"}[10m])) + sum by (cluster) (rate(apiserver_request_terminations_total{job="kubelet"}[10m])) ) > 0.20 for: 5m labels: severity: warning diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-kubelet.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-kubelet.yaml index ff9cc7e5..6dcb733d 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-kubelet.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kubernetes-system-kubelet.yaml @@ -32,11 +32,11 @@ spec: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubelettoomanypods summary: Kubelet is running at capacity. expr: |- - count by(cluster, node) ( - (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) + count by (node,cluster) ( + (kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on (instance,pod,namespace,cluster) group_left(node) topk by (instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"}) ) / - max by(cluster, node) ( + max by (node,cluster) ( kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1 ) > 0.95 for: 15m @@ -47,7 +47,7 @@ spec: description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubenodereadinessflapping summary: Node readiness status is flapping. - expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 + expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (node,cluster) > 2 for: 15m labels: severity: warning @@ -65,7 +65,7 @@ spec: description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. - expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 + expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance,le,cluster)) * on (instance,cluster) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"} > 60 for: 15m labels: severity: warning diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kustomization.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kustomization.yaml index 0ba88b71..5e2e65ea 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/kustomization.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/rules/kustomization.yaml @@ -5,11 +5,7 @@ resources: - kube-apiserver-availability.rules.yaml - kube-apiserver-burnrate.rules.yaml - kube-apiserver-histogram.rules.yaml - - kube-apiserver-slos.yaml - - kube-prometheus-node-recording.rules.yaml - kube-scheduler.rules.yaml - - kubernetes-resources.yaml - kubernetes-system-apiserver.yaml - kubernetes-system-kubelet.yaml - - vm-health.yaml - vmagent.yaml diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/vm-health.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/vm-health.yaml deleted file mode 100644 index 3f33ab4a..00000000 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/vm-health.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# Twak from https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/templates/rules/vm-health.yaml -# Changes: added cluster in TooManyLogs -apiVersion: operator.victoriametrics.com/v1beta1 -kind: VMRule -metadata: - name: monitoring-vm-health -spec: - groups: - - name: vm-health - rules: - - alert: TooManyRestarts - annotations: - description: Job {{ $labels.job }} (instance {{ $labels.instance }}) has restarted more than twice in the last 15 minutes. It might be crashlooping. - summary: '{{ $labels.job }} too many restarts (instance {{ $labels.instance }})' - expr: changes(process_start_time_seconds{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"}[15m]) > 2 - labels: - severity: critical - - alert: ServiceDown - annotations: - description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.' - summary: Service {{ $labels.job }} is down on {{ $labels.instance }} - expr: up{job=~".*(victoriametrics|vmselect|vminsert|vmstorage|vmagent|vmalert|vmsingle|vmalertmanager|vmauth).*"} == 0 - for: 2m - labels: - severity: critical - - alert: ProcessNearFDLimits - annotations: - description: Exhausting OS file descriptors limit can cause severe degradation of the process. Consider to increase the limit as fast as possible. - summary: Number of free file descriptors is less than 100 for "{{ $labels.job }}"("{{ $labels.instance }}") for the last 5m - expr: (process_max_fds - process_open_fds) < 100 - for: 5m - labels: - severity: critical - - alert: TooHighMemoryUsage - annotations: - description: Too high memory usage may result into multiple issues such as OOMs or degraded performance. Consider to either increase available memory or decrease the load on the process. - summary: It is more than 80% of memory used by "{{ $labels.job }}"("{{ $labels.instance }}") - expr: (min_over_time(process_resident_memory_anon_bytes[10m]) / vm_available_memory_bytes) > 0.8 - for: 5m - labels: - severity: critical - - alert: TooHighCPUUsage - annotations: - description: Too high CPU usage may be a sign of insufficient resources and make process unstable. Consider to either increase available CPU resources or decrease the load on the process. - summary: More than 90% of CPU is used by "{{ $labels.job }}"("{{ $labels.instance }}") during the last 5m - expr: rate(process_cpu_seconds_total[5m]) / process_cpu_cores_available > 0.9 - for: 5m - labels: - severity: critical - - alert: TooManyLogs - annotations: - description: "Logging rate for job \"{{ $labels.job }}\" ({{ $labels.instance }}) is {{ $value }} for last 15m.\n Worth to check logs for specific error messages." - summary: Too many logs printed for job "{{ $labels.job }}" ({{ $labels.instance }}) - expr: sum(increase(vm_log_messages_total{level="error"}[5m])) by (job, instance, cluster) > 0 - for: 15m - labels: - severity: warning - - alert: TooManyTSIDMisses - annotations: - description: "The rate of TSID misses during query lookups is too high for \"{{ $labels.job }}\" ({{ $labels.instance }}).\n Make sure you're running VictoriaMetrics of v1.85.3 or higher.\n Related issue https://github.com/VictoriaMetrics/VictoriaMetrics/issues/3502" - summary: Too many TSID misses for job "{{ $labels.job }}" ({{ $labels.instance }}) - expr: sum(rate(vm_missing_tsids_for_metric_id_total[5m])) by (job, instance) > 0 - for: 10m - labels: - severity: critical - - alert: ConcurrentInsertsHitTheLimit - annotations: - description: "The limit of concurrent inserts on instance {{ $labels.instance }} depends on the number of CPUs.\n Usually, when component constantly hits the limit it is likely the component is overloaded and requires more CPU. In some cases for components like vmagent or vminsert the alert might trigger if there are too many clients making write attempts. If vmagent's or vminsert's CPU usage and network saturation are at normal level, then it might be worth adjusting `-maxConcurrentInserts` cmd-line flag." - summary: '{{ $labels.job }} on instance {{ $labels.instance }} is constantly hitting concurrent inserts limit' - expr: avg_over_time(vm_concurrent_insert_current[1m]) >= vm_concurrent_insert_capacity - for: 15m - labels: - severity: warning - - alert: IndexDBRecordsDrop - annotations: - description: "VictoriaMetrics could skip registering new timeseries during ingestion if they fail the validation process. \nFor example, `reason=too_long_item` means that time series cannot exceed 64KB. Please, reduce the number \nof labels or label values for such series. Or enforce these limits via `-maxLabelsPerTimeseries` and \n`-maxLabelValueLen` command-line flags.\n" - summary: IndexDB skipped registering items during data ingestion with reason={{`{{`}} $labels.reason {{`}}`}}. - expr: increase(vm_indexdb_items_dropped_total[5m]) > 0 - labels: - severity: critical - - alert: TooLongLabelValues - annotations: - description: "The maximum length of a label value is limited via `-maxLabelValueLen` cmd-line flag. \nLonger label values are truncated and may result into time series overlapping.\nPlease, check your logs to find which labels were truncated and \neither reduce the size of label values or increase `-maxLabelValueLen`.\n" - summary: VictoriaMetrics truncates too long label values - expr: increase(vm_too_long_label_values_total[5m]) > 0 - labels: - severity: critical - - alert: TooLongLabelNames - annotations: - description: The maximum length of a label name is limited by 256 bytes. Longer label names are truncated and may result into time series overlapping. - summary: VictoriaMetrics truncates too long label names - expr: increase(vm_too_long_label_names_total[5m]) > 0 - labels: - severity: critical diff --git a/kubernetes/services/baku/base/monitoring/vm-stack/rules/vmagent.yaml b/kubernetes/services/baku/base/monitoring/vm-stack/rules/vmagent.yaml index 4dd0b167..8f7cda18 100644 --- a/kubernetes/services/baku/base/monitoring/vm-stack/rules/vmagent.yaml +++ b/kubernetes/services/baku/base/monitoring/vm-stack/rules/vmagent.yaml @@ -1,7 +1,5 @@ # Tweak from https://github.com/VictoriaMetrics/helm-charts/blob/master/charts/victoria-metrics-k8s-stack/templates/rules/vmagent.yaml -# Changes: -# - add cluster to PersistentQueueIsDroppingData, -# - remove dashboard annotation +# Changes: remove dashboard annotation apiVersion: operator.victoriametrics.com/v1beta1 kind: VMRule metadata: @@ -16,31 +14,35 @@ spec: annotations: description: Vmagent dropped {{ $value | humanize1024 }} from persistent queue on instance {{ $labels.instance }} for the last 10m. summary: Instance {{ $labels.instance }} is dropping data from persistent queue - expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) by (job, instance, cluster) > 0 + expr: sum(increase(vm_persistentqueue_bytes_dropped_total[5m])) without (path) > 0 for: 10m labels: severity: critical - alert: RejectedRemoteWriteDataBlocksAreDropped annotations: - summary: Job "{{ $labels.job }}" on instance {{ $labels.instance }} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects. - expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) by (job, instance, cluster) > 0 + description: Job "{{ $labels.job }}" on instance {{ $labels.instance }} drops the rejected by remote-write server data blocks. Check the logs to find the reason for rejects. + summary: Vmagent is dropping data blocks that are rejected by remote storage + condition: '{{ true }}' + expr: sum(increase(vmagent_remotewrite_packets_dropped_total[5m])) without (url) > 0 for: 15m labels: severity: warning - alert: TooManyScrapeErrors annotations: - summary: Job "{{ $labels.job }}" on instance {{ $labels.instance }} fails to scrape targets for last 15m - expr: sum(increase(vm_promscrape_scrapes_failed_total[5m])) by (job, instance, cluster) > 0 + description: Job "{{ $labels.job }}" on instance {{ $labels.instance }} fails to scrape targets for last 15m + summary: Vmagent fails to scrape one or more targets + expr: increase(vm_promscrape_scrapes_failed_total[5m]) > 0 for: 15m labels: severity: warning - alert: TooManyWriteErrors annotations: - summary: Job "{{ $labels.job }}" on instance {{ $labels.instance }} responds with errors to write requests for last 15m. + description: Job "{{ $labels.job }}" on instance {{ $labels.instance }} responds with errors to write requests for last 15m. + summary: Vmagent responds with too many errors on data ingestion protocols expr: |- - (sum(increase(vm_ingestserver_request_errors_total[5m])) by (job, instance, cluster) + (sum(increase(vm_ingestserver_request_errors_total[5m])) without (name,net,type) + - sum(increase(vmagent_http_request_errors_total[5m])) by (job, instance, cluster)) > 0 + sum(increase(vmagent_http_request_errors_total[5m])) without (path,protocol)) > 0 for: 15m labels: severity: warning @@ -48,7 +50,7 @@ spec: annotations: description: "Vmagent fails to push data via remote write protocol to destination \"{{ $labels.url }}\"\n Ensure that destination is up and reachable." summary: Job "{{ $labels.job }}" on instance {{ $labels.instance }} fails to push to remote storage - expr: sum(rate(vmagent_remotewrite_retries_count_total[5m])) by(job, instance, cluster, url) > 0 + expr: rate(vmagent_remotewrite_retries_count_total[5m]) > 0 for: 15m labels: severity: warning @@ -56,7 +58,7 @@ spec: annotations: description: "The remote write connection between vmagent \"{{ $labels.job }}\" (instance {{ $labels.instance }}) and destination \"{{ $labels.url }}\" is saturated by more than 90% and vmagent won't be able to keep up.\n This usually means that `-remoteWrite.queues` command-line flag must be increased in order to increase the number of connections per each remote storage." summary: Remote write connection from "{{ $labels.job }}" (instance {{ $labels.instance }}) to {{ $labels.url }} is saturated - expr: "sum(rate(vmagent_remotewrite_send_duration_seconds_total[5m])) by(job, instance, cluster, url) \n> 0.9 * max(vmagent_remotewrite_queues) by(job, instance, cluster, url)" + expr: "(\n rate(vmagent_remotewrite_send_duration_seconds_total[5m])\n / \n vmagent_remotewrite_queues\n) > 0.9" for: 15m labels: severity: warning @@ -103,14 +105,14 @@ spec: - alert: StreamAggrFlushTimeout annotations: description: 'Stream aggregation process can''t keep up with the load and might produce incorrect aggregation results. Check logs for more details. Possible solutions: increase aggregation interval; aggregate smaller number of series; reduce samples'' ingestion rate to stream aggregation.' - summary: Streaming aggregation at "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) can't be finished within the configured aggregation interval. + summary: Streaming aggregation at "{{ $labels.job }}" (instance {{ $labels.instance }}) can't be finished within the configured aggregation interval. expr: increase(vm_streamaggr_flush_timeouts_total[5m]) > 0 labels: severity: warning - alert: StreamAggrDedupFlushTimeout annotations: description: 'Deduplication process can''t keep up with the load and might produce incorrect results. Check docs https://docs.victoriametrics.com/stream-aggregation/#deduplication and logs for more details. Possible solutions: increase deduplication interval; deduplicate smaller number of series; reduce samples'' ingestion rate.' - summary: Deduplication "{{`{{`}} $labels.job {{`}}`}}" (instance {{`{{`}} $labels.instance {{`}}`}}) can't be finished within configured deduplication interval. + summary: Deduplication "{{ $labels.job }}" (instance {{ $labels.instance }}) can't be finished within configured deduplication interval. expr: increase(vm_streamaggr_dedup_flush_timeouts_total[5m]) > 0 labels: severity: warning diff --git a/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml b/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml index 7a0bfc9d..3c9a0c0f 100644 --- a/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml +++ b/kubernetes/services/common/base/monitoring/vm-stack/helm-release.yaml @@ -21,18 +21,6 @@ spec: upgrade: crds: Skip values: - # Remove slack things - alertmanager: - config: - route: - receiver: noop - routes: [] - receivers: - - name: noop - - monzoTemplate: - enabled: false - kubelet: vmScrape: spec: @@ -64,6 +52,13 @@ spec: cpu: 120m memory: 320Mi + # Disable CRDs + crds: + enabled: false + plain: false + cleanup: + enabled: false + # Disable unneeded alert rules defaultRules: groups: @@ -85,13 +80,12 @@ spec: enabled: false # Disable default grafana dashboards - defaultDashboardsEnabled: false + defaultDashboards: + enabled: false ## Disable embedded grafana for deploy own grafana: enabled: false # Disable CRDs - crds: - enabled: false prometheus-operator-crds: enabled: false