From 46dd6ffeb3ec9ae3e06c32b3df96d3ce1bac8304 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 14:06:07 +0100 Subject: [PATCH 01/16] Add ci steps Signed-off-by: Michel Hollands --- .github/workflows/helm-ci.yml | 101 +++ Makefile | 10 + charts/meta-monitoring/Makefile | 7 + charts/meta-monitoring/src/.yamllint.yaml | 4 + .../meta-monitoring/src/rules/loki-rules.yaml | 100 +-- .../src/rules/mimir-rules.yaml | 812 +++++++++--------- .../src/rules/tempo-rules.yaml | 26 +- 7 files changed, 591 insertions(+), 469 deletions(-) create mode 100644 .github/workflows/helm-ci.yml create mode 100644 Makefile create mode 100644 charts/meta-monitoring/Makefile create mode 100644 charts/meta-monitoring/src/.yamllint.yaml diff --git a/.github/workflows/helm-ci.yml b/.github/workflows/helm-ci.yml new file mode 100644 index 0000000..44e17e8 --- /dev/null +++ b/.github/workflows/helm-ci.yml @@ -0,0 +1,101 @@ +--- +name: helm-ci +on: + pull_request: + paths: + - "production/helm/**" + +env: + CT_CONFIGFILE: production/helm/ct.yaml + +jobs: + call-lint: + name: Lint Helm Chart + runs-on: ubuntu-latest + steps: + - name: Checkout Code + uses: actions/checkout@v3 + + # - name: Check Docs + # run: | + # docker run --rm --volume "$(pwd):/helm-docs" -u "$(id -u)" jnorwood/helm-docs:v1.11.0 + # if ! git diff --exit-code; then + # echo "Documentation not up to date. Please run helm-docs and commit changes!" >&2 + # exit 1 + # fi + + - name: Lint Yaml + run: make helm-lint + + # - name: Lint Code Base + # uses: docker://github/super-linter:v3.12.0 + # env: + # FILTER_REGEX_EXCLUDE: .*(README\.md|Chart\.yaml|NOTES.txt).* + # FILTER_REGEX_INCLUDE: .*charts/meta-monitoring/.* + # VALIDATE_ALL_CODEBASE: false + # VALIDATE_KUBERNETES_KUBEVAL: false + # VALIDATE_YAML: false + # VALIDATE_GO: false + # DEFAULT_BRANCH: main + # GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + call-test: + name: Test Helm Chart + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Helm + uses: azure/setup-helm@v3 + with: + version: v3.8.2 + + # Python is required because `ct lint` runs Yamale (https://github.com/23andMe/Yamale) and + # yamllint (https://github.com/adrienverge/yamllint) which require Python + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: 3.7 + + - name: Set up chart-testing + uses: helm/chart-testing-action@v2.4.0 + + - name: Run chart-testing (list-changed) + id: list-changed + run: | + changed=$(ct list-changed --config "${CT_CONFIGFILE}") + if [[ -n "$changed" ]]; then + echo "changed=true" >> $GITHUB_OUTPUT + fi + + - name: Run chart-testing (lint) + run: ct lint --config "${CT_CONFIGFILE}" --check-version-increment=false + + - name: Create kind cluster + uses: helm/kind-action@v1.8.0 + if: steps.list-changed.outputs.changed == 'true' + + - name: Install prometheus operator + id: install-prometheus + if: steps.list-changed.outputs.changed == 'true' + run: | + kubectl create namespace prometheus + + helm install prometheus prometheus-community/kube-prometheus-stack \ + --namespace prometheus \ + --set grafana.enabled=false \ + --set prometheus.prometheusSpec.serviceMonitorSelector.matchLabels.release=prometheus + + kubectl --namespace prometheus get pods -l "release=prometheus" + kubectl --namespace prometheus get services -l "release=prometheus" + + - name: Run chart-testing (install) + run: | + changed=$(ct list-changed --config "${CT_CONFIGFILE}") + if [[ "$changed" == "charts/enterprise-metrics" ]]; then + # Do not run `ct install` for enterprise-metrics + exit 0 + fi + ct install --config "${CT_CONFIGFILE}" \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..00e72da --- /dev/null +++ b/Makefile @@ -0,0 +1,10 @@ +# Adapted from https://www.thapaliya.com/en/writings/well-documented-makefiles/ +.PHONY: help +help: ## Display this help and any documented user-facing targets. Other undocumented targets may be present in the Makefile. +help: + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \n\nTargets:\n"} /^[a-zA-Z_-]+:.*?##/ { printf " %-45s %s\n", $$1, $$2 }' $(MAKEFILE_LIST) + +.PHONY: helm-lint + +helm-lint: ## run helm linter + $(MAKE) -BC charts/meta-monitoring lint diff --git a/charts/meta-monitoring/Makefile b/charts/meta-monitoring/Makefile new file mode 100644 index 0000000..4b56414 --- /dev/null +++ b/charts/meta-monitoring/Makefile @@ -0,0 +1,7 @@ +.DEFAULT_GOAL := all +.PHONY: lint lint-yaml + +lint: lint-yaml + +lint-yaml: + yamllint -c $(CURDIR)/src/.yamllint.yaml $(CURDIR)/src diff --git a/charts/meta-monitoring/src/.yamllint.yaml b/charts/meta-monitoring/src/.yamllint.yaml new file mode 100644 index 0000000..19e5933 --- /dev/null +++ b/charts/meta-monitoring/src/.yamllint.yaml @@ -0,0 +1,4 @@ +--- +rules: + quoted-strings: + required: true diff --git a/charts/meta-monitoring/src/rules/loki-rules.yaml b/charts/meta-monitoring/src/rules/loki-rules.yaml index 33196c1..d25cd98 100644 --- a/charts/meta-monitoring/src/rules/loki-rules.yaml +++ b/charts/meta-monitoring/src/rules/loki-rules.yaml @@ -1,53 +1,53 @@ groups: -- name: loki_rules +- name: "loki_rules" rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:loki_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:loki_request_duration_seconds:50quantile" + - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) + by (cluster, job)" + record: "cluster_job:loki_request_duration_seconds:avg" + - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)" + record: "cluster_job:loki_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job)" + record: "cluster_job:loki_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job)" + record: "cluster_job:loki_request_duration_seconds_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route))" + record: "cluster_job_route:loki_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route))" + record: "cluster_job_route:loki_request_duration_seconds:50quantile" + - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)" + record: "cluster_job_route:loki_request_duration_seconds:avg" + - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route)" + record: "cluster_job_route:loki_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route)" + record: "cluster_job_route:loki_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route)" + record: "cluster_job_route:loki_request_duration_seconds_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:loki_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:loki_request_duration_seconds:50quantile" + - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate \ No newline at end of file + namespace, job, route)" + record: "cluster_namespace_job_route:loki_request_duration_seconds:avg" + - expr: "sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route)" + record: "cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route)" + record: "cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route)" + record: "cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate" diff --git a/charts/meta-monitoring/src/rules/mimir-rules.yaml b/charts/meta-monitoring/src/rules/mimir-rules.yaml index 1344cee..1a565fb 100644 --- a/charts/meta-monitoring/src/rules/mimir-rules.yaml +++ b/charts/meta-monitoring/src/rules/mimir-rules.yaml @@ -1,322 +1,322 @@ groups: -- name: mimir_api_1 +- name: "mimir_api_1" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_request_duration_seconds_count:sum_rate -- name: mimir_api_2 + - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_request_duration_seconds_count[1m])) + by (cluster, job)" + record: "cluster_job:cortex_request_duration_seconds:avg" + - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job)" + record: "cluster_job:cortex_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_request_duration_seconds_count:sum_rate" +- name: "mimir_api_2" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:cortex_request_duration_seconds_count:sum_rate -- name: mimir_api_3 + - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route))" + record: "cluster_job_route:cortex_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route))" + record: "cluster_job_route:cortex_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route) + / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)" + record: "cluster_job_route:cortex_request_duration_seconds:avg" + - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, job, + route)" + record: "cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, job, route)" + record: "cluster_job_route:cortex_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, job, route)" + record: "cluster_job_route:cortex_request_duration_seconds_count:sum_rate" +- name: "mimir_api_3" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_request_duration_seconds:50quantile - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + - expr: "histogram_quantile(0.99, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:cortex_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:cortex_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds:avg - - expr: sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate -- name: mimir_querier_api + namespace, job, route)" + record: "cluster_namespace_job_route:cortex_request_duration_seconds:avg" + - expr: "sum(rate(cortex_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, + job, route)" + record: "cluster_namespace_job_route:cortex_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_request_duration_seconds_sum[1m])) by (cluster, namespace, + job, route)" + record: "cluster_namespace_job_route:cortex_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_request_duration_seconds_count[1m])) by (cluster, namespace, + job, route)" + record: "cluster_namespace_job_route:cortex_request_duration_seconds_count:sum_rate" +- name: "mimir_querier_api" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_querier_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_querier_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job)" + record: "cluster_job:cortex_querier_request_duration_seconds:avg" + - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + job)" + record: "cluster_job:cortex_querier_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job)" + record: "cluster_job:cortex_querier_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job)" + record: "cluster_job:cortex_querier_request_duration_seconds_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route))" + record: "cluster_job_route:cortex_querier_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, job, route))" + record: "cluster_job_route:cortex_querier_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) by - (cluster, job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - job, route) - record: cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + (cluster, job, route)" + record: "cluster_job_route:cortex_querier_request_duration_seconds:avg" + - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + job, route)" + record: "cluster_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + job, route)" + record: "cluster_job_route:cortex_querier_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + job, route)" + record: "cluster_job_route:cortex_querier_request_duration_seconds_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) + by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(cortex_querier_request_duration_seconds_count[1m])) - by (cluster, namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg - - expr: sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate -- name: mimir_cache + by (cluster, namespace, job, route)" + record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds:avg" + - expr: "sum(rate(cortex_querier_request_duration_seconds_bucket[1m])) by (le, cluster, + namespace, job, route)" + record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_querier_request_duration_seconds_sum[1m])) by (cluster, + namespace, job, route)" + record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_querier_request_duration_seconds_count[1m])) by (cluster, + namespace, job, route)" + record: "cluster_namespace_job_route:cortex_querier_request_duration_seconds_count:sum_rate" +- name: "mimir_cache" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) - record: cluster_job_method:cortex_memcache_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) - record: cluster_job_method:cortex_memcache_request_duration_seconds:50quantile - - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + - expr: "histogram_quantile(0.99, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method))" + record: "cluster_job_method:cortex_memcache_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method))" + record: "cluster_job_method:cortex_memcache_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_memcache_request_duration_seconds_count[1m])) - by (cluster, job, method) - record: cluster_job_method:cortex_memcache_request_duration_seconds:avg - - expr: sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, - job, method) - record: cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, - job, method) - record: cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, - job, method) - record: cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_cache_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_cache_request_duration_seconds:50quantile - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_cache_request_duration_seconds:avg - - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_cache_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) - record: cluster_job_method:cortex_cache_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) - by (le, cluster, job, method)) - record: cluster_job_method:cortex_cache_request_duration_seconds:50quantile - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + by (cluster, job, method)" + record: "cluster_job_method:cortex_memcache_request_duration_seconds:avg" + - expr: "sum(rate(cortex_memcache_request_duration_seconds_bucket[1m])) by (le, cluster, + job, method)" + record: "cluster_job_method:cortex_memcache_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_memcache_request_duration_seconds_sum[1m])) by (cluster, + job, method)" + record: "cluster_job_method:cortex_memcache_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_memcache_request_duration_seconds_count[1m])) by (cluster, + job, method)" + record: "cluster_job_method:cortex_memcache_request_duration_seconds_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_cache_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_cache_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job) + / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_cache_request_duration_seconds:avg" + - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + job)" + record: "cluster_job:cortex_cache_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_cache_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job)" + record: "cluster_job:cortex_cache_request_duration_seconds_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method))" + record: "cluster_job_method:cortex_cache_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) + by (le, cluster, job, method))" + record: "cluster_job_method:cortex_cache_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, method) / sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, - job, method) - record: cluster_job_method:cortex_cache_request_duration_seconds:avg - - expr: sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, - job, method) - record: cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, - method) - record: cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, - job, method) - record: cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate -- name: mimir_storage + job, method)" + record: "cluster_job_method:cortex_cache_request_duration_seconds:avg" + - expr: "sum(rate(cortex_cache_request_duration_seconds_bucket[1m])) by (le, cluster, + job, method)" + record: "cluster_job_method:cortex_cache_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_cache_request_duration_seconds_sum[1m])) by (cluster, job, + method)" + record: "cluster_job_method:cortex_cache_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_cache_request_duration_seconds_count[1m])) by (cluster, + job, method)" + record: "cluster_job_method:cortex_cache_request_duration_seconds_count:sum_rate" +- name: "mimir_storage" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_kv_request_duration_seconds:50quantile - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds:avg - - expr: sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:cortex_kv_request_duration_seconds_count:sum_rate -- name: mimir_queries + - expr: "histogram_quantile(0.99, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_kv_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_kv_request_duration_seconds:50quantile" + - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job) + / sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_kv_request_duration_seconds:avg" + - expr: "sum(rate(cortex_kv_request_duration_seconds_bucket[1m])) by (le, cluster, + job)" + record: "cluster_job:cortex_kv_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_kv_request_duration_seconds_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_kv_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_kv_request_duration_seconds_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_kv_request_duration_seconds_count:sum_rate" +- name: "mimir_queries" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_retries:50quantile - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) - by (cluster, job) - record: cluster_job:cortex_query_frontend_retries:avg - - expr: sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_query_frontend_retries_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job) - record: cluster_job:cortex_query_frontend_retries_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_retries_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_query_frontend_retries:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_retries_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_query_frontend_retries:50quantile" + - expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_retries_count[1m])) + by (cluster, job)" + record: "cluster_job:cortex_query_frontend_retries:avg" + - expr: "sum(rate(cortex_query_frontend_retries_bucket[1m])) by (le, cluster, job)" + record: "cluster_job:cortex_query_frontend_retries_bucket:sum_rate" + - expr: "sum(rate(cortex_query_frontend_retries_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_query_frontend_retries_sum:sum_rate" + - expr: "sum(rate(cortex_query_frontend_retries_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_query_frontend_retries_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_query_frontend_queue_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_query_frontend_queue_duration_seconds:50quantile" + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by - (cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds:avg - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, - cluster, job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate - - expr: sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, - job) - record: cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate -- name: mimir_ingester_queries + (cluster, job)" + record: "cluster_job:cortex_query_frontend_queue_duration_seconds:avg" + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_bucket[1m])) by (le, + cluster, job)" + record: "cluster_job:cortex_query_frontend_queue_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_sum[1m])) by (cluster, + job)" + record: "cluster_job:cortex_query_frontend_queue_duration_seconds_sum:sum_rate" + - expr: "sum(rate(cortex_query_frontend_queue_duration_seconds_count[1m])) by (cluster, + job)" + record: "cluster_job:cortex_query_frontend_queue_duration_seconds_count:sum_rate" +- name: "mimir_ingester_queries" rules: - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_series:50quantile - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_series:avg - - expr: sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_series_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_series_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_samples:50quantile - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) - by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples:avg - - expr: sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job) - record: cluster_job:cortex_ingester_queried_samples_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_samples_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:99quantile - - expr: histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:cortex_ingester_queried_exemplars:50quantile - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / - sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars:avg - - expr: sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, - job) - record: cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate - - expr: sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job) - record: cluster_job:cortex_ingester_queried_exemplars_count:sum_rate -- name: mimir_received_samples + - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_series_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_ingester_queried_series:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_series_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_ingester_queried_series:50quantile" + - expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_series_count[1m])) + by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_series:avg" + - expr: "sum(rate(cortex_ingester_queried_series_bucket[1m])) by (le, cluster, job)" + record: "cluster_job:cortex_ingester_queried_series_bucket:sum_rate" + - expr: "sum(rate(cortex_ingester_queried_series_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_series_sum:sum_rate" + - expr: "sum(rate(cortex_ingester_queried_series_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_series_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_ingester_queried_samples:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_samples_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_ingester_queried_samples:50quantile" + - expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job) / sum(rate(cortex_ingester_queried_samples_count[1m])) + by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_samples:avg" + - expr: "sum(rate(cortex_ingester_queried_samples_bucket[1m])) by (le, cluster, job)" + record: "cluster_job:cortex_ingester_queried_samples_bucket:sum_rate" + - expr: "sum(rate(cortex_ingester_queried_samples_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_samples_sum:sum_rate" + - expr: "sum(rate(cortex_ingester_queried_samples_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_samples_count:sum_rate" + - expr: "histogram_quantile(0.99, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_ingester_queried_exemplars:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) + by (le, cluster, job))" + record: "cluster_job:cortex_ingester_queried_exemplars:50quantile" + - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job) / + sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_exemplars:avg" + - expr: "sum(rate(cortex_ingester_queried_exemplars_bucket[1m])) by (le, cluster, + job)" + record: "cluster_job:cortex_ingester_queried_exemplars_bucket:sum_rate" + - expr: "sum(rate(cortex_ingester_queried_exemplars_sum[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_exemplars_sum:sum_rate" + - expr: "sum(rate(cortex_ingester_queried_exemplars_count[1m])) by (cluster, job)" + record: "cluster_job:cortex_ingester_queried_exemplars_count:sum_rate" +- name: "mimir_received_samples" rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_samples:rate5m -- name: mimir_exemplars_in + - expr: "| + sum by (cluster, namespace, job) (rate(cortex_distributor_received_samples_total[5m]))" + record: "cluster_namespace_job:cortex_distributor_received_samples:rate5m" +- name: "mimir_exemplars_in" rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m])) - record: cluster_namespace_job:cortex_distributor_exemplars_in:rate5m -- name: mimir_received_exemplars + - expr: "| + sum by (cluster, namespace, job) (rate(cortex_distributor_exemplars_in_total[5m]))" + record: "cluster_namespace_job:cortex_distributor_exemplars_in:rate5m" +- name: "mimir_received_exemplars" rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m])) - record: cluster_namespace_job:cortex_distributor_received_exemplars:rate5m -- name: mimir_exemplars_ingested + - expr: "| + sum by (cluster, namespace, job) (rate(cortex_distributor_received_exemplars_total[5m]))" + record: "cluster_namespace_job:cortex_distributor_received_exemplars:rate5m" +- name: "mimir_exemplars_ingested" rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m])) - record: cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m -- name: mimir_exemplars_appended + - expr: "| + sum by (cluster, namespace, job) (rate(cortex_ingester_ingested_exemplars_total[5m]))" + record: "cluster_namespace_job:cortex_ingester_ingested_exemplars:rate5m" +- name: "mimir_exemplars_appended" rules: - - expr: | - sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m])) - record: cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m -- name: mimir_scaling_rules + - expr: "| + sum by (cluster, namespace, job) (rate(cortex_ingester_tsdb_exemplar_exemplars_appended_total[5m]))" + record: "cluster_namespace_job:cortex_ingester_tsdb_exemplar_exemplars_appended:rate5m" +- name: "mimir_scaling_rules" rules: - - expr: | + - expr: "| # Convenience rule to get the number of replicas for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. sum by (cluster, namespace, deployment) ( label_replace( kube_deployment_spec_replicas, - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) ) or sum by (cluster, namespace, deployment) ( - label_replace(kube_statefulset_replicas, "deployment", "$1", "statefulset", "(.*?)(?:-zone-[a-z])?") - ) - record: cluster_namespace_deployment:actual_replicas:count - - expr: | + label_replace(kube_statefulset_replicas, \"deployment\", \"$1\", \"statefulset\", \"(.*?)(?:-zone-[a-z])?\") + )" + record: "cluster_namespace_deployment:actual_replicas:count" + - expr: "| ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( @@ -324,21 +324,21 @@ groups: )[24h:] ) / 240000 - ) + )" labels: - deployment: distributor - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "distributor" + reason: "sample_rate" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"}) * 0.59999999999999998 / 240000 - ) + )" labels: - deployment: distributor - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "distributor" + reason: "sample_rate_limits" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| ceil( quantile_over_time(0.99, sum by (cluster, namespace) ( @@ -346,12 +346,12 @@ groups: )[24h:] ) * 3 / 80000 - ) + )" labels: - deployment: ingester - reason: sample_rate - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "ingester" + reason: "sample_rate" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| ceil( quantile_over_time(0.99, sum by(cluster, namespace) ( @@ -359,59 +359,59 @@ groups: )[24h:] ) / 1500000 - ) + )" labels: - deployment: ingester - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "ingester" + reason: "active_series" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="max_global_series_per_user"}) + sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"max_global_series_per_user\"}) * 3 * 0.59999999999999998 / 1500000 - ) + )" labels: - deployment: ingester - reason: active_series_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "ingester" + reason: "active_series_limits" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| ceil( - sum by (cluster, namespace) (cortex_limits_overrides{limit_name="ingestion_rate"}) + sum by (cluster, namespace) (cortex_limits_overrides{limit_name=\"ingestion_rate\"}) * 0.59999999999999998 / 80000 - ) + )" labels: - deployment: ingester - reason: sample_rate_limits - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "ingester" + reason: "sample_rate_limits" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| ceil( (sum by (cluster, namespace) ( - cortex_ingester_tsdb_storage_blocks_bytes{job=~".+/ingester.*"} + cortex_ingester_tsdb_storage_blocks_bytes{job=~\".+/ingester.*\"} ) / 4) / avg by (cluster, namespace) ( - memcached_limit_bytes{job=~".+/memcached"} + memcached_limit_bytes{job=~\".+/memcached\"} ) - ) + )" labels: - deployment: memcached - reason: active_series - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + deployment: "memcached" + reason: "active_series" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| sum by (cluster, namespace, deployment) ( label_replace( label_replace( sum by (cluster, namespace, pod)(rate(container_cpu_usage_seconds_total[1m])), - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) - ) - record: cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate - - expr: | + )" + record: "cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate" + - expr: "| # Convenience rule to get the CPU request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # that remove resource metrics, ref: # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 @@ -424,11 +424,11 @@ groups: label_replace( label_replace( kube_pod_container_resource_requests_cpu_cores, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) ) ) @@ -439,17 +439,17 @@ groups: sum by (cluster, namespace, deployment) ( label_replace( label_replace( - kube_pod_container_resource_requests{resource="cpu"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + kube_pod_container_resource_requests{resource=\"cpu\"}, + \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - - expr: | + )" + record: "cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum" + - expr: "| # Jobs should be sized to their CPU usage. # We do this by comparing 99th percentile usage over the last 24hrs to # their current provisioned #replicas and resource requests. @@ -459,28 +459,28 @@ groups: quantile_over_time(0.99, cluster_namespace_deployment:container_cpu_usage_seconds_total:sum_rate[24h]) / cluster_namespace_deployment:kube_pod_container_resource_requests_cpu_cores:sum - ) + )" labels: - reason: cpu_usage - record: cluster_namespace_deployment_reason:required_replicas:count - - expr: | + reason: "cpu_usage" + record: "cluster_namespace_deployment_reason:required_replicas:count" + - expr: "| # Convenience rule to get the Memory utilization for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. sum by (cluster, namespace, deployment) ( label_replace( label_replace( - container_memory_usage_bytes{image!=""}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + container_memory_usage_bytes{image!=\"\"}, + \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) - ) - record: cluster_namespace_deployment:container_memory_usage_bytes:sum - - expr: | + )" + record: "cluster_namespace_deployment:container_memory_usage_bytes:sum" + - expr: "| # Convenience rule to get the Memory request for both a deployment and a statefulset. - # Multi-zone deployments are grouped together removing the "zone-X" suffix. + # Multi-zone deployments are grouped together removing the \"zone-X\" suffix. # This recording rule is made compatible with the breaking changes introduced in kube-state-metrics v2 # that remove resource metrics, ref: # - https://github.com/kubernetes/kube-state-metrics/blob/master/CHANGELOG.md#v200-alpha--2020-09-16 @@ -493,11 +493,11 @@ groups: label_replace( label_replace( kube_pod_container_resource_requests_memory_bytes, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) ) ) @@ -508,17 +508,17 @@ groups: sum by (cluster, namespace, deployment) ( label_replace( label_replace( - kube_pod_container_resource_requests{resource="memory"}, - "deployment", "$1", "pod", "(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))" + kube_pod_container_resource_requests{resource=\"memory\"}, + \"deployment\", \"$1\", \"pod\", \"(.*)-(?:([0-9]+)|([a-z0-9]+)-([a-z0-9]+))\" ), - # The question mark in "(.*?)" is used to make it non-greedy, otherwise it + # The question mark in \"(.*?)\" is used to make it non-greedy, otherwise it # always matches everything and the (optional) zone is not removed. - "deployment", "$1", "deployment", "(.*?)(?:-zone-[a-z])?" + \"deployment\", \"$1\", \"deployment\", \"(.*?)(?:-zone-[a-z])?\" ) ) - ) - record: cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - - expr: | + )" + record: "cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum" + - expr: "| # Jobs should be sized to their Memory usage. # We do this by comparing 99th percentile usage over the last 24hrs to # their current provisioned #replicas and resource requests. @@ -528,44 +528,44 @@ groups: quantile_over_time(0.99, cluster_namespace_deployment:container_memory_usage_bytes:sum[24h]) / cluster_namespace_deployment:kube_pod_container_resource_requests_memory_bytes:sum - ) + )" labels: - reason: memory_usage - record: cluster_namespace_deployment_reason:required_replicas:count -- name: mimir_alertmanager_rules + reason: "memory_usage" + record: "cluster_namespace_deployment_reason:required_replicas:count" +- name: "mimir_alertmanager_rules" rules: - - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_alerts) - record: cluster_job_pod:cortex_alertmanager_alerts:sum - - expr: | - sum by (cluster, job, pod) (cortex_alertmanager_silences) - record: cluster_job_pod:cortex_alertmanager_silences:sum - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_received_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m])) - record: cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_total:rate5m - - expr: | - sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m])) - record: cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m])) - record: cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m - - expr: | - sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m])) - record: cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m -- name: mimir_ingester_rules + - expr: "| + sum by (cluster, job, pod) (cortex_alertmanager_alerts)" + record: "cluster_job_pod:cortex_alertmanager_alerts:sum" + - expr: "| + sum by (cluster, job, pod) (cortex_alertmanager_silences)" + record: "cluster_job_pod:cortex_alertmanager_silences:sum" + - expr: "| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_received_total[5m]))" + record: "cluster_job:cortex_alertmanager_alerts_received_total:rate5m" + - expr: "| + sum by (cluster, job) (rate(cortex_alertmanager_alerts_invalid_total[5m]))" + record: "cluster_job:cortex_alertmanager_alerts_invalid_total:rate5m" + - expr: "| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_total[5m]))" + record: "cluster_job_integration:cortex_alertmanager_notifications_total:rate5m" + - expr: "| + sum by (cluster, job, integration) (rate(cortex_alertmanager_notifications_failed_total[5m]))" + record: "cluster_job_integration:cortex_alertmanager_notifications_failed_total:rate5m" + - expr: "| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_total[5m]))" + record: "cluster_job:cortex_alertmanager_state_replication_total:rate5m" + - expr: "| + sum by (cluster, job) (rate(cortex_alertmanager_state_replication_failed_total[5m]))" + record: "cluster_job:cortex_alertmanager_state_replication_failed_total:rate5m" + - expr: "| + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_total[5m]))" + record: "cluster_job:cortex_alertmanager_partial_state_merges_total:rate5m" + - expr: "| + sum by (cluster, job) (rate(cortex_alertmanager_partial_state_merges_failed_total[5m]))" + record: "cluster_job:cortex_alertmanager_partial_state_merges_failed_total:rate5m" +- name: "mimir_ingester_rules" rules: - - expr: | - sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m])) - record: cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m \ No newline at end of file + - expr: "| + sum by(cluster, namespace, pod) (rate(cortex_ingester_ingested_samples_total[1m]))" + record: "cluster_namespace_pod:cortex_ingester_ingested_samples_total:rate1m" diff --git a/charts/meta-monitoring/src/rules/tempo-rules.yaml b/charts/meta-monitoring/src/rules/tempo-rules.yaml index 59f8f17..27ac873 100644 --- a/charts/meta-monitoring/src/rules/tempo-rules.yaml +++ b/charts/meta-monitoring/src/rules/tempo-rules.yaml @@ -1,15 +1,15 @@ groups: -- name: tempo_rules +- name: "tempo_rules" rules: - - expr: histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:tempo_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:tempo_request_duration_seconds:50quantile - - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds:avg - - expr: sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate - - expr: sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) - record: cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate \ No newline at end of file + - expr: "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:tempo_request_duration_seconds:99quantile" + - expr: "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route))" + record: "cluster_namespace_job_route:tempo_request_duration_seconds:50quantile" + - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)" + record: "cluster_namespace_job_route:tempo_request_duration_seconds:avg" + - expr: "sum(rate(tempo_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)" + record: "cluster_namespace_job_route:tempo_request_duration_seconds_bucket:sum_rate" + - expr: "sum(rate(tempo_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route)" + record: "cluster_namespace_job_route:tempo_request_duration_seconds_sum:sum_rate" + - expr: "sum(rate(tempo_request_duration_seconds_count[1m])) by (cluster, namespace, job, route)" + record: "cluster_namespace_job_route:tempo_request_duration_seconds_count:sum_rate" From 9e136cdc5e513d8e1c7708bf99df45a1b8e0baf8 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 14:08:11 +0100 Subject: [PATCH 02/16] Fix paths Signed-off-by: Michel Hollands --- .github/workflows/helm-ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/helm-ci.yml b/.github/workflows/helm-ci.yml index 44e17e8..9cb9746 100644 --- a/.github/workflows/helm-ci.yml +++ b/.github/workflows/helm-ci.yml @@ -3,10 +3,10 @@ name: helm-ci on: pull_request: paths: - - "production/helm/**" + - "charts/meta-monitoring/**" env: - CT_CONFIGFILE: production/helm/ct.yaml + CT_CONFIGFILE: charts/meta-monitoring/ct.yaml jobs: call-lint: From 9c67417c02a67db37c8b9a9bfdfd768525e83783 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 14:11:09 +0100 Subject: [PATCH 03/16] Fix path again Signed-off-by: Michel Hollands --- .github/workflows/helm-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/helm-ci.yml b/.github/workflows/helm-ci.yml index 9cb9746..010661c 100644 --- a/.github/workflows/helm-ci.yml +++ b/.github/workflows/helm-ci.yml @@ -6,7 +6,7 @@ on: - "charts/meta-monitoring/**" env: - CT_CONFIGFILE: charts/meta-monitoring/ct.yaml + CT_CONFIGFILE: charts/meta-monitoring/src/ct.yaml jobs: call-lint: From ed6c6da4a0634d1d2201daac2ed22c379f4f5d8b Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 14:31:55 +0100 Subject: [PATCH 04/16] Add ct.yaml file Signed-off-by: Michel Hollands --- .github/workflows/helm-ci.yml | 2 +- charts/meta-monitoring/ct.yaml | 11 +++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 charts/meta-monitoring/ct.yaml diff --git a/.github/workflows/helm-ci.yml b/.github/workflows/helm-ci.yml index 010661c..9cb9746 100644 --- a/.github/workflows/helm-ci.yml +++ b/.github/workflows/helm-ci.yml @@ -6,7 +6,7 @@ on: - "charts/meta-monitoring/**" env: - CT_CONFIGFILE: charts/meta-monitoring/src/ct.yaml + CT_CONFIGFILE: charts/meta-monitoring/ct.yaml jobs: call-lint: diff --git a/charts/meta-monitoring/ct.yaml b/charts/meta-monitoring/ct.yaml new file mode 100644 index 0000000..57a1cab --- /dev/null +++ b/charts/meta-monitoring/ct.yaml @@ -0,0 +1,11 @@ +--- +remote: origin +target-branch: main +chart-dirs: + - charts/meta-monitoring +chart-repos: + - grafana=https://grafana.github.io/helm-charts + - minio=https://charts.min.io +helm-extra-args: --timeout 600s +check-version-increment: false +validate-maintainers: false \ No newline at end of file From b7dc7212e9c86d1efe0046294d08de88e3dc0947 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 14:38:18 +0100 Subject: [PATCH 05/16] Update path for CI Signed-off-by: Michel Hollands --- charts/meta-monitoring/ct.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/meta-monitoring/ct.yaml b/charts/meta-monitoring/ct.yaml index 57a1cab..f5c087d 100644 --- a/charts/meta-monitoring/ct.yaml +++ b/charts/meta-monitoring/ct.yaml @@ -2,7 +2,7 @@ remote: origin target-branch: main chart-dirs: - - charts/meta-monitoring + - charts chart-repos: - grafana=https://grafana.github.io/helm-charts - minio=https://charts.min.io From 4534f3eb21bf6b5db35223b2a293ebdf48f0bf0e Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:09:22 +0100 Subject: [PATCH 06/16] Update chart dependencies Signed-off-by: Michel Hollands --- charts/meta-monitoring/Chart.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/meta-monitoring/Chart.lock b/charts/meta-monitoring/Chart.lock index 94afcbb..e5ca275 100644 --- a/charts/meta-monitoring/Chart.lock +++ b/charts/meta-monitoring/Chart.lock @@ -14,5 +14,5 @@ dependencies: - name: minio repository: https://charts.min.io version: 5.0.11 -digest: sha256:4b04084e6fe821c4d481017b2430f7c8cd782a5d60830dd3a24eb8f10a9ece09 -generated: "2023-06-29T14:25:07.247853+01:00" +digest: sha256:da0e744b5046eb7972e0bf82d1d0ba4786e9600af63b65f35b16118105248074 +generated: "2023-08-16T16:08:36.406791+01:00" From f9b6ae9b3e0e0802ac61b39c685da677760e4205 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:15:00 +0100 Subject: [PATCH 07/16] Fix linting issues Signed-off-by: Michel Hollands --- charts/meta-monitoring/Chart.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/meta-monitoring/Chart.yaml b/charts/meta-monitoring/Chart.yaml index bf3052c..70e2c7f 100644 --- a/charts/meta-monitoring/Chart.yaml +++ b/charts/meta-monitoring/Chart.yaml @@ -25,18 +25,18 @@ appVersion: "0.0.1" dependencies: - name: loki - repository: https://grafana.github.io/helm-charts + repository: https://grafana.github.io/helm-charts version: "5.8.0" condition: local.logs.enabled - name: grafana-agent - repository: https://grafana.github.io/helm-charts + repository: https://grafana.github.io/helm-charts version: "0.15.0" - name: mimir-distributed - repository: https://grafana.github.io/helm-charts + repository: https://grafana.github.io/helm-charts version: "4.4.1" condition: local.metrics.enabled - name: tempo-distributed - repository: https://grafana.github.io/helm-charts + repository: https://grafana.github.io/helm-charts version: "1.4.7" condition: local.traces.enabled - name: minio From e9b05c38568406984c5d1c8fcdfa6b2bcc861cff Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:29:38 +0100 Subject: [PATCH 08/16] Fix linting issues again Signed-off-by: Michel Hollands --- charts/meta-monitoring/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/meta-monitoring/Chart.yaml b/charts/meta-monitoring/Chart.yaml index 70e2c7f..22823e0 100644 --- a/charts/meta-monitoring/Chart.yaml +++ b/charts/meta-monitoring/Chart.yaml @@ -42,4 +42,4 @@ dependencies: - name: minio repository: https://charts.min.io version: "5.0.11" - condition: local.minio.enabled \ No newline at end of file + condition: local.minio.enabled From 3a59a6bc82b965b987c919e121cf58eb8085389b Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:31:48 +0100 Subject: [PATCH 09/16] Fix linting issues again 2 Signed-off-by: Michel Hollands --- charts/meta-monitoring/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/meta-monitoring/values.yaml b/charts/meta-monitoring/values.yaml index 9d64f4c..735164b 100644 --- a/charts/meta-monitoring/values.yaml +++ b/charts/meta-monitoring/values.yaml @@ -193,4 +193,4 @@ minio: cpu: 100m memory: 128Mi # Changed the mc config path to '/tmp' from '/etc' as '/etc' is only writable by root and OpenShift will not permit this. - configPathmc: "/tmp/minio/mc/" \ No newline at end of file + configPathmc: "/tmp/minio/mc/" From 4cf4d1395506792073e16d116dc6d4418e4ebc1a Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:43:37 +0100 Subject: [PATCH 10/16] Remove prometheus operator Signed-off-by: Michel Hollands --- .github/workflows/helm-ci.yml | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/helm-ci.yml b/.github/workflows/helm-ci.yml index 9cb9746..3aa5112 100644 --- a/.github/workflows/helm-ci.yml +++ b/.github/workflows/helm-ci.yml @@ -77,19 +77,19 @@ jobs: uses: helm/kind-action@v1.8.0 if: steps.list-changed.outputs.changed == 'true' - - name: Install prometheus operator - id: install-prometheus - if: steps.list-changed.outputs.changed == 'true' - run: | - kubectl create namespace prometheus + # - name: Install prometheus operator + # id: install-prometheus + # if: steps.list-changed.outputs.changed == 'true' + # run: | + # kubectl create namespace prometheus - helm install prometheus prometheus-community/kube-prometheus-stack \ - --namespace prometheus \ - --set grafana.enabled=false \ - --set prometheus.prometheusSpec.serviceMonitorSelector.matchLabels.release=prometheus + # helm install prometheus prometheus-community/kube-prometheus-stack \ + # --namespace prometheus \ + # --set grafana.enabled=false \ + # --set prometheus.prometheusSpec.serviceMonitorSelector.matchLabels.release=prometheus - kubectl --namespace prometheus get pods -l "release=prometheus" - kubectl --namespace prometheus get services -l "release=prometheus" + # kubectl --namespace prometheus get pods -l "release=prometheus" + # kubectl --namespace prometheus get services -l "release=prometheus" - name: Run chart-testing (install) run: | From a3d27c1c3a284d43597c8c6e6b4115415e1de171 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:48:41 +0100 Subject: [PATCH 11/16] Add default values in values.yaml Signed-off-by: Michel Hollands --- charts/meta-monitoring/values.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/charts/meta-monitoring/values.yaml b/charts/meta-monitoring/values.yaml index 735164b..1aa2cff 100644 --- a/charts/meta-monitoring/values.yaml +++ b/charts/meta-monitoring/values.yaml @@ -21,19 +21,19 @@ local: cloud: logs: enabled: true - endpoint: - username: - password: + endpoint: CHANGEME + username: CHANGEME + password: CHANGEME metrics: enabled: true - endpoint: - username: - password: + endpoint: CHANGEME + username: CHANGEME + password: CHANGEME traces: enabled: true - endpoint: - username: - password: + endpoint: CHANGEME + username: CHANGEME + password: CHANGEME # Adding regexes here will add a stage.replace block for logs. For more information see # https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block From 419cdede3db4af94c77c03f6fa9a49fc7ee9d12b Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 16:59:42 +0100 Subject: [PATCH 12/16] Remove hardcoded meta namespace Signed-off-by: Michel Hollands --- charts/meta-monitoring/templates/ruler/ruler.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/charts/meta-monitoring/templates/ruler/ruler.yaml b/charts/meta-monitoring/templates/ruler/ruler.yaml index 9fae6e4..2f805e3 100644 --- a/charts/meta-monitoring/templates/ruler/ruler.yaml +++ b/charts/meta-monitoring/templates/ruler/ruler.yaml @@ -3,7 +3,7 @@ apiVersion: apps/v1 kind: Deployment metadata: name: meta-mimir-ruler-for-dashboards - namespace: meta + namespace: {{ $.Release.Namespace }} spec: progressDeadlineSeconds: 600 replicas: 1 @@ -24,7 +24,7 @@ spec: app.kubernetes.io/component: ruler-for-dashboards app.kubernetes.io/instance: meta app.kubernetes.io/name: mimir - namespace: meta + namespace: {{ $.Release.Namespace }} spec: containers: - args: From 1b3b89df4246adcd7fe2afda998d92b33c4ff610 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 17:13:15 +0100 Subject: [PATCH 13/16] Increase timeout Signed-off-by: Michel Hollands --- charts/meta-monitoring/ct.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/meta-monitoring/ct.yaml b/charts/meta-monitoring/ct.yaml index f5c087d..bc9072a 100644 --- a/charts/meta-monitoring/ct.yaml +++ b/charts/meta-monitoring/ct.yaml @@ -6,6 +6,6 @@ chart-dirs: chart-repos: - grafana=https://grafana.github.io/helm-charts - minio=https://charts.min.io -helm-extra-args: --timeout 600s +helm-extra-args: --timeout 1200s check-version-increment: false validate-maintainers: false \ No newline at end of file From 70312c135dada9041e018c9adcc1e2ea27ca485b Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 17:14:44 +0100 Subject: [PATCH 14/16] Do local install Signed-off-by: Michel Hollands --- charts/meta-monitoring/values.yaml | 32 +++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/charts/meta-monitoring/values.yaml b/charts/meta-monitoring/values.yaml index 1aa2cff..2b86f91 100644 --- a/charts/meta-monitoring/values.yaml +++ b/charts/meta-monitoring/values.yaml @@ -9,31 +9,31 @@ clusterName: "meta-monitoring" # Set to true for a local version of logs, metrics or traces local: logs: - enabled: false + enabled: true metrics: - enabled: false + enabled: true traces: - enabled: false + enabled: true minio: - enabled: false # This should be set to true if any of the previous is enabled + enabled: true # This should be set to true if any of the previous is enabled # Set to true to write logs, metrics or traces to Grafana Cloud cloud: logs: - enabled: true - endpoint: CHANGEME - username: CHANGEME - password: CHANGEME + enabled: false + endpoint: + username: + password: metrics: - enabled: true - endpoint: CHANGEME - username: CHANGEME - password: CHANGEME + enabled: false + endpoint: + username: + password: traces: - enabled: true - endpoint: CHANGEME - username: CHANGEME - password: CHANGEME + enabled: false + endpoint: + username: + password: # Adding regexes here will add a stage.replace block for logs. For more information see # https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block From 1eafbd490493c698f629380a3db109773e79b838 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Wed, 16 Aug 2023 17:19:09 +0100 Subject: [PATCH 15/16] Fix linting issues Signed-off-by: Michel Hollands --- charts/meta-monitoring/values.yaml | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/charts/meta-monitoring/values.yaml b/charts/meta-monitoring/values.yaml index 2b86f91..53fcd4f 100644 --- a/charts/meta-monitoring/values.yaml +++ b/charts/meta-monitoring/values.yaml @@ -15,25 +15,25 @@ local: traces: enabled: true minio: - enabled: true # This should be set to true if any of the previous is enabled + enabled: true # This should be set to true if any of the previous is enabled # Set to true to write logs, metrics or traces to Grafana Cloud cloud: logs: enabled: false - endpoint: - username: - password: + endpoint: + username: + password: metrics: enabled: false - endpoint: - username: - password: + endpoint: + username: + password: traces: enabled: false - endpoint: - username: - password: + endpoint: + username: + password: # Adding regexes here will add a stage.replace block for logs. For more information see # https://grafana.com/docs/agent/latest/flow/reference/components/loki.process/#stagereplace-block From c9295f51e2dcab187bcaa4d1492b01ffaea237a7 Mon Sep 17 00:00:00 2001 From: Michel Hollands Date: Thu, 17 Aug 2023 09:40:25 +0100 Subject: [PATCH 16/16] Rename agent dashboard file Signed-off-by: Michel Hollands --- .../meta-monitoring/templates/grafana/agent-dashboards-1.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/meta-monitoring/templates/grafana/agent-dashboards-1.yaml b/charts/meta-monitoring/templates/grafana/agent-dashboards-1.yaml index 6589c27..d9303cf 100644 --- a/charts/meta-monitoring/templates/grafana/agent-dashboards-1.yaml +++ b/charts/meta-monitoring/templates/grafana/agent-dashboards-1.yaml @@ -14,6 +14,6 @@ data: {{ $.Files.Get "src/dashboards/agent-remote-write.json" | fromJson | toJson }} "agent-tracing-pipeline.json": | {{ $.Files.Get "src/dashboards/agent-tracing-pipeline.json" | fromJson | toJson }} - "agent.json": | + "agent-overview.json": | {{ $.Files.Get "src/dashboards/agent.json" | fromJson | toJson }} {{- end }} \ No newline at end of file