diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index c3ba7f53..7e393de8 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -46,6 +46,8 @@ jobs: - 'otel-agent/k8s-helm/**' otel-infrastructure-collector: - 'otel-infrastructure-collector/k8s-helm/**' + otel-integration: + - 'otel-integration/k8s-helm/**' metrics-prometheus-agent: - 'metrics/prometheus-agent/**' metrics-prometheus-operator: @@ -69,6 +71,9 @@ jobs: - name: OpenTelemetry Infrastructure Collector Changelog if: steps.filter.outputs.otel-infrastructure-collector == 'true' run: scripts/changelog_check.sh otel-infrastructure-collector + - name: OpenTelemetry Integration Changelog + if: steps.filter.outputs.otel-integration == 'true' + run: scripts/changelog_check.sh otel-integration - name: Prometheus Agent Changelog if: steps.filter.outputs.metrics-prometheus-agent == 'true' run: scripts/changelog_check.sh metrics/prometheus-agent diff --git a/.github/workflows/otel-integration-helm-test.yml b/.github/workflows/otel-integration-helm-test.yml new file mode 100644 index 00000000..d3b8d2e8 --- /dev/null +++ b/.github/workflows/otel-integration-helm-test.yml @@ -0,0 +1,23 @@ +name: Otel Integration Helm Install Test + +on: + pull_request: + paths: + - 'otel-integration/k8s-helm/**' + +jobs: + collector-test: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + - name: Setup + uses: ./.github/actions/setup + with: + create-kind-cluster: "true" + - name: Setup Secret + run: kubectl create secret generic coralogix-keys --from-literal=PRIVATE_KEY=123 + - name: Run chart-testing (install) + run: ct install --namespace default --charts otel-integration/k8s-helm diff --git a/.github/workflows/otel-integration.yml b/.github/workflows/otel-integration.yml new file mode 100644 index 00000000..e028a2f6 --- /dev/null +++ b/.github/workflows/otel-integration.yml @@ -0,0 +1,41 @@ + +name: OpenTelemetry-Integration-Chart + +on: + push: + branches: master + paths: + - 'otel-integration/k8s-helm/**' + +env: + CHART_VERSION: $(yq eval '.version' otel-integration/k8s-helm/Chart.yaml) + CHART_NAME: otel-integration + ARTIFACTORY_URL: https://cgx.jfrog.io/artifactory/ + ARTIFACTORY_USERNAME: integrations-actions + +jobs: + build: + runs-on: ubuntu-latest + steps: + - + name: Checkout + uses: actions/checkout@v2.4.0 + - + name: Setup Helm Repo + run: | + helm repo add coralogix-charts-virtual ${{ env.ARTIFACTORY_URL }}coralogix-charts-virtual --username ${{ env.ARTIFACTORY_USERNAME }} --password ${{ secrets.ARTIFACTORY_NONUSER_ACCESS_TOKEN }} + helm repo add opentelemetry https://open-telemetry.github.io/opentelemetry-helm-charts + helm repo update + helm dependency build ./otel-integration/k8s-helm/ + cd otel-integration/k8s-helm + helm package . + - + name: Setup JFrog CLI + uses: jfrog/setup-jfrog-cli@v2.1.0 + with: + version: 2.12.1 + - + name: use-jfrog-cli + run: | + cd otel-integration/k8s-helm + jfrog rt upload --access-token ${{ secrets.ARTIFACTORY_NONUSER_ACCESS_TOKEN }} "${{ env.CHART_NAME }}-*.tgz" coralogix-charts --url ${{ env.ARTIFACTORY_URL }} diff --git a/otel-integration/CHANGELOG.md b/otel-integration/CHANGELOG.md new file mode 100644 index 00000000..2834b077 --- /dev/null +++ b/otel-integration/CHANGELOG.md @@ -0,0 +1,7 @@ +# Changelog + +## OpenTelemtry-Integration + +### v0.0.1 / 2023-07-21 + +* [FEATURE] Add new chart diff --git a/otel-integration/README.md b/otel-integration/README.md new file mode 100644 index 00000000..956a2864 --- /dev/null +++ b/otel-integration/README.md @@ -0,0 +1,3 @@ +# OpenTelemetry Integration + +The OpenTelemetry integration projects offers a full-fledged integration to collect and export telemetry signals from your infrastructure and applications. Currently this integration is available for Kubernetes on Linux platforms, please see the [`k8s-helm`](./k8s-helm) subdirectory for installation instructions. diff --git a/otel-integration/dashboard/k8s/README.md b/otel-integration/dashboard/k8s/README.md new file mode 100644 index 00000000..9956748c --- /dev/null +++ b/otel-integration/dashboard/k8s/README.md @@ -0,0 +1,23 @@ +## Installation + +* Copy the [otel-agent-dashboard.json](./assets/otel-agent-dashboard.json) file content +* Go to your hosted grafana +* Press the `Create` tab +* Press the `Import` tab +* Paste the copied content + +### Host Metrics + +![Alt text](./assets/host-metrics.png) + +### Span Metrics + +![Alt text](./assets/spanmetrics.png) + +### Kubernetes Pod + +![Alt text](./assets/kubernetes-pod.png) + +### Otel Agent + +![Alt text](./assets/dashboard-screenshot.jpg) diff --git a/otel-integration/dashboard/k8s/assets/dashboard-screenshot.jpg b/otel-integration/dashboard/k8s/assets/dashboard-screenshot.jpg new file mode 100644 index 00000000..ac86be74 Binary files /dev/null and b/otel-integration/dashboard/k8s/assets/dashboard-screenshot.jpg differ diff --git a/otel-integration/dashboard/k8s/assets/host-metrics-dashboard.json b/otel-integration/dashboard/k8s/assets/host-metrics-dashboard.json new file mode 100644 index 00000000..0d096242 --- /dev/null +++ b/otel-integration/dashboard/k8s/assets/host-metrics-dashboard.json @@ -0,0 +1,563 @@ +{ + "__inputs": [ + { + "name": "DS_METRICS", + "label": "Metrics", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.2.6" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1668014551266, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 1, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": false, + "expr": "(\n (1 - rate(system_cpu_time_total{state=\"idle\", host_name=\"$host\"}[$__rate_interval]))\n/ ignoring(cpu) group_left\n count without (cpu)( system_cpu_time_total{state=\"idle\", host_name=\"$host\"})\n)", + "instant": false, + "interval": "", + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "system_cpu_load_average_1m{host_name=\"$host\"}", + "interval": "", + "legendFormat": "1m load average", + "refId": "A" + }, + { + "exemplar": true, + "expr": "system_cpu_load_average_5m_1{host_name=\"$host\"}", + "hide": false, + "interval": "", + "legendFormat": "5m load average", + "refId": "B" + }, + { + "exemplar": true, + "expr": "system_cpu_load_average_15m{host_name=\"$host\"}", + "hide": false, + "interval": "", + "legendFormat": "15m load average", + "refId": "C" + }, + { + "exemplar": true, + "expr": "count(system_cpu_time_total{host_name=\"$host\",state=\"idle\"})", + "hide": false, + "interval": "", + "legendFormat": "logical cores", + "refId": "D" + } + ], + "title": "Load Average", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "__systemRef": "hideSeriesFrom", + "matcher": { + "id": "byNames", + "options": { + "mode": "exclude", + "names": [ + "free" + ], + "prefix": "All except:", + "readOnly": true + } + }, + "properties": [ + { + "id": "custom.hideFrom", + "value": { + "legend": false, + "tooltip": false, + "viz": true + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "system_memory_usage_By{host_name=\"$host\"}", + "interval": "", + "legendFormat": "{{state}}", + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 16 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "rate(system_disk_io_By_total{host_name=\"$host\"}[$__rate_interval])", + "interval": "", + "legendFormat": "{{device}} {{direction}}", + "refId": "A" + } + ], + "title": "Disk IO", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 16 + }, + "id": 10, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "system_filesystem_usage_By{host_name=\"$host\"}", + "interval": "", + "legendFormat": "{{device}} {{state}}", + "refId": "A" + } + ], + "title": "Disk Space Usage", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 32, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_METRICS}", + "definition": "label_values(system_cpu_time_total, host_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "host", + "options": [], + "query": { + "query": "label_values(system_cpu_time_total, host_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Host Metrics", + "uid": "a_uQehv4k", + "version": 17 +} \ No newline at end of file diff --git a/otel-integration/dashboard/k8s/assets/host-metrics.png b/otel-integration/dashboard/k8s/assets/host-metrics.png new file mode 100644 index 00000000..d364c805 Binary files /dev/null and b/otel-integration/dashboard/k8s/assets/host-metrics.png differ diff --git a/otel-integration/dashboard/k8s/assets/kubernetes-pod.json b/otel-integration/dashboard/k8s/assets/kubernetes-pod.json new file mode 100644 index 00000000..a29504ab --- /dev/null +++ b/otel-integration/dashboard/k8s/assets/kubernetes-pod.json @@ -0,0 +1,623 @@ +{ + "__inputs": [ + { + "name": "DS_METRICS", + "label": "Metrics", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.2.6" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1668019714556, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(container_cpu_time_s_total{k8s_namespace_name=\"$namespace\", k8s_pod_name=\"$pod\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "CPU Usage", + "type": "timeseries" + }, + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "container_cpu_utilization_1{k8s_namespace_name=\"$namespace\", k8s_pod_name=\"$pod\"}", + "interval": "", + "legendFormat": "CPU utilization", + "refId": "A" + } + ], + "title": "CPU Utilization", + "type": "timeseries" + }, + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "container_memory_usage_By{k8s_namespace_name=\"$namespace\", k8s_pod_name=\"$pod\"}", + "interval": "", + "legendFormat": "Memory Usage", + "refId": "A" + } + ], + "title": "Memory Usage", + "type": "timeseries" + }, + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 7, + "options": { + "legend": { + "calcs": [], + "displayMode": "hidden", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "increase(container_memory_major_page_faults_1{k8s_namespace_name=\"$namespace\", k8s_pod_name=\"$pod\"}[$__rate_interval])", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Memory Major Page faults", + "type": "timeseries" + }, + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "binBps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(k8s_pod_network_io_By_total{k8s_namespace_name=\"$namespace\", k8s_pod_name=\"$pod\"}[$__rate_interval])) by (direction)", + "hide": false, + "interval": "", + "legendFormat": "{{direction}}", + "refId": "A" + } + ], + "title": "Network IO", + "type": "timeseries" + }, + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 6, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(k8s_pod_network_errors_1_total{k8s_namespace_name=\"$namespace\", k8s_pod_name=\"$pod\"}[$__rate_interval])) by (direction)", + "hide": false, + "interval": "", + "legendFormat": "{{direction}}", + "refId": "A" + } + ], + "title": "Network Errors", + "type": "timeseries" + } + ], + "refresh": false, + "schemaVersion": 32, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_METRICS}", + "definition": "label_values(container_cpu_time_s_total, k8s_namespace_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(container_cpu_time_s_total, k8s_namespace_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_METRICS}", + "definition": "label_values(container_cpu_time_s_total{k8s_namespace_name=\"$namespace\"}, k8s_pod_name)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "pod", + "options": [], + "query": { + "query": "label_values(container_cpu_time_s_total{k8s_namespace_name=\"$namespace\"}, k8s_pod_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Kubernetes Pod", + "uid": "usfb0vj7z", + "version": 12 +} \ No newline at end of file diff --git a/otel-integration/dashboard/k8s/assets/kubernetes-pod.png b/otel-integration/dashboard/k8s/assets/kubernetes-pod.png new file mode 100644 index 00000000..b7749f24 Binary files /dev/null and b/otel-integration/dashboard/k8s/assets/kubernetes-pod.png differ diff --git a/otel-integration/dashboard/k8s/assets/otel-agent-dashboard.json b/otel-integration/dashboard/k8s/assets/otel-agent-dashboard.json new file mode 100644 index 00000000..220d2ec3 --- /dev/null +++ b/otel-integration/dashboard/k8s/assets/otel-agent-dashboard.json @@ -0,0 +1,1636 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": null, + "graphTooltip": 0, + "id": 460, + "iteration": 1670926408011, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 24, + "panels": [], + "title": "Exporters", + "type": "row" + }, + { + "datasource": null, + "description": "This panel shows you if the queue capacity is enough for your workload. ", + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 35, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "text": {} + }, + "pluginVersion": "8.2.6", + "targets": [ + { + "exemplar": true, + "expr": "otelcol_exporter_queue_capacity{__meta_applicationname=~\"$appname\",__meta_subsystem=~\"$subsystem\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "capacity", + "refId": "A" + } + ], + "title": "Retry Queue Capacity (in Batches)", + "type": "gauge" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "dark-red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 38, + "options": { + "displayMode": "basic", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "text": {} + }, + "pluginVersion": "8.2.6", + "targets": [ + { + "exemplar": true, + "expr": "sum by (host_name)(otelcol_exporter_queue_size{__meta_applicationname=~\"$appname\",__meta_subsystem=~\"$subsystem\"}) > 0", + "interval": "", + "legendFormat": "{{ host_name }}", + "refId": "A" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Queue Current Size [non empty queues]", + "type": "bargauge" + }, + { + "datasource": null, + "description": "The collector is not able to export data as expected. It's not necessarily means data loss because there can be retries, s but a high rate of failures can indicate issues with the network or backend receiving the data.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 36, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by(exporter)(rate(otelcol_exporter_send_failed_spans_total{__meta_applicationname=~\"$appname\",__meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ exporter }} ", + "refId": "A" + } + ], + "title": "Failed Exported Spans", + "type": "timeseries" + }, + { + "datasource": null, + "description": "The collector is not able to export data as expected. It's not necessarily means data loss because there can be retries, s but a high rate of failures can indicate issues with the network or backend receiving the data.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 9 + }, + "id": 39, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by(exporter)(rate(otelcol_exporter_send_failed_metric_points_total{__meta_applicationname=~\"$appname\",__meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ exporter }} ", + "refId": "A" + } + ], + "title": "Failed Exported Metrics", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (exporter)(rate(otelcol_exporter_sent_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ exporter }}", + "refId": "A" + } + ], + "title": "Sent Spans", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (exporter)(rate(otelcol_exporter_sent_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ exporter }}", + "refId": "A" + } + ], + "title": "Sent Metrics", + "type": "timeseries" + }, + { + "datasource": null, + "description": " This may be caused by a queue full of unsettled elements, so you may need to decrease your sending rate.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (exporter)(rate(otelcol_exporter_enqueue_failed_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ exporter }}", + "refId": "A" + } + ], + "title": "Number of Spans Failed to be Added to the Sending Queue", + "type": "timeseries" + }, + { + "datasource": null, + "description": " This may be caused by a queue full of unsettled elements, so you may need to decrease your sending rate.", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (exporter)(rate(otelcol_exporter_enqueue_failed_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ exporter }}", + "refId": "A" + } + ], + "title": "Number of Metrics Failed to be Added to the Sending Queue", + "type": "timeseries" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 34 + }, + "id": 13, + "panels": [ + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 35 + }, + "id": 10, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (processor)(rate(otelcol_processor_accepted_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ processor }}", + "refId": "A" + } + ], + "title": "Received Spans", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 35 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (processor)(rate(otelcol_processor_accepted_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ processor }}", + "refId": "A" + } + ], + "title": "Received Metrics", + "type": "timeseries" + }, + { + "datasource": null, + "description": "It indicates that too many errors are returned, it may indicate data loss", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 43 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (processor)(rate(otelcol_processor_refused_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ processor }}", + "refId": "A" + } + ], + "title": "Refused Spans", + "type": "timeseries" + }, + { + "datasource": null, + "description": "It indicates that too many errors are returned, it may indicate data loss", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 43 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (processor)(rate(otelcol_processor_refused_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ processor }}", + "refId": "A" + } + ], + "title": "Refused Metrics", + "type": "timeseries" + }, + { + "datasource": null, + "description": "Dropped Spans for a sustained period of time means a data loss", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (processor)(rate(otelcol_processor_dropped_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ processor }}", + "refId": "A" + } + ], + "title": "Dropped Spans", + "type": "timeseries" + }, + { + "datasource": null, + "description": "Dropped metrics for a sustained period of time means a data loss", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (processor)(rate(otelcol_processor_dropped_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ processor }}", + "refId": "A" + } + ], + "title": "Dropped Metrics", + "type": "timeseries" + } + ], + "title": "Processors", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, + "id": 8, + "panels": [ + { + "datasource": null, + "description": "Received spans per receiver and transport type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 36 + }, + "id": 2, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (receiver, transport)(rate(otelcol_receiver_accepted_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{ receiver }} - {{ transport }}", + "refId": "A" + } + ], + "title": "Received Spans", + "type": "timeseries" + }, + { + "datasource": null, + "description": "Received metrics per receiver and transport type", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 36 + }, + "id": 4, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum by (receiver)(rate(otelcol_receiver_accepted_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "interval": "", + "legendFormat": "{{ receiver }}", + "refId": "A" + } + ], + "title": "Received Metrics", + "type": "timeseries" + }, + { + "datasource": null, + "description": "It indicates that too many errors are returned, it may indicate data loss", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 45 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "group by (receiver, transport)(rate(otelcol_receiver_refused_spans_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{ receiver }} - {{ transport }}", + "refId": "A" + } + ], + "title": "Refused Spans", + "type": "timeseries" + }, + { + "datasource": null, + "description": "It indicates that too many errors are returned, it may indicate data loss", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 45 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "last", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "group by (receiver)(rate(otelcol_receiver_refused_metric_points_total{__meta_applicationname=~\"$appname\", __meta_subsystem=~\"$subsystem\"}[$__rate_interval]))", + "hide": false, + "interval": "", + "legendFormat": "{{ receiver }}", + "refId": "A" + } + ], + "title": "Refused Metrics", + "type": "timeseries" + } + ], + "title": "Receivers", + "type": "row" + } + ], + "schemaVersion": 32, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "selected": false, + "text": "production", + "value": "production" + }, + "datasource": null, + "definition": "label_values(otelcol_exporter_queue_capacity, __meta_applicationname)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "appname", + "options": [], + "query": { + "query": "label_values(otelcol_exporter_queue_capacity, __meta_applicationname)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "telemetry-agent", + "value": "telemetry-agent" + }, + "datasource": null, + "definition": "label_values(otelcol_exporter_queue_capacity, __meta_subsystem)", + "description": null, + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "subsystem", + "options": [], + "query": { + "query": "label_values(otelcol_exporter_queue_capacity, __meta_subsystem)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "OpenTelemetry Collector", + "uid": "wp8NM", + "version": 11 +} \ No newline at end of file diff --git a/otel-integration/dashboard/k8s/assets/span-metrics-dashboard.json b/otel-integration/dashboard/k8s/assets/span-metrics-dashboard.json new file mode 100644 index 00000000..c96388f5 --- /dev/null +++ b/otel-integration/dashboard/k8s/assets/span-metrics-dashboard.json @@ -0,0 +1,360 @@ +{ + "__inputs": [ + { + "name": "DS_METRICS", + "label": "Metrics", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "8.2.6" + }, + { + "type": "panel", + "id": "heatmap", + "name": "Heatmap", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "gnetId": null, + "graphTooltip": 0, + "id": null, + "iteration": 1668074102006, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": null, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(calls_total_total{k8s_namespace_name=\"$namespace\", k8s_deployment_name=\"$deployment\"}[$__rate_interval])) by (operation)", + "interval": "", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "Operation Rate", + "type": "timeseries" + }, + { + "datasource": null, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "reqps" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single" + } + }, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(calls_total_total{status_code=\"STATUS_CODE_ERROR\",k8s_namespace_name=\"$namespace\", k8s_deployment_name=\"$deployment\"}[$__rate_interval])) by (operation)", + "interval": "", + "legendFormat": "{{operation}}", + "refId": "A" + } + ], + "title": "Operation Errors", + "type": "timeseries" + }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "max": null, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": null, + "gridPos": { + "h": 9, + "w": 24, + "x": 0, + "y": 9 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 4, + "legend": { + "show": false + }, + "maxDataPoints": 25, + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(latency_ms_bucket{k8s_namespace_name=\"$namespace\", k8s_deployment_name=\"$deployment\"}[$__rate_interval])) by (le)", + "format": "heatmap", + "instant": false, + "interval": "", + "legendFormat": "{{le}}", + "refId": "A" + } + ], + "title": "Call Latency", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": null, + "format": "ms", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "auto", + "yBucketNumber": null, + "yBucketSize": null + } + ], + "refresh": false, + "schemaVersion": 32, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "${DS_METRICS}", + "definition": "label_values(calls_total_total, k8s_namespace_name)", + "description": "", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(calls_total_total, k8s_namespace_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "allValue": null, + "current": {}, + "datasource": "${DS_METRICS}", + "definition": "label_values(calls_total_total, k8s_deployment_name)", + "description": "", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "deployment", + "options": [], + "query": { + "query": "label_values(calls_total_total, k8s_deployment_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Span Metrics", + "uid": "iMhCo1D4k", + "version": 10 +} \ No newline at end of file diff --git a/otel-integration/dashboard/k8s/assets/spanmetrics.png b/otel-integration/dashboard/k8s/assets/spanmetrics.png new file mode 100644 index 00000000..0b328e1c Binary files /dev/null and b/otel-integration/dashboard/k8s/assets/spanmetrics.png differ diff --git a/otel-integration/k8s-helm/.gitignore b/otel-integration/k8s-helm/.gitignore new file mode 100644 index 00000000..ebf1d3dc --- /dev/null +++ b/otel-integration/k8s-helm/.gitignore @@ -0,0 +1 @@ +charts diff --git a/otel-integration/k8s-helm/Chart.yaml b/otel-integration/k8s-helm/Chart.yaml new file mode 100644 index 00000000..7669ef5e --- /dev/null +++ b/otel-integration/k8s-helm/Chart.yaml @@ -0,0 +1,30 @@ +apiVersion: v2 +name: otel-ingegration +description: OpenTelemetry Integration +version: 0.0.1 +keywords: + - OpenTelemetry Collector + - OpenTelemetry Agent + - OpenTelemetry Integration + - Kubernetes + - Coralogix +dependencies: + - name: opentelemetry-collector + alias: opentelemetry-agent + version: "0.65.0" + repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual + condition: opentelemetry-agent.enabled + - name: opentelemetry-collector + alias: opentelemetry-cluster-collector + version: "0.65.0" + repository: https://cgx.jfrog.io/artifactory/coralogix-charts-virtual + condition: opentelemetry-cluster-collector.enabled + - name: kube-state-metrics + version: "5.8.1" + repository: https://prometheus-community.github.io/helm-charts + condition: global.extensions.kubernetesDashboard.enabled +sources: + - https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector +maintainers: + - name: Coralogix + email: platform@coralogix.com diff --git a/otel-integration/k8s-helm/README.md b/otel-integration/k8s-helm/README.md new file mode 100644 index 00000000..617797bd --- /dev/null +++ b/otel-integration/k8s-helm/README.md @@ -0,0 +1,196 @@ +# OpenTelemetry Integration + +The OpenTelemetry Integration consists of two main compoenents, that provide our users with full fledged integration for their Kubernetes cluster - the [OpenTelemetry Agent](#opentelemetry-agent) and [OpenTelemetry Cluster Collector](#opentelemetry-cluster-collector). Depending on your needs, you can deploy both components (default behavior) or decide to disable eihter one under the `opentelemetry-agent` or `opentelemetry-cluster-collector` sections in the `values.yaml` file. + +Content: +1. [Components](#components) +2. [Prerequisites](#prerequisites) +3. [Installation](#installation) +4. [How to use it](#how-to-use-it) +5. [Performance of the Collector](#performance-of-the-collector) +6. [Infrastructure Monitoring](#infrastructure-monitoring) + +# Components + +## OpenTelemetry Agent + +For the agent component, the collector will be deployed as a daemonset, meaning the collector will run as an `agent` on each node. Agent runs in host network mode allowing you to easily send application telemetry data. + +The included agent provides: + +- [Coralogix Exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/coralogixexporter) - Coralogix exporter is preconfigured to enrich data using Kubernetes Attributes, which allows quick correlation of telemetry signals using consistent ApplicationName and SubsytemName fields. +- [Kubernetes Attributes Processor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor) Kubernetes Attributes Processor, enriches data with Kubernetes metadata, such as Deployment information. +- [Kubernetes Log Collection](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/filelogreceiver) - native Kubernetes Log collection with Opentelemetry Collector. No need to run multiple agents such as fluentd, fluent-bit or filebeat. +- [Host Metrics](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver) - native Linux monitor resource collection agent. No need to run Node exporter or vendor agents. +- [Kubelet Metrics](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kubeletstatsreceiver) - Fetches running container metrics from the local Kubelet. +- [OTLP Metrics](https://github.com/open-telemetry/opentelemetry-collector/blob/main/receiver/otlpreceiver/README.md) - Send application metrics via OpenTelemetry protocol. +- Traces - You can send data in various format, such as [Jaeger](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/jaegerreceiver), [OpenTelemetry Protocol](https://github.com/open-telemetry/opentelemetry-collector/blob/main/receiver/otlpreceiver/README.md) or [Zipkin](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/zipkinreceiver). +- [Span Metrics](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/spanmetricsprocessor) - Traces are converted to Requests, Duration and Error metrics using spanmetrics processor. +- [Zpages Extension](https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/zpagesextension) - You can investigate latency and error issues by navigating to Pod's localhost:55516 web server. Routes are desribed in [OpenTelemetry documentation](https://github.com/open-telemetry/opentelemetry-collector/tree/main/extension/zpagesextension#exposed-zpages-routes) + +## OpenTelemetry Cluster Collector + +This cluster collector provides: + +- [Coralogix Exporter](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/coralogixexporter) - Coralogix exporter is preconfigured to enrich data using Kubernetes Attributes, which allows quick correlation of telemetry signals using consistent ApplicationName and SubsytemName fields. +- [Cluster Metrics Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sclusterreceiver) - The Kubernetes Cluster receiver collects cluster-level metrics from the Kubernetes API server. +- [Kubernetes Events Receiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8seventsreceiver) - The Kubernetes Events receiver collects events from the Kubernetes API server. See [Kubernetes Events](#kubernetes-events) for more information. +- Kubernetes Extra Metrics - This preset enables collection of extra Kubernetes related metrics, such as node information, pod status or container I/O metrics. These metrics are collected in particular for the [Kubernetes Dashboard](#kubernetes-dashboard). + +## Kubernetes Dashboard + +This chart will also collect, out of the box, all the metrics necessary for [Coralogix Kubernetes Monitoring](https://coralogix.com/docs/apm-kubernetes/), which will allow you to monitor your Kubernetes cluster and applications. To do this, it is necessary to deploy the [Kube-state-metrics](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) component, which makes it possible to obtain some of these extra metrics. + +**Please be aware** that certain metrics collected by for the dashboard have high cardinality, which means that the number of unique values for a given metric is high and might result in higher costs connected with metrics ingestion and storage. This is applies in particular to the pod related metrics `kube_pod_status_reason`, `kube_pod_status_phase` and `kube_pod_status_qos_class`. + +If you do not require to collect these metrics, you can disable them by setting `global.extensions.kubernetesDashboard.enabled` to `false` in the `values.yaml` file. + +# Prerequisites + +### Secret Key + +Follow the [private key docs](https://coralogix.com/docs/private-key/) tutorial to obtain your secret key tutorial to obtain your secret key. + +OpenTelemetry Agent require a `secret` called `coralogix-keys` with the relevant `private key` under a secret key called `PRIVATE_KEY`, inside the `same namespace` that the chart is installed in. + +```bash +kubectl create secret generic coralogix-keys \ + --from-literal=PRIVATE_KEY= +``` + +The created secret should look like this: + +```yaml +apiVersion: v1 +data: + PRIVATE_KEY: +kind: Secret +metadata: + name: coralogix-keys + namespace: +type: Opaque +``` + +# Installation + +First make sure to add our Helm charts repository to the local repos list with the following command: + +```bash +helm repo add coralogix-charts-virtual https://cgx.jfrog.io/artifactory/coralogix-charts-virtual +``` + +In order to get the updated Helm charts from the added repository, please run: + +```bash +helm repo update +``` + +Install the chart: + +```bash +helm upgrade --install otel-integration coralogix-charts-virtual/opentelemetry-coralogix \ + -f values.yaml +``` + +# How to use it + +## Available Endpoints + +Applications can send OTLP Metrics and Jaeger, Zipkin and OTLP traces to the local nodes, as `otel-agent` is using hostNetwork . + +| Protocol | Port | +|-----------------------|-------| +| Zipkin | 9411 | +| Jaeger GRPC | 6832 | +| Jaeger Thrift binary | 6832 | +| Jaeger Thrift compact | 6831 | +| Jaeger Thrift http | 14268 | +| OTLP GRPC | 4317 | +| OTLP HTTP | 4318 | + +### Example Application environment configuration + +The following code creates a new environment variable (`NODE`) containing the node's IP address and then uses that IP in the `OTEL_EXPORTER_OTLP_ENDPOINT` environment variable. +This ensures that each instrumented pod will send data to the local OTEL collector on the node it is currently running on. + +```yaml +env: + - name: NODE + valueFrom: + fieldRef: + fieldPath: status.hostIP + - name: OTEL_EXPORTER_OTLP_ENDPOINT + value: "$(NODE):4317" +``` + +# Performance of the Collector + +## Picking the right tracing SDK span processor + +OpenTelemetry tracing SDK supports two strategies to create an application traces, a “SimpleSpanProcessor” and a “BatchSpanProcessor.” +While the SimpleSpanProcessor submits a span every time a span is finished, the BatchSpanProcessor processes spans in batches, and buffers them until a flush event occurs. Flush events can occur when the buffer is full or when a timeout is reached. + +Picking the right tracing SDK span processor can have an impact on the performance of the collector. +We switched our SDK span processor from SimpleSpanProcessor to BatchSpanProcessor and noticed a massive performance improvement in the collector: + +| Span Processor | Agent Memory Usage | Agent CPU Usage | Latency Samples | +|---------------------|--------------------|-----------------|-----------------| +| SimpleSpanProcessor | 3.7 GB | 0.5 | >1m40s | +| BatchSpanProcessor | 600 MB | 0.02 | >1s <10s | + +In addition, it improved the buffer performance of the collector, when we used the SimpleSpanProcessor, the buffer queues were getting full very quickly, +and after switching to the BatchSpanProcessor, it stopped becoming full all the time, therefore stopped dropping data. + +#### Example + +```python +import BatchSpanProcessor from "@opentelemetry/sdk-trace-base"; +tracerProvider.addSpanProcessor(new BatchSpanProcessor(exporter)); +``` + +# Infrastructure Monitoring + +## Log Collection + +Default installation collects Kubernetes logs. + +## Kubernetes Events + +Kubernetes events provide a rich source of information. These objects can be used to monitor your application and cluster state, respond to failures, and perform diagnostics. The events are generated when the cluster’s resources — such as pods, deployments, or nodes — change state. + +Whenever something happens inside your cluster, it produces an events object that provides visibility into your cluster. However, Kubernetes events don’t persist throughout your cluster life cycle, as there’s no mechanism for retention. They’re short-lived and only available for one hour after the event is generated. + +With that in mind we're configuring an OpenTelemetry receiver to collect Kubernetes events and ship them to the `kube-events` subSystem so that you can leverage all the other features such as dashboard and alerting using Kubernetes events as the source of information. + +On the OpenTelemetry config, you will find a new pipeline named `logs/kube-events`, which is used to collect, process, and export the Kubernetes events to Coralogix. + +### Cleaning the data + +By default, there's a transform processor named `transform/kube-events` which is removing some unneeded fields, but feel free to override this and add back some fields or even remove fields that are not needed at all on your specific use case. + +### Filtering Events + +On large-scale environments, you may have hundreds or even millions of events per hour, and maybe you don't need all of them, with that in mind you can leverage another OpenTelemetry processor to filter the events and don't send it to Coralogix, below you can find a config sample. + +```yaml +processors: + filter/kube-events: + logs: + log_record: + - 'IsMatch(body["reason"], "(BackoffLimitExceeded|FailedScheduling|Unhealthy)") == true' +``` + +This configuration is filtering out any event that has the field `reason` with one of those values `BackoffLimitExceeded|FailedScheduling|Unhealthy`, for more information about the `filter` processor feel free to check the official documentation [here](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor). + +## Dashboards + +Under the `dashboard` directory, there are: + +- Host Metrics Dashboard +- Kubernetes Pod Dashboard +- Span Metrics Dashboard +- Otel-Agent Grafana dashboard + +# Dependencies + +This chart uses [openetelemetry-collector](https://github.com/coralogix/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector) help chart. Also this chart currently depends on the [`kube-state-metrics`](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-state-metrics) chart to collect extra Kubernetes metrics. diff --git a/otel-integration/k8s-helm/ci/ci-values.yaml b/otel-integration/k8s-helm/ci/ci-values.yaml new file mode 100644 index 00000000..fb8f61a8 --- /dev/null +++ b/otel-integration/k8s-helm/ci/ci-values.yaml @@ -0,0 +1,10 @@ +global: + domain: "coralogix.com" + clusterName: "ci-test" + defaultApplicationName: "otel" + defaultSubsystemName: "integration" + logLevel: "warn" + + extensions: + kubernetesDashboard: + enabled: true diff --git a/otel-integration/k8s-helm/templates/NOTES.txt b/otel-integration/k8s-helm/templates/NOTES.txt new file mode 100644 index 00000000..c7f4a5b9 --- /dev/null +++ b/otel-integration/k8s-helm/templates/NOTES.txt @@ -0,0 +1,7 @@ +{{- if eq .Values.global.domain "" }} +{{ fail "[ERROR] 'global.domain' must be set. See https://coralogix.com/docs/coralogix-endpoints for instructions." }} +{{ end }} + +{{- if eq .Values.global.clusterName "" }} +{{ fail "[ERROR] 'global.clusterName' must be set with your Kubernetes cluster name." }} +{{ end }} \ No newline at end of file diff --git a/otel-integration/k8s-helm/templates/clusterrole.yaml b/otel-integration/k8s-helm/templates/clusterrole.yaml new file mode 100644 index 00000000..55c4d409 --- /dev/null +++ b/otel-integration/k8s-helm/templates/clusterrole.yaml @@ -0,0 +1,15 @@ +{{- if eq ((.Values.distribution)) "openshift" }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: system:openshift:scc:{{.Values.global.fullnameOverride }} +rules: +- apiGroups: + - security.openshift.io + resourceNames: + - {{.Values.global.fullnameOverride }} + resources: + - securitycontextconstraints + verbs: + - use +{{- end }} diff --git a/otel-integration/k8s-helm/templates/clusterrolebinding.yaml b/otel-integration/k8s-helm/templates/clusterrolebinding.yaml new file mode 100644 index 00000000..65deeca2 --- /dev/null +++ b/otel-integration/k8s-helm/templates/clusterrolebinding.yaml @@ -0,0 +1,14 @@ +{{- if eq ((.Values.distribution)) "openshift" }} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: system:openshift:scc:{{.Values.global.fullnameOverride }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:openshift:scc:{{.Values.global.fullnameOverride }} +subjects: +- kind: ServiceAccount + name: {{.Values.global.fullnameOverride }} + namespace: {{ .Release.Namespace }} +{{- end }} diff --git a/otel-integration/k8s-helm/templates/scc.yaml b/otel-integration/k8s-helm/templates/scc.yaml new file mode 100644 index 00000000..223cb505 --- /dev/null +++ b/otel-integration/k8s-helm/templates/scc.yaml @@ -0,0 +1,49 @@ +{{/* +Default values for SecurityContextConstraints +*/}} + +{{- define "opentelemetry-coralogix.defaultSecurityContextConstraints" -}} +priority: 10 +allowHostNetwork: true +allowHostPorts: true +allowHostPID: true +allowHostDirVolumePlugin: true +allowHostIPC: false +allowPrivilegedContainer: false +volumes: +- configMap +- downwardAPI +- emptyDir +- hostPath +- secret +seLinuxContext: + type: RunAsAny +allowedFlexVolumes: [] +allowedCapabilities: [] +defaultAddCapabilities: [] +fsGroup: + type: MustRunAs +readOnlyRootFilesystem: true +runAsUser: + type: RunAsAny +supplementalGroups: + type: RunAsAny +requiredDropCapabilities: +- ALL +{{- end -}} + +{{- if eq ((.Values.distribution)) "openshift" }} +kind: SecurityContextConstraints +apiVersion: security.openshift.io/v1 +metadata: + name: {{.Values.global.fullnameOverride }} + labels: + app: {{ template "opentelemetry-agent.name" . }} + chart: {{ template "opentelemetry-agent.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} +users: +- system:serviceaccount:{{ .Release.Namespace }}:{{.Values.global.fullnameOverride }} +{{- $config := include "opentelemetry-coralogix.defaultSecurityContextConstraints" . | fromYaml }} +{{ .Values.securityContextConstraintsOverwrite | mustMergeOverwrite $config | toYaml }} +{{- end }} diff --git a/otel-integration/k8s-helm/values.yaml b/otel-integration/k8s-helm/values.yaml new file mode 100644 index 00000000..399bc7ea --- /dev/null +++ b/otel-integration/k8s-helm/values.yaml @@ -0,0 +1,468 @@ +global: + domain: "" + clusterName: "" + defaultApplicationName: "otel" + defaultSubsystemName: "integration" + logLevel: "warn" + + extensions: + kubernetesDashboard: + enabled: true + +# set distribution to openshift for openshift clusters +distribution: "" +opentelemetry-agent: + enabled: true + mode: daemonset + fullnameOverride: coralogix-opentelemetry + extraEnvs: + - name: CORALOGIX_PRIVATE_KEY + valueFrom: + secretKeyRef: + name: coralogix-keys + key: PRIVATE_KEY + - name: OTEL_RESOURCE_ATTRIBUTES + value: "k8s.node.name=$(K8S_NODE_NAME)" + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + + serviceAccount: + # Specifies whether a service account should be created + create: true + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + clusterRole: + name: "coralogix-opentelemetry" + clusterRoleBinding: + name: "coralogix-opentelemetry" + hostNetwork: true + dnsPolicy: "ClusterFirstWithHostNet" + + presets: + logsCollection: + enabled: true + storeCheckpoints: true + maxRecombineLogSize: 1048576 + extraFilelogOperators: [] +# - type: recombine +# combine_field: body +# source_identifier: attributes["log.file.path"] +# is_first_entry: body matches "^(YOUR-LOGS-REGEX)" + kubernetesAttributes: + enabled: true + hostMetrics: + enabled: true + kubeletMetrics: + enabled: true + + config: + extensions: + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + + receivers: + otlp: + protocols: + grpc: + endpoint: ${MY_POD_IP}:4317 + http: + endpoint: ${MY_POD_IP}:4318 + zipkin: + endpoint: ${MY_POD_IP}:9411 + jaeger: + protocols: + grpc: + endpoint: ${MY_POD_IP}:14250 + thrift_http: + endpoint: ${MY_POD_IP}:14268 + thrift_compact: + endpoint: ${MY_POD_IP}:6831 + thrift_binary: + endpoint: ${MY_POD_IP}:6832 + prometheus: + config: + scrape_configs: + - job_name: opentelemetry-collector + scrape_interval: 30s + static_configs: + - targets: + - ${MY_POD_IP}:8888 + processors: + resourcedetection/env: + detectors: ["system", "env"] + timeout: 2s + override: false + resourcedetection/region: + detectors: ["gcp", "ec2"] + timeout: 2s + override: false + gcp: + resource_attributes: + cloud.region: + enabled: true + cloud.availability_zone: + enabled: true + ec2: + resource_attributes: + cloud.region: + enabled: true + cloud.availability_zone: + enabled: true + metricstransform: + transforms: + include: .* + match_type: regexp + action: update + operations: + - action: add_label + new_label: k8s.cluster.name + new_value: "{{ .Values.global.clusterName }}" + - action: add_label + new_label: cx.otel_integration.name + new_value: "{{ .Chart.Name }}" + - action: add_label + new_label: cx.otel_integration.version + new_value: "{{ .Chart.Version }}" + k8sattributes: + filter: + node_from_env_var: KUBE_NODE_NAME + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.pod.name" + - "k8s.node.name" + spanmetrics: + metrics_exporter: coralogix + dimensions: + - name: "k8s.deployment.name" + - name: "k8s.statefulset.name" + - name: "k8s.daemonset.name" + - name: "k8s.cronjob.name" + - name: "k8s.job.name" + - name: "k8s.container.name" + - name: "k8s.node.name" + - name: "k8s.namespace.name" + memory_limiter: null # Will get the k8s resource limits + + exporters: + coralogix: + timeout: "30s" + private_key: "${CORALOGIX_PRIVATE_KEY}" + domain: "{{ .Values.global.domain }}" + application_name: "{{ .Values.global.defaultApplicationName }}" + subsystem_name: "{{ .Values.global.defaultSubsystemName }}" + application_name_attributes: + - "k8s.namespace.name" + - "service.namespace" + subsystem_name_attributes: + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.container.name" + - "k8s.node.name" + - "service.name" + + service: + telemetry: + logs: + level: "{{ .Values.global.logLevel }}" + encoding: json + metrics: + address: ${MY_POD_IP}:8888 + extensions: + - zpages + - pprof + - health_check + - memory_ballast + pipelines: + metrics: + exporters: + - coralogix + processors: + - k8sattributes + - resourcedetection/env + - resourcedetection/region + - metricstransform + - memory_limiter + - batch + receivers: + - otlp + - prometheus + - hostmetrics + traces: + exporters: + - coralogix + processors: + - memory_limiter + - spanmetrics + - batch + receivers: + - otlp + - zipkin + - jaeger + logs: + exporters: + - coralogix + processors: + - batch + receivers: + - otlp + + tolerations: + - operator: Exists + + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 1 + memory: 2G + + ports: + jaeger-binary: + enabled: true + containerPort: 6832 + servicePort: 6832 + hostPort: 6832 + protocol: TCP + # In order to enable podMonitor, following part must be enabled in order to expose the required port: + # metrics: + # enabled: true + + # podMonitor: + # enabled: true + + # prometheusRule: + # enabled: true + # defaultRules: + # enabled: true + +opentelemetry-cluster-collector: + enabled: true + mode: deployment + fullnameOverride: coralogix-opentelemetry-collector + clusterRole: + name: "coralogix-opentelemetry-collector" + create: true + clusterRoleBinding: + name: "coralogix-opentelemetry-collector" + replicaCount: 1 + ports: + otlp: + enabled: true + otlp-http: + enabled: false + jaeger-compact: + enabled: false + jaeger-thrift: + enabled: false + jaeger-grpc: + enabled: false + zipkin: + enabled: false + + presets: + clusterMetrics: + enabled: true + kubernetesEvents: + enabled: true + kubernetesExtraMetrics: + enabled: true + + extraEnvs: + - name: CORALOGIX_PRIVATE_KEY + valueFrom: + secretKeyRef: + name: coralogix-keys + key: PRIVATE_KEY + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: spec.nodeName + + config: + extensions: + zpages: + endpoint: localhost:55679 + pprof: + endpoint: localhost:1777 + receivers: + prometheus: + config: + scrape_configs: + - job_name: opentelemetry-infrastructure-collector + scrape_interval: 30s + static_configs: + - targets: + - ${MY_POD_IP}:8888 + processors: + k8sattributes: + filter: + node_from_env_var: KUBE_NODE_NAME + extract: + metadata: + - "k8s.namespace.name" + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.pod.name" + - "k8s.node.name" + resource/kube-events: + attributes: + - key: service.name + value: "kube-events" + action: upsert + - key: k8s.cluster.name + value: "{{ .Values.global.clusterName }}" + action: upsert + transform/kube-events: + log_statements: + - context: log + statements: + - keep_keys(body, ["type", "action", "eventTime", "reason", "regarding", "reportingController", "note", "series", "metadata", "deprecatedFirstTimestamp", "deprecatedLastTimestamp"]) + metricstransform/kube-extra-metrics: + transforms: + include: .* + match_type: regexp + action: update + operations: + - action: add_label + new_label: k8s.cluster.name + new_value: "{{ .Values.global.clusterName }}" + - action: add_label + new_label: cx.otel_integration.name + new_value: "{{ .Chart.Name }}" + - action: add_label + new_label: cx.otel_integration.version + new_value: "{{ .Chart.Version }}" + resourcedetection/env: + detectors: ["system", "env"] + timeout: 2s + override: false + resourcedetection/region: + detectors: ["gcp", "ec2"] + timeout: 2s + override: false + gcp: + resource_attributes: + cloud.region: + enabled: true + cloud.availability_zone: + enabled: true + ec2: + resource_attributes: + cloud.region: + enabled: true + cloud.availability_zone: + enabled: true + memory_limiter: null # Will get the k8s resource limits + + exporters: + coralogix: + timeout: "30s" + private_key: "${CORALOGIX_PRIVATE_KEY}" + domain: "{{ .Values.global.domain }}" + application_name: "{{ .Values.global.defaultApplicationName }}" + subsystem_name: "{{ .Values.global.defaultSubsystemName }}" + application_name_attributes: + - "k8s.namespace.name" + - "service.namespace" + subsystem_name_attributes: + - "k8s.deployment.name" + - "k8s.statefulset.name" + - "k8s.daemonset.name" + - "k8s.cronjob.name" + - "k8s.job.name" + - "k8s.container.name" + - "k8s.node.name" + - "service.name" + + service: + telemetry: + logs: + level: "{{ .Values.global.logLevel }}" + encoding: json + metrics: + address: ${MY_POD_IP}:8888 + extensions: + - zpages + - pprof + - health_check + - memory_ballast + pipelines: + logs: + exporters: + - coralogix + processors: + - memory_limiter + - batch + - resource/kube-events + - transform/kube-events + metrics: + exporters: + - coralogix + processors: + - k8sattributes + - metricstransform/kube-extra-metrics + - resourcedetection/env + - resourcedetection/region + - memory_limiter + - batch + receivers: + - otlp + - prometheus + + tolerations: + - operator: Exists + + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 1 + memory: 2G + + # In order to enable serviceMonitor, following part must be enabled in order to expose the required port: + # ports: + # metrics: + # enabled: true + + # serviceMonitor: + # enabled: true + + # prometheusRule: + # enabled: true + # defaultRules: + # enabled: true + +kube-state-metrics: + enabled: true + prometheusScrape: false + collectors: + - pods + - nodes + metricAllowlist: + - kube_node_info + - kube_pod_status_reason + - kube_pod_status_phase + - kube_pod_status_qos_class