Tail-based sampling (#526)

* Add tail sampling in tempo pipelines * Add load balancing for traces when tail sampling Implements load balancing of spans by trace ID between agent instances for tail sampling * Lint things * Config recevier listening port * Add sampling in scraping service compose example * Don't load balance when it's not needed Many times users will have single instance deployements that do not require to load balance spans. Load balancing block is optional * Config reference for tail sampling * Update CHANGELOG * Fix panic * Move exporter to its own block * Make tail-sampling work in k3d example * Fix image name * Lower collector log level * Remove unnecessary pvcs
grafana · Apr 16, 2021 · 697685c · 697685c
1 parent 6b70747
commit 697685c
Show file tree

Hide file tree

Showing 49 changed files with 8,661 additions and 10 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,8 @@ cross-compilation issue, but will return in v0.13.0.
 - [BUGFIX] Validate that incoming scraped metrics do not have an empty label
   set or a label set with duplicate labels, mirroring the behavior of
   Prometheus. (@rfratto)
+
+- [FEATURE] Tail-based sampling for tracing pipelines (@mapno)
 
 # v0.13.1 (2021-04-09)
 

diff --git a/docs/configuration-reference.md b/docs/configuration-reference.md
@@ -2078,6 +2078,59 @@ spanmetrics:
     # They can be further namespaced, i.e. `{namespace}_tempo_spanmetrics`
     [ namespace: <prometheusexporter.namespace> ]
     [ send_timestamps: <prometheusexporter.send_timestamps> ]
+
+# tail_sampling supports tail-based sampling of traces in the agent.
+# Policies can be defined that determine what traces are sampled and sent to the backends and what traces are dropped.
+# In order to make a correct sampling decision it's important that the agent has a complete trace.
+# This is achieved by waiting a given time for all the spans before evaluating the trace.
+# Tail sampling also supports multiple agent deployments, allowing to group all spans of a trace
+# in the same agent by load balancing the spans by trace ID between the instances.
+tail_sampling:
+  # policies define the rules by which traces will be sampled. Multiple policies can be added to the same pipeline
+  # They are the same as the policies in Open-Telemetry's tailsamplingprocessor.
+  # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor
+  policies:
+    - [<tailsamplingprocessor.policies>]
+  # decision_wait is the time that will be waited before making a decision for a trace.
+  # Longer times reduce the probability of sampling an incomplete trace at the cost of higher memory usage.
+  decision_wait: [ <string> | default="5s" ]
+  # load_balancing configures load balancing of spans across multiple agents.
+  # It ensures that all spans of a trace are sampled in the same instance.
+  # It's not necessary when only one agent is receiving traces (e.g. single instance deployments). 
+  load_balancing:
+    # resolver configures the resolution strategy for the involved backends
+    # It can be static, with a fixed list of hostnames, or DNS, with a hostname (and port) that will resolve to all IP addresses.
+    # It's the same as the config in loadbalancingexporter.
+    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/loadbalancingexporter
+    resolver:
+      static:
+        hostnames:
+          [ - <string> ... ]
+      dns:
+        hostname: <string>
+        [ port: <int> ]
+
+    # Load balancing is done via an otlp exporter.
+    # The remaining configuration is common with the remote_write block.
+    exporter:
+      # Controls whether compression is enabled.
+      [ compression: <string> | default = "gzip" | supported = "none", "gzip"]
+
+      # Controls whether or not TLS is required.  See https://godoc.org/google.golang.org/grpc#WithInsecure
+      [ insecure: <boolean> | default = false ]
+
+      # Disable validation of the server certificate. Only used when insecure is set
+      # to false.
+      [ insecure_skip_verify: <bool> | default = false ]
+
+      # Sets the `Authorization` header on every trace push with the
+      # configured username and password.
+      # password and password_file are mutually exclusive.
+      basic_auth:
+        [ username: <string> ]
+        [ password: <secret> ]
+        [ password_file: <string> ]
+
 ```
 
 ### integrations_config

diff --git a/example/docker-compose/agent/config/agent-scraping-service.yaml b/example/docker-compose/agent/config/agent-scraping-service.yaml
@@ -21,3 +21,45 @@ prometheus:
             endpoints:
               - etcd:2379
 
+tempo:
+  configs:
+    - name: default
+      receivers:
+        jaeger:
+          protocols:
+            thrift_http:
+      attributes:
+        actions:
+          - action: upsert
+            key: env
+            value: prod
+      remote_write:
+        - endpoint: otel-collector:55680
+          insecure: true
+      batch:
+        timeout: 5s
+        send_batch_size: 100
+      tail_sampling:
+        policies:
+          - always_sample:
+#            Enter more policies to sample traces (ref: https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor)
+#            Example of rate limiting policy. Uncomment to start rate limiting!
+#          - name: policy-2
+#            type: rate_limiting
+#            rate_limiting:
+#              - spans_per_second: 35
+        load_balancing:
+          exporter:
+            insecure: true
+          resolver:
+            static:
+              hostnames:
+                - agent-1:4318
+                - agent-2:4318
+                - agent-3:4318
+      spanmetrics:
+        dimensions:
+          - name: http.url
+        metrics_exporter:
+          endpoint: 0.0.0.0:8889
+
diff --git a/example/docker-compose/docker-compose.scraping-service.yaml b/example/docker-compose/docker-compose.scraping-service.yaml
@@ -80,3 +80,22 @@ services:
     image: quay.io/freshtracks.io/avalanche:latest
   avalanche-5:
     image: quay.io/freshtracks.io/avalanche:latest
+
+  # tracing load generator
+  synthetic-load-generator:
+    image: omnition/synthetic-load-generator:1.0.25
+    volumes:
+      - ./load-generator:/etc/load-generator
+    environment:
+      - TOPOLOGY_FILE=/etc/load-generator/load-generator.json
+      - JAEGER_COLLECTOR_URL=http://agent-1:14268
+    depends_on:
+      - agent-1
+
+  # tracing backend
+  otel-collector:
+    image: otel/opentelemetry-collector:0.9.0
+    volumes:
+      - ./otel-collector:/etc/otel-collector
+    command:
+      - --config=/etc/otel-collector/config.yaml
diff --git a/example/k3d/environment/main.jsonnet b/example/k3d/environment/main.jsonnet
@@ -1,11 +1,14 @@
+local collector = import 'collector/main.libsonnet';
 local default = import 'default/main.libsonnet';
 local etcd = import 'etcd/main.libsonnet';
 local agent_cluster = import 'grafana-agent/scraping-svc/main.libsonnet';
 local k = import 'ksonnet-util/kausal.libsonnet';
+local load_generator = import 'load-generator/main.libsonnet';
 
 local loki_config = import 'default/loki_config.libsonnet';
 local grafana_agent = import 'grafana-agent/v1/main.libsonnet';
 
+local containerPort = k.core.v1.containerPort;
 local ingress = k.networking.v1beta1.ingress;
 local path = k.networking.v1beta1.httpIngressPath;
 local rule = k.networking.v1beta1.ingressRule;
@@ -65,11 +68,54 @@ local images = {
       scheme: 'http',
       hostname: 'loki.default.svc.cluster.local',
       external_labels: { cluster: cluster_label },
-    })),
+    })) +
+    grafana_agent.withTempoConfig({
+      receivers: {
+        jaeger: {
+          protocols: {
+            thrift_http: null,
+          },
+        },
+      },
+      batch: {
+        timeout: '5s',
+        send_batch_size: 1000,
+      },
+    }) +
+    grafana_agent.withPortsMixin([
+      containerPort.new('thrift-http', 14268) + containerPort.withProtocol('TCP'),
+      containerPort.new('otlp-lb', 4318) + containerPort.withProtocol('TCP'),
+    ]) +
+    grafana_agent.withTempoRemoteWrite([
+      {
+        endpoint: 'collector.default.svc.cluster.local:55680',
+        insecure: true,
+      },
+    ]) +
+    grafana_agent.withTempoTailSamplingConfig({
+      policies: [{
+        always_sample: null,
+      }],
+      load_balancing: {
+        exporter: {
+          insecure: true,
+        },
+        resolver: {
+          dns: {
+            hostname: 'grafana-agent.default.svc.cluster.local',
+            port: 4318,
+          },
+        },
+      },
+    }),
 
   // Need to run ETCD for agent_cluster
   etcd: etcd.new('default'),
 
+  collector: collector.new('default'),
+
+  load_generator: load_generator.new('default'),
+
   agent_cluster:
     agent_cluster.new('default', 'kube-system') +
     agent_cluster.withImagesMixin(images) +

diff --git a/example/k3d/lib/collector/collector-config.libsonnet b/example/k3d/lib/collector/collector-config.libsonnet
@@ -0,0 +1,28 @@
+{
+  receivers: {
+    otlp: {
+      protocols: {
+        grpc: null,
+      },
+    },
+  },
+
+  exporters: {
+    logging: {
+      loglevel: "info",
+    },
+  },
+
+  service: {
+    pipelines: {
+      traces: {
+        receivers: [
+          "otlp",
+        ],
+        exporters: [
+          "logging",
+        ],
+      },
+    },
+  },
+}
diff --git a/example/k3d/lib/collector/main.libsonnet b/example/k3d/lib/collector/main.libsonnet
@@ -0,0 +1,47 @@
+local k = import 'ksonnet-util/kausal.libsonnet';
+
+local configMap = k.core.v1.configMap;
+local container = k.core.v1.container;
+local containerPort = k.core.v1.containerPort;
+local deployment = k.apps.v1.deployment;
+local pvc = k.core.v1.persistentVolumeClaim;
+local service = k.core.v1.service;
+local volumeMount = k.core.v1.volumeMount;
+local volume = k.core.v1.volume;
+
+{
+  new(namespace=''):: {
+    local this = self,
+
+    _images:: {
+      collector: 'otel/opentelemetry-collector:0.9.0',
+    },
+    _config:: (import './collector-config.libsonnet'),
+
+    configMap:
+      configMap.new('collector') +
+      configMap.mixin.metadata.withNamespace(namespace) +
+      configMap.withData({
+        'config.yaml': k.util.manifestYaml(this._config),
+      }),
+
+    container::
+      container.new('collector', this._images.collector) +
+      container.withPorts([
+        containerPort.newNamed(name='grpc', containerPort=55680),
+      ]) +
+      container.withArgsMixin(
+          '--config=/etc/collector/config.yaml',
+      ),
+
+    deployment:
+      deployment.new('collector', 1, [self.container]) +
+      deployment.mixin.metadata.withNamespace(namespace) +
+      k.util.configMapVolumeMount(this.configMap, '/etc/collector'),
+
+
+    service:
+      k.util.serviceFor(self.deployment) +
+      service.mixin.metadata.withNamespace(namespace),
+  },
+}