From 48b39c36083d2aa4a4d0708db4e1852f6ea64e43 Mon Sep 17 00:00:00 2001 From: Annanay Date: Fri, 3 Dec 2021 13:05:18 +0530 Subject: [PATCH 1/6] Add overrides configmap to querier, update docs for new limit Signed-off-by: Annanay --- docs/tempo/website/configuration/_index.md | 53 +++++++++++++------ .../jsonnet/microservices/querier.libsonnet | 3 ++ 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/docs/tempo/website/configuration/_index.md b/docs/tempo/website/configuration/_index.md index b60f3eb4411..ce775bf521b 100644 --- a/docs/tempo/website/configuration/_index.md +++ b/docs/tempo/website/configuration/_index.md @@ -723,38 +723,40 @@ overrides: # Global ingestion limits configurations + # Specifies whether the ingestion rate limits should be applied by each replica (of the distributor and ingester) + # individually OR should be shared across all replicas. See the "override strategies" section below for an example. + [ingestion_rate_strategy: | default = local] + # Burst size (bytes) used in ingestion. - # (Default: `20,000,000` ~20MB ) # Results in errors like - # RATE_LIMITED: ingestion rate limit (15000000 bytes) exceeded while adding 10 bytes - [ingestion_burst_size_bytes: ] + # RATE_LIMITED: ingestion rate limit (20000000 bytes) exceeded while adding 10 bytes + [ingestion_burst_size_bytes: | default = 20000000 (20MB) ] # Per-user ingestion rate limit (bytes) used in ingestion. - # (Default: `15,000,000` ~15MB) # Results in errors like - # RATE_LIMITED: ingestion rate limit (15000000 bytes) exceeded while - [ingestion_rate_limit_bytes: ] - + # RATE_LIMITED: ingestion rate limit (15000000 bytes) exceeded while adding 10 bytes + [ingestion_rate_limit_bytes: | default = 15000000 (15MB) ] + # Maximum size of a single trace in bytes. `0` to disable. - # (Default: `5,000,000` ~5MB) # Results in errors like # TRACE_TOO_LARGE: max size of trace (5000000) exceeded while adding 387 bytes - [max_bytes_per_trace: ] + [max_bytes_per_trace: | default = 5000000 (5MB) ] # Maximum number of active traces per user, per ingester. `0` to disable. - # (Default: `10,000`) # Results in errors like # LIVE_TRACES_EXCEEDED: max live traces per tenant exceeded: per-user traces limit (local: 10000 global: 0 actual local: 1) exceeded - [max_traces_per_user: ] + # This override limit is used by the ingester. + [max_traces_per_user: | default = 10000] + # Maximum size in bytes of a tag-values query. Tag-values query is used mainly to populate the autocomplete dropdown. + # Limit added to protect from tags with high cardinality or large values (like http urls or sql queries) + # This override limit is used by the ingester and the querier. + [max_bytes_per_tag_values_query: | default = 5000000 (5MB) ] # Tenant-specific overrides - # tenant-specific overrides settings config file + # Tenant-specific overrides settings config file. See the "Tenant-specific overrides" section below for an example. [per_tenant_override_config: /conf/overrides.yaml] - - # Ingestion strategy, default is `local`. - [ingestion_rate_strategy: ] ``` @@ -793,11 +795,28 @@ The trace limits specified by the various parameters are, by default, applied as A setting that applies at a local level is quite helpful in ensuring that each distributor independently can process traces up to the limit without affecting the tracing limits on other distributors. However, as a cluster grows quite large, this can lead to quite a large quantity of traces. An alternative strategy may be to set a `global` trace limit that establishes a total budget of all traces across all distributors in the cluster. The global limit is averaged across all distributors by using the distributor ring. + ```yaml # /conf/tempo.yaml overrides: - # Ingestion strategy, default is `local`. - ingestion_rate_strategy: + [ingestion_rate_strategy: | default = local] +``` + +For ex, the config below specifies that each replica of the distributor will apply a limit of `15MB/s`. + +```yaml +overrides: + - ingestion_rate_strategy: local + - ingestion_rate_limit_bytes: 15000000 +``` + +The config below specifies that together, all distributor replicas will apply a limit of `15MB/s`. +So if there are 5 replicas, each will apply a local limit of `(15MB/s / 5) = 3MB/s`. + +```yaml +overrides: + - ingestion_rate_strategy: global + - ingestion_rate_limit_bytes: 15000000 ``` ## Search diff --git a/operations/jsonnet/microservices/querier.libsonnet b/operations/jsonnet/microservices/querier.libsonnet index f35cfd23fda..e68b5019dd9 100644 --- a/operations/jsonnet/microservices/querier.libsonnet +++ b/operations/jsonnet/microservices/querier.libsonnet @@ -10,6 +10,7 @@ local target_name = 'querier', local tempo_config_volume = 'tempo-conf', + local tempo_overrides_config_volume = 'overrides', tempo_querier_container:: container.new(target_name, $._images.tempo) + @@ -23,6 +24,7 @@ ]) + container.withVolumeMounts([ volumeMount.new(tempo_config_volume, '/conf'), + volumeMount.new(tempo_overrides_config_volume, '/overrides'), ]) + $.util.withResources($._config.querier.resources) + $.util.readinessProbe, @@ -44,5 +46,6 @@ }) + deployment.mixin.spec.template.spec.withVolumes([ volume.fromConfigMap(tempo_config_volume, $.tempo_querier_configmap.metadata.name), + volume.fromConfigMap(tempo_overrides_config_volume, $._config.overrides_configmap_name), ]), } From b5af06cfb6234166536fce1b1f468dad87dacdfa Mon Sep 17 00:00:00 2001 From: Annanay Date: Fri, 3 Dec 2021 15:08:30 +0530 Subject: [PATCH 2/6] make kube-manifests-check Signed-off-by: Annanay --- operations/kube-manifests/Deployment-querier.yaml | 5 +++++ operations/kube-manifests/util/jsonnetfile.lock.json | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/operations/kube-manifests/Deployment-querier.yaml b/operations/kube-manifests/Deployment-querier.yaml index 9525eca1fa0..9b3fd033cb8 100644 --- a/operations/kube-manifests/Deployment-querier.yaml +++ b/operations/kube-manifests/Deployment-querier.yaml @@ -52,7 +52,12 @@ spec: volumeMounts: - mountPath: /conf name: tempo-conf + - mountPath: /overrides + name: overrides volumes: - configMap: name: tempo-querier name: tempo-conf + - configMap: + name: tempo-overrides + name: overrides diff --git a/operations/kube-manifests/util/jsonnetfile.lock.json b/operations/kube-manifests/util/jsonnetfile.lock.json index f072d30a2ce..7d20bb54545 100644 --- a/operations/kube-manifests/util/jsonnetfile.lock.json +++ b/operations/kube-manifests/util/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "ksonnet-util" } }, - "version": "ad31de265551c5577fed96d4f5a8b818027a2d85", + "version": "9927be87af4be9ff6b009e4503868b1b5493011b", "sum": "fFVlCoa/N0qiqTbDhZAEdRm2Vv76Z9Clxp3/haJ+PyA=" }, { @@ -18,7 +18,7 @@ "subdir": "memcached" } }, - "version": "ad31de265551c5577fed96d4f5a8b818027a2d85", + "version": "9927be87af4be9ff6b009e4503868b1b5493011b", "sum": "dTOeEux3t9bYSqP2L/uCuLo/wUDpCKH4w+4OD9fePUk=" }, { From b89853aa3cd24f4a8033d17b790b1b1097f88e75 Mon Sep 17 00:00:00 2001 From: Annanay Agarwal Date: Sat, 4 Dec 2021 12:32:10 +0530 Subject: [PATCH 3/6] Apply suggestions from code review Co-authored-by: Karen Miller <84039272+KMiller-Grafana@users.noreply.github.com> --- docs/tempo/website/configuration/_index.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/tempo/website/configuration/_index.md b/docs/tempo/website/configuration/_index.md index 35e6092049c..f3c92efb5ad 100644 --- a/docs/tempo/website/configuration/_index.md +++ b/docs/tempo/website/configuration/_index.md @@ -717,8 +717,8 @@ overrides: # Global ingestion limits configurations - # Specifies whether the ingestion rate limits should be applied by each replica (of the distributor and ingester) - # individually OR should be shared across all replicas. See the "override strategies" section below for an example. + # Specifies whether the ingestion rate limits should be applied by each instance of the distributor and ingester + # individually, or the limits are to be shared across all instances. See the "override strategies" section for an example. [ingestion_rate_strategy: | default = local] # Burst size (bytes) used in ingestion. @@ -747,13 +747,13 @@ overrides: [max_search_bytes_per_trace: | default = 5000] # Maximum size in bytes of a tag-values query. Tag-values query is used mainly to populate the autocomplete dropdown. - # Limit added to protect from tags with high cardinality or large values (like http urls or sql queries) + # Limit added to protect from tags with high cardinality or large values (like HTTP URLs or SQL queries) # This override limit is used by the ingester and the querier. [max_bytes_per_tag_values_query: | default = 5000000 (5MB) ] # Tenant-specific overrides - # Tenant-specific overrides settings config file. See the "Tenant-specific overrides" section below for an example. + # Tenant-specific overrides settings configuration file. See the "Tenant-specific overrides" section for an example. [per_tenant_override_config: /conf/overrides.yaml] ``` @@ -800,7 +800,7 @@ overrides: [ingestion_rate_strategy: | default = local] ``` -For ex, the config below specifies that each replica of the distributor will apply a limit of `15MB/s`. +For example, this configuration specifies that each instance of the distributor will apply a limit of `15MB/s`. ```yaml overrides: @@ -808,8 +808,8 @@ overrides: - ingestion_rate_limit_bytes: 15000000 ``` -The config below specifies that together, all distributor replicas will apply a limit of `15MB/s`. -So if there are 5 replicas, each will apply a local limit of `(15MB/s / 5) = 3MB/s`. +This configuration specifies that together, all distributor instances will apply a limit of `15MB/s`. +So if there are 5 instances, each instance will apply a local limit of `(15MB/s / 5) = 3MB/s`. ```yaml overrides: From 0cecdd66f5b121c5084540494e6a93e36a824e6a Mon Sep 17 00:00:00 2001 From: Annanay Date: Mon, 6 Dec 2021 15:28:43 +0530 Subject: [PATCH 4/6] Split unit tests and integration test runners in CI Signed-off-by: Annanay --- .github/workflows/ci.yml | 21 ++++++++++++++++++--- Makefile | 7 +++++-- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4b5be33d272..b647dbf2e4e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,8 +24,8 @@ jobs: with: version: v1.41.1 - test: - name: Test + unit-tests: + name: Test packages runs-on: ubuntu-latest steps: - name: Set up Go 1.17 @@ -37,7 +37,22 @@ jobs: uses: actions/checkout@v2 - name: Test - run: make test-all + run: make test-with-cover + + integration-tests: + name: Test integration e2e suite + runs-on: ubuntu-latest + steps: + - name: Set up Go 1.17 + uses: actions/setup-go@v2 + with: + go-version: 1.17 + + - name: Check out code + uses: actions/checkout@v2 + + - name: Test + run: make test-e2e build: name: Build diff --git a/Makefile b/Makefile index 4f57abc0fd9..01b28d765a3 100644 --- a/Makefile +++ b/Makefile @@ -73,10 +73,13 @@ benchmark: test-with-cover: $(GOTEST) $(GOTEST_OPT_WITH_COVERAGE) $(ALL_PKGS) +.PHONY: test-e2e +test-e2e: docker-tempo + $(GOTEST) -v $(GOTEST_OPT) ./integration/e2e + # test-all/bench use a docker image so build it first to make sure we're up to date .PHONY: test-all -test-all: docker-tempo test-with-cover - $(GOTEST) -v $(GOTEST_OPT) ./integration/e2e +test-all: test-with-cover test-e2e .PHONY: test-bench test-bench: docker-tempo From 62b462ab38d36c3c8481583f1624cf0a96944a08 Mon Sep 17 00:00:00 2001 From: Annanay Date: Mon, 6 Dec 2021 16:00:46 +0530 Subject: [PATCH 5/6] Increase ingestion limits for search data in e2e tests Signed-off-by: Annanay --- integration/e2e/config-all-in-one-azurite.yaml | 3 +++ integration/e2e/config-all-in-one-gcs.yaml | 3 +++ integration/e2e/config-all-in-one-s3.yaml | 3 +++ 3 files changed, 9 insertions(+) diff --git a/integration/e2e/config-all-in-one-azurite.yaml b/integration/e2e/config-all-in-one-azurite.yaml index cf35d4b985f..de42d858c2e 100644 --- a/integration/e2e/config-all-in-one-azurite.yaml +++ b/integration/e2e/config-all-in-one-azurite.yaml @@ -37,3 +37,6 @@ storage: pool: max_workers: 10 queue_depth: 100 + +overrides: + max_search_bytes_per_trace: 50_000 \ No newline at end of file diff --git a/integration/e2e/config-all-in-one-gcs.yaml b/integration/e2e/config-all-in-one-gcs.yaml index 2092b8cac00..245e1141fb3 100644 --- a/integration/e2e/config-all-in-one-gcs.yaml +++ b/integration/e2e/config-all-in-one-gcs.yaml @@ -36,3 +36,6 @@ storage: pool: max_workers: 10 queue_depth: 1000 + +overrides: + max_search_bytes_per_trace: 50_000 \ No newline at end of file diff --git a/integration/e2e/config-all-in-one-s3.yaml b/integration/e2e/config-all-in-one-s3.yaml index 7420ae6831f..3d716d063a0 100644 --- a/integration/e2e/config-all-in-one-s3.yaml +++ b/integration/e2e/config-all-in-one-s3.yaml @@ -38,3 +38,6 @@ storage: pool: max_workers: 10 queue_depth: 100 + +overrides: + max_search_bytes_per_trace: 50_000 \ No newline at end of file From eacd93b5124a6a5c5fce52b6de088efa04329521 Mon Sep 17 00:00:00 2001 From: Annanay Date: Mon, 6 Dec 2021 16:45:23 +0530 Subject: [PATCH 6/6] Slightly optimize integration tests Signed-off-by: Annanay --- integration/e2e/e2e_test.go | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/integration/e2e/e2e_test.go b/integration/e2e/e2e_test.go index c673cea65fa..14ae69eda07 100644 --- a/integration/e2e/e2e_test.go +++ b/integration/e2e/e2e_test.go @@ -6,6 +6,7 @@ import ( "os" "reflect" "strings" + "sync" "testing" "time" @@ -221,7 +222,7 @@ func TestMicroservices(t *testing.T) { queryAndAssertTrace(t, apiClient, info) // stop an ingester and confirm we can still write and query - err = tempoIngester2.Stop() + err = tempoIngester2.Kill() require.NoError(t, err) // sleep for heartbeat timeout @@ -237,7 +238,7 @@ func TestMicroservices(t *testing.T) { searchAndAssertTrace(t, apiClient, info) // stop another ingester and confirm things fail - err = tempoIngester1.Stop() + err = tempoIngester1.Kill() require.NoError(t, err) require.Error(t, info.EmitBatches(c)) @@ -251,12 +252,27 @@ func TestScalableSingleBinary(t *testing.T) { minio := cortex_e2e_db.NewMinio(9000, "tempo") require.NotNil(t, minio) require.NoError(t, s.StartAndWaitReady(minio)) - // + + // copy configuration file over to shared dir require.NoError(t, util.CopyFileToSharedDir(s, configHA, "config.yaml")) - tempo1 := util.NewTempoScalableSingleBinary(1) - tempo2 := util.NewTempoScalableSingleBinary(2) - tempo3 := util.NewTempoScalableSingleBinary(3) + // start three scalable single binary tempos in parallel + var wg sync.WaitGroup + var tempo1, tempo2, tempo3 *cortex_e2e.HTTPService + wg.Add(3) + go func() { + tempo1 = util.NewTempoScalableSingleBinary(1) + wg.Done() + }() + go func() { + tempo2 = util.NewTempoScalableSingleBinary(2) + wg.Done() + }() + go func() { + tempo3 = util.NewTempoScalableSingleBinary(3) + wg.Done() + }() + wg.Wait() require.NoError(t, s.StartAndWaitReady(tempo1, tempo2, tempo3)) // wait for 2 active ingesters @@ -314,16 +330,16 @@ func TestScalableSingleBinary(t *testing.T) { queryAndAssertTrace(t, apiClient1, info) - err = tempo1.Stop() + err = tempo1.Kill() require.NoError(t, err) // Push to one of the instances that are still running. require.NoError(t, info.EmitBatches(c2)) - err = tempo2.Stop() + err = tempo2.Kill() require.NoError(t, err) - err = tempo3.Stop() + err = tempo3.Kill() require.NoError(t, err) }