diff --git a/CHANGELOG.md b/CHANGELOG.md index 8967a98b396..1093d5b055b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ * [BUGFIX] Fixes error where Dell ECS cannot list objects. [#561](https://github.com/grafana/tempo/pull/561) * [BUGFIX] Fixes listing blocks in S3 when the list is truncated. [#567](https://github.com/grafana/tempo/pull/567) * [BUGFIX] Fixes where ingester may leave file open [#570](https://github.com/grafana/tempo/pull/570) +* [BUGFIX] Fixes a bug where some blocks were not searched due to query sharding and randomness in blocklist poll. [#583](https://github.com/grafana/tempo/pull/583) ## v0.6.0 diff --git a/cmd/tempo-query/tempo/plugin.go b/cmd/tempo-query/tempo/plugin.go index af22dff23c1..8b7a73fa891 100644 --- a/cmd/tempo-query/tempo/plugin.go +++ b/cmd/tempo-query/tempo/plugin.go @@ -27,7 +27,7 @@ type Backend struct { func New(cfg *Config) *Backend { return &Backend{ - tempoEndpoint: "http://" + cfg.Backend + "/api/traces/", + tempoEndpoint: "http://" + cfg.Backend + "/tempo/api/traces/", } } @@ -75,6 +75,10 @@ func (b *Backend) GetTrace(ctx context.Context, traceID jaeger.TraceID) (*jaeger return nil, fmt.Errorf("error reading response from tempo: %w", err) } + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("%s", body) + } + otTrace := ot_pdata.NewTraces() err = otTrace.FromOtlpProtoBytes(body) if err != nil { diff --git a/cmd/tempo/app/modules.go b/cmd/tempo/app/modules.go index 14a49a9314e..87e44d803d8 100644 --- a/cmd/tempo/app/modules.go +++ b/cmd/tempo/app/modules.go @@ -47,6 +47,10 @@ const ( All string = "all" ) +const ( + queryEndpoint = "/tempo/api/traces/{traceID}" +) + func (t *App) initServer() (services.Service, error) { t.cfg.Server.MetricsNamespace = metricsNamespace t.cfg.Server.ExcludeRequestInLog = true @@ -153,7 +157,7 @@ func (t *App) initQuerier() (services.Service, error) { t.httpAuthMiddleware, ).Wrap(http.HandlerFunc(t.querier.TraceByIDHandler)) - t.server.HTTP.Handle("/querier/api/traces/{traceID}", tracesHandler) + t.server.HTTP.Handle("/querier"+queryEndpoint, tracesHandler) return t.querier, t.querier.CreateAndRegisterWorker(t.server.HTTPServer.Handler) } @@ -185,7 +189,7 @@ func (t *App) initQueryFrontend() (services.Service, error) { // register grpc server for queriers to connect to cortex_frontend_v1pb.RegisterFrontendServer(t.server.GRPC, t.frontend) // http query endpoint - t.server.HTTP.Handle("/api/traces/{traceID}", tracesHandler) + t.server.HTTP.Handle(queryEndpoint, tracesHandler) return services.NewIdleService(nil, func(_ error) error { t.frontend.Close() diff --git a/docs/tempo/website/architecture/architecture.md b/docs/tempo/website/architecture/architecture.md index 27c73ae3b17..938ff7f8e38 100644 --- a/docs/tempo/website/architecture/architecture.md +++ b/docs/tempo/website/architecture/architecture.md @@ -35,7 +35,7 @@ Batches traces into blocks, blooms, indexes and flushes to backend. Blocks in t Responsible for sharding the search space for an incoming query. Traces are exposed via a simple HTTP endpoint: -`GET /api/traces/` +`GET /tempo/api/traces/` Internally, the Query Frontend splits the blockID space into a configurable number of shards and queues these requests. Queriers connect to the Query Frontend via a streaming gRPC connection to process these sharded queries. @@ -45,7 +45,7 @@ Queriers connect to the Query Frontend via a streaming gRPC connection to proces The querier is responsible for finding the requested trace id in either the ingesters or the backend storage. It begins by querying the ingesters to see if the id is currently stored there, if not it proceeds to use the bloom and indexes to find the trace in the storage backend. The querier exposes an HTTP endpoint at: -`GET /querier/api/traces/`, but its not expected to be used directly. +`GET /querier/tempo/api/traces/`, but its not expected to be used directly. Queries should be sent to the Query Frontend. diff --git a/docs/tempo/website/configuration/_index.md b/docs/tempo/website/configuration/_index.md index 660c565bc7f..6ca419277e3 100644 --- a/docs/tempo/website/configuration/_index.md +++ b/docs/tempo/website/configuration/_index.md @@ -78,6 +78,9 @@ querier: frontend_address: query-frontend-discovery.default.svc.cluster.local:9095 # the address of the query frontend to connect to, and process queries ``` +The Querier also queries compacted blocks that fall within (2 * BlocklistPoll) where the value of Blocklist poll duration +is defined in the storage section below. + ## Compactor See [here](https://github.com/grafana/tempo/blob/master/modules/compactor/config.go) for all configuration options. diff --git a/docs/tempo/website/guides/pushing-spans-with-http.md b/docs/tempo/website/guides/pushing-spans-with-http.md index d3e12e45c9c..6d40cb0a5ff 100644 --- a/docs/tempo/website/guides/pushing-spans-with-http.md +++ b/docs/tempo/website/guides/pushing-spans-with-http.md @@ -61,7 +61,7 @@ Note that the `timestamp` field is in microseconds and was obtained by running ` The easiest way to get the trace is to execute a simple curl command to Tempo. The returned format is [OTLP](https://github.com/open-telemetry/opentelemetry-proto/blob/master/opentelemetry/proto/trace/v1/trace.proto). ```bash -curl http://localhost:3100/api/traces/0123456789abcdef +curl http://localhost:3100/tempo/api/traces/0123456789abcdef {"batches":[{"resource":{"attributes":[{"key":"service.name","value":{"stringValue":"shell script"}}]},"instrumentationLibrarySpans":[{"spans":[{"traceId":"AAAAAAAAAAABI0VniavN7w==","spanId":"AAAAAAAAEjQ=","name":"span from bash!","startTimeUnixNano":"1608239395286533000","endTimeUnixNano":"1608239395386533000","attributes":[{"key":"http.path","value":{"stringValue":"/api"}},{"key":"http.method","value":{"stringValue":"GET"}}]}]}]}]} ``` diff --git a/integration/bench/smoke_test.js b/integration/bench/smoke_test.js index 6a068834901..b001498b399 100644 --- a/integration/bench/smoke_test.js +++ b/integration/bench/smoke_test.js @@ -72,7 +72,7 @@ export function readPath() { console.log(`type=read traceId=${traceId}`); - let res = http.get(`${QUERY_ENDPOINT}/api/traces/${traceId}`, params); + let res = http.get(`${QUERY_ENDPOINT}/tempo/api/traces/${traceId}`, params); check(res, { 'read status is 200': (r) => r.status === 200, }, { type: 'read' }); diff --git a/integration/e2e/e2e_test.go b/integration/e2e/e2e_test.go index 6db1d1df776..0e7b4d7886a 100644 --- a/integration/e2e/e2e_test.go +++ b/integration/e2e/e2e_test.go @@ -62,7 +62,7 @@ func TestAllInOne(t *testing.T) { require.NoError(t, tempo.WaitSumMetrics(cortex_e2e.Equals(1), "tempo_ingester_traces_created_total")) // query an in-memory trace - queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) // flush trace to backend res, err := cortex_e2e.GetRequest("http://" + tempo.Endpoint(3100) + "/flush") @@ -78,7 +78,7 @@ func TestAllInOne(t *testing.T) { require.NoError(t, tempo.WaitSumMetrics(cortex_e2e.Equals(1), "tempo_query_frontend_queries_total")) // query trace - should fetch from backend - queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) } func TestAzuriteAllInOne(t *testing.T) { @@ -121,7 +121,7 @@ func TestAzuriteAllInOne(t *testing.T) { require.NoError(t, tempo.WaitSumMetrics(cortex_e2e.Equals(1), "tempo_ingester_traces_created_total")) // query an in-memory trace - queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) // flush trace to backend res, err := cortex_e2e.GetRequest("http://" + tempo.Endpoint(3100) + "/flush") @@ -136,7 +136,7 @@ func TestAzuriteAllInOne(t *testing.T) { require.NoError(t, tempo.WaitSumMetrics(cortex_e2e.Equals(1), "tempodb_blocklist_length")) // query trace - should fetch from backend - queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempo.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) } func TestMicroservices(t *testing.T) { @@ -192,7 +192,7 @@ func TestMicroservices(t *testing.T) { require.NoError(t, tempoIngester3.WaitSumMetrics(cortex_e2e.Equals(1), "tempo_ingester_traces_created_total")) // query an in-memory trace - queryAndAssertTrace(t, "http://"+tempoQueryFrontend.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempoQueryFrontend.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) // flush trace to backend res, err := cortex_e2e.GetRequest("http://" + tempoIngester1.Endpoint(3100) + "/flush") @@ -214,7 +214,7 @@ func TestMicroservices(t *testing.T) { require.NoError(t, tempoQueryFrontend.WaitSumMetrics(cortex_e2e.Equals(1), "tempo_query_frontend_queries_total")) // query trace - should fetch from backend - queryAndAssertTrace(t, "http://"+tempoQueryFrontend.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempoQueryFrontend.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) // stop an ingester and confirm we can still write and query err = tempoIngester2.Stop() @@ -228,7 +228,7 @@ func TestMicroservices(t *testing.T) { hexID = fmt.Sprintf("%016x%016x", batch.Spans[0].TraceIdHigh, batch.Spans[0].TraceIdLow) // query an in-memory trace - queryAndAssertTrace(t, "http://"+tempoQueryFrontend.Endpoint(3100)+"/api/traces/"+hexID, "my operation", 1) + queryAndAssertTrace(t, "http://"+tempoQueryFrontend.Endpoint(3100)+"/tempo/api/traces/"+hexID, "my operation", 1) // stop another ingester and confirm things fail err = tempoIngester1.Stop() diff --git a/operations/tempo-mixin/dashboards.libsonnet b/operations/tempo-mixin/dashboards.libsonnet index 58220f9c166..6784cf8762a 100644 --- a/operations/tempo-mixin/dashboards.libsonnet +++ b/operations/tempo-mixin/dashboards.libsonnet @@ -34,22 +34,22 @@ dashboard_utils { g.row('Query Frontend') .addPanel( $.panel('QPS') + - $.qpsPanel('tempo_request_duration_seconds_count{%s, route="api_traces_traceid"}' % $.jobMatcher($._config.jobs.query_frontend)) + $.qpsPanel('tempo_request_duration_seconds_count{%s, route="tempo_api_traces_traceid"}' % $.jobMatcher($._config.jobs.query_frontend)) ) .addPanel( $.panel('Latency') + - $.latencyPanel('tempo_request_duration_seconds', '{%s,route="api_traces_traceid"}' % $.jobMatcher($._config.jobs.query_frontend)) + $.latencyPanel('tempo_request_duration_seconds', '{%s,route="tempo_api_traces_traceid"}' % $.jobMatcher($._config.jobs.query_frontend)) ) ) .addRow( g.row('Querier') .addPanel( $.panel('QPS') + - $.qpsPanel('tempo_request_duration_seconds_count{%s, route="querier_api_traces_traceid"}' % $.jobMatcher($._config.jobs.querier)) + $.qpsPanel('tempo_request_duration_seconds_count{%s, route="querier_tempo_api_traces_traceid"}' % $.jobMatcher($._config.jobs.querier)) ) .addPanel( $.panel('Latency') + - $.latencyPanel('tempo_request_duration_seconds', '{%s,route="querier_api_traces_traceid"}' % $.jobMatcher($._config.jobs.querier)) + $.latencyPanel('tempo_request_duration_seconds', '{%s,route="querier_tempo_api_traces_traceid"}' % $.jobMatcher($._config.jobs.querier)) ) ) .addRow( diff --git a/operations/tempo-mixin/out/tempo-reads.json b/operations/tempo-mixin/out/tempo-reads.json index 760afb71745..29ab63641bf 100644 --- a/operations/tempo-mixin/out/tempo-reads.json +++ b/operations/tempo-mixin/out/tempo-reads.json @@ -492,7 +492,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=\"api_traces_traceid\"}[$__interval]), \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"), \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\", route=\"tempo_api_traces_traceid\"}[$__interval]), \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"), \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -579,7 +579,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"api_traces_traceid\"}[$__interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"tempo_api_traces_traceid\"}[$__interval])) by (le)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -588,7 +588,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"api_traces_traceid\"}[$__interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"tempo_api_traces_traceid\"}[$__interval])) by (le)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -597,7 +597,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"api_traces_traceid\"}[$__interval])) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"api_traces_traceid\"}[$__interval]))", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"tempo_api_traces_traceid\"}[$__interval])) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/query-frontend\",route=\"tempo_api_traces_traceid\"}[$__interval]))", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -702,7 +702,7 @@ "steppedLine": false, "targets": [ { - "expr": "sum by (status) (label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=\"querier_api_traces_traceid\"}[$__interval]), \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"), \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", + "expr": "sum by (status) (label_replace(label_replace(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\", route=\"querier_tempo_api_traces_traceid\"}[$__interval]), \"status\", \"${1}xx\", \"status_code\", \"([0-9])..\"), \"status\", \"${1}\", \"status_code\", \"([a-z]+)\"))", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -789,7 +789,7 @@ "steppedLine": false, "targets": [ { - "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_api_traces_traceid\"}[$__interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.99, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_tempo_api_traces_traceid\"}[$__interval])) by (le)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -798,7 +798,7 @@ "step": 10 }, { - "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_api_traces_traceid\"}[$__interval])) by (le)) * 1e3", + "expr": "histogram_quantile(0.50, sum(rate(tempo_request_duration_seconds_bucket{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_tempo_api_traces_traceid\"}[$__interval])) by (le)) * 1e3", "format": "time_series", "interval": "1m", "intervalFactor": 2, @@ -807,7 +807,7 @@ "step": 10 }, { - "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_api_traces_traceid\"}[$__interval])) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_api_traces_traceid\"}[$__interval]))", + "expr": "sum(rate(tempo_request_duration_seconds_sum{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_tempo_api_traces_traceid\"}[$__interval])) * 1e3 / sum(rate(tempo_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/querier\",route=\"querier_tempo_api_traces_traceid\"}[$__interval]))", "format": "time_series", "interval": "1m", "intervalFactor": 2, diff --git a/pkg/util/query.go b/pkg/util/query.go index 86c24805137..430bb46ff55 100644 --- a/pkg/util/query.go +++ b/pkg/util/query.go @@ -12,7 +12,7 @@ import ( const orgIDHeader = "X-Scope-OrgID" func QueryTrace(baseURL, id, orgID string) (*tempopb.Trace, error) { - req, err := http.NewRequest("GET", baseURL+"/api/traces/"+id, nil) + req, err := http.NewRequest("GET", baseURL+"/tempo/api/traces/"+id, nil) if err != nil { return nil, err }