diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bf893a6aa3..58c06caa737 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,6 +77,7 @@ * [BUGFIX] Set span's tag `span.kind` to `client` in query-frontend [#975](https://github.com/grafana/tempo/pull/975) (@mapno) * [BUGFIX] Nil check overrides module in the `/status` handler [#994](https://github.com/grafana/tempo/pull/994) (@mapno) * [BUGFIX] Several bug fixes for search contention and panics [#1033](https://github.com/grafana/tempo/pull/1033) (@mdisibio) +* [BUGFIX] Fixes `tempodb_backend_hedged_roundtrips_total` to correctly count hedged roundtrips. [#1079](https://github.com/grafana/tempo/pull/1079) (@joe-elliott) ## v1.1.0 / 2021-08-26 * [CHANGE] Upgrade Cortex from v1.9.0 to v1.9.0-131-ga4bf10354 [#841](https://github.com/grafana/tempo/pull/841) (@aknuds1) diff --git a/modules/frontend/tracebyidsharding.go b/modules/frontend/tracebyidsharding.go index d6af58eee31..5bc6a683b12 100644 --- a/modules/frontend/tracebyidsharding.go +++ b/modules/frontend/tracebyidsharding.go @@ -96,13 +96,8 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { return } - // if not found bail - if resp.StatusCode == http.StatusNotFound { - return - } - // if the status code is anything but happy, save the error and pass it down the line - if resp.StatusCode != http.StatusOK { + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNotFound { // todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast? statusCode = resp.StatusCode bytesMsg, err := io.ReadAll(resp.Body) @@ -113,7 +108,7 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { return } - // successful query, read the body + // read the body buff, err := io.ReadAll(resp.Body) if err != nil { _ = level.Error(s.logger).Log("msg", "error reading response body status == ok", "url", innerR.RequestURI, "err", err) @@ -140,6 +135,11 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { } } + // if not found bail + if resp.StatusCode == http.StatusNotFound { + return + } + // happy path statusCode = http.StatusOK overallTrace, _, _, _ = model.CombineTraceProtos(overallTrace, traceResp.Trace) diff --git a/modules/frontend/tracebyidsharding_test.go b/modules/frontend/tracebyidsharding_test.go index 89c20d37252..47516380e56 100644 --- a/modules/frontend/tracebyidsharding_test.go +++ b/modules/frontend/tracebyidsharding_test.go @@ -92,17 +92,18 @@ func TestShardingWareDoRequest(t *testing.T) { } tests := []struct { - name string - status1 int - status2 int - trace1 *tempopb.Trace - trace2 *tempopb.Trace - err1 error - err2 error - failedBlockQueries int - expectedStatus int - expectedTrace *tempopb.Trace - expectedError error + name string + status1 int + status2 int + trace1 *tempopb.Trace + trace2 *tempopb.Trace + err1 error + err2 error + failedBlockQueries1 int + failedBlockQueries2 int + expectedStatus int + expectedTrace *tempopb.Trace + expectedError error }{ { name: "empty returns", @@ -202,7 +203,6 @@ func TestShardingWareDoRequest(t *testing.T) { trace2: trace1, expectedError: errors.New("booo"), }, - { name: "500+err", status1: 500, @@ -211,23 +211,54 @@ func TestShardingWareDoRequest(t *testing.T) { expectedError: errors.New("booo"), }, { - name: "failed block queries <= max", - status1: 200, - trace1: trace1, - status2: 200, - trace2: trace2, - failedBlockQueries: 1, - expectedStatus: 200, - expectedTrace: trace, + name: "failedBlocks under: 200+200", + status1: 200, + trace1: trace1, + status2: 200, + trace2: trace2, + failedBlockQueries1: 1, + failedBlockQueries2: 1, + expectedStatus: 200, + expectedTrace: trace, + }, + { + name: "failedBlocks over: 200+200", + status1: 200, + trace1: trace1, + status2: 200, + trace2: trace2, + failedBlockQueries1: 0, + failedBlockQueries2: 5, + expectedError: errors.New("too many failed block queries 5 (max 2)"), + }, + { + name: "failedBlocks under: 200+404", + status1: 200, + trace1: trace1, + status2: 404, + failedBlockQueries1: 1, + failedBlockQueries2: 0, + expectedStatus: 200, + expectedTrace: trace1, + }, + { + name: "failedBlocks under: 404+200", + status1: 200, + trace1: trace1, + status2: 404, + failedBlockQueries1: 0, + failedBlockQueries2: 1, + expectedStatus: 200, + expectedTrace: trace1, }, { - name: "too many failed block queries", - status1: 200, - trace1: trace1, - status2: 200, - trace2: trace2, - failedBlockQueries: 10, - expectedError: errors.New("too many failed block queries 10 (max 2)"), + name: "failedBlocks over: 404+200", + status1: 200, + trace1: trace1, + status2: 404, + failedBlockQueries1: 0, + failedBlockQueries2: 5, + expectedError: errors.New("too many failed block queries 5 (max 2)"), }, } @@ -239,29 +270,41 @@ func TestShardingWareDoRequest(t *testing.T) { var trace *tempopb.Trace var statusCode int var err error + var failedBlockQueries int if r.RequestURI == "/querier/api/traces/1234?mode=ingesters" { trace = tc.trace1 statusCode = tc.status1 err = tc.err1 + failedBlockQueries = tc.failedBlockQueries1 } else { trace = tc.trace2 err = tc.err2 statusCode = tc.status2 + failedBlockQueries = tc.failedBlockQueries2 } if err != nil { return nil, err } - var resBytes []byte - if trace != nil { - resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ - Trace: trace, - Metrics: &tempopb.TraceByIDMetrics{ - FailedBlocks: uint32(tc.failedBlockQueries), - }, - }) - require.NoError(t, err) + resBytes := []byte("error occurred") + if statusCode != 500 { + if trace != nil { + resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ + Trace: trace, + Metrics: &tempopb.TraceByIDMetrics{ + FailedBlocks: uint32(failedBlockQueries), + }, + }) + require.NoError(t, err) + } else { + resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ + Metrics: &tempopb.TraceByIDMetrics{ + FailedBlocks: uint32(failedBlockQueries), + }, + }) + require.NoError(t, err) + } } return &http.Response{ diff --git a/modules/querier/http.go b/modules/querier/http.go index 094e1b7e5b2..44b99fa8075 100644 --- a/modules/querier/http.go +++ b/modules/querier/http.go @@ -2,7 +2,6 @@ package querier import ( "context" - "encoding/hex" "fmt" "net/http" "time" @@ -67,9 +66,10 @@ func (q *Querier) TraceByIDHandler(w http.ResponseWriter, r *http.Request) { return } + // record not found here, but continue on so we can marshal metrics + // to the body if resp.Trace == nil || len(resp.Trace.Batches) == 0 { - http.Error(w, fmt.Sprintf("Unable to find %s", hex.EncodeToString(byteID)), http.StatusNotFound) - return + w.WriteHeader(http.StatusNotFound) } if r.Header.Get(api.HeaderAccept) == api.HeaderAcceptProtobuf { diff --git a/tempodb/backend/instrumentation/hedged_requests.go b/tempodb/backend/instrumentation/hedged_requests.go index 2c6ff396f4c..85ff7010f46 100644 --- a/tempodb/backend/instrumentation/hedged_requests.go +++ b/tempodb/backend/instrumentation/hedged_requests.go @@ -13,11 +13,11 @@ const ( ) var ( - hedgedRequestsMetrics = promauto.NewCounter( - prometheus.CounterOpts{ + hedgedRequestsMetrics = promauto.NewGauge( + prometheus.GaugeOpts{ Namespace: "tempodb", Name: "backend_hedged_roundtrips_total", - Help: "Total number of hedged backend requests", + Help: "Total number of hedged backend requests. Registered as a gauge for code sanity. This is a counter.", }, ) ) @@ -27,7 +27,12 @@ func PublishHedgedMetrics(s *hedgedhttp.Stats) { ticker := time.NewTicker(hedgedMetricsPublishDuration) go func() { for range ticker.C { - hedgedRequestsMetrics.Add(float64(s.Snapshot().RequestedRoundTrips)) + snap := s.Snapshot() + hedgedRequests := int64(snap.ActualRoundTrips) - int64(snap.RequestedRoundTrips) + if hedgedRequests < 0 { + hedgedRequests = 0 + } + hedgedRequestsMetrics.Set(float64(hedgedRequests)) } }() }