From 27853ff6e803091d495efbf9c733fa38f49f9521 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 26 Oct 2021 11:54:17 -0400 Subject: [PATCH 1/8] Fixed issue with increasing hedged requests Signed-off-by: Joe Elliott --- tempodb/backend/instrumentation/hedged_requests.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tempodb/backend/instrumentation/hedged_requests.go b/tempodb/backend/instrumentation/hedged_requests.go index 2c6ff396f4c..1f60c9c0340 100644 --- a/tempodb/backend/instrumentation/hedged_requests.go +++ b/tempodb/backend/instrumentation/hedged_requests.go @@ -13,11 +13,11 @@ const ( ) var ( - hedgedRequestsMetrics = promauto.NewCounter( - prometheus.CounterOpts{ + hedgedRequestsMetrics = promauto.NewGauge( + prometheus.GaugeOpts{ Namespace: "tempodb", Name: "backend_hedged_roundtrips_total", - Help: "Total number of hedged backend requests", + Help: "Total number of hedged backend requests. Registered as a gauge for code sanity. This is a counter.", }, ) ) @@ -27,7 +27,7 @@ func PublishHedgedMetrics(s *hedgedhttp.Stats) { ticker := time.NewTicker(hedgedMetricsPublishDuration) go func() { for range ticker.C { - hedgedRequestsMetrics.Add(float64(s.Snapshot().RequestedRoundTrips)) + hedgedRequestsMetrics.Set(float64(s.Snapshot().RequestedRoundTrips)) } }() } From da1e01dc8fd7f29014dce27e7b462239ca57789a Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 26 Oct 2021 13:45:04 -0400 Subject: [PATCH 2/8] Count failed blocks on 404 Signed-off-by: Joe Elliott --- modules/frontend/tracebyidsharding.go | 36 ++++---- modules/frontend/tracebyidsharding_test.go | 96 ++++++++++++++++------ 2 files changed, 87 insertions(+), 45 deletions(-) diff --git a/modules/frontend/tracebyidsharding.go b/modules/frontend/tracebyidsharding.go index d6af58eee31..de9db50e4d3 100644 --- a/modules/frontend/tracebyidsharding.go +++ b/modules/frontend/tracebyidsharding.go @@ -96,24 +96,7 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { return } - // if not found bail - if resp.StatusCode == http.StatusNotFound { - return - } - - // if the status code is anything but happy, save the error and pass it down the line - if resp.StatusCode != http.StatusOK { - // todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast? - statusCode = resp.StatusCode - bytesMsg, err := io.ReadAll(resp.Body) - if err != nil { - _ = level.Error(s.logger).Log("msg", "error reading response body status != ok", "url", innerR.RequestURI, "err", err) - } - statusMsg = string(bytesMsg) - return - } - - // successful query, read the body + // read the body buff, err := io.ReadAll(resp.Body) if err != nil { _ = level.Error(s.logger).Log("msg", "error reading response body status == ok", "url", innerR.RequestURI, "err", err) @@ -140,6 +123,23 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { } } + // if not found bail + if resp.StatusCode == http.StatusNotFound { + return + } + + // if the status code is anything but happy, save the error and pass it down the line + if resp.StatusCode != http.StatusOK { + // todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast? + statusCode = resp.StatusCode + bytesMsg, err := io.ReadAll(resp.Body) + if err != nil { + _ = level.Error(s.logger).Log("msg", "error reading response body status != ok", "url", innerR.RequestURI, "err", err) + } + statusMsg = string(bytesMsg) + return + } + // happy path statusCode = http.StatusOK overallTrace, _, _, _ = model.CombineTraceProtos(overallTrace, traceResp.Trace) diff --git a/modules/frontend/tracebyidsharding_test.go b/modules/frontend/tracebyidsharding_test.go index 89c20d37252..b4d10f34369 100644 --- a/modules/frontend/tracebyidsharding_test.go +++ b/modules/frontend/tracebyidsharding_test.go @@ -92,17 +92,18 @@ func TestShardingWareDoRequest(t *testing.T) { } tests := []struct { - name string - status1 int - status2 int - trace1 *tempopb.Trace - trace2 *tempopb.Trace - err1 error - err2 error - failedBlockQueries int - expectedStatus int - expectedTrace *tempopb.Trace - expectedError error + name string + status1 int + status2 int + trace1 *tempopb.Trace + trace2 *tempopb.Trace + err1 error + err2 error + failedBlockQueries1 int + failedBlockQueries2 int + expectedStatus int + expectedTrace *tempopb.Trace + expectedError error }{ { name: "empty returns", @@ -211,23 +212,54 @@ func TestShardingWareDoRequest(t *testing.T) { expectedError: errors.New("booo"), }, { - name: "failed block queries <= max", - status1: 200, - trace1: trace1, - status2: 200, - trace2: trace2, - failedBlockQueries: 1, - expectedStatus: 200, - expectedTrace: trace, + name: "failedBlocks under: 200+200", + status1: 200, + trace1: trace1, + status2: 200, + trace2: trace2, + failedBlockQueries1: 1, + failedBlockQueries2: 1, + expectedStatus: 200, + expectedTrace: trace, }, { - name: "too many failed block queries", - status1: 200, - trace1: trace1, - status2: 200, - trace2: trace2, - failedBlockQueries: 10, - expectedError: errors.New("too many failed block queries 10 (max 2)"), + name: "failedBlocks over: 200+200", + status1: 200, + trace1: trace1, + status2: 200, + trace2: trace2, + failedBlockQueries1: 0, + failedBlockQueries2: 5, + expectedError: errors.New("too many failed block queries 5 (max 2)"), + }, + { + name: "failedBlocks under: 200+404", + status1: 200, + trace1: trace1, + status2: 404, + failedBlockQueries1: 1, + failedBlockQueries2: 0, + expectedStatus: 200, + expectedTrace: trace1, + }, + { + name: "failedBlocks under: 404+200", + status1: 200, + trace1: trace1, + status2: 404, + failedBlockQueries1: 0, + failedBlockQueries2: 1, + expectedStatus: 200, + expectedTrace: trace1, + }, + { + name: "failedBlocks over: 404+200", + status1: 200, + trace1: trace1, + status2: 404, + failedBlockQueries1: 0, + failedBlockQueries2: 5, + expectedError: errors.New("too many failed block queries 5 (max 2)"), }, } @@ -239,14 +271,17 @@ func TestShardingWareDoRequest(t *testing.T) { var trace *tempopb.Trace var statusCode int var err error + var failedBlockQueries int if r.RequestURI == "/querier/api/traces/1234?mode=ingesters" { trace = tc.trace1 statusCode = tc.status1 err = tc.err1 + failedBlockQueries = tc.failedBlockQueries1 } else { trace = tc.trace2 err = tc.err2 statusCode = tc.status2 + failedBlockQueries = tc.failedBlockQueries2 } if err != nil { @@ -258,7 +293,14 @@ func TestShardingWareDoRequest(t *testing.T) { resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ Trace: trace, Metrics: &tempopb.TraceByIDMetrics{ - FailedBlocks: uint32(tc.failedBlockQueries), + FailedBlocks: uint32(failedBlockQueries), + }, + }) + require.NoError(t, err) + } else { + resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ + Metrics: &tempopb.TraceByIDMetrics{ + FailedBlocks: uint32(failedBlockQueries), }, }) require.NoError(t, err) From f7334816480c81404a29e1934c3b02e737fc75c2 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 26 Oct 2021 13:47:43 -0400 Subject: [PATCH 3/8] changelog Signed-off-by: Joe Elliott --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bf893a6aa3..58c06caa737 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -77,6 +77,7 @@ * [BUGFIX] Set span's tag `span.kind` to `client` in query-frontend [#975](https://github.com/grafana/tempo/pull/975) (@mapno) * [BUGFIX] Nil check overrides module in the `/status` handler [#994](https://github.com/grafana/tempo/pull/994) (@mapno) * [BUGFIX] Several bug fixes for search contention and panics [#1033](https://github.com/grafana/tempo/pull/1033) (@mdisibio) +* [BUGFIX] Fixes `tempodb_backend_hedged_roundtrips_total` to correctly count hedged roundtrips. [#1079](https://github.com/grafana/tempo/pull/1079) (@joe-elliott) ## v1.1.0 / 2021-08-26 * [CHANGE] Upgrade Cortex from v1.9.0 to v1.9.0-131-ga4bf10354 [#841](https://github.com/grafana/tempo/pull/841) (@aknuds1) From 9c19c28ca6b262d8906eafcae24cf305615684fc Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Tue, 26 Oct 2021 13:54:41 -0400 Subject: [PATCH 4/8] fixed error messages Signed-off-by: Joe Elliott --- modules/frontend/tracebyidsharding.go | 24 +++++++-------- modules/frontend/tracebyidsharding_test.go | 35 +++++++++++----------- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/modules/frontend/tracebyidsharding.go b/modules/frontend/tracebyidsharding.go index de9db50e4d3..5bc6a683b12 100644 --- a/modules/frontend/tracebyidsharding.go +++ b/modules/frontend/tracebyidsharding.go @@ -96,6 +96,18 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { return } + // if the status code is anything but happy, save the error and pass it down the line + if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNotFound { + // todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast? + statusCode = resp.StatusCode + bytesMsg, err := io.ReadAll(resp.Body) + if err != nil { + _ = level.Error(s.logger).Log("msg", "error reading response body status != ok", "url", innerR.RequestURI, "err", err) + } + statusMsg = string(bytesMsg) + return + } + // read the body buff, err := io.ReadAll(resp.Body) if err != nil { @@ -128,18 +140,6 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) { return } - // if the status code is anything but happy, save the error and pass it down the line - if resp.StatusCode != http.StatusOK { - // todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast? - statusCode = resp.StatusCode - bytesMsg, err := io.ReadAll(resp.Body) - if err != nil { - _ = level.Error(s.logger).Log("msg", "error reading response body status != ok", "url", innerR.RequestURI, "err", err) - } - statusMsg = string(bytesMsg) - return - } - // happy path statusCode = http.StatusOK overallTrace, _, _, _ = model.CombineTraceProtos(overallTrace, traceResp.Trace) diff --git a/modules/frontend/tracebyidsharding_test.go b/modules/frontend/tracebyidsharding_test.go index b4d10f34369..47516380e56 100644 --- a/modules/frontend/tracebyidsharding_test.go +++ b/modules/frontend/tracebyidsharding_test.go @@ -203,7 +203,6 @@ func TestShardingWareDoRequest(t *testing.T) { trace2: trace1, expectedError: errors.New("booo"), }, - { name: "500+err", status1: 500, @@ -288,22 +287,24 @@ func TestShardingWareDoRequest(t *testing.T) { return nil, err } - var resBytes []byte - if trace != nil { - resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ - Trace: trace, - Metrics: &tempopb.TraceByIDMetrics{ - FailedBlocks: uint32(failedBlockQueries), - }, - }) - require.NoError(t, err) - } else { - resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ - Metrics: &tempopb.TraceByIDMetrics{ - FailedBlocks: uint32(failedBlockQueries), - }, - }) - require.NoError(t, err) + resBytes := []byte("error occurred") + if statusCode != 500 { + if trace != nil { + resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ + Trace: trace, + Metrics: &tempopb.TraceByIDMetrics{ + FailedBlocks: uint32(failedBlockQueries), + }, + }) + require.NoError(t, err) + } else { + resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{ + Metrics: &tempopb.TraceByIDMetrics{ + FailedBlocks: uint32(failedBlockQueries), + }, + }) + require.NoError(t, err) + } } return &http.Response{ From 1f36ae341f09bbdc393f3889f01aae737638d2af Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Wed, 27 Oct 2021 09:24:12 -0400 Subject: [PATCH 5/8] return metrics on not found Signed-off-by: Joe Elliott --- modules/querier/http.go | 6 ------ 1 file changed, 6 deletions(-) diff --git a/modules/querier/http.go b/modules/querier/http.go index 094e1b7e5b2..64017b9b1f6 100644 --- a/modules/querier/http.go +++ b/modules/querier/http.go @@ -2,7 +2,6 @@ package querier import ( "context" - "encoding/hex" "fmt" "net/http" "time" @@ -67,11 +66,6 @@ func (q *Querier) TraceByIDHandler(w http.ResponseWriter, r *http.Request) { return } - if resp.Trace == nil || len(resp.Trace.Batches) == 0 { - http.Error(w, fmt.Sprintf("Unable to find %s", hex.EncodeToString(byteID)), http.StatusNotFound) - return - } - if r.Header.Get(api.HeaderAccept) == api.HeaderAcceptProtobuf { span.SetTag("contentType", api.HeaderAcceptProtobuf) b, err := proto.Marshal(resp) From 5983943afe7965043282f4e7f5bd309cbf9ee214 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Wed, 27 Oct 2021 10:45:22 -0400 Subject: [PATCH 6/8] fix metric Signed-off-by: Joe Elliott --- tempodb/backend/instrumentation/hedged_requests.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tempodb/backend/instrumentation/hedged_requests.go b/tempodb/backend/instrumentation/hedged_requests.go index 1f60c9c0340..8ef73b4978c 100644 --- a/tempodb/backend/instrumentation/hedged_requests.go +++ b/tempodb/backend/instrumentation/hedged_requests.go @@ -27,7 +27,8 @@ func PublishHedgedMetrics(s *hedgedhttp.Stats) { ticker := time.NewTicker(hedgedMetricsPublishDuration) go func() { for range ticker.C { - hedgedRequestsMetrics.Set(float64(s.Snapshot().RequestedRoundTrips)) + snap := s.Snapshot() + hedgedRequestsMetrics.Set(float64(snap.ActualRoundTrips - snap.RequestedRoundTrips)) } }() } From 43930a3096729a19ae1075d61b10b40b34410bf5 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Wed, 27 Oct 2021 14:19:36 -0400 Subject: [PATCH 7/8] protect against overflow Signed-off-by: Joe Elliott --- tempodb/backend/instrumentation/hedged_requests.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tempodb/backend/instrumentation/hedged_requests.go b/tempodb/backend/instrumentation/hedged_requests.go index 8ef73b4978c..85ff7010f46 100644 --- a/tempodb/backend/instrumentation/hedged_requests.go +++ b/tempodb/backend/instrumentation/hedged_requests.go @@ -28,7 +28,11 @@ func PublishHedgedMetrics(s *hedgedhttp.Stats) { go func() { for range ticker.C { snap := s.Snapshot() - hedgedRequestsMetrics.Set(float64(snap.ActualRoundTrips - snap.RequestedRoundTrips)) + hedgedRequests := int64(snap.ActualRoundTrips) - int64(snap.RequestedRoundTrips) + if hedgedRequests < 0 { + hedgedRequests = 0 + } + hedgedRequestsMetrics.Set(float64(hedgedRequests)) } }() } From 832758f674c49de30433fe85ff6980fd5a4265d0 Mon Sep 17 00:00:00 2001 From: Joe Elliott Date: Wed, 27 Oct 2021 15:02:18 -0400 Subject: [PATCH 8/8] restore 404 Signed-off-by: Joe Elliott --- modules/querier/http.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/modules/querier/http.go b/modules/querier/http.go index 64017b9b1f6..44b99fa8075 100644 --- a/modules/querier/http.go +++ b/modules/querier/http.go @@ -66,6 +66,12 @@ func (q *Querier) TraceByIDHandler(w http.ResponseWriter, r *http.Request) { return } + // record not found here, but continue on so we can marshal metrics + // to the body + if resp.Trace == nil || len(resp.Trace.Batches) == 0 { + w.WriteHeader(http.StatusNotFound) + } + if r.Header.Get(api.HeaderAccept) == api.HeaderAcceptProtobuf { span.SetTag("contentType", api.HeaderAcceptProtobuf) b, err := proto.Marshal(resp)