Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Query Path Fixes #1079

Merged
merged 8 commits into from
Oct 27, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
* [BUGFIX] Set span's tag `span.kind` to `client` in query-frontend [#975](https://github.com/grafana/tempo/pull/975) (@mapno)
* [BUGFIX] Nil check overrides module in the `/status` handler [#994](https://github.com/grafana/tempo/pull/994) (@mapno)
* [BUGFIX] Several bug fixes for search contention and panics [#1033](https://github.com/grafana/tempo/pull/1033) (@mdisibio)
* [BUGFIX] Fixes `tempodb_backend_hedged_roundtrips_total` to correctly count hedged roundtrips. [#1079](https://github.com/grafana/tempo/pull/1079) (@joe-elliott)

## v1.1.0 / 2021-08-26
* [CHANGE] Upgrade Cortex from v1.9.0 to v1.9.0-131-ga4bf10354 [#841](https://github.com/grafana/tempo/pull/841) (@aknuds1)
Expand Down
14 changes: 7 additions & 7 deletions modules/frontend/tracebyidsharding.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,8 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) {
return
}

// if not found bail
if resp.StatusCode == http.StatusNotFound {
return
}

// if the status code is anything but happy, save the error and pass it down the line
if resp.StatusCode != http.StatusOK {
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusNotFound {
// todo: if we cancel the parent context here will it shortcircuit the other queries and fail fast?
statusCode = resp.StatusCode
bytesMsg, err := io.ReadAll(resp.Body)
Expand All @@ -113,7 +108,7 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) {
return
}

// successful query, read the body
// read the body
buff, err := io.ReadAll(resp.Body)
if err != nil {
_ = level.Error(s.logger).Log("msg", "error reading response body status == ok", "url", innerR.RequestURI, "err", err)
Expand All @@ -140,6 +135,11 @@ func (s shardQuery) RoundTrip(r *http.Request) (*http.Response, error) {
}
}

// if not found bail
if resp.StatusCode == http.StatusNotFound {
return
}

// happy path
statusCode = http.StatusOK
overallTrace, _, _, _ = model.CombineTraceProtos(overallTrace, traceResp.Trace)
Expand Down
115 changes: 79 additions & 36 deletions modules/frontend/tracebyidsharding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,17 +92,18 @@ func TestShardingWareDoRequest(t *testing.T) {
}

tests := []struct {
name string
status1 int
status2 int
trace1 *tempopb.Trace
trace2 *tempopb.Trace
err1 error
err2 error
failedBlockQueries int
expectedStatus int
expectedTrace *tempopb.Trace
expectedError error
name string
status1 int
status2 int
trace1 *tempopb.Trace
trace2 *tempopb.Trace
err1 error
err2 error
failedBlockQueries1 int
failedBlockQueries2 int
expectedStatus int
expectedTrace *tempopb.Trace
expectedError error
}{
{
name: "empty returns",
Expand Down Expand Up @@ -202,7 +203,6 @@ func TestShardingWareDoRequest(t *testing.T) {
trace2: trace1,
expectedError: errors.New("booo"),
},

{
name: "500+err",
status1: 500,
Expand All @@ -211,23 +211,54 @@ func TestShardingWareDoRequest(t *testing.T) {
expectedError: errors.New("booo"),
},
{
name: "failed block queries <= max",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries: 1,
expectedStatus: 200,
expectedTrace: trace,
name: "failedBlocks under: 200+200",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries1: 1,
failedBlockQueries2: 1,
expectedStatus: 200,
expectedTrace: trace,
},
{
name: "failedBlocks over: 200+200",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries1: 0,
failedBlockQueries2: 5,
expectedError: errors.New("too many failed block queries 5 (max 2)"),
},
{
name: "failedBlocks under: 200+404",
status1: 200,
trace1: trace1,
status2: 404,
failedBlockQueries1: 1,
failedBlockQueries2: 0,
expectedStatus: 200,
expectedTrace: trace1,
},
{
name: "failedBlocks under: 404+200",
status1: 200,
trace1: trace1,
status2: 404,
failedBlockQueries1: 0,
failedBlockQueries2: 1,
expectedStatus: 200,
expectedTrace: trace1,
},
{
name: "too many failed block queries",
status1: 200,
trace1: trace1,
status2: 200,
trace2: trace2,
failedBlockQueries: 10,
expectedError: errors.New("too many failed block queries 10 (max 2)"),
name: "failedBlocks over: 404+200",
status1: 200,
trace1: trace1,
status2: 404,
failedBlockQueries1: 0,
failedBlockQueries2: 5,
expectedError: errors.New("too many failed block queries 5 (max 2)"),
},
}

Expand All @@ -239,29 +270,41 @@ func TestShardingWareDoRequest(t *testing.T) {
var trace *tempopb.Trace
var statusCode int
var err error
var failedBlockQueries int
if r.RequestURI == "/querier/api/traces/1234?mode=ingesters" {
trace = tc.trace1
statusCode = tc.status1
err = tc.err1
failedBlockQueries = tc.failedBlockQueries1
} else {
trace = tc.trace2
err = tc.err2
statusCode = tc.status2
failedBlockQueries = tc.failedBlockQueries2
}

if err != nil {
return nil, err
}

var resBytes []byte
if trace != nil {
resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{
Trace: trace,
Metrics: &tempopb.TraceByIDMetrics{
FailedBlocks: uint32(tc.failedBlockQueries),
},
})
require.NoError(t, err)
resBytes := []byte("error occurred")
if statusCode != 500 {
if trace != nil {
resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{
Trace: trace,
Metrics: &tempopb.TraceByIDMetrics{
FailedBlocks: uint32(failedBlockQueries),
},
})
require.NoError(t, err)
} else {
resBytes, err = proto.Marshal(&tempopb.TraceByIDResponse{
Metrics: &tempopb.TraceByIDMetrics{
FailedBlocks: uint32(failedBlockQueries),
},
})
require.NoError(t, err)
}
}

return &http.Response{
Expand Down
6 changes: 3 additions & 3 deletions modules/querier/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package querier

import (
"context"
"encoding/hex"
"fmt"
"net/http"
"time"
Expand Down Expand Up @@ -67,9 +66,10 @@ func (q *Querier) TraceByIDHandler(w http.ResponseWriter, r *http.Request) {
return
}

// record not found here, but continue on so we can marshal metrics
// to the body
if resp.Trace == nil || len(resp.Trace.Batches) == 0 {
http.Error(w, fmt.Sprintf("Unable to find %s", hex.EncodeToString(byteID)), http.StatusNotFound)
return
w.WriteHeader(http.StatusNotFound)
}

if r.Header.Get(api.HeaderAccept) == api.HeaderAcceptProtobuf {
Expand Down
13 changes: 9 additions & 4 deletions tempodb/backend/instrumentation/hedged_requests.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ const (
)

var (
hedgedRequestsMetrics = promauto.NewCounter(
prometheus.CounterOpts{
hedgedRequestsMetrics = promauto.NewGauge(
prometheus.GaugeOpts{
Namespace: "tempodb",
Name: "backend_hedged_roundtrips_total",
Help: "Total number of hedged backend requests",
Help: "Total number of hedged backend requests. Registered as a gauge for code sanity. This is a counter.",
},
)
)
Expand All @@ -27,7 +27,12 @@ func PublishHedgedMetrics(s *hedgedhttp.Stats) {
ticker := time.NewTicker(hedgedMetricsPublishDuration)
go func() {
for range ticker.C {
hedgedRequestsMetrics.Add(float64(s.Snapshot().RequestedRoundTrips))
snap := s.Snapshot()
hedgedRequests := int64(snap.ActualRoundTrips) - int64(snap.RequestedRoundTrips)
if hedgedRequests < 0 {
hedgedRequests = 0
}
hedgedRequestsMetrics.Set(float64(hedgedRequests))
}
}()
}