Skip to content

Commit

Permalink
fix: Put a limit on the size of sampler keys (#1364)
Browse files Browse the repository at this point in the history
## Which problem is this PR solving?

- If someone sends a big trace where one of the sampler keys is a
high-cardinality field, Refinery could generate a huge sampler key
value. This causes problems both for refinery but also the downstream
telemetry. Let's not do that.

## Short description of the changes

- Put a limit of 100 unique values to make up any one sampler key. Even
that is big and probably useless but it should avoid any real use cases.
- Add a test to show it works.

Fixes #1363

---------

Co-authored-by: Mike Goldsmth <goldsmith.mike@gmail.com>
  • Loading branch information
kentquirk and MikeGoldsmith authored Oct 7, 2024
1 parent ea00691 commit e5fd6d8
Show file tree
Hide file tree
Showing 7 changed files with 99 additions and 19 deletions.
5 changes: 4 additions & 1 deletion sample/dynamic.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ func (d *DynamicSampler) Start() error {
}

func (d *DynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
key = d.key.build(trace)
key, n := d.key.build(trace)
if n == maxKeyLength {
d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
}
count := int(trace.DescendantCount())
rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
if rate < 1 { // protect against dynsampler being broken even though it shouldn't be
Expand Down
6 changes: 4 additions & 2 deletions sample/dynamic_ema.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ type EMADynamicSampler struct {
burstDetectionDelay uint
maxKeys int
prefix string
lastMetrics map[string]int64

key *traceKey
keyFields []string
Expand Down Expand Up @@ -74,7 +73,10 @@ func (d *EMADynamicSampler) Start() error {
}

func (d *EMADynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
key = d.key.build(trace)
key, n := d.key.build(trace)
if n == maxKeyLength {
d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
}
count := int(trace.DescendantCount())
rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
if rate < 1 { // protect against dynsampler being broken even though it shouldn't be
Expand Down
5 changes: 4 additions & 1 deletion sample/ema_throughput.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,10 @@ func (d *EMAThroughputSampler) SetClusterSize(size int) {
}

func (d *EMAThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
key = d.key.build(trace)
key, n := d.key.build(trace)
if n == maxKeyLength {
d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
}
count := int(trace.DescendantCount())
rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
if rate < 1 { // protect against dynsampler being broken even though it shouldn't be
Expand Down
5 changes: 4 additions & 1 deletion sample/totalthroughput.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ func (d *TotalThroughputSampler) SetClusterSize(size int) {
}

func (d *TotalThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
key = d.key.build(trace)
key, n := d.key.build(trace)
if n == maxKeyLength {
d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
}
count := int(trace.DescendantCount())
rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
if rate < 1 { // protect against dynsampler being broken even though it shouldn't be
Expand Down
42 changes: 33 additions & 9 deletions sample/trace_key.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,16 @@ import (
"strconv"
"strings"

"github.com/honeycombio/refinery/generics"
"github.com/honeycombio/refinery/types"
)

// once a key gets this many unique values, it's off the charts in terms of uniqueness
// so we just stop looking for more.
// This is a safety valve to prevent us from someone sending a high-cardinality field
// in a giant trace.
const maxKeyLength = 100

type traceKey struct {
fields []string
rootOnlyFields []string
Expand Down Expand Up @@ -37,49 +44,66 @@ func newTraceKey(fields []string, useTraceLength bool) *traceKey {
}

// build, builds the trace key based on the configuration of the traceKeyGenerator
func (d *traceKey) build(trace *types.Trace) string {
// returns the number of values used to build the key
func (d *traceKey) build(trace *types.Trace) (string, int) {
fieldCount := 0
// fieldCollector gets all values from the fields listed in the config, even
// if they happen multiple times.
fieldCollector := make(map[string][]string)

// for each field, for each span, get the value of that field
spans := trace.GetSpans()
uniques := generics.NewSetWithCapacity[string](maxKeyLength)
outer:
for _, field := range d.fields {
for _, span := range spans {
if val, ok := span.Data[field]; ok {
u := fmt.Sprintf("%s/%v", field, val)
// don't bother to add it if we've already seen it
if uniques.Contains(u) {
continue
}
uniques.Add(u)
if len(uniques) >= maxKeyLength {
break outer
}
fieldCollector[field] = append(fieldCollector[field], fmt.Sprintf("%v", val))
}
}
}
// ok, now we have a map of fields to a list of all values for that field.
// ok, now we have a map of fields to a list of all unique values for that field.
// (unless it was huge, in which case we have a bunch of them)

var key string
var key strings.Builder
for _, field := range d.fields {
// sort and collapse list
sort.Strings(fieldCollector[field])
var prevStr string
for _, str := range fieldCollector[field] {
if str != prevStr {
key += str + "•"
key.WriteString(str)
key.WriteRune('•')
fieldCount += 1
}
prevStr = str
}
// get ready for the next element
key += ","
key.WriteRune(',')
}

if trace.RootSpan != nil {
for _, field := range d.rootOnlyFields {

if val, ok := trace.RootSpan.Data[field]; ok {
key += fmt.Sprintf("%v,", val)
key.WriteString(fmt.Sprintf("%v,", val))
fieldCount += 1
}
}
}

if d.useTraceLength {
key += strconv.FormatInt(int64(len(spans)), 10)
key.WriteString(strconv.FormatInt(int64(len(spans)), 10))
fieldCount += 1
}

return key
return key.String(), fieldCount
}
50 changes: 46 additions & 4 deletions sample/trace_key_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package sample

import (
"fmt"
"testing"

"github.com/honeycombio/refinery/types"
Expand Down Expand Up @@ -28,7 +29,9 @@ func TestKeyGeneration(t *testing.T) {

expected := "2•,200•,true•,/{slug}/home•,1"

assert.Equal(t, expected, generator.build(trace))
key, n := generator.build(trace)
assert.Equal(t, expected, key)
assert.Equal(t, 5, n)

fields = []string{"http.status_code", "request.path", "app.team.id", "important_field"}
useTraceLength = true
Expand Down Expand Up @@ -71,7 +74,9 @@ func TestKeyGeneration(t *testing.T) {

expected = "2•,200•,true•,/{slug}/home•,4"

assert.Equal(t, expected, generator.build(trace))
key, n = generator.build(trace)
assert.Equal(t, expected, key)
assert.Equal(t, 5, n)

// now test that multiple values across spans are condensed correctly
fields = []string{"http.status_code"}
Expand Down Expand Up @@ -115,7 +120,9 @@ func TestKeyGeneration(t *testing.T) {

expected = "200•404•,4"

assert.Equal(t, expected, generator.build(trace))
key, n = generator.build(trace)
assert.Equal(t, expected, key)
assert.Equal(t, 3, n)

// test field list with root prefix, only include the field from on the root span
// if it exists
Expand Down Expand Up @@ -153,5 +160,40 @@ func TestKeyGeneration(t *testing.T) {

expected = "200•404•,test,2"

assert.Equal(t, expected, generator.build(trace))
key, n = generator.build(trace)
assert.Equal(t, expected, key)
assert.Equal(t, 4, n)
}

func TestKeyLimits(t *testing.T) {
fields := []string{"fieldA", "fieldB"}
useTraceLength := true

generator := newTraceKey(fields, useTraceLength)

trace := &types.Trace{}

// generate too many spans with different unique values
for i := 0; i < 160; i++ {
trace.AddSpan(&types.Span{
Event: types.Event{
Data: map[string]interface{}{
"fieldA": fmt.Sprintf("value%d", i),
"fieldB": i,
},
},
})
}

trace.RootSpan = &types.Span{
Event: types.Event{
Data: map[string]interface{}{
"service_name": "test",
},
},
}

_, n := generator.build(trace)
// we should have maxKeyLength unique values
assert.Equal(t, maxKeyLength, n)
}
5 changes: 4 additions & 1 deletion sample/windowed_throughput.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ func (d *WindowedThroughputSampler) SetClusterSize(size int) {
}

func (d *WindowedThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
key = d.key.build(trace)
key, n := d.key.build(trace)
if n == maxKeyLength {
d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
}
count := int(trace.DescendantCount())
rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
if rate < 1 { // protect against dynsampler being broken even though it shouldn't be
Expand Down

0 comments on commit e5fd6d8

Please sign in to comment.