honeycombio · MikeGoldsmith · Oct 7, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
@@ -64,7 +64,10 @@ func (d *DynamicSampler) Start() error {
 }
 
 func (d *DynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
-	key = d.key.build(trace)
+	key, n := d.key.build(trace)
+	if n == maxKeyLength {
+		d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
+	}
 	count := int(trace.DescendantCount())
 	rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
 	if rate < 1 { // protect against dynsampler being broken even though it shouldn't be

@@ -25,7 +25,6 @@ type EMADynamicSampler struct {
 	burstDetectionDelay uint
 	maxKeys             int
 	prefix              string
-	lastMetrics         map[string]int64
 
 	key       *traceKey
 	keyFields []string
@@ -74,7 +73,10 @@ func (d *EMADynamicSampler) Start() error {
 }
 
 func (d *EMADynamicSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
-	key = d.key.build(trace)
+	key, n := d.key.build(trace)
+	if n == maxKeyLength {
+		d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
+	}
 	count := int(trace.DescendantCount())
 	rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
 	if rate < 1 { // protect against dynsampler being broken even though it shouldn't be

@@ -89,7 +89,10 @@ func (d *EMAThroughputSampler) SetClusterSize(size int) {
 }
 
 func (d *EMAThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
-	key = d.key.build(trace)
+	key, n := d.key.build(trace)
+	if n == maxKeyLength {
+		d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
+	}
 	count := int(trace.DescendantCount())
 	rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
 	if rate < 1 { // protect against dynsampler being broken even though it shouldn't be

@@ -80,7 +80,10 @@ func (d *TotalThroughputSampler) SetClusterSize(size int) {
 }
 
 func (d *TotalThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
-	key = d.key.build(trace)
+	key, n := d.key.build(trace)
+	if n == maxKeyLength {
+		d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
+	}
 	count := int(trace.DescendantCount())
 	rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
 	if rate < 1 { // protect against dynsampler being broken even though it shouldn't be

@@ -6,9 +6,16 @@ import (
 	"strconv"
 	"strings"
 
+	"github.com/honeycombio/refinery/generics"
 	"github.com/honeycombio/refinery/types"
 )
 
+// once a key gets this many unique values, it's off the charts in terms of uniqueness
+// so we just stop looking for more.
+// This is a safety valve to prevent us from someone sending a high-cardinality field
+// in a giant trace.
+const maxKeyLength = 100
+
 type traceKey struct {
 	fields         []string
 	rootOnlyFields []string
@@ -37,49 +44,66 @@ func newTraceKey(fields []string, useTraceLength bool) *traceKey {
 }
 
 // build, builds the trace key based on the configuration of the traceKeyGenerator
-func (d *traceKey) build(trace *types.Trace) string {
+// returns the number of values used to build the key
+func (d *traceKey) build(trace *types.Trace) (string, int) {
+	fieldCount := 0
 	// fieldCollector gets all values from the fields listed in the config, even
 	// if they happen multiple times.
 	fieldCollector := make(map[string][]string)
 
 	// for each field, for each span, get the value of that field
 	spans := trace.GetSpans()
+	uniques := generics.NewSetWithCapacity[string](maxKeyLength)
+outer:
 	for _, field := range d.fields {
 		for _, span := range spans {
 			if val, ok := span.Data[field]; ok {
+				u := fmt.Sprintf("%s/%v", field, val)
+				// don't bother to add it if we've already seen it
+				if uniques.Contains(u) {
+					continue
+				}
+				uniques.Add(u)
+				if len(uniques) >= maxKeyLength {
+					break outer
+				}
 				fieldCollector[field] = append(fieldCollector[field], fmt.Sprintf("%v", val))
 			}
 		}
 	}
-	// ok, now we have a map of fields to a list of all values for that field.
+	// ok, now we have a map of fields to a list of all unique values for that field.
+	// (unless it was huge, in which case we have a bunch of them)
 
-	var key string
+	var key strings.Builder
 	for _, field := range d.fields {
 		// sort and collapse list
 		sort.Strings(fieldCollector[field])
 		var prevStr string
 		for _, str := range fieldCollector[field] {
 			if str != prevStr {
-				key += str + "•"
+				key.WriteString(str)
+				key.WriteRune('•')
+				fieldCount += 1
 			}
 			prevStr = str
 		}
 		// get ready for the next element
-		key += ","
+		key.WriteRune(',')
 	}
 
 	if trace.RootSpan != nil {
 		for _, field := range d.rootOnlyFields {
-
 			if val, ok := trace.RootSpan.Data[field]; ok {
-				key += fmt.Sprintf("%v,", val)
+				key.WriteString(fmt.Sprintf("%v,", val))
+				fieldCount += 1
 			}
 		}
 	}
 
 	if d.useTraceLength {
-		key += strconv.FormatInt(int64(len(spans)), 10)
+		key.WriteString(strconv.FormatInt(int64(len(spans)), 10))
+		fieldCount += 1
 	}
 
-	return key
+	return key.String(), fieldCount
 }
@@ -1,6 +1,7 @@
 package sample
 
 import (
+	"fmt"
 	"testing"
 
 	"github.com/honeycombio/refinery/types"
@@ -28,7 +29,9 @@ func TestKeyGeneration(t *testing.T) {
 
 	expected := "2•,200•,true•,/{slug}/home•,1"
 
-	assert.Equal(t, expected, generator.build(trace))
+	key, n := generator.build(trace)
+	assert.Equal(t, expected, key)
+	assert.Equal(t, 5, n)
 
 	fields = []string{"http.status_code", "request.path", "app.team.id", "important_field"}
 	useTraceLength = true
@@ -71,7 +74,9 @@ func TestKeyGeneration(t *testing.T) {
 
 	expected = "2•,200•,true•,/{slug}/home•,4"
 
-	assert.Equal(t, expected, generator.build(trace))
+	key, n = generator.build(trace)
+	assert.Equal(t, expected, key)
+	assert.Equal(t, 5, n)
 
 	// now test that multiple values across spans are condensed correctly
 	fields = []string{"http.status_code"}
@@ -115,7 +120,9 @@ func TestKeyGeneration(t *testing.T) {
 
 	expected = "200•404•,4"
 
-	assert.Equal(t, expected, generator.build(trace))
+	key, n = generator.build(trace)
+	assert.Equal(t, expected, key)
+	assert.Equal(t, 3, n)
 
 	// test field list with root prefix, only include the field from on the root span
 	// if it exists
@@ -153,5 +160,40 @@ func TestKeyGeneration(t *testing.T) {
 
 	expected = "200•404•,test,2"
 
-	assert.Equal(t, expected, generator.build(trace))
+	key, n = generator.build(trace)
+	assert.Equal(t, expected, key)
+	assert.Equal(t, 4, n)
+}
+
+func TestKeyLimits(t *testing.T) {
+	fields := []string{"fieldA", "fieldB"}
+	useTraceLength := true
+
+	generator := newTraceKey(fields, useTraceLength)
+
+	trace := &types.Trace{}
+
+	// generate too many spans with different unique values
+	for i := 0; i < 160; i++ {
+		trace.AddSpan(&types.Span{
+			Event: types.Event{
+				Data: map[string]interface{}{
+					"fieldA": fmt.Sprintf("value%d", i),
+					"fieldB": i,
+				},
+			},
+		})
+	}
+
+	trace.RootSpan = &types.Span{
+		Event: types.Event{
+			Data: map[string]interface{}{
+				"service_name": "test",
+			},
+		},
+	}
+
+	_, n := generator.build(trace)
+	// we should have maxKeyLength unique values
+	assert.Equal(t, maxKeyLength, n)
 }
@@ -76,7 +76,10 @@ func (d *WindowedThroughputSampler) SetClusterSize(size int) {
 }
 
 func (d *WindowedThroughputSampler) GetSampleRate(trace *types.Trace) (rate uint, keep bool, reason string, key string) {
-	key = d.key.build(trace)
+	key, n := d.key.build(trace)
+	if n == maxKeyLength {
+		d.Logger.Debug().Logf("trace key hit max length of %d, truncating", maxKeyLength)
+	}
 	count := int(trace.DescendantCount())
 	rate = uint(d.dynsampler.GetSampleRateMulti(key, count))
 	if rate < 1 { // protect against dynsampler being broken even though it shouldn't be