From 52d7f5bc887af5c9cb6b68243d38218902d1d04c Mon Sep 17 00:00:00 2001 From: Tyler Yahn Date: Thu, 18 Apr 2024 13:57:03 -0700 Subject: [PATCH] Truncate and de-duplicate log attr values --- sdk/log/record.go | 83 ++++++++++- sdk/log/record_test.go | 311 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 390 insertions(+), 4 deletions(-) diff --git a/sdk/log/record.go b/sdk/log/record.go index 74ae2888e748..2cdac6e75f23 100644 --- a/sdk/log/record.go +++ b/sdk/log/record.go @@ -5,8 +5,10 @@ package log // import "go.opentelemetry.io/otel/sdk/log" import ( "slices" + "strings" "sync" "time" + "unicode/utf8" "go.opentelemetry.io/otel/log" "go.opentelemetry.io/otel/sdk/instrumentation" @@ -198,8 +200,6 @@ func (r *Record) AddAttributes(attrs ...log.KeyValue) { } } else { // Unique attribute. - // TODO: apply truncation to string and []string values. - // TODO: deduplicate map values. unique = append(unique, a) uIndex[a.Key] = len(unique) - 1 } @@ -246,10 +246,13 @@ func (r *Record) addAttrs(attrs []log.KeyValue) { var i int for i = 0; i < len(attrs) && r.nFront < len(r.front); i++ { a := attrs[i] - r.front[r.nFront] = a + r.front[r.nFront] = r.applyAttrLimits(a) r.nFront++ } + for j, a := range attrs[i:] { + attrs[i+j] = r.applyAttrLimits(a) + } r.back = slices.Grow(r.back, len(attrs[i:])) r.back = append(r.back, attrs[i:]...) } @@ -268,11 +271,14 @@ func (r *Record) SetAttributes(attrs ...log.KeyValue) { var i int for i = 0; i < len(attrs) && r.nFront < len(r.front); i++ { a := attrs[i] - r.front[r.nFront] = a + r.front[r.nFront] = r.applyAttrLimits(a) r.nFront++ } r.back = slices.Clone(attrs[i:]) + for i, a := range r.back { + r.back[i] = r.applyAttrLimits(a) + } } // head returns the first n values of kvs along with the number of elements @@ -367,3 +373,72 @@ func (r *Record) Clone() Record { res.back = slices.Clone(r.back) return res } + +func (r Record) applyAttrLimits(attr log.KeyValue) log.KeyValue { + attr.Value = r.applyValueLimits(attr.Value) + return attr +} + +func (r Record) applyValueLimits(val log.Value) log.Value { + switch val.Kind() { + case log.KindString: + s := val.AsString() + if len(s) > r.attributeValueLengthLimit { + val = log.StringValue(truncate(s, r.attributeValueLengthLimit)) + } + case log.KindSlice: + sl := val.AsSlice() + for i := range sl { + sl[i] = r.applyValueLimits(sl[i]) + } + val = log.SliceValue(sl...) + case log.KindMap: + // Deduplicate then truncate. Do not do at the same time to avoid + // wasted truncation operations. + kvs, dropped := dedup(val.AsMap()) + r.dropped += dropped + for i := range kvs { + kvs[i] = r.applyAttrLimits(kvs[i]) + } + val = log.MapValue(kvs...) + } + return val +} + +// truncate returns a copy of str truncated to have a length of at most n +// characters. If the length of str is less than n, str itself is returned. +// +// The truncate of str ensures that no valid UTF-8 code point is split. The +// copy returned will be less than n if a characters straddles the length +// limit. +// +// No truncation is performed if n is less than zero. +func truncate(str string, n int) string { + if n < 0 { + return str + } + + // cut returns a copy of the s truncated to not exceed a length of n. If + // invalid UTF-8 is encountered, s is returned with false. Otherwise, the + // truncated copy will be returned with true. + cut := func(s string) (string, bool) { + var i int + for i = 0; i < n; { + r, size := utf8.DecodeRuneInString(s[i:]) + if r == utf8.RuneError { + return s, false + } + if i+size > n { + break + } + i += size + } + return s[:i], true + } + + cp, ok := cut(str) + if !ok { + cp, _ = cut(strings.ToValidUTF8(str, "")) + } + return cp +} diff --git a/sdk/log/record_test.go b/sdk/log/record_test.go index 99adfdfa9e9f..bf3d78c6da66 100644 --- a/sdk/log/record_test.go +++ b/sdk/log/record_test.go @@ -4,11 +4,13 @@ package log import ( + "fmt" "strconv" "testing" "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "go.opentelemetry.io/otel/attribute" "go.opentelemetry.io/otel/log" @@ -63,6 +65,7 @@ func TestRecordAttributes(t *testing.T) { log.Bytes("6", []byte("six")), } r := new(Record) + r.attributeValueLengthLimit = -1 r.SetAttributes(attrs...) r.SetAttributes(attrs[:2]...) // Overwrite existing. r.AddAttributes(attrs[2:]...) @@ -309,18 +312,21 @@ func TestRecordAttrDeduplication(t *testing.T) { t.Run("SetAttributes", func(t *testing.T) { r := new(Record) + r.attributeValueLengthLimit = -1 r.SetAttributes(tc.attrs...) validate(t, r) }) t.Run("AddAttributes/Empty", func(t *testing.T) { r := new(Record) + r.attributeValueLengthLimit = -1 r.AddAttributes(tc.attrs...) validate(t, r) }) t.Run("AddAttributes/Duplicates", func(t *testing.T) { r := new(Record) + r.attributeValueLengthLimit = -1 r.AddAttributes(tc.attrs...) r.AddAttributes(tc.attrs...) validate(t, r) @@ -328,3 +334,308 @@ func TestRecordAttrDeduplication(t *testing.T) { }) } } + +func TestApplyAttrLimitsDeduplication(t *testing.T) { + testcases := []struct { + name string + limit int + input, want log.Value + }{ + { + // No de-duplication + name: "Slice", + input: log.SliceValue( + log.BoolValue(true), + log.BoolValue(true), + log.Float64Value(1.3), + log.Float64Value(1.3), + log.Int64Value(43), + log.Int64Value(43), + log.BytesValue([]byte("hello")), + log.BytesValue([]byte("hello")), + log.StringValue("foo"), + log.StringValue("foo"), + log.SliceValue(log.StringValue("baz")), + log.SliceValue(log.StringValue("baz")), + log.MapValue(log.String("a", "qux")), + log.MapValue(log.String("a", "qux")), + ), + want: log.SliceValue( + log.BoolValue(true), + log.BoolValue(true), + log.Float64Value(1.3), + log.Float64Value(1.3), + log.Int64Value(43), + log.Int64Value(43), + log.BytesValue([]byte("hello")), + log.BytesValue([]byte("hello")), + log.StringValue("foo"), + log.StringValue("foo"), + log.SliceValue(log.StringValue("baz")), + log.SliceValue(log.StringValue("baz")), + log.MapValue(log.String("a", "qux")), + log.MapValue(log.String("a", "qux")), + ), + }, + { + name: "Map", + input: log.MapValue( + log.Bool("a", true), + log.Int64("b", 1), + log.Bool("a", false), + log.Float64("c", 2.), + log.String("b", "3"), + log.Slice("d", log.Int64Value(4)), + log.Map("a", log.Int("key", 5)), + log.Bytes("d", []byte("six")), + log.Bool("e", true), + log.Int("f", 1), + log.Int("f", 2), + log.Int("f", 3), + log.Float64("b", 0.0), + log.Float64("b", 0.0), + log.String("g", "G"), + log.String("h", "H"), + log.String("g", "GG"), + log.Bool("a", false), + ), + want: log.MapValue( + // Order is important here. + log.Bool("a", false), + log.Float64("b", 0.0), + log.Float64("c", 2.), + log.Bytes("d", []byte("six")), + log.Bool("e", true), + log.Int("f", 3), + log.String("g", "GG"), + log.String("h", "H"), + ), + }, + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + const key = "key" + kv := log.KeyValue{Key: key, Value: tc.input} + r := Record{attributeValueLengthLimit: -1} + + t.Run("AddAttributes", func(t *testing.T) { + r.AddAttributes(kv) + assertKV(t, r, log.KeyValue{Key: key, Value: tc.want}) + }) + + t.Run("SetAttributes", func(t *testing.T) { + r.SetAttributes(kv) + assertKV(t, r, log.KeyValue{Key: key, Value: tc.want}) + }) + }) + } +} + +func TestApplyAttrLimitsTruncation(t *testing.T) { + testcases := []struct { + name string + limit int + input, want log.Value + }{ + { + name: "Empty", + limit: 0, + input: log.Value{}, + want: log.Value{}, + }, + { + name: "Bool", + limit: 0, + input: log.BoolValue(true), + want: log.BoolValue(true), + }, + { + name: "Float64", + limit: 0, + input: log.Float64Value(1.3), + want: log.Float64Value(1.3), + }, + { + name: "Int64", + limit: 0, + input: log.Int64Value(43), + want: log.Int64Value(43), + }, + { + name: "Bytes", + limit: 0, + input: log.BytesValue([]byte("foo")), + want: log.BytesValue([]byte("foo")), + }, + { + name: "String", + limit: 0, + input: log.StringValue("foo"), + want: log.StringValue(""), + }, + { + name: "Slice", + limit: 0, + input: log.SliceValue( + log.BoolValue(true), + log.Float64Value(1.3), + log.Int64Value(43), + log.BytesValue([]byte("hello")), + log.StringValue("foo"), + log.StringValue("bar"), + log.SliceValue(log.StringValue("baz")), + log.MapValue(log.String("a", "qux")), + ), + want: log.SliceValue( + log.BoolValue(true), + log.Float64Value(1.3), + log.Int64Value(43), + log.BytesValue([]byte("hello")), + log.StringValue(""), + log.StringValue(""), + log.SliceValue(log.StringValue("")), + log.MapValue(log.String("a", "")), + ), + }, + { + name: "Map", + limit: 0, + input: log.MapValue( + log.Bool("0", true), + log.Float64("1", 1.3), + log.Int64("2", 43), + log.Bytes("3", []byte("hello")), + log.String("4", "foo"), + log.String("5", "bar"), + log.Slice("6", log.StringValue("baz")), + log.Map("7", log.String("a", "qux")), + ), + want: log.MapValue( + log.Bool("0", true), + log.Float64("1", 1.3), + log.Int64("2", 43), + log.Bytes("3", []byte("hello")), + log.String("4", ""), + log.String("5", ""), + log.Slice("6", log.StringValue("")), + log.Map("7", log.String("a", "")), + ), + }, + } + + assertKV := func(t *testing.T, r Record, kv log.KeyValue) { + t.Helper() + + var kvs []log.KeyValue + r.WalkAttributes(func(kv log.KeyValue) bool { + kvs = append(kvs, kv) + return true + }) + + require.Len(t, kvs, 1) + assert.Truef(t, kv.Equal(kvs[0]), "%s != %s", kv, kvs[0]) + } + + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + const key = "key" + kv := log.KeyValue{Key: key, Value: tc.input} + r := Record{attributeValueLengthLimit: tc.limit} + + t.Run("AddAttributes", func(t *testing.T) { + r.AddAttributes(kv) + assertKV(t, r, log.KeyValue{Key: key, Value: tc.want}) + }) + + t.Run("SetAttributes", func(t *testing.T) { + r.SetAttributes(kv) + assertKV(t, r, log.KeyValue{Key: key, Value: tc.want}) + }) + }) + } +} + +func assertKV(t *testing.T, r Record, kv log.KeyValue) { + t.Helper() + + var kvs []log.KeyValue + r.WalkAttributes(func(kv log.KeyValue) bool { + kvs = append(kvs, kv) + return true + }) + + require.Len(t, kvs, 1) + assert.Truef(t, kv.Equal(kvs[0]), "%s != %s", kv, kvs[0]) +} + +func TestTruncate(t *testing.T) { + testcases := []struct { + input, want string + limit int + }{ + { + input: "value", + want: "value", + limit: -1, + }, + { + input: "value", + want: "", + limit: 0, + }, + { + input: "value", + want: "v", + limit: 1, + }, + { + input: "value", + want: "va", + limit: 2, + }, + { + input: "value", + want: "val", + limit: 3, + }, + { + input: "value", + want: "valu", + limit: 4, + }, + { + input: "value", + want: "value", + limit: 5, + }, + { + input: "value", + want: "value", + limit: 6, + }, + { + input: "€€€€", // 3 bytes each + want: "€€€", + limit: 10, + }, + { + input: "€"[0:2] + "hello€€", // corrupted first rune, then over limit + want: "hello€", + limit: 10, + }, + { + input: "€"[0:2] + "hello", // corrupted first rune, then not over limit + want: "hello", + limit: 10, + }, + } + + for _, tc := range testcases { + name := fmt.Sprintf("%s/%d", tc.input, tc.limit) + t.Run(name, func(t *testing.T) { + t.Log(tc.input, len(tc.input), tc.limit) + assert.Equal(t, tc.want, truncate(tc.input, tc.limit)) + }) + } +}