From c6533de9e42157ea5e15ac846f180560464953d5 Mon Sep 17 00:00:00 2001 From: Kent Quirk Date: Thu, 15 Sep 2022 20:08:25 -0400 Subject: [PATCH] Allow adding extra fields to error logs (#514) ## Which problem is this PR solving? As noted in #513, errors that occur while Refinery is trying to send data to Honeycomb happen asynchronously. It's hard to relate any errors that show up in the logs with the spans that caused them to occur. However, there is a way provided by libhoney ([metadata](https://github.com/honeycombio/libhoney-go/blob/25068939fe8240ccc45e4c86271b6f057c5f833d/transmission/response.go#L11)) that makes it possible to attach data to the error. Before now, Refinery has attached the dataset, api host, and environment to the metadata. This helps but is not enough. ## Short description of the changes This PR attempts to leverage that technique further by adding a new configuration value called `AdditionalErrorFields`. It allows the user to specify a list of field names. In the event of a transmission error, these fields (if they exist) will be copied from the failing span into the metadata, and will therefore show up as identified fields in the logs. The default value is `trace.span_id`. This also removes the remaining instances of the obsolete standard library `ioutil`. Closes #513 --- app/app_test.go | 34 ++++++------ config/config.go | 2 + config/config_test.go | 73 ++++++++++++++++++++++--- config/config_test_reload_error_test.go | 9 ++- config/file_config.go | 9 +++ config/mock.go | 8 +++ config_complete.toml | 13 +++++ internal/peer/peers_test.go | 3 - logger/honeycomb.go | 2 +- metrics/honeycomb.go | 2 +- sample/sample_test.go | 7 +-- transmit/transmit.go | 14 ++++- 12 files changed, 136 insertions(+), 40 deletions(-) diff --git a/app/app_test.go b/app/app_test.go index 8b17176775..562e43e33e 100644 --- a/app/app_test.go +++ b/app/app_test.go @@ -1,5 +1,4 @@ //go:build all || race -// +build all race package app @@ -9,7 +8,6 @@ import ( "context" "fmt" "io" - "io/ioutil" "net" "net/http" "net/http/httptest" @@ -215,7 +213,7 @@ func post(t testing.TB, req *http.Request) { resp, err := httpClient.Do(req) assert.NoError(t, err) assert.Equal(t, http.StatusOK, resp.StatusCode) - io.Copy(ioutil.Discard, resp.Body) + io.Copy(io.Discard, resp.Body) resp.Body.Close() } @@ -333,7 +331,7 @@ func TestPeerRouting(t *testing.T) { req.Header.Set("Content-Type", "application/json") blob := `[` + string(spans[0]) + `]` - req.Body = ioutil.NopCloser(strings.NewReader(blob)) + req.Body = io.NopCloser(strings.NewReader(blob)) post(t, req) assert.Eventually(t, func() bool { return len(senders[0].Events()) == 1 @@ -364,7 +362,7 @@ func TestPeerRouting(t *testing.T) { "long": "this is a test of the emergency broadcast system", "foo": "bar", }, - Metadata: map[string]string{ + Metadata: map[string]any{ "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "", @@ -383,7 +381,7 @@ func TestPeerRouting(t *testing.T) { req.Header.Set("X-Honeycomb-Team", legacyAPIKey) req.Header.Set("Content-Type", "application/json") - req.Body = ioutil.NopCloser(strings.NewReader(blob)) + req.Body = io.NopCloser(strings.NewReader(blob)) post(t, req) assert.Eventually(t, func() bool { return len(senders[1].Events()) == 1 @@ -488,7 +486,7 @@ func TestEventsEndpoint(t *testing.T) { "trace.trace_id": "1", "foo": "bar", }, - Metadata: map[string]string{ + Metadata: map[string]any{ "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "", @@ -535,7 +533,7 @@ func TestEventsEndpoint(t *testing.T) { "trace.trace_id": "1", "foo": "bar", }, - Metadata: map[string]string{ + Metadata: map[string]any{ "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "", @@ -602,7 +600,7 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { "trace.trace_id": "1", "foo": "bar", }, - Metadata: map[string]string{ + Metadata: map[string]any{ "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "test", @@ -649,7 +647,7 @@ func TestEventsEndpointWithNonLegacyKey(t *testing.T) { "trace.trace_id": "1", "foo": "bar", }, - Metadata: map[string]string{ + Metadata: map[string]any{ "api_host": "http://api.honeycomb.io", "dataset": "dataset", "environment": "test", @@ -716,7 +714,7 @@ func BenchmarkTraces(b *testing.B) { sender := &countingWriterSender{ WriterSender: transmission.WriterSender{ - W: ioutil.Discard, + W: io.Discard, }, } _, graph := newStartedApp(b, sender, 11000, nil, false) @@ -734,7 +732,7 @@ func BenchmarkTraces(b *testing.B) { sender.resetCount() for n := 0; n < b.N; n++ { blob := `[` + string(spans[n%len(spans)]) + `]` - req.Body = ioutil.NopCloser(strings.NewReader(blob)) + req.Body = io.NopCloser(strings.NewReader(blob)) post(b, req) } sender.waitForCount(b, b.N) @@ -752,7 +750,7 @@ func BenchmarkTraces(b *testing.B) { blob = append(blob, ',') } blob[len(blob)-1] = ']' - req.Body = ioutil.NopCloser(bytes.NewReader(blob)) + req.Body = io.NopCloser(bytes.NewReader(blob)) post(b, req) } @@ -776,13 +774,13 @@ func BenchmarkTraces(b *testing.B) { blob = append(blob, ',') } blob[len(blob)-1] = ']' - req.Body = ioutil.NopCloser(bytes.NewReader(blob)) + req.Body = io.NopCloser(bytes.NewReader(blob)) resp, err := httpClient.Do(req) assert.NoError(b, err) if resp != nil { assert.Equal(b, http.StatusOK, resp.StatusCode) - io.Copy(ioutil.Discard, resp.Body) + io.Copy(io.Discard, resp.Body) resp.Body.Close() } } @@ -799,7 +797,7 @@ func BenchmarkTraces(b *testing.B) { func BenchmarkDistributedTraces(b *testing.B) { sender := &countingWriterSender{ WriterSender: transmission.WriterSender{ - W: ioutil.Discard, + W: io.Discard, }, } @@ -837,7 +835,7 @@ func BenchmarkDistributedTraces(b *testing.B) { sender.resetCount() for n := 0; n < b.N; n++ { blob := `[` + string(spans[n%len(spans)]) + `]` - req.Body = ioutil.NopCloser(strings.NewReader(blob)) + req.Body = io.NopCloser(strings.NewReader(blob)) req.URL.Host = addrs[n%len(addrs)] post(b, req) } @@ -856,7 +854,7 @@ func BenchmarkDistributedTraces(b *testing.B) { blob = append(blob, ',') } blob[len(blob)-1] = ']' - req.Body = ioutil.NopCloser(bytes.NewReader(blob)) + req.Body = io.NopCloser(bytes.NewReader(blob)) req.URL.Host = addrs[n%len(addrs)] post(b, req) diff --git a/config/config.go b/config/config.go index 3212c85d3b..4bfe3924b3 100644 --- a/config/config.go +++ b/config/config.go @@ -164,4 +164,6 @@ type Config interface { GetGRPCTimeout() time.Duration GetPeerTimeout() time.Duration + + GetAdditionalErrorFields() []string } diff --git a/config/config_test.go b/config/config_test.go index 4df8c6e683..10d1024d1f 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -1,7 +1,6 @@ package config import ( - "io/ioutil" "os" "sync" "testing" @@ -78,11 +77,12 @@ func TestRedisPasswordEnvVar(t *testing.T) { } } +// creates two temporary toml files from the strings passed in and returns their filenames func createTempConfigs(t *testing.T, configBody string, rulesBody string) (string, string) { - tmpDir, err := ioutil.TempDir("", "") + tmpDir, err := os.MkdirTemp("", "") assert.NoError(t, err) - configFile, err := ioutil.TempFile(tmpDir, "*.toml") + configFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) if configBody != "" { @@ -91,7 +91,7 @@ func createTempConfigs(t *testing.T, configBody string, rulesBody string) (strin } configFile.Close() - rulesFile, err := ioutil.TempFile(tmpDir, "*.toml") + rulesFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) if rulesBody != "" { @@ -636,11 +636,11 @@ func TestQueryAuthToken(t *testing.T) { } func TestGRPCServerParameters(t *testing.T) { - tmpDir, err := ioutil.TempDir("", "") + tmpDir, err := os.MkdirTemp("", "") assert.NoError(t, err) defer os.RemoveAll(tmpDir) - configFile, err := ioutil.TempFile(tmpDir, "*.toml") + configFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) _, err = configFile.Write([]byte(` @@ -668,7 +668,7 @@ func TestGRPCServerParameters(t *testing.T) { assert.NoError(t, err) configFile.Close() - rulesFile, err := ioutil.TempFile(tmpDir, "*.toml") + rulesFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) c, err := NewConfig(configFile.Name(), rulesFile.Name(), func(err error) {}) @@ -680,3 +680,62 @@ func TestGRPCServerParameters(t *testing.T) { assert.Equal(t, 4*time.Minute, c.GetGRPCTime()) assert.Equal(t, 5*time.Minute, c.GetGRPCTimeout()) } + +func TestHoneycombAdditionalErrorConfig(t *testing.T) { + config, rules := createTempConfigs(t, ` + AdditionalErrorFields = [ + "first", + "second" + ] + + [InMemCollector] + CacheCapacity=1000 + + [HoneycombMetrics] + MetricsHoneycombAPI="http://honeycomb.io" + MetricsAPIKey="1234" + MetricsDataset="testDatasetName" + MetricsReportingInterval=3 + + [HoneycombLogger] + LoggerHoneycombAPI="http://honeycomb.io" + LoggerAPIKey="1234" + LoggerDataset="loggerDataset" + LoggerSamplerEnabled=true + LoggerSamplerThroughput=10 + `, "") + defer os.Remove(rules) + defer os.Remove(config) + + c, err := NewConfig(config, rules, func(err error) {}) + assert.NoError(t, err) + + assert.Equal(t, []string{"first", "second"}, c.GetAdditionalErrorFields()) +} + +func TestHoneycombAdditionalErrorDefaults(t *testing.T) { + config, rules := createTempConfigs(t, ` + [InMemCollector] + CacheCapacity=1000 + + [HoneycombMetrics] + MetricsHoneycombAPI="http://honeycomb.io" + MetricsAPIKey="1234" + MetricsDataset="testDatasetName" + MetricsReportingInterval=3 + + [HoneycombLogger] + LoggerHoneycombAPI="http://honeycomb.io" + LoggerAPIKey="1234" + LoggerDataset="loggerDataset" + LoggerSamplerEnabled=true + LoggerSamplerThroughput=10 + `, "") + defer os.Remove(rules) + defer os.Remove(config) + + c, err := NewConfig(config, rules, func(err error) {}) + assert.NoError(t, err) + + assert.Equal(t, []string{"trace.span_id"}, c.GetAdditionalErrorFields()) +} diff --git a/config/config_test_reload_error_test.go b/config/config_test_reload_error_test.go index d86574cdb8..307166b27a 100644 --- a/config/config_test_reload_error_test.go +++ b/config/config_test_reload_error_test.go @@ -3,7 +3,6 @@ package config import ( - "io/ioutil" "os" "sync" "testing" @@ -13,14 +12,14 @@ import ( ) func TestErrorReloading(t *testing.T) { - tmpDir, err := ioutil.TempDir("", "") + tmpDir, err := os.MkdirTemp("", "") assert.NoError(t, err) defer os.RemoveAll(tmpDir) - rulesFile, err := ioutil.TempFile(tmpDir, "*.toml") + rulesFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) - configFile, err := ioutil.TempFile(tmpDir, "*.toml") + configFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) dummy := []byte(` @@ -76,7 +75,7 @@ func TestErrorReloading(t *testing.T) { } }() - err = ioutil.WriteFile(rulesFile.Name(), []byte(`Sampler="InvalidSampler"`), 0644) + err = os.WriteFile(rulesFile.Name(), []byte(`Sampler="InvalidSampler"`), 0644) if err != nil { t.Error(err) diff --git a/config/file_config.go b/config/file_config.go index 5e30aae6fb..5e84578848 100644 --- a/config/file_config.go +++ b/config/file_config.go @@ -54,6 +54,7 @@ type configContents struct { DatasetPrefix string QueryAuthToken string GRPCServerParameters GRPCServerParameters + AdditionalErrorFields []string } type InMemoryCollectorCacheCapacity struct { @@ -153,6 +154,7 @@ func NewConfig(config, rules string, errorCallback func(error)) (Config, error) c.SetDefault("GRPCServerParameters.MaxConnectionAgeGrace", time.Duration(0)) c.SetDefault("GRPCServerParameters.Time", 10*time.Second) c.SetDefault("GRPCServerParameters.Timeout", 2*time.Second) + c.SetDefault("AdditionalErrorFields", []string{"trace.span_id"}) c.SetConfigFile(config) err := c.ReadInConfig() @@ -870,3 +872,10 @@ func (f *fileConfig) GetPeerTimeout() time.Duration { return f.conf.PeerManagement.Timeout } + +func (f *fileConfig) GetAdditionalErrorFields() []string { + f.mux.RLock() + defer f.mux.RUnlock() + + return f.conf.AdditionalErrorFields +} diff --git a/config/mock.go b/config/mock.go index 83a4395a2d..3cc5911b2c 100644 --- a/config/mock.go +++ b/config/mock.go @@ -82,6 +82,7 @@ type MockConfig struct { GRPCTime time.Duration GRPCTimeout time.Duration PeerTimeout time.Duration + AdditionalErrorFields []string Mux sync.RWMutex } @@ -443,3 +444,10 @@ func (f *MockConfig) GetPeerTimeout() time.Duration { return f.PeerTimeout } + +func (f *MockConfig) GetAdditionalErrorFields() []string { + f.Mux.RLock() + defer f.Mux.RUnlock() + + return f.AdditionalErrorFields +} diff --git a/config_complete.toml b/config_complete.toml index 3fdfdd6be5..265e95a719 100644 --- a/config_complete.toml +++ b/config_complete.toml @@ -116,12 +116,25 @@ EnvironmentCacheTTL = "1h" # are not typically needed in normal operation. # Can be specified in the environment as REFINERY_QUERY_AUTH_TOKEN. # If left unspecified, the /query endpoints are inaccessible. +# Not eligible for live reload. # QueryAuthToken = "some-random-value" # AddRuleReasonToTrace causes traces that are sent to Honeycomb to include the field `meta.refinery.reason`. # This field contains text indicating which rule was evaluated that caused the trace to be included. +# Eligible for live reload. # AddRuleReasonToTrace = true +# AdditionalErrorFields should be a list of span fields that should be included when logging +# errors that happen during ingestion of events (for example, the span too large error). +# This is primarily useful in trying to track down misbehaving senders in a large installation. +# The fields `dataset`, `apihost`, and `environment` are always included. +# If a field is not present in the span, it will not be present in the error log. +# Default is ["trace.span_id"]. +# Eligible for live reload. +AdditionalErrorFields = [ + "trace.span_id" +] + ############################ ## Implementation Choices ## ############################ diff --git a/internal/peer/peers_test.go b/internal/peer/peers_test.go index 5e6917e4ab..5ec7f8137a 100644 --- a/internal/peer/peers_test.go +++ b/internal/peer/peers_test.go @@ -1,6 +1,3 @@ -//go:build all || race -// +build all race - package peer import ( diff --git a/logger/honeycomb.go b/logger/honeycomb.go index f5f6c76fd9..85ef57c049 100644 --- a/logger/honeycomb.go +++ b/logger/honeycomb.go @@ -240,7 +240,7 @@ func (h *HoneycombEntry) Logf(f string, args ...interface{}) { ev := h.builder.NewEvent() msg := fmt.Sprintf(f, args...) ev.AddField("msg", msg) - ev.Metadata = map[string]string{ + ev.Metadata = map[string]any{ "api_host": ev.APIHost, "dataset": ev.Dataset, } diff --git a/metrics/honeycomb.go b/metrics/honeycomb.go index 32f0a462d9..43a2c6d944 100644 --- a/metrics/honeycomb.go +++ b/metrics/honeycomb.go @@ -222,7 +222,7 @@ func (h *HoneycombMetrics) reportToHoneycommb(ctx context.Context) { return case <-tick.C: ev := h.libhClient.NewEvent() - ev.Metadata = map[string]string{ + ev.Metadata = map[string]any{ "api_host": ev.APIHost, "dataset": ev.Dataset, } diff --git a/sample/sample_test.go b/sample/sample_test.go index e06f70aeb7..b1b804b968 100644 --- a/sample/sample_test.go +++ b/sample/sample_test.go @@ -1,7 +1,6 @@ package sample import ( - "io/ioutil" "os" "testing" @@ -30,11 +29,11 @@ func TestDependencyInjection(t *testing.T) { } func TestDatasetPrefix(t *testing.T) { - tmpDir, err := ioutil.TempDir("", "") + tmpDir, err := os.MkdirTemp("", "") assert.NoError(t, err) defer os.RemoveAll(tmpDir) - configFile, err := ioutil.TempFile(tmpDir, "*.toml") + configFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) _, err = configFile.Write([]byte(` @@ -57,7 +56,7 @@ func TestDatasetPrefix(t *testing.T) { assert.NoError(t, err) configFile.Close() - rulesFile, err := ioutil.TempFile(tmpDir, "*.toml") + rulesFile, err := os.CreateTemp(tmpDir, "*.toml") assert.NoError(t, err) _, err = rulesFile.Write([]byte(` diff --git a/transmit/transmit.go b/transmit/transmit.go index 30309c7a47..64db70f359 100644 --- a/transmit/transmit.go +++ b/transmit/transmit.go @@ -98,12 +98,19 @@ func (d *DefaultTransmission) EnqueueEvent(ev *types.Event) { libhEv.SampleRate = ev.SampleRate libhEv.Timestamp = ev.Timestamp // metadata is used to make error logs more helpful when processing libhoney responses - libhEv.Metadata = map[string]string{ + metadata := map[string]any{ "api_host": ev.APIHost, "dataset": ev.Dataset, "environment": ev.Environment, } + for _, k := range d.Config.GetAdditionalErrorFields() { + if v, ok := ev.Data[k]; ok { + metadata[k] = v + } + } + libhEv.Metadata = metadata + for k, v := range ev.Data { libhEv.AddField(k, v) } @@ -160,6 +167,11 @@ func (d *DefaultTransmission) processResponses( "dataset": dataset, "environment": environment, }) + for _, k := range d.Config.GetAdditionalErrorFields() { + if v, ok := r.Metadata.(map[string]any)[k]; ok { + log = log.WithField(k, v) + } + } if r.Err != nil { log = log.WithField("error", r.Err.Error()) }