From 13370eabfec5a7d8be8001c64b77e7e75922c5e3 Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Mon, 26 Sep 2022 15:10:11 -0400 Subject: [PATCH 1/9] Improve logging around agent checkins. - Log transient checkin errors at Info. - Upgrade to an Error log after 2 repeated failures. - Log the wait time for the next retry. Log the checking request duration on failure. --- .../gateway/fleet/fleet_gateway.go | 41 +++++++++------ internal/pkg/core/backoff/backoff.go | 5 ++ internal/pkg/core/backoff/backoff_test.go | 50 +++++++++++++++---- internal/pkg/core/backoff/equal_jitter.go | 17 +++++-- internal/pkg/core/backoff/exponential.go | 17 ++++--- internal/pkg/fleetapi/checkin_cmd.go | 23 +++++---- internal/pkg/fleetapi/checkin_cmd_test.go | 20 +++++--- 7 files changed, 118 insertions(+), 55 deletions(-) diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 897b81ea0d3..299b87dc282 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -173,7 +173,7 @@ func (f *fleetGateway) worker() { // Execute the checkin call and for any errors returned by the fleet-server API // the function will retry to communicate with fleet-server with an exponential delay and some // jitter to help better distribute the load from a fleet of agents. - resp, err := f.doExecute() + resp, err := f.executeCheckinWithRetries() if err != nil { continue } @@ -274,21 +274,34 @@ func (f *fleetGateway) gatherQueuedActions(ts time.Time) (queued, expired []flee return queued, expired } -func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) { +func (f *fleetGateway) executeCheckinWithRetries() (*fleetapi.CheckinResponse, error) { f.backoff.Reset() // Guard if the context is stopped by a out of bound call, // this mean we are rebooting to change the log level or the system is shutting us down. for f.bgContext.Err() == nil { - f.log.Debugf("Checkin started") - resp, err := f.execute(f.bgContext) + f.log.Debugf("Checking started") + resp, took, err := f.executeCheckin(f.bgContext) if err != nil { + // Only update the local status on failure: https://github.com/elastic/elastic-agent/issues/1148 + f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil) f.checkinFailCounter++ - f.log.Errorf("Could not communicate with fleet-server checkin API will retry, error: %s", err) + + // Report the first two failures at warn level as they may be recoverable with retries. + if f.checkinFailCounter <= 2 { + f.log.Warnw("Possible transient error during checkin with fleet-server, retrying", + "error.message", err, "request_duration", took, "failed_checkins", f.checkinFailCounter, + "retry_after", f.backoff.NextWait()) + } else { + f.log.Errorw("Cannot checkin in with fleet-server, retrying", + "error.message", err, "request_duration", took, "failed_checkins", f.checkinFailCounter, + "retry_after", f.backoff.NextWait()) + } + if !f.backoff.Wait() { // Something bad has happened and we log it and we should update our current state. err := errors.New( - "execute retry loop was stopped", + "checkin retry loop was stopped", errors.TypeNetwork, errors.M(errors.MetaKeyURI, f.client.URI()), ) @@ -297,10 +310,6 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) { f.localReporter.Update(state.Failed, err.Error(), nil) return nil, err } - if f.checkinFailCounter > 1 { - f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil) - f.log.Errorf("checkin number %d failed: %s", f.checkinFailCounter, err.Error()) - } continue } @@ -319,7 +328,7 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) { return nil, f.bgContext.Err() } -func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse, error) { +func (f *fleetGateway) executeCheckin(ctx context.Context) (*fleetapi.CheckinResponse, time.Duration, error) { ecsMeta, err := info.Metadata() if err != nil { f.log.Error(errors.New("failed to load metadata", err)) @@ -340,7 +349,7 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse, Message: f.statusController.Status().Message, } - resp, err := cmd.Execute(ctx, req) + resp, took, err := cmd.Execute(ctx, req) if isUnauth(err) { f.unauthCounter++ @@ -348,15 +357,15 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse, f.log.Warnf("received an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter) return &fleetapi.CheckinResponse{ Actions: []fleetapi.Action{&fleetapi.ActionUnenroll{ActionID: "", ActionType: "UNENROLL", IsDetected: true}}, - }, nil + }, took, nil } - return nil, err + return nil, took, err } f.unauthCounter = 0 if err != nil { - return nil, err + return nil, took, err } // Save the latest ackToken @@ -368,7 +377,7 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse, } } - return resp, nil + return resp, took, nil } // shouldUnenroll checks if the max number of trying an invalid key is reached diff --git a/internal/pkg/core/backoff/backoff.go b/internal/pkg/core/backoff/backoff.go index 06723e7db9a..c97eaae199d 100644 --- a/internal/pkg/core/backoff/backoff.go +++ b/internal/pkg/core/backoff/backoff.go @@ -4,11 +4,16 @@ package backoff +import "time" + // Backoff defines the interface for backoff strategies. type Backoff interface { // Wait blocks for a duration of time governed by the backoff strategy. Wait() bool + // NextWait returns the duration of the next call to Wait(). + NextWait() time.Duration + // Reset resets the backoff duration to an initial value governed by the backoff strategy. Reset() } diff --git a/internal/pkg/core/backoff/backoff_test.go b/internal/pkg/core/backoff/backoff_test.go index 88498ff5a58..12332eb15f2 100644 --- a/internal/pkg/core/backoff/backoff_test.go +++ b/internal/pkg/core/backoff/backoff_test.go @@ -14,14 +14,9 @@ import ( type factory func(<-chan struct{}) Backoff -func TestBackoff(t *testing.T) { - t.Run("test close channel", testCloseChannel) - t.Run("test unblock after some time", testUnblockAfterInit) -} - -func testCloseChannel(t *testing.T) { - init := 2 * time.Second - max := 5 * time.Minute +func TestCloseChannel(t *testing.T) { + init := 2 * time.Millisecond + max := 5 * time.Second tests := map[string]factory{ "ExpBackoff": func(done <-chan struct{}) Backoff { @@ -42,9 +37,9 @@ func testCloseChannel(t *testing.T) { } } -func testUnblockAfterInit(t *testing.T) { - init := 1 * time.Second - max := 5 * time.Minute +func TestUnblockAfterInit(t *testing.T) { + init := 1 * time.Millisecond + max := 5 * time.Second tests := map[string]factory{ "ExpBackoff": func(done <-chan struct{}) Backoff { @@ -68,3 +63,36 @@ func testUnblockAfterInit(t *testing.T) { }) } } + +func TestNextWait(t *testing.T) { + init := time.Millisecond + max := 5 * time.Second + + tests := map[string]factory{ + "ExpBackoff": func(done <-chan struct{}) Backoff { + return NewExpBackoff(done, init, max) + }, + "EqualJitterBackoff": func(done <-chan struct{}) Backoff { + return NewEqualJitterBackoff(done, init, max) + }, + } + + for name, f := range tests { + t.Run(name, func(t *testing.T) { + c := make(chan struct{}) + b := f(c) + + startWait := b.NextWait() + assert.Equal(t, startWait, b.NextWait(), "next wait not stable") + + startedAt := time.Now() + b.Wait() + waitDuration := time.Now().Sub(startedAt) + nextWait := b.NextWait() + + t.Logf("actualWait: %s startWait: %s nextWait: %s", waitDuration, startWait, nextWait) + assert.Less(t, startWait, nextWait, "wait value did not increase") + assert.GreaterOrEqual(t, waitDuration, startWait, "next wait duration <= actual wait duration") + }) + } +} diff --git a/internal/pkg/core/backoff/equal_jitter.go b/internal/pkg/core/backoff/equal_jitter.go index d87077397cd..a43a23bda16 100644 --- a/internal/pkg/core/backoff/equal_jitter.go +++ b/internal/pkg/core/backoff/equal_jitter.go @@ -16,8 +16,9 @@ type EqualJitterBackoff struct { duration time.Duration done <-chan struct{} - init time.Duration - max time.Duration + init time.Duration + max time.Duration + nextRand time.Duration last time.Time } @@ -29,6 +30,7 @@ func NewEqualJitterBackoff(done <-chan struct{}, init, max time.Duration) Backof done: done, init: init, max: max, + nextRand: time.Duration(rand.Int63n(int64(init))), } } @@ -38,13 +40,18 @@ func (b *EqualJitterBackoff) Reset() { b.duration = b.init * 2 } +func (b *EqualJitterBackoff) NextWait() time.Duration { + // Make sure we have always some minimal back off and jitter. + temp := b.duration / 2 + return temp + b.nextRand +} + // Wait block until either the timer is completed or channel is done. func (b *EqualJitterBackoff) Wait() bool { - // Make sure we have always some minimal back off and jitter. - temp := int64(b.duration / 2) - backoff := time.Duration(temp + rand.Int63n(temp)) + backoff := b.NextWait() // increase duration for next wait. + b.nextRand = time.Duration(rand.Int63n(int64(b.duration))) b.duration *= 2 if b.duration > b.max { b.duration = b.max diff --git a/internal/pkg/core/backoff/exponential.go b/internal/pkg/core/backoff/exponential.go index 81224b95eb5..51b5b4e0cb5 100644 --- a/internal/pkg/core/backoff/exponential.go +++ b/internal/pkg/core/backoff/exponential.go @@ -36,18 +36,23 @@ func (b *ExpBackoff) Reset() { b.duration = b.init } +func (b *ExpBackoff) NextWait() time.Duration { + nextWait := b.duration + nextWait *= 2 + if nextWait > b.max { + nextWait = b.max + } + return nextWait +} + // Wait block until either the timer is completed or channel is done. func (b *ExpBackoff) Wait() bool { - backoff := b.duration - b.duration *= 2 - if b.duration > b.max { - b.duration = b.max - } + b.duration = b.NextWait() select { case <-b.done: return false - case <-time.After(backoff): + case <-time.After(b.duration): b.last = time.Now() return true } diff --git a/internal/pkg/fleetapi/checkin_cmd.go b/internal/pkg/fleetapi/checkin_cmd.go index d6a63a45e29..f79c6bab8bc 100644 --- a/internal/pkg/fleetapi/checkin_cmd.go +++ b/internal/pkg/fleetapi/checkin_cmd.go @@ -78,23 +78,26 @@ func NewCheckinCmd(info agentInfo, client client.Sender) *CheckinCmd { } } -// Execute enroll the Agent in the Fleet Server. -func (e *CheckinCmd) Execute(ctx context.Context, r *CheckinRequest) (*CheckinResponse, error) { +// Execute enroll the Agent in the Fleet Server. Returns the decoded check in response, a duration indicating +// how long the request took, and an error. +func (e *CheckinCmd) Execute(ctx context.Context, r *CheckinRequest) (*CheckinResponse, time.Duration, error) { if err := r.Validate(); err != nil { - return nil, err + return nil, 0, err } b, err := json.Marshal(r) if err != nil { - return nil, errors.New(err, + return nil, 0, errors.New(err, "fail to encode the checkin request", errors.TypeUnexpected) } cp := fmt.Sprintf(checkingPath, e.info.AgentID()) + sendStart := time.Now() resp, err := e.client.Send(ctx, "POST", cp, nil, nil, bytes.NewBuffer(b)) + sendDuration := time.Now().Sub(sendStart) if err != nil { - return nil, errors.New(err, + return nil, sendDuration, errors.New(err, "fail to checkin to fleet-server", errors.TypeNetwork, errors.M(errors.MetaKeyURI, cp)) @@ -102,26 +105,26 @@ func (e *CheckinCmd) Execute(ctx context.Context, r *CheckinRequest) (*CheckinRe defer resp.Body.Close() if resp.StatusCode != http.StatusOK { - return nil, client.ExtractError(resp.Body) + return nil, sendDuration, client.ExtractError(resp.Body) } rs, err := ioutil.ReadAll(resp.Body) if err != nil { - return nil, errors.New(err, "failed to read checkin response") + return nil, sendDuration, errors.New(err, "failed to read checkin response") } checkinResponse := &CheckinResponse{} decoder := json.NewDecoder(bytes.NewReader(rs)) if err := decoder.Decode(checkinResponse); err != nil { - return nil, errors.New(err, + return nil, sendDuration, errors.New(err, "fail to decode checkin response", errors.TypeNetwork, errors.M(errors.MetaKeyURI, cp)) } if err := checkinResponse.Validate(); err != nil { - return nil, err + return nil, sendDuration, err } - return checkinResponse, nil + return checkinResponse, sendDuration, nil } diff --git a/internal/pkg/fleetapi/checkin_cmd_test.go b/internal/pkg/fleetapi/checkin_cmd_test.go index 2d9aef2741a..56726bb5559 100644 --- a/internal/pkg/fleetapi/checkin_cmd_test.go +++ b/internal/pkg/fleetapi/checkin_cmd_test.go @@ -11,6 +11,7 @@ import ( "io/ioutil" "net/http" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -25,6 +26,7 @@ func (*agentinfo) AgentID() string { return "id" } func TestCheckin(t *testing.T) { const withAPIKey = "secret" + const requestDelay = time.Millisecond ctx := context.Background() agentInfo := &agentinfo{} @@ -39,6 +41,8 @@ func TestCheckin(t *testing.T) { mux.HandleFunc(path, authHandler(func(w http.ResponseWriter, r *http.Request) { w.WriteHeader(http.StatusInternalServerError) fmt.Fprint(w, raw) + // Introduce a small delay to test the request time measurment. + time.Sleep(requestDelay) }, withAPIKey)) return mux }, withAPIKey, @@ -47,8 +51,10 @@ func TestCheckin(t *testing.T) { request := CheckinRequest{} - _, err := cmd.Execute(ctx, &request) + _, took, err := cmd.Execute(ctx, &request) require.Error(t, err) + // Ensure the request took at least as long as the artificial delay. + require.GreaterOrEqual(t, took, requestDelay) }, )) @@ -96,7 +102,7 @@ func TestCheckin(t *testing.T) { request := CheckinRequest{} - r, err := cmd.Execute(ctx, &request) + r, _, err := cmd.Execute(ctx, &request) require.NoError(t, err) require.Equal(t, 1, len(r.Actions)) @@ -157,7 +163,7 @@ func TestCheckin(t *testing.T) { request := CheckinRequest{} - r, err := cmd.Execute(ctx, &request) + r, _, err := cmd.Execute(ctx, &request) require.NoError(t, err) require.Equal(t, 2, len(r.Actions)) @@ -173,7 +179,7 @@ func TestCheckin(t *testing.T) { }, )) - t.Run("When we receive no action", withServerWithAuthClient( + t.Run("When we receive no action with delay", withServerWithAuthClient( func(t *testing.T) *http.ServeMux { raw := `{ "actions": [] }` mux := http.NewServeMux() @@ -189,7 +195,7 @@ func TestCheckin(t *testing.T) { request := CheckinRequest{} - r, err := cmd.Execute(ctx, &request) + r, _, err := cmd.Execute(ctx, &request) require.NoError(t, err) require.Equal(t, 0, len(r.Actions)) @@ -223,7 +229,7 @@ func TestCheckin(t *testing.T) { request := CheckinRequest{Metadata: testMetadata()} - r, err := cmd.Execute(ctx, &request) + r, _, err := cmd.Execute(ctx, &request) require.NoError(t, err) require.Equal(t, 0, len(r.Actions)) @@ -257,7 +263,7 @@ func TestCheckin(t *testing.T) { request := CheckinRequest{} - r, err := cmd.Execute(ctx, &request) + r, _, err := cmd.Execute(ctx, &request) require.NoError(t, err) require.Equal(t, 0, len(r.Actions)) From 4083192c1245973fef1cac42e4ac202001bab008 Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Tue, 11 Oct 2022 15:28:24 -0400 Subject: [PATCH 2/9] Update changelog. --- CHANGELOG.next.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index f8033809cb4..e937813e86d 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -210,3 +210,4 @@ - Add support for hints' based autodiscovery in kubernetes provider. {pull}[698] - Improve logging during upgrades. {pull}[1287] - Added status message to CheckinRequest {pull}[1369] +- Improve logging of Fleet checkins errors. {pull}[1477] From e95396498b2342148f258c4e13abe25c16655a20 Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Tue, 11 Oct 2022 15:54:49 -0400 Subject: [PATCH 3/9] Add changelog fragment. --- .../fragments/1665517984-improve-checkin-error-logging.yaml | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 changelog/fragments/1665517984-improve-checkin-error-logging.yaml diff --git a/changelog/fragments/1665517984-improve-checkin-error-logging.yaml b/changelog/fragments/1665517984-improve-checkin-error-logging.yaml new file mode 100644 index 00000000000..631a733e829 --- /dev/null +++ b/changelog/fragments/1665517984-improve-checkin-error-logging.yaml @@ -0,0 +1,4 @@ +kind: enhancement +summary: improve-checkin-error-logging +pr: 1477 +issue: 1154 From 7f27af690136cc227bbeaa550e92bfb1ec22f495 Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Wed, 12 Oct 2022 13:06:56 -0400 Subject: [PATCH 4/9] Specify units for request and retry durations. --- .../application/gateway/fleet/fleet_gateway.go | 8 ++++---- .../gateway/fleet/fleet_gateway_test.go | 16 +++++++++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 299b87dc282..38bae320963 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -290,12 +290,12 @@ func (f *fleetGateway) executeCheckinWithRetries() (*fleetapi.CheckinResponse, e // Report the first two failures at warn level as they may be recoverable with retries. if f.checkinFailCounter <= 2 { f.log.Warnw("Possible transient error during checkin with fleet-server, retrying", - "error.message", err, "request_duration", took, "failed_checkins", f.checkinFailCounter, - "retry_after", f.backoff.NextWait()) + "error.message", err, "request_duration_ns", took, "failed_checkins", f.checkinFailCounter, + "retry_after_ns", f.backoff.NextWait()) } else { f.log.Errorw("Cannot checkin in with fleet-server, retrying", - "error.message", err, "request_duration", took, "failed_checkins", f.checkinFailCounter, - "retry_after", f.backoff.NextWait()) + "error.message", err, "request_duration_ns", took, "failed_checkins", f.checkinFailCounter, + "retry_after_ns", f.backoff.NextWait()) } if !f.backoff.Wait() { diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go index 0cc00e739a8..5da8a966ce9 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go @@ -21,6 +21,7 @@ import ( "github.com/stretchr/testify/mock" "github.com/stretchr/testify/require" + "github.com/elastic/elastic-agent-libs/logp" "github.com/elastic/elastic-agent/internal/pkg/agent/application/gateway" "github.com/elastic/elastic-agent/internal/pkg/agent/application/paths" "github.com/elastic/elastic-agent/internal/pkg/agent/storage" @@ -693,7 +694,7 @@ func TestRetriesOnFailures(t *testing.T) { scheduler := scheduler.NewStepper() client := newTestingClient() dispatcher := newTestingDispatcher() - log, _ := logger.New("fleet_gateway", false) + log := newInfoLogger(t, "fleet_gateway") ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -814,3 +815,16 @@ type testAgentInfo struct{} func (testAgentInfo) AgentID() string { return "agent-secret" } type request struct{} + +func newInfoLogger(t *testing.T, name string) *logger.Logger { + t.Helper() + + loggerCfg := logger.DefaultLoggingConfig() + loggerCfg.Level = logp.InfoLevel + loggerCfg.ToFiles = false + loggerCfg.ToStderr = true + + log, err := logger.NewFromConfig("", loggerCfg, false) + require.NoError(t, err) + return log +} From 957edb4b7daf650570f695e68c128ce524917a6c Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Wed, 12 Oct 2022 13:26:36 -0400 Subject: [PATCH 5/9] Fix typo checking/checkin --- internal/pkg/agent/application/gateway/fleet/fleet_gateway.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 38bae320963..44c8a5c9a3e 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -280,7 +280,7 @@ func (f *fleetGateway) executeCheckinWithRetries() (*fleetapi.CheckinResponse, e // Guard if the context is stopped by a out of bound call, // this mean we are rebooting to change the log level or the system is shutting us down. for f.bgContext.Err() == nil { - f.log.Debugf("Checking started") + f.log.Debugf("Checkin started") resp, took, err := f.executeCheckin(f.bgContext) if err != nil { // Only update the local status on failure: https://github.com/elastic/elastic-agent/issues/1148 From eb509946549688b1fb7e740ae64c7aff266cc42e Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Wed, 12 Oct 2022 13:28:14 -0400 Subject: [PATCH 6/9] Disable false positive lint warning. --- internal/pkg/core/backoff/equal_jitter.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/pkg/core/backoff/equal_jitter.go b/internal/pkg/core/backoff/equal_jitter.go index a43a23bda16..671201f5892 100644 --- a/internal/pkg/core/backoff/equal_jitter.go +++ b/internal/pkg/core/backoff/equal_jitter.go @@ -30,7 +30,7 @@ func NewEqualJitterBackoff(done <-chan struct{}, init, max time.Duration) Backof done: done, init: init, max: max, - nextRand: time.Duration(rand.Int63n(int64(init))), + nextRand: time.Duration(rand.Int63n(int64(init))), //nolint:gosec } } From 5a6be152de2035030154bb76b3247dfb2b67895f Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Wed, 12 Oct 2022 13:28:22 -0400 Subject: [PATCH 7/9] Only update local state after repeated failures. --- internal/pkg/agent/application/gateway/fleet/fleet_gateway.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 44c8a5c9a3e..9ebebcf2c0f 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -283,8 +283,6 @@ func (f *fleetGateway) executeCheckinWithRetries() (*fleetapi.CheckinResponse, e f.log.Debugf("Checkin started") resp, took, err := f.executeCheckin(f.bgContext) if err != nil { - // Only update the local status on failure: https://github.com/elastic/elastic-agent/issues/1148 - f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil) f.checkinFailCounter++ // Report the first two failures at warn level as they may be recoverable with retries. @@ -293,6 +291,8 @@ func (f *fleetGateway) executeCheckinWithRetries() (*fleetapi.CheckinResponse, e "error.message", err, "request_duration_ns", took, "failed_checkins", f.checkinFailCounter, "retry_after_ns", f.backoff.NextWait()) } else { + // Only update the local status after repeated failures: https://github.com/elastic/elastic-agent/issues/1148 + f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil) f.log.Errorw("Cannot checkin in with fleet-server, retrying", "error.message", err, "request_duration_ns", took, "failed_checkins", f.checkinFailCounter, "retry_after_ns", f.backoff.NextWait()) From 8172358d60f647cb606756dec5a95adf4022f45b Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Wed, 12 Oct 2022 15:40:44 -0400 Subject: [PATCH 8/9] Fix changelog wording. --- .../fragments/1665517984-improve-checkin-error-logging.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog/fragments/1665517984-improve-checkin-error-logging.yaml b/changelog/fragments/1665517984-improve-checkin-error-logging.yaml index 631a733e829..b499e3dc151 100644 --- a/changelog/fragments/1665517984-improve-checkin-error-logging.yaml +++ b/changelog/fragments/1665517984-improve-checkin-error-logging.yaml @@ -1,4 +1,4 @@ kind: enhancement -summary: improve-checkin-error-logging +summary: Improve logging of Fleet check-in errors. pr: 1477 issue: 1154 From 29e3837e717db3139dc5485626867470a6cb0df6 Mon Sep 17 00:00:00 2001 From: Craig MacKenzie Date: Wed, 12 Oct 2022 16:12:16 -0400 Subject: [PATCH 9/9] Fix broken test, improve changelog. --- .../fragments/1665517984-improve-checkin-error-logging.yaml | 1 + .../pkg/agent/application/gateway/fleet/fleet_gateway_test.go | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/changelog/fragments/1665517984-improve-checkin-error-logging.yaml b/changelog/fragments/1665517984-improve-checkin-error-logging.yaml index b499e3dc151..7bf2777d9d5 100644 --- a/changelog/fragments/1665517984-improve-checkin-error-logging.yaml +++ b/changelog/fragments/1665517984-improve-checkin-error-logging.yaml @@ -1,4 +1,5 @@ kind: enhancement summary: Improve logging of Fleet check-in errors. +description: Improve logging of Fleet check-in errors and only report the local state as degraded after two consecutive failed check-ins. pr: 1477 issue: 1154 diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go index 5da8a966ce9..1860782a1e7 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway_test.go @@ -707,8 +707,8 @@ func TestRetriesOnFailures(t *testing.T) { queue.On("Actions").Return([]fleetapi.Action{}) localReporter := &testutils.MockReporter{} - localReporter.On("Update", state.Degraded, mock.Anything, mock.Anything).Times(2) - localReporter.On("Update", mock.Anything, mock.Anything, mock.Anything).Maybe() + // The local state should only be reported as degraded after two consecutive failures. + localReporter.On("Update", state.Degraded, mock.Anything, mock.Anything).Once() localReporter.On("Unregister").Maybe() fleetReporter := &testutils.MockReporter{}