diff --git a/CHANGELOG.next.asciidoc b/CHANGELOG.next.asciidoc index faef2861ba9..7d500cf8f53 100644 --- a/CHANGELOG.next.asciidoc +++ b/CHANGELOG.next.asciidoc @@ -119,6 +119,7 @@ - Remove fleet event reporter and events from checkin body. {issue}993[993] - Fix unintended reset of source URI when downloading components {pull}1252[1252] - Create separate status reporter for local only events so that degraded fleet-checkins no longer affect health on successful fleet-checkins. {issue}1157[1157] {pull}1285[1285] +- Add success log message after previous checkin failures {pull}1327[1327] ==== New features diff --git a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go index 6df9f171fbe..b88a0cafee0 100644 --- a/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go +++ b/internal/pkg/agent/application/gateway/fleet/fleet_gateway.go @@ -210,7 +210,7 @@ func (f *fleetGateway) worker() { f.statusReporter.Update(state.Failed, errMsg, nil) } else { f.statusReporter.Update(state.Healthy, "", nil) - f.localReporter.Update(state.Healthy, "", nil) // we don't need to specifically set the local reporter to failed above, but it needs to be reset to healthy if a checking succeeds + f.localReporter.Update(state.Healthy, "", nil) // we don't need to specifically set the local reporter to failed above, but it needs to be reset to healthy if a checkin succeeds } case <-f.bgContext.Done(): @@ -280,11 +280,11 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) { // Guard if the context is stopped by a out of bound call, // this mean we are rebooting to change the log level or the system is shutting us down. for f.bgContext.Err() == nil { - f.log.Debugf("Checking started") + f.log.Debugf("Checkin started") resp, err := f.execute(f.bgContext) if err != nil { f.checkinFailCounter++ - f.log.Errorf("Could not communicate with fleet-server Checking API will retry, error: %s", err) + f.log.Errorf("Could not communicate with fleet-server checkin API will retry, error: %s", err) if !f.backoff.Wait() { // Something bad has happened and we log it and we should update our current state. err := errors.New( @@ -299,10 +299,16 @@ func (f *fleetGateway) doExecute() (*fleetapi.CheckinResponse, error) { } if f.checkinFailCounter > 1 { f.localReporter.Update(state.Degraded, fmt.Sprintf("checkin failed: %v", err), nil) - f.log.Errorf("checking number %d failed: %s", f.checkinFailCounter, err.Error()) + f.log.Errorf("checkin number %d failed: %s", f.checkinFailCounter, err.Error()) } continue } + + if f.checkinFailCounter > 0 { + // Log at same level as error logs above so subsequent successes are visible when log level is set to 'error'. + f.log.Errorf("Checkin request to fleet-server succeeded after %d failures", f.checkinFailCounter) + } + f.checkinFailCounter = 0 // Request was successful, return the collected actions. return resp, nil @@ -338,7 +344,7 @@ func (f *fleetGateway) execute(ctx context.Context) (*fleetapi.CheckinResponse, f.unauthCounter++ if f.shouldUnenroll() { - f.log.Warnf("retrieved an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter) + f.log.Warnf("received an invalid api key error '%d' times. Starting to unenroll the elastic agent.", f.unauthCounter) return &fleetapi.CheckinResponse{ Actions: []fleetapi.Action{&fleetapi.ActionUnenroll{ActionID: "", ActionType: "UNENROLL", IsDetected: true}}, }, nil