From 109732404e0d6b087632fb6436c0b756f771f3b5 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Tue, 27 Apr 2021 11:50:21 -0400 Subject: [PATCH 1/4] Delay the restart of application when a status report of failure is given. --- .../elastic-agent/pkg/agent/cmd/enroll_cmd.go | 7 -- .../pkg/core/plugin/process/app.go | 4 +- .../pkg/core/plugin/process/status.go | 82 ++++++++++++++++--- 3 files changed, 74 insertions(+), 19 deletions(-) diff --git a/x-pack/elastic-agent/pkg/agent/cmd/enroll_cmd.go b/x-pack/elastic-agent/pkg/agent/cmd/enroll_cmd.go index 9b51d4c3692..ba88783d39b 100644 --- a/x-pack/elastic-agent/pkg/agent/cmd/enroll_cmd.go +++ b/x-pack/elastic-agent/pkg/agent/cmd/enroll_cmd.go @@ -613,13 +613,6 @@ func waitForFleetServer(ctx context.Context, agentSubproc <-chan *os.ProcessStat } resChan <- waitResult{enrollmentToken: token} break - } else if app.Status == proto.Status_FAILED { - // app completely failed; exit now - if app.Message != "" { - log.Infof("Fleet Server - %s", app.Message) - } - resChan <- waitResult{err: errors.New(app.Message)} - break } if app.Message != "" { appMsg := fmt.Sprintf("Fleet Server - %s", app.Message) diff --git a/x-pack/elastic-agent/pkg/core/plugin/process/app.go b/x-pack/elastic-agent/pkg/core/plugin/process/app.go index 9f52f74ce38..c0c6341cbd3 100644 --- a/x-pack/elastic-agent/pkg/core/plugin/process/app.go +++ b/x-pack/elastic-agent/pkg/core/plugin/process/app.go @@ -58,7 +58,9 @@ type Application struct { logger *logger.Logger - appLock sync.Mutex + appLock sync.Mutex + restartCanceller context.CancelFunc + restartConfig map[string]interface{} } // ArgsDecorator decorates arguments before calling an application diff --git a/x-pack/elastic-agent/pkg/core/plugin/process/status.go b/x-pack/elastic-agent/pkg/core/plugin/process/status.go index 21ded667101..5f531d125e6 100644 --- a/x-pack/elastic-agent/pkg/core/plugin/process/status.go +++ b/x-pack/elastic-agent/pkg/core/plugin/process/status.go @@ -5,7 +5,9 @@ package process import ( + "context" "fmt" + "time" "gopkg.in/yaml.v2" @@ -15,6 +17,11 @@ import ( "github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/core/state" ) +const ( + // FailedRestartTimeout is the amount of time an Application can sit in Failed status before it is restarted. + FailedRestartTimeout = 10 * time.Second +) + // OnStatusChange is the handler called by the GRPC server code. // // It updates the status of the application and handles restarting the application if needed. @@ -35,21 +42,74 @@ func (a *Application) OnStatusChange(s *server.ApplicationState, status proto.St return } - // kill the process - if a.state.ProcessInfo != nil { - _ = a.state.ProcessInfo.Process.Kill() - a.state.ProcessInfo = nil - } - ctx := a.startContext - tag := a.tag - // it was marshalled to pass into the state, so unmarshall will always succeed var cfg map[string]interface{} _ = yaml.Unmarshal([]byte(s.Config()), &cfg) - err := a.start(ctx, tag, cfg) - if err != nil { - a.setState(state.Crashed, fmt.Sprintf("failed to restart: %s", err), nil) + // start the failed timer + a.startFailedTimer(cfg) + } else { + a.stopFailedTimer() + } +} + +// startFailedTimer starts a timer that will restart the application if it doesn't exit failed after a period of time. +// +// This does not grab the appLock, that must be managed by the caller. +func (a *Application) startFailedTimer(cfg map[string]interface{}) { + if a.restartCanceller != nil { + // already have running failed timer; just update config + a.restartConfig = cfg + return + } + + ctx, cancel := context.WithCancel(a.startContext) + a.restartCanceller = cancel + a.restartConfig = cfg + t := time.NewTimer(FailedRestartTimeout) + go func() { + defer func() { + a.appLock.Lock() + a.restartCanceller = nil + a.restartConfig = nil + a.appLock.Unlock() + }() + + select { + case <-ctx.Done(): + return + case <-t.C: + a.restart(a.restartConfig) } + }() +} + +// stopFailedTimer stops the timer that would restart the application from reporting failure. +// +// This does not grab the appLock, that must be managed by the caller. +func (a *Application) stopFailedTimer() { + if a.restartCanceller == nil { + return + } + a.restartCanceller() + a.restartCanceller = nil +} + +// restart restarts the application +func (a *Application) restart(cfg map[string]interface{}) { + a.appLock.Lock() + defer a.appLock.Unlock() + + // kill the process + if a.state.ProcessInfo != nil { + _ = a.state.ProcessInfo.Process.Kill() + a.state.ProcessInfo = nil + } + ctx := a.startContext + tag := a.tag + + err := a.start(ctx, tag, cfg) + if err != nil { + a.setState(state.Crashed, fmt.Sprintf("failed to restart: %s", err), nil) } } From 2f89ae9ae64d93141e90f6ba031ff45e5767c52f Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Tue, 27 Apr 2021 11:55:00 -0400 Subject: [PATCH 2/4] Add changelog. --- x-pack/elastic-agent/CHANGELOG.next.asciidoc | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/elastic-agent/CHANGELOG.next.asciidoc b/x-pack/elastic-agent/CHANGELOG.next.asciidoc index c3dc8b38ce2..c0019e656d7 100644 --- a/x-pack/elastic-agent/CHANGELOG.next.asciidoc +++ b/x-pack/elastic-agent/CHANGELOG.next.asciidoc @@ -56,6 +56,7 @@ - Fix issue with status and inspect inside of container {pull}25204[25204] - Remove FLEET_SERVER_POLICY_NAME env variable as it was not used {pull}25149[25149] - Reduce log level for listener cleanup to debug {pull}25274 +- Delay the restart of application when a status report of failure is given {pull}25339[25339] ==== New features From f754066e5aa1898a47741495ff4a43ce863969e8 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Wed, 28 Apr 2021 07:37:11 -0400 Subject: [PATCH 3/4] Fix test and make it configurable. --- x-pack/elastic-agent/pkg/agent/operation/common_test.go | 4 +++- x-pack/elastic-agent/pkg/core/plugin/process/status.go | 7 +------ x-pack/elastic-agent/pkg/core/process/config.go | 2 ++ 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/x-pack/elastic-agent/pkg/agent/operation/common_test.go b/x-pack/elastic-agent/pkg/agent/operation/common_test.go index 505eadc8d08..f83ebc7f741 100644 --- a/x-pack/elastic-agent/pkg/agent/operation/common_test.go +++ b/x-pack/elastic-agent/pkg/agent/operation/common_test.go @@ -41,7 +41,9 @@ func getTestOperator(t *testing.T, downloadPath string, installPath string, p *a Delay: 3 * time.Second, MaxDelay: 10 * time.Second, }, - ProcessConfig: &process.Config{}, + ProcessConfig: &process.Config{ + FailureTimeout: 1, // restart instantly + }, DownloadConfig: &artifact.Config{ TargetDirectory: downloadPath, InstallPath: installPath, diff --git a/x-pack/elastic-agent/pkg/core/plugin/process/status.go b/x-pack/elastic-agent/pkg/core/plugin/process/status.go index 5f531d125e6..6838c0a18d4 100644 --- a/x-pack/elastic-agent/pkg/core/plugin/process/status.go +++ b/x-pack/elastic-agent/pkg/core/plugin/process/status.go @@ -17,11 +17,6 @@ import ( "github.com/elastic/beats/v7/x-pack/elastic-agent/pkg/core/state" ) -const ( - // FailedRestartTimeout is the amount of time an Application can sit in Failed status before it is restarted. - FailedRestartTimeout = 10 * time.Second -) - // OnStatusChange is the handler called by the GRPC server code. // // It updates the status of the application and handles restarting the application if needed. @@ -66,7 +61,7 @@ func (a *Application) startFailedTimer(cfg map[string]interface{}) { ctx, cancel := context.WithCancel(a.startContext) a.restartCanceller = cancel a.restartConfig = cfg - t := time.NewTimer(FailedRestartTimeout) + t := time.NewTimer(a.processConfig.FailureTimeout) go func() { defer func() { a.appLock.Lock() diff --git a/x-pack/elastic-agent/pkg/core/process/config.go b/x-pack/elastic-agent/pkg/core/process/config.go index 72e8e466720..532ca379457 100644 --- a/x-pack/elastic-agent/pkg/core/process/config.go +++ b/x-pack/elastic-agent/pkg/core/process/config.go @@ -10,6 +10,7 @@ import "time" type Config struct { SpawnTimeout time.Duration `yaml:"spawn_timeout" config:"spawn_timeout"` StopTimeout time.Duration `yaml:"stop_timeout" config:"stop_timeout"` + FailureTimeout time.Duration `yaml:"failure_timeout" config:"failure_timeout"` // TODO: cgroups and namespaces } @@ -19,5 +20,6 @@ func DefaultConfig() *Config { return &Config{ SpawnTimeout: 30 * time.Second, StopTimeout: 30 * time.Second, + FailureTimeout: 10 * time.Second, } } From 377e45e6bddc733def080ac2121322cb8c746b91 Mon Sep 17 00:00:00 2001 From: Blake Rouse Date: Wed, 28 Apr 2021 07:39:06 -0400 Subject: [PATCH 4/4] Run mage check --- x-pack/elastic-agent/CHANGELOG.next.asciidoc | 2 +- x-pack/elastic-agent/pkg/agent/operation/common_test.go | 2 +- x-pack/elastic-agent/pkg/core/process/config.go | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/x-pack/elastic-agent/CHANGELOG.next.asciidoc b/x-pack/elastic-agent/CHANGELOG.next.asciidoc index dcb118337eb..2ff301db695 100644 --- a/x-pack/elastic-agent/CHANGELOG.next.asciidoc +++ b/x-pack/elastic-agent/CHANGELOG.next.asciidoc @@ -56,9 +56,9 @@ - Fix issue with status and inspect inside of container {pull}25204[25204] - Remove FLEET_SERVER_POLICY_NAME env variable as it was not used {pull}25149[25149] - Reduce log level for listener cleanup to debug {pull}25274 -- Delay the restart of application when a status report of failure is given {pull}25339[25339] - Passing in policy id to container command works {pull}25352[25352] - Reduce log level for listener cleanup to debug {pull}25274[25274] +- Delay the restart of application when a status report of failure is given {pull}25339[25339] ==== New features diff --git a/x-pack/elastic-agent/pkg/agent/operation/common_test.go b/x-pack/elastic-agent/pkg/agent/operation/common_test.go index f83ebc7f741..7ebcb085555 100644 --- a/x-pack/elastic-agent/pkg/agent/operation/common_test.go +++ b/x-pack/elastic-agent/pkg/agent/operation/common_test.go @@ -42,7 +42,7 @@ func getTestOperator(t *testing.T, downloadPath string, installPath string, p *a MaxDelay: 10 * time.Second, }, ProcessConfig: &process.Config{ - FailureTimeout: 1, // restart instantly + FailureTimeout: 1, // restart instantly }, DownloadConfig: &artifact.Config{ TargetDirectory: downloadPath, diff --git a/x-pack/elastic-agent/pkg/core/process/config.go b/x-pack/elastic-agent/pkg/core/process/config.go index 532ca379457..4d12fc60f04 100644 --- a/x-pack/elastic-agent/pkg/core/process/config.go +++ b/x-pack/elastic-agent/pkg/core/process/config.go @@ -8,8 +8,8 @@ import "time" // Config for fine tuning new process type Config struct { - SpawnTimeout time.Duration `yaml:"spawn_timeout" config:"spawn_timeout"` - StopTimeout time.Duration `yaml:"stop_timeout" config:"stop_timeout"` + SpawnTimeout time.Duration `yaml:"spawn_timeout" config:"spawn_timeout"` + StopTimeout time.Duration `yaml:"stop_timeout" config:"stop_timeout"` FailureTimeout time.Duration `yaml:"failure_timeout" config:"failure_timeout"` // TODO: cgroups and namespaces @@ -18,8 +18,8 @@ type Config struct { // DefaultConfig creates a config with pre-set default values. func DefaultConfig() *Config { return &Config{ - SpawnTimeout: 30 * time.Second, - StopTimeout: 30 * time.Second, + SpawnTimeout: 30 * time.Second, + StopTimeout: 30 * time.Second, FailureTimeout: 10 * time.Second, } }