-
Notifications
You must be signed in to change notification settings - Fork 4.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Delay the restart of application when a status report of failure is given #25339
Changes from all commits
1097324
2f89ae9
f754066
c6977d2
377e45e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,7 +5,9 @@ | |
package process | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"time" | ||
|
||
"gopkg.in/yaml.v2" | ||
|
||
|
@@ -35,21 +37,74 @@ func (a *Application) OnStatusChange(s *server.ApplicationState, status proto.St | |
return | ||
} | ||
|
||
// kill the process | ||
if a.state.ProcessInfo != nil { | ||
_ = a.state.ProcessInfo.Process.Kill() | ||
a.state.ProcessInfo = nil | ||
} | ||
ctx := a.startContext | ||
tag := a.tag | ||
|
||
// it was marshalled to pass into the state, so unmarshall will always succeed | ||
var cfg map[string]interface{} | ||
_ = yaml.Unmarshal([]byte(s.Config()), &cfg) | ||
|
||
err := a.start(ctx, tag, cfg) | ||
if err != nil { | ||
a.setState(state.Crashed, fmt.Sprintf("failed to restart: %s", err), nil) | ||
// start the failed timer | ||
a.startFailedTimer(cfg) | ||
} else { | ||
a.stopFailedTimer() | ||
} | ||
} | ||
|
||
// startFailedTimer starts a timer that will restart the application if it doesn't exit failed after a period of time. | ||
// | ||
// This does not grab the appLock, that must be managed by the caller. | ||
func (a *Application) startFailedTimer(cfg map[string]interface{}) { | ||
if a.restartCanceller != nil { | ||
// already have running failed timer; just update config | ||
a.restartConfig = cfg | ||
return | ||
} | ||
|
||
ctx, cancel := context.WithCancel(a.startContext) | ||
a.restartCanceller = cancel | ||
a.restartConfig = cfg | ||
t := time.NewTimer(a.processConfig.FailureTimeout) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this start quick with exponential backoff to limit? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Feel like that would make it even more complicated, and harder to understand the time interval in log messages. A constant time allows for log messages to be clear that its every 10 seconds (or whatever setting value set) it is restarting. |
||
go func() { | ||
defer func() { | ||
a.appLock.Lock() | ||
a.restartCanceller = nil | ||
a.restartConfig = nil | ||
a.appLock.Unlock() | ||
}() | ||
|
||
select { | ||
case <-ctx.Done(): | ||
return | ||
case <-t.C: | ||
a.restart(a.restartConfig) | ||
} | ||
}() | ||
} | ||
|
||
// stopFailedTimer stops the timer that would restart the application from reporting failure. | ||
// | ||
// This does not grab the appLock, that must be managed by the caller. | ||
func (a *Application) stopFailedTimer() { | ||
if a.restartCanceller == nil { | ||
return | ||
} | ||
a.restartCanceller() | ||
a.restartCanceller = nil | ||
} | ||
|
||
// restart restarts the application | ||
func (a *Application) restart(cfg map[string]interface{}) { | ||
a.appLock.Lock() | ||
defer a.appLock.Unlock() | ||
|
||
// kill the process | ||
if a.state.ProcessInfo != nil { | ||
_ = a.state.ProcessInfo.Process.Kill() | ||
a.state.ProcessInfo = nil | ||
} | ||
ctx := a.startContext | ||
tag := a.tag | ||
|
||
err := a.start(ctx, tag, cfg) | ||
if err != nil { | ||
a.setState(state.Crashed, fmt.Sprintf("failed to restart: %s", err), nil) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we add here a bit more info which process (name?) failed to restart? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That is there, that is managed inside of the |
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated to this PR but I don't think we should swallow the errors here.