Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

logging: Attempt to recover logmon failures #5577

Merged
merged 2 commits into from
Apr 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 13 additions & 6 deletions client/allocrunner/taskrunner/logmon_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,29 @@ func reattachConfigFromHookData(data map[string]string) (*plugin.ReattachConfig,
func (h *logmonHook) Prestart(ctx context.Context,
req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {

// Create a logmon client by reattaching or launching a new instance
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
// Attempt to reattach to logmon
if h.logmonPluginClient == nil {
reattachConfig, err := reattachConfigFromHookData(req.PreviousState)
if err != nil {
h.logger.Error("failed to load reattach config", "error", err)
return err
}
if reattachConfig != nil {
if err := h.launchLogMon(reattachConfig); err != nil {
h.logger.Warn("failed to reattach to logmon process", "error", err)
}
}

}

// Launch or reattach logmon instance for the task.
if err := h.launchLogMon(reattachConfig); err != nil {
// Retry errors launching logmon as logmon may have crashed and
// We did not reattach to a plugin and one is still not running.
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
if err := h.launchLogMon(nil); err != nil {
// Retry errors launching logmon as logmon may have crashed on start and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ugh I wrote this comment and I can't make out what it means. Could you rewrite this into something intelligible. We want to convey that subsequent attempts to launch logmon will likely error at this point so a recoverable error is returned so the task can reattempt to start.

// subsequent attempts will start a new one.
h.logger.Error("failed to launch logmon process", "error", err)
return structs.NewRecoverableError(err, true)
}

}

err := h.logmon.Start(&logmon.LogConfig{
Expand Down
9 changes: 4 additions & 5 deletions client/allocrunner/taskrunner/logmon_hook_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@ import (
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)

// TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the
// Nomad client is restarting and asserts failing to reattach to logmon causes
// a recoverable error (task restart).
// nomad to spawn a new logmon.
func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
t.Parallel()

Expand All @@ -46,6 +45,7 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
defer hook.Stop(context.Background(), nil, nil)

origState := resp.State
origHookData := resp.State[logmonReattachKey]
require.NotEmpty(t, origHookData)

Expand Down Expand Up @@ -80,9 +80,8 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
}
resp = interfaces.TaskPrestartResponse{}
err = hook.Prestart(context.Background(), &req, &resp)
require.Error(t, err)
require.True(t, structs.IsRecoverable(err))
require.Empty(t, resp.State)
require.NoError(t, err)
require.NotEqual(t, origState, resp.State)

// Running stop should shutdown logmon
require.NoError(t, hook.Stop(context.Background(), nil, nil))
Expand Down