Skip to content

Commit

Permalink
Merge pull request #5577 from hashicorp/dani/b-logmon-unrecoverable
Browse files Browse the repository at this point in the history
logging: Attempt to recover logmon failures
  • Loading branch information
notnoop committed Apr 22, 2019
2 parents 8a0df40 + 0f91277 commit 151e0ae
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 11 deletions.
19 changes: 13 additions & 6 deletions client/allocrunner/taskrunner/logmon_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,22 +95,29 @@ func reattachConfigFromHookData(data map[string]string) (*plugin.ReattachConfig,
func (h *logmonHook) Prestart(ctx context.Context,
req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {

// Create a logmon client by reattaching or launching a new instance
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
// Attempt to reattach to logmon
if h.logmonPluginClient == nil {
reattachConfig, err := reattachConfigFromHookData(req.PreviousState)
if err != nil {
h.logger.Error("failed to load reattach config", "error", err)
return err
}
if reattachConfig != nil {
if err := h.launchLogMon(reattachConfig); err != nil {
h.logger.Warn("failed to reattach to logmon process", "error", err)
}
}

}

// Launch or reattach logmon instance for the task.
if err := h.launchLogMon(reattachConfig); err != nil {
// Retry errors launching logmon as logmon may have crashed and
// We did not reattach to a plugin and one is still not running.
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
if err := h.launchLogMon(nil); err != nil {
// Retry errors launching logmon as logmon may have crashed on start and
// subsequent attempts will start a new one.
h.logger.Error("failed to launch logmon process", "error", err)
return structs.NewRecoverableError(err, true)
}

}

err := h.logmon.Start(&logmon.LogConfig{
Expand Down
9 changes: 4 additions & 5 deletions client/allocrunner/taskrunner/logmon_hook_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,13 @@ import (
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/helper/testlog"
"github.com/hashicorp/nomad/nomad/mock"
"github.com/hashicorp/nomad/nomad/structs"
"github.com/hashicorp/nomad/testutil"
"github.com/stretchr/testify/require"
)

// TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the
// Nomad client is restarting and asserts failing to reattach to logmon causes
// a recoverable error (task restart).
// nomad to spawn a new logmon.
func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
t.Parallel()

Expand All @@ -46,6 +45,7 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
defer hook.Stop(context.Background(), nil, nil)

origState := resp.State
origHookData := resp.State[logmonReattachKey]
require.NotEmpty(t, origHookData)

Expand Down Expand Up @@ -80,9 +80,8 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
}
resp = interfaces.TaskPrestartResponse{}
err = hook.Prestart(context.Background(), &req, &resp)
require.Error(t, err)
require.True(t, structs.IsRecoverable(err))
require.Empty(t, resp.State)
require.NoError(t, err)
require.NotEqual(t, origState, resp.State)

// Running stop should shutdown logmon
require.NoError(t, hook.Stop(context.Background(), nil, nil))
Expand Down

0 comments on commit 151e0ae

Please sign in to comment.