From 269e2c00fb66e5645ba9582108d25624d03ffa8b Mon Sep 17 00:00:00 2001
From: Danielle Lancashire <dani@hashicorp.com>
Date: Thu, 18 Apr 2019 12:50:49 +0200
Subject: [PATCH 1/2] loggging: Attempt to recover logmon failures

Currently, when logmon fails to reattach, we will retry reattachment to
the same pid until the task restart specification is exhausted.

Because we cannot clear hook state during error conditions, it is not
possible for us to signal to a future restart that it _shouldn't_
attempt to reattach to the plugin.

Here we revert to explicitly detecting reattachment seperately from a
launch of a new logmon, so we can recover from scenarios where a logmon
plugin has failed.

This is a net improvement over the current hard failure situation, as it
means in the most common case (the pid has gone away), we can recover.

Other reattachment failure modes where the plugin may still be running
could potentially cause a duplicate process, or a subsequent failure to launch
a new plugin.

If there was a duplicate process, it could potentially cause duplicate
logging. This is better than a production workload outage.

If there was a subsequent failure to launch a new plugin, it would fail
in the same (retry until restarts are exhausted) as the current failure
mode.
---
 client/allocrunner/taskrunner/logmon_hook.go  | 19 +++++++++++++------
 .../taskrunner/logmon_hook_unix_test.go       |  9 ++++-----
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/client/allocrunner/taskrunner/logmon_hook.go b/client/allocrunner/taskrunner/logmon_hook.go
index 7c68aa0f7b69..05b2f613a721 100644
--- a/client/allocrunner/taskrunner/logmon_hook.go
+++ b/client/allocrunner/taskrunner/logmon_hook.go
@@ -95,22 +95,29 @@ func reattachConfigFromHookData(data map[string]string) (*plugin.ReattachConfig,
 func (h *logmonHook) Prestart(ctx context.Context,
 	req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {
 
-	// Create a logmon client by reattaching or launching a new instance
-	if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
+	// Attempt to reattach to logmon
+	if h.logmonPluginClient == nil {
 		reattachConfig, err := reattachConfigFromHookData(req.PreviousState)
 		if err != nil {
 			h.logger.Error("failed to load reattach config", "error", err)
 			return err
 		}
+		if reattachConfig != nil {
+			if err := h.launchLogMon(reattachConfig); err != nil {
+				h.logger.Error("failed to reattach to logmon process", "error", err)
+			}
+		}
+
+	}
 
-		// Launch or reattach logmon instance for the task.
-		if err := h.launchLogMon(reattachConfig); err != nil {
-			// Retry errors launching logmon as logmon may have crashed and
+	// We did not reattach to a plugin and one is still not running.
+	if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
+		if err := h.launchLogMon(nil); err != nil {
+			// Retry errors launching logmon as logmon may have crashed on start and
 			// subsequent attempts will start a new one.
 			h.logger.Error("failed to launch logmon process", "error", err)
 			return structs.NewRecoverableError(err, true)
 		}
-
 	}
 
 	err := h.logmon.Start(&logmon.LogConfig{
diff --git a/client/allocrunner/taskrunner/logmon_hook_unix_test.go b/client/allocrunner/taskrunner/logmon_hook_unix_test.go
index 0dac426376fd..582692a27bde 100644
--- a/client/allocrunner/taskrunner/logmon_hook_unix_test.go
+++ b/client/allocrunner/taskrunner/logmon_hook_unix_test.go
@@ -14,14 +14,13 @@ import (
 	"github.com/hashicorp/nomad/client/allocrunner/interfaces"
 	"github.com/hashicorp/nomad/helper/testlog"
 	"github.com/hashicorp/nomad/nomad/mock"
-	"github.com/hashicorp/nomad/nomad/structs"
 	"github.com/hashicorp/nomad/testutil"
 	"github.com/stretchr/testify/require"
 )
 
 // TestTaskRunner_LogmonHook_StartCrashStop simulates logmon crashing while the
 // Nomad client is restarting and asserts failing to reattach to logmon causes
-// a recoverable error (task restart).
+// nomad to spawn a new logmon.
 func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
 	t.Parallel()
 
@@ -46,6 +45,7 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
 	require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
 	defer hook.Stop(context.Background(), nil, nil)
 
+	origState := resp.State
 	origHookData := resp.State[logmonReattachKey]
 	require.NotEmpty(t, origHookData)
 
@@ -80,9 +80,8 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
 	}
 	resp = interfaces.TaskPrestartResponse{}
 	err = hook.Prestart(context.Background(), &req, &resp)
-	require.Error(t, err)
-	require.True(t, structs.IsRecoverable(err))
-	require.Empty(t, resp.State)
+	require.NoError(t, err)
+	require.NotEqual(t, origState, resp.State)
 
 	// Running stop should shutdown logmon
 	require.NoError(t, hook.Stop(context.Background(), nil, nil))

From 0f91277d852cd1f5ef2f37d240b6beac394f23c5 Mon Sep 17 00:00:00 2001
From: Michael Schurter <michael.schurter@gmail.com>
Date: Mon, 22 Apr 2019 14:40:17 -0400
Subject: [PATCH 2/2] tweak logging level for failed log line

Co-Authored-By: notnoop <mahmood@notnoop.com>
---
 client/allocrunner/taskrunner/logmon_hook.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/client/allocrunner/taskrunner/logmon_hook.go b/client/allocrunner/taskrunner/logmon_hook.go
index 05b2f613a721..1f58d7e33644 100644
--- a/client/allocrunner/taskrunner/logmon_hook.go
+++ b/client/allocrunner/taskrunner/logmon_hook.go
@@ -104,7 +104,7 @@ func (h *logmonHook) Prestart(ctx context.Context,
 		}
 		if reattachConfig != nil {
 			if err := h.launchLogMon(reattachConfig); err != nil {
-				h.logger.Error("failed to reattach to logmon process", "error", err)
+				h.logger.Warn("failed to reattach to logmon process", "error", err)
 			}
 		}