Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

logmon: recover from shutting down call locally #5616

Merged
merged 7 commits into from
Apr 26, 2019
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 35 additions & 8 deletions client/allocrunner/taskrunner/logmon_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ import (
"fmt"
"path/filepath"
"runtime"
"time"

hclog "github.com/hashicorp/go-hclog"
plugin "github.com/hashicorp/go-plugin"
"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/client/logmon"
"github.com/hashicorp/nomad/helper/uuid"
"github.com/hashicorp/nomad/nomad/structs"
bstructs "github.com/hashicorp/nomad/plugins/base/structs"
pstructs "github.com/hashicorp/nomad/plugins/shared/structs"
)

Expand Down Expand Up @@ -95,7 +97,37 @@ func reattachConfigFromHookData(data map[string]string) (*plugin.ReattachConfig,
func (h *logmonHook) Prestart(ctx context.Context,
req *interfaces.TaskPrestartRequest, resp *interfaces.TaskPrestartResponse) error {

// Attempt to reattach to logmon
tries := 0
for {
err := h.prestartOneLoop(ctx, req)
if err == bstructs.ErrPluginShutdown {
h.logger.Warn("logmon shutdown while making request", "error", err)

if tries > 3 {
return err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe log here that its out of retries before returning error?

}

// retry after killing process and ensure we start a new logmon process
tries++
h.logmonPluginClient.Kill()
time.Sleep(1 * time.Second)
continue
} else if err != nil {
return err
}

rCfg := pstructs.ReattachConfigFromGoPlugin(h.logmonPluginClient.ReattachConfig())
jsonCfg, err := json.Marshal(rCfg)
if err != nil {
return err
}
resp.State = map[string]string{logmonReattachKey: string(jsonCfg)}
return nil
}
}

func (h *logmonHook) prestartOneLoop(ctx context.Context, req *interfaces.TaskPrestartRequest) error {
// attach to a running logmon if state indicates one
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I opted not to change the logic here as the logic is somewhat brittle and I don't want to make it worse. But the idea is that if grpc call fails with shutting down, h.logmonPluginClient would be non-nill and would be marked as Exited, so we'll create a new logmon instance on retries.

if h.logmonPluginClient == nil {
reattachConfig, err := reattachConfigFromHookData(req.PreviousState)
if err != nil {
Expand All @@ -105,12 +137,13 @@ func (h *logmonHook) Prestart(ctx context.Context,
if reattachConfig != nil {
if err := h.launchLogMon(reattachConfig); err != nil {
h.logger.Warn("failed to reattach to logmon process", "error", err)
// if we failed to launch logmon, try again below
}
}

}

// We did not reattach to a plugin and one is still not running.
// create a new client in initial starts, failed reattachment, or if we detect exits
if h.logmonPluginClient == nil || h.logmonPluginClient.Exited() {
if err := h.launchLogMon(nil); err != nil {
// Retry errors launching logmon as logmon may have crashed on start and
Expand All @@ -134,12 +167,6 @@ func (h *logmonHook) Prestart(ctx context.Context,
return err
}

rCfg := pstructs.ReattachConfigFromGoPlugin(h.logmonPluginClient.ReattachConfig())
jsonCfg, err := json.Marshal(rCfg)
if err != nil {
return err
}
resp.State = map[string]string{logmonReattachKey: string(jsonCfg)}
return nil
}

Expand Down
74 changes: 74 additions & 0 deletions client/allocrunner/taskrunner/logmon_hook_unix_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"os"
"syscall"
"testing"
"time"

"github.com/hashicorp/nomad/client/allocrunner/interfaces"
"github.com/hashicorp/nomad/helper/testlog"
Expand Down Expand Up @@ -86,3 +87,76 @@ func TestTaskRunner_LogmonHook_StartCrashStop(t *testing.T) {
// Running stop should shutdown logmon
require.NoError(t, hook.Stop(context.Background(), nil, nil))
}

// TestTaskRunner_LogmonHook_ShutdownMidStart simulates logmon crashing while the
// Nomad client is calling Start() and asserts that we recover and spawn a new logmon.
func TestTaskRunner_LogmonHook_ShutdownMidStart(t *testing.T) {
t.Parallel()

alloc := mock.BatchAlloc()
task := alloc.Job.TaskGroups[0].Tasks[0]

dir, err := ioutil.TempDir("", "nomadtest")
require.NoError(t, err)
defer func() {
require.NoError(t, os.RemoveAll(dir))
}()

hookConf := newLogMonHookConfig(task.Name, dir)
hook := newLogMonHook(hookConf, testlog.HCLogger(t))

req := interfaces.TaskPrestartRequest{
Task: task,
}
resp := interfaces.TaskPrestartResponse{}

// First start
require.NoError(t, hook.Prestart(context.Background(), &req, &resp))
defer hook.Stop(context.Background(), nil, nil)

origState := resp.State
origHookData := resp.State[logmonReattachKey]
require.NotEmpty(t, origHookData)

// Pluck PID out of reattach synthesize a crash
reattach := struct {
Pid int
}{}
require.NoError(t, json.Unmarshal([]byte(origHookData), &reattach))
pid := reattach.Pid
require.NotZero(t, pid)

proc, _ := os.FindProcess(pid)

// Assert logmon is running
require.NoError(t, proc.Signal(syscall.Signal(0)))

// SIGSTOP would freeze process without it being considered
// exited; so this causes process to be non-exited at beginning of call
// then we kill process while Start call is running
require.NoError(t, proc.Signal(syscall.SIGSTOP))
// sleep for the signal to take effect
time.Sleep(1 * time.Second)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How can you guarantee that the sleep is sufficient? Wondering if this will be flaky in CI

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know of a good way to test if a process is stopped - unlike killing. If it's flaky in CI, we can react - but considering that we will overhaul logmon plugin method to be inline with other plugin clients and this code would probably change, I'd avoid overengineering and react to CI problems as they come.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I take that back :). ended up using gopsutil.Process utility to verify process sleep status in 658a734!


go func() {
time.Sleep(2 * time.Second)

proc.Signal(syscall.SIGCONT)
proc.Signal(os.Kill)
}()

req.PreviousState = map[string]string{
logmonReattachKey: origHookData,
}

initLogmon, initClient := hook.logmon, hook.logmonPluginClient

resp = interfaces.TaskPrestartResponse{}
err = hook.Prestart(context.Background(), &req, &resp)
require.NoError(t, err)
require.NotEqual(t, origState, resp.State)

// assert that we got a new client and logmon
require.True(t, initLogmon != hook.logmon)
require.True(t, initClient != hook.logmonPluginClient)
}
8 changes: 6 additions & 2 deletions client/logmon/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,14 @@ import (
"context"

"github.com/hashicorp/nomad/client/logmon/proto"
"github.com/hashicorp/nomad/helper/pluginutils/grpcutils"
)

type logmonClient struct {
client proto.LogMonClient

// doneCtx is closed when the plugin exits
doneCtx context.Context
}

func (c *logmonClient) Start(cfg *LogConfig) error {
Expand All @@ -21,11 +25,11 @@ func (c *logmonClient) Start(cfg *LogConfig) error {
StderrFifo: cfg.StderrFifo,
}
_, err := c.client.Start(context.Background(), req)
return err
return grpcutils.HandleGrpcErr(err, c.doneCtx)
}

func (c *logmonClient) Stop() error {
req := &proto.StopRequest{}
_, err := c.client.Stop(context.Background(), req)
return err
return grpcutils.HandleGrpcErr(err, c.doneCtx)
}
5 changes: 4 additions & 1 deletion client/logmon/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,8 @@ func (p *Plugin) GRPCServer(broker *plugin.GRPCBroker, s *grpc.Server) error {
}

func (p *Plugin) GRPCClient(ctx context.Context, broker *plugin.GRPCBroker, c *grpc.ClientConn) (interface{}, error) {
return &logmonClient{client: proto.NewLogMonClient(c)}, nil
return &logmonClient{
doneCtx: ctx,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not one 100% percent following the plugin ctx and how it's set during the initialization phase - but this follows the pattern in other driver plugins , e.g. https://github.com/hashicorp/nomad/blob/v0.9.0/plugins/drivers/plugin.go#L39-L49 .

Interestingly the logmon plugin client doesn't embed BasePluginClient, so might want to follow up on that with 0.9.2.

client: proto.NewLogMonClient(c),
}, nil
}