Skip to content

Commit

Permalink
csi: exit plugin supervisor after 30s without initial connection
Browse files Browse the repository at this point in the history
The plugin supervisor registers the plugin in the `Poststart` hook, so
the task itself should be running. If the plugin can't communicate
with us after 30s, exit and mark the task as unhealthy so that it can
be restarted.
  • Loading branch information
tgross committed Feb 15, 2022
1 parent c2b7dda commit 4f94dde
Showing 1 changed file with 17 additions and 7 deletions.
24 changes: 17 additions & 7 deletions client/allocrunner/taskrunner/plugin_supervisor_hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,20 +209,30 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {

t := time.NewTimer(0)

// We're in Postrun at this point, so if we can't connect within
// this deadline, assume it's broken so we can restart the task
startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second)
defer startCancelFn()

var err error
var pluginHealthy bool

// Step 1: Wait for the plugin to initially become available.
WAITFORREADY:
for {
select {
case <-ctx.Done():
case <-startCtx.Done():
h.logger.Error("CSI plugin did not become healthy before timeout", "error", err)
event := structs.NewTaskEvent(structs.TaskPluginUnhealthy)
event.SetMessage(fmt.Sprintf("failed to register plugin: %s, reason: %v", h.task.CSIPluginConfig.ID, err))
h.eventEmitter.EmitEvent(event)
return
case <-t.C:
pluginHealthy, err := h.supervisorLoopOnce(ctx, client)
pluginHealthy, err = h.supervisorLoopOnce(startCtx, client)
if err != nil || !pluginHealthy {
h.logger.Debug("CSI Plugin not ready", "error", err)

// Plugin is not yet returning healthy, because we want to optimise for
// quickly bringing a plugin online, we use a short timeout here.
// TODO(dani): Test with more plugins and adjust.
h.logger.Debug("CSI plugin not ready", "error", err)
// Use only a short delay here to optimize for quickly
// bringing up a plugin
t.Reset(5 * time.Second)
continue
}
Expand Down

0 comments on commit 4f94dde

Please sign in to comment.