From 69a1ba9ffcd321a8cd1c66829b8533560d57f0b2 Mon Sep 17 00:00:00 2001
From: hc-github-team-nomad-core
 <82989552+hc-github-team-nomad-core@users.noreply.github.com>
Date: Tue, 14 Jun 2022 14:30:18 -0400
Subject: [PATCH] CSI: make plugin health_timeout configurable in csi_plugin
 stanza (#13340) (#13366)

Signed-off-by: Grant Griffiths <ggriffiths@purestorage.com>

Co-authored-by: Grant Griffiths <ggriffiths@purestorage.com>
---
 .changelog/13340.txt                          |  3 +++
 api/tasks.go                                  |  8 ++++++++
 .../taskrunner/plugin_supervisor_hook.go      |  8 ++++++--
 command/agent/job_endpoint.go                 |  1 +
 jobspec/parse_task.go                         | 12 ++++++++++--
 jobspec/parse_test.go                         |  7 ++++---
 jobspec/test-fixtures/csi-plugin.hcl          |  7 ++++---
 nomad/structs/csi.go                          |  4 ++++
 .../docs/job-specification/csi_plugin.mdx     | 19 +++++++++++++------
 9 files changed, 53 insertions(+), 16 deletions(-)
 create mode 100644 .changelog/13340.txt

diff --git a/.changelog/13340.txt b/.changelog/13340.txt
new file mode 100644
index 000000000000..1f8e40f21b93
--- /dev/null
+++ b/.changelog/13340.txt
@@ -0,0 +1,3 @@
+```release-note:improvements
+csi: Made the CSI Plugin supervisor health check configurable with a new CSI Stanza health_timeout field
+```
diff --git a/api/tasks.go b/api/tasks.go
index efc1b87af9de..33a4acc16b15 100644
--- a/api/tasks.go
+++ b/api/tasks.go
@@ -1018,10 +1018,18 @@ type TaskCSIPluginConfig struct {
 	//
 	// Default is /csi.
 	MountDir string `mapstructure:"mount_dir" hcl:"mount_dir,optional"`
+
+	// HealthTimeout is the time after which the CSI plugin tasks will be killed
+	// if the CSI Plugin is not healthy.
+	HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
 }
 
 func (t *TaskCSIPluginConfig) Canonicalize() {
 	if t.MountDir == "" {
 		t.MountDir = "/csi"
 	}
+
+	if t.HealthTimeout == 0 {
+		t.HealthTimeout = 30 * time.Second
+	}
 }
diff --git a/client/allocrunner/taskrunner/plugin_supervisor_hook.go b/client/allocrunner/taskrunner/plugin_supervisor_hook.go
index 0903e1487e9b..c7cfa2454b40 100644
--- a/client/allocrunner/taskrunner/plugin_supervisor_hook.go
+++ b/client/allocrunner/taskrunner/plugin_supervisor_hook.go
@@ -88,6 +88,10 @@ func newCSIPluginSupervisorHook(config *csiPluginSupervisorHookConfig) *csiPlugi
 	pluginRoot := filepath.Join(config.clientStateDirPath, "csi",
 		string(task.CSIPluginConfig.Type), task.CSIPluginConfig.ID)
 
+	if task.CSIPluginConfig.HealthTimeout == 0 {
+		task.CSIPluginConfig.HealthTimeout = 30 * time.Second
+	}
+
 	shutdownCtx, cancelFn := context.WithCancel(context.Background())
 
 	hook := &csiPluginSupervisorHook{
@@ -231,7 +235,7 @@ func (h *csiPluginSupervisorHook) ensureSupervisorLoop(ctx context.Context) {
 
 	// We're in Poststart at this point, so if we can't connect within
 	// this deadline, assume it's broken so we can restart the task
-	startCtx, startCancelFn := context.WithTimeout(ctx, 30*time.Second)
+	startCtx, startCancelFn := context.WithTimeout(ctx, h.task.CSIPluginConfig.HealthTimeout)
 	defer startCancelFn()
 
 	var err error
@@ -414,7 +418,7 @@ func (h *csiPluginSupervisorHook) kill(ctx context.Context, reason error) {
 	if err := h.lifecycle.Kill(ctx,
 		structs.NewTaskEvent(structs.TaskKilling).
 			SetFailsTask().
-			SetDisplayMessage("CSI plugin did not become healthy before timeout"),
+			SetDisplayMessage(fmt.Sprintf("CSI plugin did not become healthy before configured %v health timeout", h.task.CSIPluginConfig.HealthTimeout.String())),
 	); err != nil {
 		h.logger.Error("failed to kill task", "kill_reason", reason, "error", err)
 	}
diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go
index 8a99a77b2971..4de07d9c1cc6 100644
--- a/command/agent/job_endpoint.go
+++ b/command/agent/job_endpoint.go
@@ -1166,6 +1166,7 @@ func ApiCSIPluginConfigToStructsCSIPluginConfig(apiConfig *api.TaskCSIPluginConf
 	sc.ID = apiConfig.ID
 	sc.Type = structs.CSIPluginType(apiConfig.Type)
 	sc.MountDir = apiConfig.MountDir
+	sc.HealthTimeout = apiConfig.HealthTimeout
 	return sc
 }
 
diff --git a/jobspec/parse_task.go b/jobspec/parse_task.go
index ff81b6ba66a8..4bc77c310f27 100644
--- a/jobspec/parse_task.go
+++ b/jobspec/parse_task.go
@@ -158,12 +158,20 @@ func parseTask(item *ast.ObjectItem, keys []string) (*api.Task, error) {
 		i := o.Elem().Items[0]
 
 		var m map[string]interface{}
+		var cfg api.TaskCSIPluginConfig
 		if err := hcl.DecodeObject(&m, i.Val); err != nil {
 			return nil, err
 		}
 
-		var cfg api.TaskCSIPluginConfig
-		if err := mapstructure.WeakDecode(m, &cfg); err != nil {
+		dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{
+			DecodeHook:       mapstructure.StringToTimeDurationHookFunc(),
+			WeaklyTypedInput: true,
+			Result:           &cfg,
+		})
+		if err != nil {
+			return nil, err
+		}
+		if err := dec.Decode(m); err != nil {
 			return nil, err
 		}
 
diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go
index 389d01245d11..1af37804371d 100644
--- a/jobspec/parse_test.go
+++ b/jobspec/parse_test.go
@@ -625,9 +625,10 @@ func TestParse(t *testing.T) {
 								Name:   "binstore",
 								Driver: "docker",
 								CSIPluginConfig: &api.TaskCSIPluginConfig{
-									ID:       "org.hashicorp.csi",
-									Type:     api.CSIPluginTypeMonolith,
-									MountDir: "/csi/test",
+									ID:            "org.hashicorp.csi",
+									Type:          api.CSIPluginTypeMonolith,
+									MountDir:      "/csi/test",
+									HealthTimeout: 1 * time.Minute,
 								},
 							},
 						},
diff --git a/jobspec/test-fixtures/csi-plugin.hcl b/jobspec/test-fixtures/csi-plugin.hcl
index b879da184347..3e4106719d5e 100644
--- a/jobspec/test-fixtures/csi-plugin.hcl
+++ b/jobspec/test-fixtures/csi-plugin.hcl
@@ -4,9 +4,10 @@ job "binstore-storagelocker" {
       driver = "docker"
 
       csi_plugin {
-        id        = "org.hashicorp.csi"
-        type      = "monolith"
-        mount_dir = "/csi/test"
+        id             = "org.hashicorp.csi"
+        type           = "monolith"
+        mount_dir      = "/csi/test"
+        health_timeout = "1m"
       }
     }
   }
diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go
index 9f90335ac7d9..d8eae4d3ea9e 100644
--- a/nomad/structs/csi.go
+++ b/nomad/structs/csi.go
@@ -66,6 +66,10 @@ type TaskCSIPluginConfig struct {
 	// to be created by the plugin, and will provide references into
 	// "MountDir/CSIIntermediaryDirname/{VolumeName}/{AllocID} for mounts.
 	MountDir string
+
+	// HealthTimeout is the time after which the CSI plugin tasks will be killed
+	// if the CSI Plugin is not healthy.
+	HealthTimeout time.Duration `mapstructure:"health_timeout" hcl:"health_timeout,optional"`
 }
 
 func (t *TaskCSIPluginConfig) Copy() *TaskCSIPluginConfig {
diff --git a/website/content/docs/job-specification/csi_plugin.mdx b/website/content/docs/job-specification/csi_plugin.mdx
index b72b8998c853..ddee5bfe5f14 100644
--- a/website/content/docs/job-specification/csi_plugin.mdx
+++ b/website/content/docs/job-specification/csi_plugin.mdx
@@ -17,9 +17,10 @@ to claim [volumes][csi_volumes].
 
 ```hcl
 csi_plugin {
-  id        = "csi-hostpath"
-  type      = "monolith"
-  mount_dir = "/csi"
+  id             = "csi-hostpath"
+  type           = "monolith"
+  mount_dir      = "/csi"
+  health_timeout = "30s"
 }
 ```
 
@@ -43,6 +44,11 @@ csi_plugin {
   container where the plugin will expect a Unix domain socket for
   bidirectional communication with Nomad.
 
+- `health_timeout` `(duration: <optional>)` - The duration that 
+  the plugin supervisor will wait before restarting an unhealthy
+  CSI plugin. Must be a duration value such as `30s` or `2m`.
+  Defaults to `30s` if not set. 
+
 ~> **Note:** Plugins running as `node` or `monolith` require root
 privileges (or `CAP_SYS_ADMIN` on Linux) to mount volumes on the
 host. With the Docker task driver, you can use the `privileged = true`
@@ -112,10 +118,11 @@ job "plugin-efs" {
       }
 
       csi_plugin {
-        id        = "aws-efs0"
-        type      = "node"
-        mount_dir = "/csi"  # this path /csi matches the --endpoint
+        id             = "aws-efs0"
+        type           = "node"
+        mount_dir      = "/csi"  # this path /csi matches the --endpoint
                             # argument for the container
+        health_timeout = "30s"
       }
     }
   }