From 2ae87115bff15f1b9a0f6a1e47492b4799a31fda Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 20 Jul 2023 12:05:40 -0400 Subject: [PATCH] improve error handling and fix bug for ineligible nodes --- .changelog/17996.txt | 8 +++++++ nomad/client_csi_endpoint.go | 42 +++++++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 8 deletions(-) diff --git a/.changelog/17996.txt b/.changelog/17996.txt index f95501a253dc..1c85790d5b06 100644 --- a/.changelog/17996.txt +++ b/.changelog/17996.txt @@ -1,3 +1,11 @@ ```release-note:bug csi: Fixed a bug in sending concurrent requests to CSI controller plugins by serializing them per plugin ``` + +```release-note:bug +csi: Fixed a bug where CSI controller requests could be sent to unhealthy plugins +``` + +```release-note:bug +csi: Fixed a bug where CSI controller requests could not be sent to controllers on nodes ineligible for scheduling +``` diff --git a/nomad/client_csi_endpoint.go b/nomad/client_csi_endpoint.go index 773efa211717..6a20f0d7ea10 100644 --- a/nomad/client_csi_endpoint.go +++ b/nomad/client_csi_endpoint.go @@ -4,6 +4,7 @@ package nomad import ( + "errors" "fmt" "sort" "strings" @@ -275,21 +276,46 @@ func (a *ClientCSI) clientIDsForController(pluginID string) ([]string, error) { clientIDs := []string{} + if len(plugin.Controllers) == 0 { + return nil, fmt.Errorf("failed to find instances of controller plugin %q", pluginID) + } + + var merr error for clientID, controller := range plugin.Controllers { - if !controller.IsController() || !controller.Healthy { - // we don't have separate types for CSIInfo depending on - // whether it's a controller or node. this error shouldn't - // make it to production but is to aid developers during - // development + if !controller.IsController() { + // we don't have separate types for CSIInfo depending on whether + // it's a controller or node. this error should never make it to + // production + merr = errors.Join(merr, fmt.Errorf( + "plugin instance %q is not a controller but was registered as one - this is always a bug", controller.AllocID)) + continue + } + + if !controller.Healthy { + merr = errors.Join(merr, fmt.Errorf( + "plugin instance %q is not healthy", controller.AllocID)) continue } + node, err := getNodeForRpc(snap, clientID) - if err == nil && node != nil && node.Ready() { - clientIDs = append(clientIDs, clientID) + if err != nil || node == nil { + merr = errors.Join(merr, fmt.Errorf( + "cannot find node %q for plugin instance %q", clientID, controller.AllocID)) + continue + } + + if node.Status != structs.NodeStatusReady { + merr = errors.Join(merr, fmt.Errorf( + "node %q for plugin instance %q is not ready", clientID, controller.AllocID)) + continue } + + clientIDs = append(clientIDs, clientID) } + if len(clientIDs) == 0 { - return nil, fmt.Errorf("failed to find clients running controller plugin %q", pluginID) + return nil, fmt.Errorf("failed to find clients running controller plugin %q: %v", + pluginID, merr) } // Many plugins don't handle concurrent requests as described in the spec,