Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CSI: fix potential state store corruptions #16256

Merged
merged 2 commits into from
Feb 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/16256.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
csi: Fixed potential state store corruption when garbage collecting CSI volume claims or checking whether it's safe to force-deregister a volume
```
2 changes: 1 addition & 1 deletion nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,7 @@ func (c *CoreScheduler) csiVolumeClaimGC(eval *structs.Evaluation) error {
// we only call the claim release RPC if the volume has claims
// that no longer have valid allocations. otherwise we'd send
// out a lot of do-nothing RPCs.
vol, err := c.snap.CSIVolumeDenormalize(ws, vol)
vol, err := c.snap.CSIVolumeDenormalize(ws, vol.Copy())
if err != nil {
return err
}
Expand Down
1 change: 1 addition & 0 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2218,6 +2218,7 @@ func TestCoreScheduler_CSIPluginGC(t *testing.T) {
require.NoError(t, err)

// Empty the plugin
plug = plug.Copy()
plug.Controllers = map[string]*structs.CSIInfo{}
plug.Nodes = map[string]*structs.CSIInfo{}

Expand Down
4 changes: 2 additions & 2 deletions nomad/state/state_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -1458,14 +1458,13 @@ func deleteNodeCSIPlugins(txn *txn, node *structs.Node, index uint64) error {

// updateOrGCPlugin updates a plugin but will delete it if the plugin is empty
func updateOrGCPlugin(index uint64, txn Txn, plug *structs.CSIPlugin) error {
plug.ModifyIndex = index

if plug.IsEmpty() {
err := txn.Delete("csi_plugins", plug)
if err != nil {
return fmt.Errorf("csi_plugins delete error: %v", err)
}
} else {
plug.ModifyIndex = index
err := txn.Insert("csi_plugins", plug)
if err != nil {
return fmt.Errorf("csi_plugins update error %s: %v", plug.ID, err)
Expand Down Expand Up @@ -2601,6 +2600,7 @@ func (s *StateStore) CSIVolumeDeregister(index uint64, namespace string, ids []s
// volSafeToForce checks if the any of the remaining allocations
// are in a non-terminal state.
func (s *StateStore) volSafeToForce(txn Txn, v *structs.CSIVolume) bool {
v = v.Copy()
vol, err := s.csiVolumeDenormalizeTxn(txn, nil, v)
if err != nil {
return false
Expand Down
14 changes: 9 additions & 5 deletions nomad/state/state_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3490,8 +3490,10 @@ func TestStateStore_CSIVolume(t *testing.T) {
vs = slurp(iter)
require.False(t, vs[0].HasFreeWriteClaims())

claim0.Mode = u
err = state.CSIVolumeClaim(2, ns, vol0, claim0)
claim2 := new(structs.CSIVolumeClaim)
*claim2 = *claim0
claim2.Mode = u
err = state.CSIVolumeClaim(2, ns, vol0, claim2)
require.NoError(t, err)
ws = memdb.NewWatchSet()
iter, err = state.CSIVolumesByPluginID(ws, ns, "", "minnie")
Expand Down Expand Up @@ -3520,8 +3522,10 @@ func TestStateStore_CSIVolume(t *testing.T) {

// release claims to unblock deregister
index++
claim0.State = structs.CSIVolumeClaimStateReadyToFree
err = state.CSIVolumeClaim(index, ns, vol0, claim0)
claim3 := new(structs.CSIVolumeClaim)
*claim3 = *claim2
claim3.State = structs.CSIVolumeClaimStateReadyToFree
err = state.CSIVolumeClaim(index, ns, vol0, claim3)
require.NoError(t, err)
index++
claim1.Mode = u
Expand Down Expand Up @@ -3577,7 +3581,7 @@ func TestStateStore_CSIPlugin_Lifecycle(t *testing.T) {
require.Equal(t, counts.nodesHealthy, plug.NodesHealthy, "nodes healthy")
require.Equal(t, counts.controllersExpected, plug.ControllersExpected, "controllers expected")
require.Equal(t, counts.nodesExpected, plug.NodesExpected, "nodes expected")
return plug
return plug.Copy()
}

type allocUpdateKind int
Expand Down