From ecdbce038f24a7b45fd2db80003dc9d164711131 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 21 Mar 2024 10:48:50 +0100 Subject: [PATCH 1/5] cleanup: do not pass an empty snapshot to genSnapFromSnapID() Just like GenVolFromVolID() the genSnapFromSnapID() function can return a snapshot. There is no need to allocated an empty snapshot and pass that to the genSnapFromSnapID() function. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 11 ++++---- internal/rbd/rbd_util.go | 45 +++++++++++++++++--------------- 2 files changed, 29 insertions(+), 27 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 75197c8bb6e..5f5e70ecdc5 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -638,7 +638,6 @@ func (cs *ControllerServer) createVolumeFromSnapshot( rbdVol *rbdVolume, snapshotID string, ) error { - rbdSnap := &rbdSnapshot{} if acquired := cs.SnapshotLocks.TryAcquire(snapshotID); !acquired { log.ErrorLog(ctx, util.SnapshotOperationAlreadyExistsFmt, snapshotID) @@ -646,7 +645,7 @@ func (cs *ControllerServer) createVolumeFromSnapshot( } defer cs.SnapshotLocks.Release(snapshotID) - err := genSnapFromSnapID(ctx, rbdSnap, snapshotID, cr, secrets) + rbdSnap, err := genSnapFromSnapID(ctx, snapshotID, cr, secrets) if err != nil { if errors.Is(err, util.ErrPoolNotFound) { log.ErrorLog(ctx, "failed to get backend snapshot for %s: %v", snapshotID, err) @@ -789,8 +788,8 @@ func checkContentSource( if snapshotID == "" { return nil, nil, status.Errorf(codes.NotFound, "volume Snapshot ID cannot be empty") } - rbdSnap := &rbdSnapshot{} - if err := genSnapFromSnapID(ctx, rbdSnap, snapshotID, cr, req.GetSecrets()); err != nil { + rbdSnap, err := genSnapFromSnapID(ctx, snapshotID, cr, req.GetSecrets()) + if err != nil { log.ErrorLog(ctx, "failed to get backend snapshot for %s: %v", snapshotID, err) if !errors.Is(err, ErrSnapNotFound) { return nil, nil, status.Error(codes.Internal, err.Error()) @@ -1429,8 +1428,8 @@ func (cs *ControllerServer) DeleteSnapshot( } defer cs.OperationLocks.ReleaseDeleteLock(snapshotID) - rbdSnap := &rbdSnapshot{} - if err = genSnapFromSnapID(ctx, rbdSnap, snapshotID, cr, req.GetSecrets()); err != nil { + rbdSnap, err := genSnapFromSnapID(ctx, snapshotID, cr, req.GetSecrets()) + if err != nil { // if error is ErrPoolNotFound, the pool is already deleted we don't // need to worry about deleting snapshot or omap data, return success if errors.Is(err, util.ErrPoolNotFound) { diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index b318d0f62ad..13e226bf4de 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -949,54 +949,57 @@ func (ri *rbdImage) checkImageChainHasFeature(ctx context.Context, feature uint6 // genSnapFromSnapID generates a rbdSnapshot structure from the provided identifier, updating // the structure with elements from on-disk snapshot metadata as well. +// +// NOTE: The returned rbdSnapshot can be non-nil in case of an error. That +// seems to be required for the DeleteSnapshot procedure, so that OMAP +// attributes can be cleaned-up. func genSnapFromSnapID( ctx context.Context, - rbdSnap *rbdSnapshot, snapshotID string, cr *util.Credentials, secrets map[string]string, -) error { +) (*rbdSnapshot, error) { var vi util.CSIIdentifier - rbdSnap.VolID = snapshotID - - err := vi.DecomposeCSIID(rbdSnap.VolID) + err := vi.DecomposeCSIID(snapshotID) if err != nil { - log.ErrorLog(ctx, "error decoding snapshot ID (%s) (%s)", err, rbdSnap.VolID) + log.ErrorLog(ctx, "error decoding snapshot ID (%s) (%s)", err, snapshotID) - return err + return nil, err } + rbdSnap := &rbdSnapshot{} + rbdSnap.VolID = snapshotID rbdSnap.ClusterID = vi.ClusterID rbdSnap.Monitors, _, err = util.GetMonsAndClusterID(ctx, rbdSnap.ClusterID, false) if err != nil { log.ErrorLog(ctx, "failed getting mons (%s)", err) - return err + return nil, err } rbdSnap.Pool, err = util.GetPoolName(rbdSnap.Monitors, cr, vi.LocationID) if err != nil { - return err + return nil, err } rbdSnap.JournalPool = rbdSnap.Pool rbdSnap.RadosNamespace, err = util.GetRadosNamespace(util.CsiConfigFile, rbdSnap.ClusterID) if err != nil { - return err + return nil, err } j, err := snapJournal.Connect(rbdSnap.Monitors, rbdSnap.RadosNamespace, cr) if err != nil { - return err + return nil, err } defer j.Destroy() imageAttributes, err := j.GetImageAttributes( ctx, rbdSnap.Pool, vi.ObjectUUID, true) if err != nil { - return err + return rbdSnap, err } rbdSnap.ImageID = imageAttributes.ImageID rbdSnap.RequestName = imageAttributes.RequestName @@ -1009,42 +1012,42 @@ func genSnapFromSnapID( rbdSnap.JournalPool, err = util.GetPoolName(rbdSnap.Monitors, cr, imageAttributes.JournalPoolID) if err != nil { // TODO: If pool is not found we may leak the image (as DeleteSnapshot will return success) - return err + return rbdSnap, err } } err = rbdSnap.Connect(cr) + if err != nil { + return rbdSnap, fmt.Errorf("failed to connect to %q: %w", + rbdSnap, err) + } defer func() { if err != nil { rbdSnap.Destroy() } }() - if err != nil { - return fmt.Errorf("failed to connect to %q: %w", - rbdSnap, err) - } if imageAttributes.KmsID != "" && imageAttributes.EncryptionType == util.EncryptionTypeBlock { err = rbdSnap.configureBlockEncryption(imageAttributes.KmsID, secrets) if err != nil { - return fmt.Errorf("failed to configure block encryption for "+ + return rbdSnap, fmt.Errorf("failed to configure block encryption for "+ "%q: %w", rbdSnap, err) } } if imageAttributes.KmsID != "" && imageAttributes.EncryptionType == util.EncryptionTypeFile { err = rbdSnap.configureFileEncryption(ctx, imageAttributes.KmsID, secrets) if err != nil { - return fmt.Errorf("failed to configure file encryption for "+ + return rbdSnap, fmt.Errorf("failed to configure file encryption for "+ "%q: %w", rbdSnap, err) } } err = updateSnapshotDetails(rbdSnap) if err != nil { - return fmt.Errorf("failed to update snapshot details for %q: %w", rbdSnap, err) + return rbdSnap, fmt.Errorf("failed to update snapshot details for %q: %w", rbdSnap, err) } - return err + return rbdSnap, err } // updateSnapshotDetails will copy the details from the rbdVolume to the From df0df50d21db3b32575e21fbcf1465160318312e Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Thu, 21 Mar 2024 10:59:50 +0100 Subject: [PATCH 2/5] rbd: free snapshot resources after allocation Not all snapshot objects are free'd correctly after they were allocated. It is possible that some connections to the Ceph cluster were never closed. This does not need to be a noticeable problem, as connections are re-used where possible, but it isn't clean either. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 5f5e70ecdc5..cb27828518e 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -348,6 +348,12 @@ func (cs *ControllerServer) CreateVolume( if err != nil { return nil, err } + if parentVol != nil { + defer parentVol.Destroy() + } + if rbdSnap != nil { + defer rbdSnap.Destroy() + } err = updateTopologyConstraints(rbdVol, rbdSnap) if err != nil { @@ -655,6 +661,7 @@ func (cs *ControllerServer) createVolumeFromSnapshot( return status.Error(codes.Internal, err.Error()) } + defer rbdSnap.Destroy() // update parent name(rbd image name in snapshot) rbdSnap.RbdImageName = rbdSnap.RbdSnapName @@ -1458,6 +1465,7 @@ func (cs *ControllerServer) DeleteSnapshot( return nil, status.Error(codes.Internal, err.Error()) } + defer rbdSnap.Destroy() // safeguard against parallel create or delete requests against the same // name From f4aeef0d3177de52152f22522a582ec9633c991c Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Fri, 22 Mar 2024 14:43:25 +0100 Subject: [PATCH 3/5] rbd: let parseVolCreateRequest() return a connected rbdVolume By returning a connected rbdVolume in parseVolCreateRequest(), the CreateVolume() function can be simplified a little. There is no need to call the additional Connect() and detect failures with it. Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index cb27828518e..89f0191ec71 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -150,6 +150,7 @@ func validateStriping(parameters map[string]string) error { func (cs *ControllerServer) parseVolCreateRequest( ctx context.Context, req *csi.CreateVolumeRequest, + cr *util.Credentials, ) (*rbdVolume, error) { // TODO (sbezverk) Last check for not exceeding total storage capacity @@ -226,6 +227,13 @@ func (cs *ControllerServer) parseVolCreateRequest( return nil, status.Error(codes.InvalidArgument, err.Error()) } + err = rbdVol.Connect(cr) + if err != nil { + log.ErrorLog(ctx, "failed to connect to volume %v: %v", rbdVol.RbdImageName, err) + + return nil, status.Error(codes.Internal, err.Error()) + } + // NOTE: rbdVol does not contain VolID and RbdImageName populated, everything // else is populated post create request parsing return rbdVol, nil @@ -324,7 +332,7 @@ func (cs *ControllerServer) CreateVolume( return nil, status.Error(codes.InvalidArgument, err.Error()) } defer cr.DeleteCredentials() - rbdVol, err := cs.parseVolCreateRequest(ctx, req) + rbdVol, err := cs.parseVolCreateRequest(ctx, req, cr) if err != nil { return nil, err } @@ -337,13 +345,6 @@ func (cs *ControllerServer) CreateVolume( } defer cs.VolumeLocks.Release(req.GetName()) - err = rbdVol.Connect(cr) - if err != nil { - log.ErrorLog(ctx, "failed to connect to volume %v: %v", rbdVol.RbdImageName, err) - - return nil, status.Error(codes.Internal, err.Error()) - } - parentVol, rbdSnap, err := checkContentSource(ctx, req, cr) if err != nil { return nil, err From 81f11a8d9a1c528b37e981088c5f22ef67ea7eb8 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 26 Mar 2024 09:51:19 +0100 Subject: [PATCH 4/5] cleanup: reformat generateVolFromSnap() to rbdSnapshot.toVolume() Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 10 ++++----- internal/rbd/rbd_journal.go | 2 +- internal/rbd/rbd_util.go | 2 +- internal/rbd/snapshot.go | 35 ++++++++++++++++---------------- 4 files changed, 25 insertions(+), 24 deletions(-) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 89f0191ec71..01d4b6c583b 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -666,7 +666,7 @@ func (cs *ControllerServer) createVolumeFromSnapshot( // update parent name(rbd image name in snapshot) rbdSnap.RbdImageName = rbdSnap.RbdSnapName - parentVol := generateVolFromSnap(rbdSnap) + parentVol := rbdSnap.toVolume() // as we are operating on single cluster reuse the connection parentVol.conn = rbdVol.conn.Copy() @@ -1237,7 +1237,7 @@ func cloneFromSnapshot( cr *util.Credentials, parameters map[string]string, ) (*csi.CreateSnapshotResponse, error) { - vol := generateVolFromSnap(rbdSnap) + vol := rbdSnap.toVolume() err := vol.Connect(cr) if err != nil { uErr := undoSnapshotCloning(ctx, rbdVol, rbdSnap, vol, cr) @@ -1322,7 +1322,7 @@ func (cs *ControllerServer) doSnapshotClone( cr *util.Credentials, ) (*rbdVolume, error) { // generate cloned volume details from snapshot - cloneRbd := generateVolFromSnap(rbdSnap) + cloneRbd := rbdSnap.toVolume() defer cloneRbd.Destroy() // add image feature for cloneRbd f := []string{librbd.FeatureNameLayering, librbd.FeatureNameDeepFlatten} @@ -1480,7 +1480,7 @@ func (cs *ControllerServer) DeleteSnapshot( // Deleting snapshot and cloned volume log.DebugLog(ctx, "deleting cloned rbd volume %s", rbdSnap.RbdSnapName) - rbdVol := generateVolFromSnap(rbdSnap) + rbdVol := rbdSnap.toVolume() err = rbdVol.Connect(cr) if err != nil { @@ -1511,7 +1511,7 @@ func (cs *ControllerServer) DeleteSnapshot( // cleanUpImageAndSnapReservation cleans up the image from the trash and // snapshot reservation in rados OMAP. func cleanUpImageAndSnapReservation(ctx context.Context, rbdSnap *rbdSnapshot, cr *util.Credentials) error { - rbdVol := generateVolFromSnap(rbdSnap) + rbdVol := rbdSnap.toVolume() err := rbdVol.Connect(cr) if err != nil { return status.Error(codes.Internal, err.Error()) diff --git a/internal/rbd/rbd_journal.go b/internal/rbd/rbd_journal.go index f7c2f86c7ce..04ad9138e17 100644 --- a/internal/rbd/rbd_journal.go +++ b/internal/rbd/rbd_journal.go @@ -162,7 +162,7 @@ func checkSnapCloneExists( snapData.ImagePool, rbdSnap.Pool) } - vol := generateVolFromSnap(rbdSnap) + vol := rbdSnap.toVolume() defer vol.Destroy() err = vol.Connect(cr) if err != nil { diff --git a/internal/rbd/rbd_util.go b/internal/rbd/rbd_util.go index 13e226bf4de..31b330b4d37 100644 --- a/internal/rbd/rbd_util.go +++ b/internal/rbd/rbd_util.go @@ -1053,7 +1053,7 @@ func genSnapFromSnapID( // updateSnapshotDetails will copy the details from the rbdVolume to the // rbdSnapshot. example copying size from rbdVolume to rbdSnapshot. func updateSnapshotDetails(rbdSnap *rbdSnapshot) error { - vol := generateVolFromSnap(rbdSnap) + vol := rbdSnap.toVolume() err := vol.Connect(rbdSnap.conn.Creds) if err != nil { return err diff --git a/internal/rbd/snapshot.go b/internal/rbd/snapshot.go index bb420475c3d..d6dcd48c20d 100644 --- a/internal/rbd/snapshot.go +++ b/internal/rbd/snapshot.go @@ -98,23 +98,24 @@ func cleanUpSnapshot( return nil } -func generateVolFromSnap(rbdSnap *rbdSnapshot) *rbdVolume { - vol := new(rbdVolume) - vol.ClusterID = rbdSnap.ClusterID - vol.VolID = rbdSnap.VolID - vol.Monitors = rbdSnap.Monitors - vol.Pool = rbdSnap.Pool - vol.JournalPool = rbdSnap.JournalPool - vol.RadosNamespace = rbdSnap.RadosNamespace - vol.RbdImageName = rbdSnap.RbdSnapName - vol.ImageID = rbdSnap.ImageID - // copyEncryptionConfig cannot be used here because the volume and the - // snapshot will have the same volumeID which cases the panic in - // copyEncryptionConfig function. - vol.blockEncryption = rbdSnap.blockEncryption - vol.fileEncryption = rbdSnap.fileEncryption - - return vol +func (rbdSnap *rbdSnapshot) toVolume() *rbdVolume { + return &rbdVolume{ + rbdImage: rbdImage{ + ClusterID: rbdSnap.ClusterID, + VolID: rbdSnap.VolID, + Monitors: rbdSnap.Monitors, + Pool: rbdSnap.Pool, + JournalPool: rbdSnap.JournalPool, + RadosNamespace: rbdSnap.RadosNamespace, + RbdImageName: rbdSnap.RbdSnapName, + ImageID: rbdSnap.ImageID, + // copyEncryptionConfig cannot be used here because the volume and the + // snapshot will have the same volumeID which cases the panic in + // copyEncryptionConfig function. + blockEncryption: rbdSnap.blockEncryption, + fileEncryption: rbdSnap.fileEncryption, + }, + } } func undoSnapshotCloning( From 93e24a85335be2f99494ec8327eda0c3116d0b79 Mon Sep 17 00:00:00 2001 From: Niels de Vos Date: Tue, 26 Mar 2024 15:38:20 +0100 Subject: [PATCH 5/5] rbd: add extra logging while cleaning up snapshots Signed-off-by: Niels de Vos --- internal/rbd/controllerserver.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/internal/rbd/controllerserver.go b/internal/rbd/controllerserver.go index 01d4b6c583b..4f07ce5f0d3 100644 --- a/internal/rbd/controllerserver.go +++ b/internal/rbd/controllerserver.go @@ -1450,12 +1450,16 @@ func (cs *ControllerServer) DeleteSnapshot( // or partially complete (snap and snapOMap are garbage collected already), hence return // success as deletion is complete if errors.Is(err, util.ErrKeyNotFound) { + log.UsefulLog(ctx, "snapshot %s was been deleted already: %v", snapshotID, err) + return &csi.DeleteSnapshotResponse{}, nil } // if the error is ErrImageNotFound, We need to cleanup the image from // trash and remove the metadata in OMAP. if errors.Is(err, ErrImageNotFound) { + log.UsefulLog(ctx, "cleaning up leftovers of snapshot %s: %v", snapshotID, err) + err = cleanUpImageAndSnapReservation(ctx, rbdSnap, cr) if err != nil { return nil, status.Error(codes.Internal, err.Error())