diff --git a/pkg/rbd/controllerserver.go b/pkg/rbd/controllerserver.go index 80fbca63f6ed..22b06a95783a 100644 --- a/pkg/rbd/controllerserver.go +++ b/pkg/rbd/controllerserver.go @@ -146,7 +146,7 @@ func (cs *ControllerServer) CreateVolume(ctx context.Context, req *csi.CreateVol } defer func() { if err != nil { - errDefer := unreserveVol(rbdVol, req.GetSecrets()) + errDefer := undoVolReservation(rbdVol, req.GetSecrets()) if errDefer != nil { klog.Warningf("failed undoing reservation of volume: %s (%s)", req.GetName(), errDefer) } @@ -257,7 +257,7 @@ func (cs *ControllerServer) DeleteVolume(ctx context.Context, req *csi.DeleteVol } }() - if err := unreserveVol(rbdVol, req.GetSecrets()); err != nil { + if err := undoVolReservation(rbdVol, req.GetSecrets()); err != nil { return nil, status.Error(codes.Internal, err.Error()) } return &csi.DeleteVolumeResponse{}, nil @@ -347,7 +347,7 @@ func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateS // check for the requested source volume id and already allocated source volume id found, err := checkSnapExists(rbdSnap, req.GetSecrets()) if err != nil { - if _, ok := err.(ErrSnapNameConflict); ok { + if _, ok := err.(util.ErrSnapNameConflict); ok { return nil, status.Error(codes.AlreadyExists, err.Error()) } @@ -371,7 +371,7 @@ func (cs *ControllerServer) CreateSnapshot(ctx context.Context, req *csi.CreateS } defer func() { if err != nil { - errDefer := unreserveSnap(rbdSnap, req.GetSecrets()) + errDefer := undoSnapReservation(rbdSnap, req.GetSecrets()) if errDefer != nil { klog.Warningf("failed undoing reservation of snapshot: %s %v", req.GetName(), errDefer) } @@ -483,7 +483,7 @@ func (cs *ControllerServer) DeleteSnapshot(ctx context.Context, req *csi.DeleteS if _, ok := err.(ErrSnapNotFound); !ok { return nil, status.Error(codes.Internal, err.Error()) } - if err := unreserveSnap(rbdSnap, req.GetSecrets()); err != nil { + if err := undoSnapReservation(rbdSnap, req.GetSecrets()); err != nil { return nil, status.Error(codes.Internal, err.Error()) } return &csi.DeleteSnapshotResponse{}, nil diff --git a/pkg/rbd/errors.go b/pkg/rbd/errors.go index 27bce1f5c849..4e88c27918cc 100644 --- a/pkg/rbd/errors.go +++ b/pkg/rbd/errors.go @@ -37,17 +37,6 @@ func (e ErrSnapNotFound) Error() string { return e.err.Error() } -// ErrSnapNameConflict is generated when a requested CSI snap name already exists on RBD but with -// different properties, and hence is in conflict with the passed in CSI volume name -type ErrSnapNameConflict struct { - requestName string - err error -} - -func (e ErrSnapNameConflict) Error() string { - return e.err.Error() -} - // ErrVolNameConflict is generated when a requested CSI volume name already exists on RBD but with // different properties, and hence is in conflict with the passed in CSI volume name type ErrVolNameConflict struct { diff --git a/pkg/rbd/nodeserver.go b/pkg/rbd/nodeserver.go index bfead3bff717..ded1c99eeb2e 100644 --- a/pkg/rbd/nodeserver.go +++ b/pkg/rbd/nodeserver.go @@ -123,7 +123,7 @@ func (ns *NodeServer) getVolumeName(req *csi.NodePublishVolumeRequest) (string, return "", status.Error(codes.InvalidArgument, err.Error()) } - return rbdImgNamePrefix + vi.ObjectUUID, nil + return volJournal.NamingPrefix() + vi.ObjectUUID, nil } func (ns *NodeServer) mountVolume(req *csi.NodePublishVolumeRequest, devicePath string) error { diff --git a/pkg/rbd/rbd.go b/pkg/rbd/rbd.go index c5e1a5190b0a..b9ff3d1df939 100644 --- a/pkg/rbd/rbd.go +++ b/pkg/rbd/rbd.go @@ -27,70 +27,6 @@ import ( "k8s.io/utils/nsenter" ) -/* -RADOS omaps usage: - -This note details how we preserve idempotent nature of create requests and retain the relationship -between orchestrator (CO) generated Names and plugin generated names for images and snapshots - -The implementation uses Ceph RADOS omaps to preserve the relationship between request name and -generated image (or snapshot) name. There are 4 types of omaps in use, -- A "csi.volumes.[csi-id]" (or "csi.volumes"+.+CSIInstanceID), we call this the csiVolsDirectory - - stores keys named using the CO generated names for volume requests - - keys are named "csi.volume."+[CO generated VolName] - - Key value contains the RBD image uuid that is created or will be created, for the CO provided - name - -- A "csi.snaps.[csi-id]" (or "csi.snaps"+.+CSIInstanceID), we refer to this as the csiSnapsDirectory - - stores keys named using the CO generated names for snapshot requests - - keys are named "csi.snap."+[CO generated SnapName] - - Key value contains the RBD snapshot uuid that is created or will be created, for the CO - provided name - -- A per image omap named "rbd.csi.volume."+[RBD image uuid], we refer to this as the rbdImageOMap - - stores a single key named "csi.volname", that has the value of the CO generated VolName that - this image refers to - -- A per snapshot omap named "rbd.csi.snap."+[RBD snapshot uuid], we refer to this as the snapOMap - - stores a key named "csi.snapname", that has the value of the CO generated SnapName that this - snapshot refers to - - also stores another key named "csi.source", that has the value of the image name that is the - source of the snapshot - -Creation of omaps: -When a volume create request is received (or a snapshot create, the snapshot is not detailed in this - comment further as the process is similar), -- The csiVolsDirectory is consulted to find if there is already a key with the CO VolName, and if present, -it is used to read its references to reach the RBD image that backs this VolName, to check if the -RBD image can satisfy the requirements for the request - - If during the process of checking the same, it is found that some linking information is stale - or missing, the corresponding keys upto the key in the csiVolsDirectory is cleaned up, to start afresh -- If the key with the CO VolName is not found, or was cleaned up, the request is treated as a -new create request, and an rbdImageOMap is created first with a generated uuid, this ensures that we -do not use a uuid that is already in use -- Next, a key with the VolName is created in the csiVolsDirectory, and its value is updated to store the -generated uuid -- This is followed by updating the rbdImageOMap with the VolName in the rbdImageCSIVolNameKey -- Finally, the image is created (or promoted from a snapshot, if content source was provided) using -the uuid and a corresponding image name prefix (rbdImgNamePrefix or rbdSnapNamePrefix) - -The entire operation is locked based on VolName hash, to ensure there is only ever a single entity -modifying the related omaps for a given VolName. - -This ensures idempotent nature of creates, as the same CO generated VolName would attempt to use -the same RBD image name to serve the request, as the relations are saved in the respective omaps. - -Deletion of omaps: -Delete requests would not contain the VolName, hence deletion uses the volume ID, which is encoded -with the image name in it, to find the image and the rbdImageOMap. The rbdImageOMap is read to get -the VolName that this image points to. This VolName can be further used to read and delete the key -from the csiVolsDirectory. - -As we trace back and find the VolName, we also take a hash based lock on the VolName before -proceeding with deleting the image and the related omap entries, to ensure there is only ever a -single entity modifying the related omaps for a given VolName. -*/ - const ( // volIDVersion is the version number of volume ID encoding scheme volIDVersion uint16 = 1 @@ -99,34 +35,8 @@ const ( // csiConfigFile is the location of the CSI config file csiConfigFile = "/etc/ceph-csi-config/config.json" - - // CSI volume-name keyname prefix, for key in csiVolsDirectory, suffix is the CSI passed volume name - csiVolNameKeyPrefix = "csi.volume." - // Per RBD image object map name prefix, suffix is the RBD image uuid - rbdImageOMapPrefix = "csi.volume." - // CSI volume-name key in per RBD image object map, containing CSI volume-name for which the - // image was created - rbdImageCSIVolNameKey = "csi.volname" - // RBD image name prefix, suffix is a uuid generated per image - rbdImgNamePrefix = "csi-vol-" - - //CSI snap-name keyname prefix, for key in csiSnapsDirectory, suffix is the CSI passed snapshot name - csiSnapNameKeyPrefix = "csi.snap." - // Per RBD snapshot object map name prefix, suffix is the RBD image uuid - rbdSnapOMapPrefix = "csi.snap." - // CSI snap-name key in per RBD snapshot object map, containing CSI snapshot-name for which the - // snapshot was created - rbdSnapCSISnapNameKey = "csi.snapname" - // source image name key in per RBD snapshot object map, containing RBD source image name for - // which the snapshot was created - rbdSnapSourceImageKey = "csi.source" - // RBD snapshot name prefix, suffix is a uuid generated per snapshot - rbdSnapNamePrefix = "csi-snap-" ) -// PluginFolder defines the location of ceph plugin -var PluginFolder = "/var/lib/kubelet/plugins/" - // Driver contains the default identity,node and controller struct type Driver struct { cd *csicommon.CSIDriver @@ -138,14 +48,18 @@ type Driver struct { var ( version = "1.0.0" + + // PluginFolder defines the location of ceph plugin + PluginFolder = "/var/lib/kubelet/plugins/" + // CSIInstanceID is the instance ID that is unique to an instance of CSI, used when sharing // ceph clusters across CSI instances, to differentiate omap names per CSI instance CSIInstanceID = "default" - // csiVolsDirectory is the name of the CSI volumes object map that contains CSI volume-name - // based keys - csiVolsDirectory = "csi.volumes" - // csiSnapsDirectory is the name of the CSI snapshots object map that contains CSI snapshot-name based keys - csiSnapsDirectory = "csi.snaps" + + // volJournal and snapJournal are used to maintain RADOS based journals for CO generated + // VolumeName to backing RBD images + volJournal *util.CSIJournal + snapJournal *util.CSIJournal ) // NewDriver returns new rbd driver @@ -199,8 +113,14 @@ func (r *Driver) Run(driverName, nodeID, endpoint, instanceID string, containeri if instanceID != "" { CSIInstanceID = instanceID } - csiVolsDirectory = csiVolsDirectory + "." + CSIInstanceID - csiSnapsDirectory = csiSnapsDirectory + "." + CSIInstanceID + + // Get an instance of the volume and snapshot journal keys + volJournal = util.NewCSIVolumeJournal() + snapJournal = util.NewCSISnapshotJournal() + + // Update keys with CSI instance suffix + volJournal.SetCSIDirectorySuffix(CSIInstanceID) + snapJournal.SetCSIDirectorySuffix(CSIInstanceID) // Initialize default library driver r.cd = csicommon.NewCSIDriver(driverName, version, nodeID) diff --git a/pkg/rbd/rbd_journal.go b/pkg/rbd/rbd_journal.go new file mode 100644 index 000000000000..07d8961fbcb0 --- /dev/null +++ b/pkg/rbd/rbd_journal.go @@ -0,0 +1,307 @@ +/* +Copyright 2019 The Ceph-CSI Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package rbd + +import ( + "fmt" + + "github.com/ceph/ceph-csi/pkg/util" + + "github.com/pkg/errors" + "k8s.io/klog" +) + +func validateNonEmptyField(field, fieldName, structName string) error { + if field == "" { + return fmt.Errorf("value '%s' in '%s' structure cannot be empty", fieldName, structName) + } + + return nil +} + +func validateRbdSnap(rbdSnap *rbdSnapshot) error { + var err error + + if err = validateNonEmptyField(rbdSnap.RequestName, "RequestName", "rbdSnapshot"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdSnap.Monitors, "Monitors", "rbdSnapshot"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdSnap.AdminID, "AdminID", "rbdSnapshot"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdSnap.Pool, "Pool", "rbdSnapshot"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdSnap.RbdImageName, "RbdImageName", "rbdSnapshot"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdSnap.ClusterID, "ClusterID", "rbdSnapshot"); err != nil { + return err + } + + return err +} + +func validateRbdVol(rbdVol *rbdVolume) error { + var err error + + if err = validateNonEmptyField(rbdVol.RequestName, "RequestName", "rbdVolume"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdVol.Monitors, "Monitors", "rbdVolume"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdVol.AdminID, "AdminID", "rbdVolume"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdVol.Pool, "Pool", "rbdVolume"); err != nil { + return err + } + + if err = validateNonEmptyField(rbdVol.ClusterID, "ClusterID", "rbdVolume"); err != nil { + return err + } + + if rbdVol.VolSize == 0 { + return errors.New("value 'VolSize' in 'rbdVolume' structure cannot be 0") + } + + return err +} + +/* +checkSnapExists, and its counterpart checkVolExists, function as checks to determine if passed +in rbdSnapshot or rbdVolume exists on the backend. + +**NOTE:** These functions manipulate the rados omaps that hold information regarding +volume names as requested by the CSI drivers. Hence, these need to be invoked only when the +respective CSI driver generated snapshot or volume name based locks are held, as otherwise racy +access to these omaps may end up leaving them in an inconsistent state. + +These functions need enough information about cluster and pool (ie, Monitors, Pool, IDs filled in) +to operate. They further require that the RequestName element of the structure have a valid value +to operate on and determine if the said RequestName already exists on the backend. + +These functions populate the snapshot or the image name, its attributes and the CSI snapshot/volume +ID for the same when successful. + +These functions also cleanup omap reservations that are stale. I.e when omap entries exist and +backing images or snapshots are missing, or one of the omaps exist and the next is missing. This is +because, the order of omap creation and deletion are inverse of each other, and protected by the +request name lock, and hence any stale omaps are leftovers from incomplete transactions and are +hence safe to garbage collect. +*/ +func checkSnapExists(rbdSnap *rbdSnapshot, credentials map[string]string) (bool, error) { + err := validateRbdSnap(rbdSnap) + if err != nil { + return false, err + } + + key, err := getKey(rbdSnap.AdminID, credentials) + if err != nil { + return false, err + } + + snapUUID, err := snapJournal.CheckReservation(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, + rbdSnap.RequestName, rbdSnap.RbdImageName) + if err != nil { + return false, err + } + if snapUUID == "" { + return false, nil + } + rbdSnap.RbdSnapName = snapJournal.NamingPrefix() + snapUUID + + // Fetch on-disk image attributes + err = updateSnapWithImageInfo(rbdSnap, credentials) + if err != nil { + if _, ok := err.(ErrSnapNotFound); ok { + err = snapJournal.UndoReservation(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, + rbdSnap.RbdSnapName, rbdSnap.RequestName) + return false, err + } + return false, err + } + + // found a snapshot already available, process and return its information + rbdSnap.SnapID, err = util.GenerateVolID(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, + rbdSnap.ClusterID, snapUUID, volIDVersion) + if err != nil { + return false, err + } + + klog.V(4).Infof("Found existing snap (%s) with snap name (%s) for request (%s)", + rbdSnap.SnapID, rbdSnap.RbdSnapName, rbdSnap.RequestName) + + return true, nil +} + +/* +Check comment on checkSnapExists, to understand how this function behaves + +**NOTE:** These functions manipulate the rados omaps that hold information regarding +volume names as requested by the CSI drivers. Hence, these need to be invoked only when the +respective CSI snapshot or volume name based locks are held, as otherwise racy access to these +omaps may end up leaving the omaps in an inconsistent state. +*/ +func checkVolExists(rbdVol *rbdVolume, credentials map[string]string) (bool, error) { + err := validateRbdVol(rbdVol) + if err != nil { + return false, err + } + + key, err := getKey(rbdVol.AdminID, credentials) + if err != nil { + return false, err + } + + imageUUID, err := volJournal.CheckReservation(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, + rbdVol.RequestName, "") + if err != nil { + return false, err + } + if imageUUID == "" { + return false, nil + } + rbdVol.RbdImageName = volJournal.NamingPrefix() + imageUUID + + // NOTE: Return volsize should be on-disk volsize, not request vol size, so + // save it for size checks before fetching image data + requestSize := rbdVol.VolSize + // Fetch on-disk image attributes and compare against request + err = updateVolWithImageInfo(rbdVol, credentials) + if err != nil { + if _, ok := err.(ErrImageNotFound); ok { + err = volJournal.UndoReservation(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, + rbdVol.RbdImageName, rbdVol.RequestName) + return false, err + } + return false, err + } + + // size checks + if rbdVol.VolSize < requestSize { + err = fmt.Errorf("image with the same name (%s) but with different size already exists", + rbdVol.RbdImageName) + return false, ErrVolNameConflict{rbdVol.RbdImageName, err} + } + // TODO: We should also ensure image features and format is the same + + // found a volume already available, process and return it! + rbdVol.VolID, err = util.GenerateVolID(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, + rbdVol.ClusterID, imageUUID, volIDVersion) + if err != nil { + return false, err + } + + klog.V(4).Infof("Found existng volume (%s) with image name (%s) for request (%s)", + rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName) + + return true, nil +} + +// reserveSnap is a helper routine to request an rbdSnapshot name reservation and generate the +// volume ID for the generated name +func reserveSnap(rbdSnap *rbdSnapshot, credentials map[string]string) error { + key, err := getKey(rbdSnap.AdminID, credentials) + if err != nil { + return err + } + + snapUUID, err := snapJournal.ReserveName(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, + rbdSnap.RequestName, rbdSnap.RbdImageName) + if err != nil { + return err + } + + rbdSnap.SnapID, err = util.GenerateVolID(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, + rbdSnap.ClusterID, snapUUID, volIDVersion) + if err != nil { + return err + } + + rbdSnap.RbdSnapName = snapJournal.NamingPrefix() + snapUUID + + klog.V(4).Infof("Generated Volume ID (%s) and image name (%s) for request name (%s)", + rbdSnap.SnapID, rbdSnap.RbdImageName, rbdSnap.RequestName) + + return nil +} + +// reserveVol is a helper routine to request an rbdVolume name reservation and generate the +// volume ID for the generated name +func reserveVol(rbdVol *rbdVolume, credentials map[string]string) error { + key, err := getKey(rbdVol.AdminID, credentials) + if err != nil { + return err + } + + imageUUID, err := volJournal.ReserveName(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, + rbdVol.RequestName, "") + if err != nil { + return err + } + + rbdVol.VolID, err = util.GenerateVolID(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, + rbdVol.ClusterID, imageUUID, volIDVersion) + if err != nil { + return err + } + + rbdVol.RbdImageName = volJournal.NamingPrefix() + imageUUID + + klog.V(4).Infof("Generated Volume ID (%s) and image name (%s) for request name (%s)", + rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName) + + return nil +} + +// undoSnapReservation is a helper routine to undo a name reservation for rbdSnapshot +func undoSnapReservation(rbdSnap *rbdSnapshot, credentials map[string]string) error { + key, err := getKey(rbdSnap.AdminID, credentials) + if err != nil { + return err + } + + err = snapJournal.UndoReservation(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, + rbdSnap.RbdSnapName, rbdSnap.RequestName) + + return err +} + +// undoVolReservation is a helper routine to undo a name reservation for rbdVolume +func undoVolReservation(rbdVol *rbdVolume, credentials map[string]string) error { + key, err := getKey(rbdVol.AdminID, credentials) + if err != nil { + return err + } + + err = volJournal.UndoReservation(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, + rbdVol.RbdImageName, rbdVol.RequestName) + + return err +} diff --git a/pkg/rbd/rbd_util.go b/pkg/rbd/rbd_util.go index 629d6215d45a..780250e3a3fa 100644 --- a/pkg/rbd/rbd_util.go +++ b/pkg/rbd/rbd_util.go @@ -211,7 +211,7 @@ func deleteImage(pOpts *rbdVolume, adminID string, credentials map[string]string return err } - err = unreserveVol(pOpts, credentials) + err = undoVolReservation(pOpts, credentials) if err != nil { klog.Errorf("failed to remove reservation for volume (%s) with backing image (%s) (%s)", pOpts.RequestName, pOpts.RbdImageName, err) @@ -292,7 +292,7 @@ func genSnapFromSnapID(rbdSnap *rbdSnapshot, snapshotID string, credentials map[ rbdSnap.ClusterID = vi.ClusterID options["clusterID"] = rbdSnap.ClusterID - rbdSnap.RbdSnapName = rbdSnapNamePrefix + vi.ObjectUUID + rbdSnap.RbdSnapName = snapJournal.NamingPrefix() + vi.ObjectUUID rbdSnap.Monitors, _, err = getMonsAndClusterID(options) if err != nil { @@ -311,16 +311,8 @@ func genSnapFromSnapID(rbdSnap *rbdSnapshot, snapshotID string, credentials map[ return err } - // TODO: fetch all omap vals in one call, than make multiple listomapvals - snapUUID := strings.TrimPrefix(rbdSnap.RbdSnapName, rbdSnapNamePrefix) - rbdSnap.RequestName, err = util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID, - key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapCSISnapNameKey) - if err != nil { - return err - } - - rbdSnap.RbdImageName, err = util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID, - key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapSourceImageKey) + rbdSnap.RequestName, rbdSnap.RbdImageName, err = snapJournal.GetObjectUUIDData(rbdSnap.Monitors, + rbdSnap.AdminID, key, rbdSnap.Pool, vi.ObjectUUID, true) if err != nil { return err } @@ -352,7 +344,7 @@ func genVolFromVolID(rbdVol *rbdVolume, volumeID string, credentials map[string] rbdVol.ClusterID = vi.ClusterID options["clusterID"] = rbdVol.ClusterID - rbdVol.RbdImageName = rbdImgNamePrefix + vi.ObjectUUID + rbdVol.RbdImageName = volJournal.NamingPrefix() + vi.ObjectUUID rbdVol.Monitors, _, err = getMonsAndClusterID(options) if err != nil { @@ -372,9 +364,8 @@ func genVolFromVolID(rbdVol *rbdVolume, volumeID string, credentials map[string] return err } - imageUUID := strings.TrimPrefix(rbdVol.RbdImageName, rbdImgNamePrefix) - rbdVol.RequestName, err = util.GetOMapValue(rbdVol.Monitors, rbdVol.AdminID, - key, rbdVol.Pool, rbdImageOMapPrefix+imageUUID, rbdImageCSIVolNameKey) + rbdVol.RequestName, _, err = volJournal.GetObjectUUIDData(rbdVol.Monitors, + rbdVol.AdminID, key, rbdVol.Pool, vi.ObjectUUID, false) if err != nil { return err } @@ -608,7 +599,7 @@ func deleteSnapshot(pOpts *rbdSnapshot, adminID string, credentials map[string]s return errors.Wrapf(err, "failed to delete snapshot, command output: %s", string(output)) } - if err := unreserveSnap(pOpts, credentials); err != nil { + if err := undoSnapReservation(pOpts, credentials); err != nil { klog.Errorf("failed to remove reservation for snapname (%s) with backing snap (%s) on image (%s) (%s)", pOpts.RequestName, pOpts.RbdSnapName, pOpts.RbdImageName, err) } diff --git a/pkg/rbd/voljournal.go b/pkg/rbd/voljournal.go deleted file mode 100644 index 323ddfc971fa..000000000000 --- a/pkg/rbd/voljournal.go +++ /dev/null @@ -1,554 +0,0 @@ -/* -Copyright 2019 The Ceph-CSI Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package rbd - -import ( - "fmt" - "strings" - - "github.com/ceph/ceph-csi/pkg/util" - - "github.com/pborman/uuid" - "github.com/pkg/errors" - "k8s.io/klog" -) - -func validateNonEmptyField(field, fieldName, structName string) error { - if field == "" { - return fmt.Errorf("value '%s' in '%s' structure cannot be empty", fieldName, structName) - } - - return nil -} - -func validateRbdSnap(rbdSnap *rbdSnapshot) error { - if err := validateNonEmptyField(rbdSnap.RequestName, "RequestName", "rbdSnapshot"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdSnap.Monitors, "Monitors", "rbdSnapshot"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdSnap.AdminID, "AdminID", "rbdSnapshot"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdSnap.Pool, "Pool", "rbdSnapshot"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdSnap.RbdImageName, "RbdImageName", "rbdSnapshot"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdSnap.ClusterID, "ClusterID", "rbdSnapshot"); err != nil { - return err - } - - return nil -} - -func validateRbdVol(rbdVol *rbdVolume) error { - if err := validateNonEmptyField(rbdVol.RequestName, "RequestName", "rbdVolume"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdVol.Monitors, "Monitors", "rbdVolume"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdVol.AdminID, "AdminID", "rbdVolume"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdVol.Pool, "Pool", "rbdVolume"); err != nil { - return err - } - - if err := validateNonEmptyField(rbdVol.ClusterID, "ClusterID", "rbdVolume"); err != nil { - return err - } - - if rbdVol.VolSize == 0 { - return errors.New("value 'VolSize' in 'rbdVolume' structure cannot be 0") - } - - return nil -} - -/* -checkSnapExists, and its counterpart checkVolExists, function as checks to determine if passed -in rbdSnapshot or rbdVolume exists on the backend. - -**NOTE:** These functions manipulate the rados omaps that hold information regarding -volume names as requested by the CSI drivers. Hence, these need to be invoked only when the -respective CSI driver generated snapshot or volume name based locks are held, as otherwise racy -access to these omaps may end up leaving them in an inconsistent state. - -These functions need enough information about cluster and pool (ie, Monitors, Pool, IDs filled in) -to operate. They further require that the RequestName element of the structure have a valid value -to operate on and determine if the said RequestName already exists on the backend. - -These functions populate the snapshot or the image name, its attributes and the CSI snapshot/volume -ID for the same when succesful. - -These functions also cleanup omap reservations that are stale. I.e when omap entries exist and -backing images or snapshots are missing, or one of the omaps exist and the next is missing. This is -because, the order of omap creation and deletion are inverse of each other, and protected by the -request name lock, and hence any stale omaps are leftovers from incomplete transactions and are -hence safe to garbage collect. -*/ -func checkSnapExists(rbdSnap *rbdSnapshot, credentials map[string]string) (found bool, err error) { - if err = validateRbdSnap(rbdSnap); err != nil { - return false, err - } - - key, err := getKey(rbdSnap.AdminID, credentials) - if err != nil { - return false, err - } - - // check if request name is already part of the snaps omap - snapUUID, err := util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID, - key, rbdSnap.Pool, csiSnapsDirectory, csiSnapNameKeyPrefix+rbdSnap.RequestName) - if err != nil { - // error should specifically be not found, for image to be absent, any other error - // is not conclusive, and we should not proceed - if _, ok := err.(util.ErrKeyNotFound); ok { - return false, nil - } - - return false, err - } - - rbdSnap.RbdSnapName = rbdSnapNamePrefix + snapUUID - - // TODO: use listomapvals to dump all keys instead of reading them one-by-one - // check if the snapshot image omap is present - savedSnapName, err := util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID, - key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapCSISnapNameKey) - if err != nil { - if _, ok := err.(util.ErrKeyNotFound); ok { - err = unreserveSnap(rbdSnap, credentials) - } - return false, err - } - - // check if snapshot image omap points back to the request name - if savedSnapName != rbdSnap.RequestName { - // NOTE: This should never be possible, hence no cleanup, but log error - // and return, as cleanup may need to occur manually! - return false, fmt.Errorf("internal state inconsistent, omap snap"+ - " names disagree, request name (%s) snap name (%s) image omap"+ - " snap name (%s)", rbdSnap.RequestName, rbdSnap.RbdSnapName, savedSnapName) - } - - // check if the snapshot source image omap is present - savedVolName, err := util.GetOMapValue(rbdSnap.Monitors, rbdSnap.AdminID, - key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID, rbdSnapSourceImageKey) - if err != nil { - if _, ok := err.(util.ErrKeyNotFound); ok { - err = unreserveSnap(rbdSnap, credentials) - } - return false, err - } - - // check if snapshot source image omap points back to the source volume passed in - if savedVolName != rbdSnap.RbdImageName { - // NOTE: This can happen if there is a snapname conflict, and we alerady have a snapshot - // with the same name pointing to a different RBD image as the source - err = fmt.Errorf("snapname points to different image, request name (%s)"+ - " image name (%s) image omap"+" volume name (%s)", - rbdSnap.RequestName, rbdSnap.RbdImageName, savedVolName) - return false, ErrSnapNameConflict{rbdSnap.RequestName, err} - } - - // Fetch on-disk image attributes - err = updateSnapWithImageInfo(rbdSnap, credentials) - if err != nil { - if _, ok := err.(ErrSnapNotFound); ok { - err = unreserveSnap(rbdSnap, credentials) - return false, err - } - - return false, err - } - - // found a snapshot already available, process and return its information - poolID, err := util.GetPoolID(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool) - if err != nil { - return false, err - } - - vi := util.CSIIdentifier{ - PoolID: poolID, - EncodingVersion: volIDVersion, - ClusterID: rbdSnap.ClusterID, - ObjectUUID: snapUUID, - } - rbdSnap.SnapID, err = vi.ComposeCSIID() - if err != nil { - return false, err - } - - klog.V(4).Infof("Found existing snap (%s) with snap name (%s) for request (%s)", - rbdSnap.SnapID, rbdSnap.RbdSnapName, rbdSnap.RequestName) - - return true, nil -} - -/* -Check comment on checkSnapExists, to understand how this function behaves - -**NOTE:** These functions manipulate the rados omaps that hold information regarding -volume names as requested by the CSI drivers. Hence, these need to be invoked only when the -respective CSI snapshot or volume name based locks are held, as otherwise racy access to these -omaps may end up leaving the omaps in an inconsistent state. -*/ -func checkVolExists(rbdVol *rbdVolume, credentials map[string]string) (found bool, err error) { - var vi util.CSIIdentifier - - if err = validateRbdVol(rbdVol); err != nil { - return false, err - } - - key, err := getKey(rbdVol.AdminID, credentials) - if err != nil { - return false, err - } - - // check if request name is already part of the volumes omap - imageUUID, err := util.GetOMapValue(rbdVol.Monitors, rbdVol.AdminID, - key, rbdVol.Pool, csiVolsDirectory, csiVolNameKeyPrefix+rbdVol.RequestName) - if err != nil { - // error should specifically be not found, for image to be absent, any other error - // is not conclusive, and we should not proceed - if _, ok := err.(util.ErrKeyNotFound); ok { - return false, nil - } - - return false, err - } - - rbdVol.RbdImageName = rbdImgNamePrefix + imageUUID - - // check if the image omap is present - savedVolName, err := util.GetOMapValue(rbdVol.Monitors, rbdVol.AdminID, - key, rbdVol.Pool, rbdImageOMapPrefix+imageUUID, rbdImageCSIVolNameKey) - if err != nil { - if _, ok := err.(util.ErrKeyNotFound); ok { - err = unreserveVol(rbdVol, credentials) - } - return false, err - } - - // check if image omap points back to the request name - if savedVolName != rbdVol.RequestName { - // NOTE: This should never be possible, hence no cleanup, but log error - // and return, as cleanup may need to occur manually! - return false, fmt.Errorf("internal state inconsistent, omap volume"+ - " names disagree, request name (%s) image name (%s) image omap"+ - " volume name (%s)", rbdVol.RequestName, rbdVol.RbdImageName, savedVolName) - } - - // NOTE: Return volsize should be on-disk volsize, not request vol size, so - // save it for size checks before fetching image data - requestSize := rbdVol.VolSize - // Fetch on-disk image attributes and compare against request - err = updateVolWithImageInfo(rbdVol, credentials) - if err != nil { - if _, ok := err.(ErrImageNotFound); ok { - err = unreserveVol(rbdVol, credentials) - return false, err - } - - return false, err - } - - // size checks - if rbdVol.VolSize < requestSize { - err = fmt.Errorf("image with the same name (%s) but with different size already exists", - rbdVol.RbdImageName) - return false, ErrVolNameConflict{rbdVol.RbdImageName, err} - } - // TODO: We should also ensure image features and format is the same - - // found a volume already available, process and return it! - poolID, err := util.GetPoolID(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool) - if err != nil { - return false, err - } - - vi = util.CSIIdentifier{ - PoolID: poolID, - EncodingVersion: volIDVersion, - ClusterID: rbdVol.ClusterID, - ObjectUUID: imageUUID, - } - rbdVol.VolID, err = vi.ComposeCSIID() - if err != nil { - return false, err - } - - klog.V(4).Infof("Found existng volume (%s) with image name (%s) for request (%s)", - rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName) - - return true, nil -} - -/* -unreserveSnap and unreserveVol remove omaps associated with the snapshot and the image name, -and also remove the corresponding request name key in the snaps or volumes omaps respectively. - -This is performed within the request name lock, to ensure that requests with the same name do not -manipulate the omap entries concurrently. -*/ -func unreserveSnap(rbdSnap *rbdSnapshot, credentials map[string]string) error { - key, err := getKey(rbdSnap.AdminID, credentials) - if err != nil { - return err - } - - // delete snap image omap (first, inverse of create order) - snapUUID := strings.TrimPrefix(rbdSnap.RbdSnapName, rbdSnapNamePrefix) - err = util.RemoveObject(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, rbdSnapOMapPrefix+snapUUID) - if err != nil { - if _, ok := err.(util.ErrObjectNotFound); !ok { - klog.Errorf("failed removing oMap %s (%s)", rbdSnapOMapPrefix+snapUUID, err) - return err - } - } - - // delete the request name omap key (last, inverse of create order) - err = util.RemoveOMapKey(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, - csiSnapsDirectory, csiSnapNameKeyPrefix+rbdSnap.RequestName) - if err != nil { - klog.Errorf("failed removing oMap key %s (%s)", csiSnapNameKeyPrefix+rbdSnap.RequestName, err) - return err - } - - return nil -} - -func unreserveVol(rbdVol *rbdVolume, credentials map[string]string) error { - key, err := getKey(rbdVol.AdminID, credentials) - if err != nil { - return err - } - - // delete image omap (first, inverse of create order) - imageUUID := strings.TrimPrefix(rbdVol.RbdImageName, rbdImgNamePrefix) - err = util.RemoveObject(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, rbdImageOMapPrefix+imageUUID) - if err != nil { - if _, ok := err.(util.ErrObjectNotFound); !ok { - klog.Errorf("failed removing oMap %s (%s)", rbdImageOMapPrefix+imageUUID, err) - return err - } - } - - // delete the request name omap key (last, inverse of create order) - err = util.RemoveOMapKey(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, - csiVolsDirectory, csiVolNameKeyPrefix+rbdVol.RequestName) - if err != nil { - klog.Errorf("failed removing oMap key %s (%s)", csiVolNameKeyPrefix+rbdVol.RequestName, err) - return err - } - - return nil -} - -// reserveOMapName creates an omap with passed in oMapNamePrefix and a generated . -// It ensures generated omap name does not already exist and if conflicts are detected, a set -// number of retires with newer uuids are attempted before returning an error -func reserveOMapName(monitors, adminID, key, poolName, oMapNamePrefix string) (string, error) { - var iterUUID string - - maxAttempts := 5 - attempt := 1 - for attempt <= maxAttempts { - // generate a uuid for the image name - iterUUID = uuid.NewUUID().String() - - err := util.CreateObject(monitors, adminID, key, poolName, oMapNamePrefix+iterUUID) - if err != nil { - if _, ok := err.(util.ErrObjectExists); ok { - attempt++ - // try again with a different uuid, for maxAttempts tries - klog.V(4).Infof("uuid (%s) conflict detected, retrying (attempt %d of %d)", - iterUUID, attempt, maxAttempts) - continue - } - - return "", err - } - - break - } - - if attempt > maxAttempts { - return "", errors.New("uuid conflicts exceeds retry threshold") - } - - return iterUUID, nil -} - -/* -reserveSnap and reserveVol add respective entries to the volumes and snapshots omaps, post -generating a target snapshot or image name for use. Further, these functions create the snapshot or -image name omaps, to store back pointers to the CSI generated request names. - -This is performed within the request name lock, to ensure that requests with the same name do not -manipulate the omap entries concurrently. -*/ -func reserveSnap(rbdSnap *rbdSnapshot, credentials map[string]string) error { - var vi util.CSIIdentifier - - key, err := getKey(rbdSnap.AdminID, credentials) - if err != nil { - return err - } - - poolID, err := util.GetPoolID(rbdSnap.Monitors, rbdSnap.AdminID, key, - rbdSnap.Pool) - if err != nil { - return err - } - - // Create the snapUUID based omap first, to reserve the same and avoid conflicts - // NOTE: If any service loss occurs post creation of the snap omap, and before - // setting the omap key (rbdSnapCSISnapNameKey) to point back to the snaps omap, the - // snap omap key will leak - snapUUID, err := reserveOMapName(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, - rbdSnapOMapPrefix) - if err != nil { - return err - } - - // Create request snapUUID key in csi snaps omap and store the uuid based - // snap name into it - err = util.SetOMapKeyValue(rbdSnap.Monitors, rbdSnap.AdminID, key, - rbdSnap.Pool, csiSnapsDirectory, csiSnapNameKeyPrefix+rbdSnap.RequestName, snapUUID) - if err != nil { - return err - } - defer func() { - if err != nil { - klog.Warningf("reservation failed for volume: %s", rbdSnap.RequestName) - errDefer := unreserveSnap(rbdSnap, credentials) - if errDefer != nil { - klog.Warningf("failed undoing reservation of snapshot: %s (%v)", - rbdSnap.RequestName, errDefer) - } - } - }() - - // Create snap name based omap and store CSI request name key and source information - err = util.SetOMapKeyValue(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, - rbdSnapOMapPrefix+snapUUID, rbdSnapCSISnapNameKey, rbdSnap.RequestName) - if err != nil { - return err - } - err = util.SetOMapKeyValue(rbdSnap.Monitors, rbdSnap.AdminID, key, rbdSnap.Pool, - rbdSnapOMapPrefix+snapUUID, rbdSnapSourceImageKey, rbdSnap.RbdImageName) - if err != nil { - return err - } - - // generate the volume ID to return to the CO system - vi = util.CSIIdentifier{ - PoolID: poolID, - EncodingVersion: volIDVersion, - ClusterID: rbdSnap.ClusterID, - ObjectUUID: snapUUID, - } - rbdSnap.SnapID, err = vi.ComposeCSIID() - if err != nil { - return err - } - rbdSnap.RbdSnapName = rbdSnapNamePrefix + snapUUID - klog.V(4).Infof("Generated Volume ID (%s) and image name (%s) for request name (%s)", - rbdSnap.SnapID, rbdSnap.RbdImageName, rbdSnap.RequestName) - - return nil -} - -func reserveVol(rbdVol *rbdVolume, credentials map[string]string) error { - var vi util.CSIIdentifier - - key, err := getKey(rbdVol.AdminID, credentials) - if err != nil { - return err - } - - poolID, err := util.GetPoolID(rbdVol.Monitors, rbdVol.AdminID, key, - rbdVol.Pool) - if err != nil { - return err - } - - // Create the imageUUID based omap first, to reserve the same and avoid conflicts - // NOTE: If any service loss occurs post creation of the image omap, and before - // setting the omap key (rbdImageCSIVolNameKey) to point back to the volumes omap, - // the image omap key will leak - imageUUID, err := reserveOMapName(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, rbdImageOMapPrefix) - if err != nil { - return err - } - - // Create request volName key in csi volumes omap and store the uuid based - // image name into it - err = util.SetOMapKeyValue(rbdVol.Monitors, rbdVol.AdminID, key, - rbdVol.Pool, csiVolsDirectory, csiVolNameKeyPrefix+rbdVol.RequestName, imageUUID) - if err != nil { - return err - } - defer func() { - if err != nil { - klog.Warningf("reservation failed for volume: %s", rbdVol.RequestName) - errDefer := unreserveVol(rbdVol, credentials) - if errDefer != nil { - klog.Warningf("failed undoing reservation of volume: %s (%v)", - rbdVol.RequestName, errDefer) - } - } - }() - - // Create image name based omap and store CSI request volume name key and data - err = util.SetOMapKeyValue(rbdVol.Monitors, rbdVol.AdminID, key, rbdVol.Pool, - rbdImageOMapPrefix+imageUUID, rbdImageCSIVolNameKey, rbdVol.RequestName) - if err != nil { - return err - } - - // generate the volume ID to return to the CO system - vi = util.CSIIdentifier{ - PoolID: poolID, - EncodingVersion: volIDVersion, - ClusterID: rbdVol.ClusterID, - ObjectUUID: imageUUID, - } - rbdVol.VolID, err = vi.ComposeCSIID() - if err != nil { - return err - } - rbdVol.RbdImageName = rbdImgNamePrefix + imageUUID - klog.V(4).Infof("Generated Volume ID (%s) and image name (%s) for request name (%s)", - rbdVol.VolID, rbdVol.RbdImageName, rbdVol.RequestName) - - return nil -} diff --git a/pkg/util/cephcmds.go b/pkg/util/cephcmds.go index d7434bec676b..8cf675726173 100644 --- a/pkg/util/cephcmds.go +++ b/pkg/util/cephcmds.go @@ -164,7 +164,8 @@ func GetOMapValue(monitors, adminID, key, poolName, oMapName, oMapKey string) (s "-p", poolName, "getomapval", oMapName, oMapKey, tmpFile.Name()) if err != nil { - // no logs, as attempting to check for key/value is done even on regular call sequences + // no logs, as attempting to check for non-existent key/value is done even on + // regular call sequences stdoutanderr := strings.Join([]string{string(stdout), string(stderr)}, " ") if strings.Contains(stdoutanderr, "No such key: "+poolName+"/"+oMapName+"/"+oMapKey) { return "", ErrKeyNotFound{poolName + "/" + oMapName + "/" + oMapKey, err} @@ -175,6 +176,10 @@ func GetOMapValue(monitors, adminID, key, poolName, oMapName, oMapKey string) (s return "", ErrKeyNotFound{poolName + "/" + oMapName + "/" + oMapKey, err} } + // log other errors for troubleshooting assistance + klog.Errorf("failed getting omap value for key (%s) from omap (%s) in pool (%s): (%v)", + oMapKey, oMapName, poolName, err) + return "", fmt.Errorf("error (%v) occured, command output streams is (%s)", err.Error(), stdoutanderr) } diff --git a/pkg/util/errors.go b/pkg/util/errors.go index 0406e5b164b8..afea1b601797 100644 --- a/pkg/util/errors.go +++ b/pkg/util/errors.go @@ -45,3 +45,14 @@ type ErrObjectNotFound struct { func (e ErrObjectNotFound) Error() string { return e.err.Error() } + +// ErrSnapNameConflict is generated when a requested CSI snap name already exists on RBD but with +// different properties, and hence is in conflict with the passed in CSI volume name +type ErrSnapNameConflict struct { + requestName string + err error +} + +func (e ErrSnapNameConflict) Error() string { + return e.err.Error() +} diff --git a/pkg/util/util.go b/pkg/util/util.go index ea8ec1fd61e7..eeaaae638ea1 100644 --- a/pkg/util/util.go +++ b/pkg/util/util.go @@ -91,3 +91,24 @@ func ValidateDriverName(driverName string) error { } return err } + +// GenerateVolID generates a volume ID based on passed in parameters and version, to be returned +// to the CO system +func GenerateVolID(monitors, id, key, pool, clusterID, objUUID string, volIDVersion uint16) (string, error) { + poolID, err := GetPoolID(monitors, id, key, pool) + if err != nil { + return "", err + } + + // generate the volume ID to return to the CO system + vi := CSIIdentifier{ + PoolID: poolID, + EncodingVersion: volIDVersion, + ClusterID: clusterID, + ObjectUUID: objUUID, + } + + volID, err := vi.ComposeCSIID() + + return volID, err +} diff --git a/pkg/util/voljournal.go b/pkg/util/voljournal.go new file mode 100644 index 000000000000..b312b419dc40 --- /dev/null +++ b/pkg/util/voljournal.go @@ -0,0 +1,394 @@ +/* +Copyright 2019 The Ceph-CSI Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package util + +import ( + "fmt" + "strings" + + "github.com/pborman/uuid" + "github.com/pkg/errors" + "k8s.io/klog" +) + +/* +RADOS omaps usage: + +This note details how we preserve idempotent nature of create requests and retain the relationship +between orchestrator (CO) generated names and plugin generated names for volumes and snapshots. + +NOTE: volume denotes an rbd image or a CephFS subvolume + +The implementation uses Ceph RADOS omaps to preserve the relationship between request name and +generated volume (or snapshot) name. There are 4 types of omaps in use, +- A "csi.volumes.[csi-id]" (or "csi.volumes"+.+CSIInstanceID), (referred to using csiDirectory variable) + - stores keys named using the CO generated names for volume requests (prefixed with csiNameKeyPrefix) + - keys are named "csi.volume."+[CO generated VolName] + - Key value contains the volume uuid that is created, for the CO provided name + +- A "csi.snaps.[csi-id]" (or "csi.snaps"+.+CSIInstanceID), (referred to using csiDirectory variable) + - stores keys named using the CO generated names for snapshot requests (prefixed with csiNameKeyPrefix) + - keys are named "csi.snap."+[CO generated SnapName] + - Key value contains the snapshot uuid that is created, for the CO provided name + +- A per volume omap named "csi.volume."+[volume uuid], (referred to as CephUUIDDirectory) + - stores a single key named "csi.volname", that has the value of the CO generated VolName that + this volume refers to (referred to using csiNameKey value) + +- A per snapshot omap named "rbd.csi.snap."+[RBD snapshot uuid], (referred to as CephUUIDDirectory) + - stores a key named "csi.snapname", that has the value of the CO generated SnapName that this + snapshot refers to (referred to using csiNameKey value) + - also stores another key named "csi.source", that has the value of the volume name that is the + source of the snapshot (referred to using cephSnapSourceKey value) + +Creation of omaps: +When a volume create request is received (or a snapshot create, the snapshot is not detailed in this + comment further as the process is similar), +- The csiDirectory is consulted to find if there is already a key with the CO VolName, and if present, +it is used to read its references to reach the UUID that backs this VolName, to check if the +UUID based volume can satisfy the requirements for the request + - If during the process of checking the same, it is found that some linking information is stale + or missing, the corresponding keys upto the key in the csiDirectory is cleaned up, to start afresh + +- If the key with the CO VolName is not found, or was cleaned up, the request is treated as a +new create request, and an CephUUIDDirectory is created first with a generated uuid, this ensures +that we do not use a uuid that is already in use + +- Next, a key with the VolName is created in the csiDirectory, and its value is updated to store the +generated uuid + +- This is followed by updating the CephUUIDDirectory with the VolName in the csiNameKey + +- Finally, the volume is created (or promoted from a snapshot, if content source was provided), +using the uuid and a corresponding name prefix (namingPrefix) as the volume name + +The entire operation is locked based on VolName hash, to ensure there is only ever a single entity +modifying the related omaps for a given VolName. + +This ensures idempotent nature of creates, as the same CO generated VolName would attempt to use +the same volume uuid to serve the request, as the relations are saved in the respective omaps. + +Deletion of omaps: +Delete requests would not contain the VolName, hence deletion uses the volume ID, which is encoded +with the volume uuid in it, to find the volume and the CephUUIDDirectory. The CephUUIDDirectory is +read to get the VolName that this image points to. This VolName can be further used to read and +delete the key from the csiDirectory. + +As we trace back and find the VolName, we also take a hash based lock on the VolName before +proceeding with deleting the volume and the related omap entries, to ensure there is only ever a +single entity modifying the related omaps for a given VolName. +*/ + +type CSIJournal struct { + // csiDirectory is the name of the CSI volumes object map that contains CSI volume-name (or + // snapshot name) based keys + csiDirectory string + + // CSI volume-name keyname prefix, for key in csiDirectory, suffix is the CSI passed volume name + csiNameKeyPrefix string + + // Per Ceph volume (RBD/FS-subvolume) object map name prefix, suffix is the generated volume uuid + cephUUIDDirectoryPrefix string + + // CSI volume-name key in per Ceph volume object map, containing CSI volume-name for which the + // Ceph volume was created + csiNameKey string + + // source volume name key in per Ceph snapshot object map, containing Ceph source volume uuid + // for which the snapshot was created + cephSnapSourceKey string + + // volume name prefix for naming on Ceph rbd or FS, suffix is a uuid generated per volume + namingPrefix string +} + +// CSIVolumeJournal returns an instance of volume keys +func NewCSIVolumeJournal() *CSIJournal { + return &CSIJournal{ + csiDirectory: "csi.volumes", + csiNameKeyPrefix: "csi.volume.", + cephUUIDDirectoryPrefix: "csi.volume.", + csiNameKey: "csi.volname", + namingPrefix: "csi-vol-", + cephSnapSourceKey: "", + } +} + +// CSISnapshotSnapshot returns an instance of snapshot keys +func NewCSISnapshotJournal() *CSIJournal { + return &CSIJournal{ + csiDirectory: "csi.snaps", + csiNameKeyPrefix: "csi.snap.", + cephUUIDDirectoryPrefix: "csi.snap.", + csiNameKey: "csi.snapname", + namingPrefix: "csi-snap-", + cephSnapSourceKey: "csi.source", + } +} + +// NamingPrefix returns the value of naming prefix from the journal keys +func (cj *CSIJournal) NamingPrefix() string { + return cj.namingPrefix +} + +// SetCSIDirectorySuffix sets the given suffix for the csiDirectory omap +func (cj *CSIJournal) SetCSIDirectorySuffix(suffix string) { + cj.csiDirectory = cj.csiDirectory + "." + suffix +} + +/* +CheckReservation checks if given request name contains a valid reservation +- If there is a valid reservation, then the corresponding UUID for the volume/snapshot is returned +- If there is a reservation that is stale (or not fully cleaned up), it is garbage collected using +the UndoReservation call, as appropriate +- If a snapshot is being checked, then its source is matched to the parentName that is provided + +NOTE: As the function manipulates omaps, it should be called with a lock against the request name +held, to prevent parallel operations from modifying the state of the omaps for this request name. + +Return values: + - string: Contains the UUID that was reserved for the passed in reqName, empty if + there was no reservation found + - error: non-nil in case of any errors +*/ +func (cj *CSIJournal) CheckReservation(monitors, id, key, pool, reqName, parentName string) (string, error) { + var snapSource bool + + if parentName != "" { + if cj.cephSnapSourceKey == "" { + err := errors.New("invalid snapshot request on volume journal") + return "", err + } + snapSource = true + } + + // check if request name is already part of the directory omap + objUUID, err := GetOMapValue(monitors, id, key, pool, cj.csiDirectory, + cj.csiNameKeyPrefix+reqName) + if err != nil { + // error should specifically be not found, for volume to be absent, any other error + // is not conclusive, and we should not proceed + if _, ok := err.(ErrKeyNotFound); ok { + return "", nil + } + return "", err + } + + savedReqName, savedReqParentName, err := cj.GetObjectUUIDData(monitors, id, key, pool, + objUUID, snapSource) + if err != nil { + // error should specifically be not found, for image to be absent, any other error + // is not conclusive, and we should not proceed + if _, ok := err.(ErrKeyNotFound); ok { + err = cj.UndoReservation(monitors, id, key, pool, cj.namingPrefix+objUUID, reqName) + } + return "", err + } + + // check if UUID key points back to the request name + if savedReqName != reqName { + // NOTE: This should never be possible, hence no cleanup, but log error + // and return, as cleanup may need to occur manually! + return "", fmt.Errorf("internal state inconsistent, omap names disagree,"+ + " request name (%s) volume UUID (%s) volume omap name (%s)", + reqName, objUUID, savedReqName) + } + + if snapSource { + // check if source UUID key points back to the parent volume passed in + if savedReqParentName != parentName { + // NOTE: This can happen if there is a snapname conflict, and we already have a snapshot + // with the same name pointing to a different UUID as the source + err = fmt.Errorf("snapname points to different volume, request name (%s)"+ + " source name (%s) saved source name (%s)", + reqName, parentName, savedReqParentName) + return "", ErrSnapNameConflict{reqName, err} + } + } + + return objUUID, nil +} + +/* +UndoReservation undoes a reservation, in the reverse order of ReserveName +- The UUID directory is cleaned up before the VolName key in the csiDirectory is cleaned up + +NOTE: Ensure that the Ceph volume (image or FS subvolume) backing the reservation is cleaned up +prior to cleaning up the reservation + +NOTE: As the function manipulates omaps, it should be called with a lock against the request name +held, to prevent parallel operations from modifying the state of the omaps for this request name. +*/ +func (cj *CSIJournal) UndoReservation(monitors, id, key, pool, volName, reqName string) error { + // delete volume UUID omap (first, inverse of create order) + // TODO: Check cases where volName can be empty, and we need to just cleanup the reqName + imageUUID := strings.TrimPrefix(volName, cj.namingPrefix) + err := RemoveObject(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix+imageUUID) + if err != nil { + if _, ok := err.(ErrObjectNotFound); !ok { + klog.Errorf("failed removing oMap %s (%s)", cj.cephUUIDDirectoryPrefix+imageUUID, err) + return err + } + } + + // delete the request name key (last, inverse of create order) + err = RemoveOMapKey(monitors, id, key, pool, cj.csiDirectory, + cj.csiNameKeyPrefix+reqName) + if err != nil { + klog.Errorf("failed removing oMap key %s (%s)", cj.csiNameKeyPrefix+reqName, err) + return err + } + + return err +} + +// reserveOMapName creates an omap with passed in oMapNamePrefix and a generated . +// It ensures generated omap name does not already exist and if conflicts are detected, a set +// number of retires with newer uuids are attempted before returning an error +func reserveOMapName(monitors, id, key, pool, oMapNamePrefix string) (string, error) { + var iterUUID string + + maxAttempts := 5 + attempt := 1 + for attempt <= maxAttempts { + // generate a uuid for the image name + iterUUID = uuid.NewUUID().String() + + err := CreateObject(monitors, id, key, pool, oMapNamePrefix+iterUUID) + if err != nil { + if _, ok := err.(ErrObjectExists); ok { + attempt++ + // try again with a different uuid, for maxAttempts tries + klog.V(4).Infof("uuid (%s) conflict detected, retrying (attempt %d of %d)", + iterUUID, attempt, maxAttempts) + continue + } + + return "", err + } + + break + } + + if attempt > maxAttempts { + return "", errors.New("uuid conflicts exceeds retry threshold") + } + + return iterUUID, nil +} + +/* +ReserveName adds respective entries to the csiDirectory omaps, post generating a target +UUIDDirectory for use. Further, these functions update the UUIDDirectory omaps, to store back +pointers to the CSI generated request names. + +NOTE: As the function manipulates omaps, it should be called with a lock against the request name +held, to prevent parallel operations from modifying the state of the omaps for this request name. + +Return values: + - string: Contains the UUID that was reserved for the passed in reqName + - error: non-nil in case of any errors +*/ +func (cj *CSIJournal) ReserveName(monitors, id, key, pool, reqName, parentName string) (string, error) { + var snapSource bool + + if parentName != "" { + if cj.cephSnapSourceKey == "" { + err := errors.New("invalid snapshot request on volume journal") + return "", err + } + snapSource = true + } + + // Create the UUID based omap first, to reserve the same and avoid conflicts + // NOTE: If any service loss occurs post creation of the UUID directory, and before + // setting the request name key (csiNameKey) to point back to the UUID directory, the + // UUID directory key will be leaked + volUUID, err := reserveOMapName(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix) + if err != nil { + return "", err + } + + // Create request name (csiNameKey) key in csiDirectory and store the UUId based + // volume name into it + err = SetOMapKeyValue(monitors, id, key, pool, cj.csiDirectory, cj.csiNameKeyPrefix+reqName, + volUUID) + if err != nil { + return "", err + } + defer func() { + if err != nil { + klog.Warningf("reservation failed for volume: %s", reqName) + errDefer := cj.UndoReservation(monitors, id, key, pool, cj.namingPrefix+volUUID, + reqName) + if errDefer != nil { + klog.Warningf("failed undoing reservation of volume: %s (%v)", reqName, errDefer) + } + } + }() + + // Update UUID directory to store CSI request name + err = SetOMapKeyValue(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix+volUUID, + cj.csiNameKey, reqName) + if err != nil { + return "", err + } + + if snapSource { + // Update UUID directory to store source volume UUID in case of snapshots + err = SetOMapKeyValue(monitors, id, key, pool, cj.cephUUIDDirectoryPrefix+volUUID, + cj.cephSnapSourceKey, parentName) + if err != nil { + return "", err + } + } + + return volUUID, nil +} + +/* +GetObjectUUIDData fetches all keys from a UUID directory +Return values: + - string: Contains the request name for the passed in UUID + - string: Contains the parent image name for the passed in UUID, if it is a snapshot + - error: non-nil in case of any errors +*/ +func (cj *CSIJournal) GetObjectUUIDData(monitors, id, key, pool, objectUUID string, snapSource bool) (string, string, error) { + var sourceName string + + if snapSource && cj.cephSnapSourceKey == "" { + err := errors.New("invalid snapshot request on volume journal") + return "", "", err + } + + // TODO: fetch all omap vals in one call, than make multiple listomapvals + requestName, err := GetOMapValue(monitors, id, key, pool, + cj.cephUUIDDirectoryPrefix+objectUUID, cj.csiNameKey) + if err != nil { + return "", "", err + } + + if snapSource { + sourceName, err = GetOMapValue(monitors, id, key, pool, + cj.cephUUIDDirectoryPrefix+objectUUID, cj.cephSnapSourceKey) + if err != nil { + return "", "", err + } + } + + return requestName, sourceName, nil +}