diff --git a/examples/README.md b/examples/README.md index d309cdcaf1d7..af26f8846464 100644 --- a/examples/README.md +++ b/examples/README.md @@ -114,3 +114,86 @@ To restore the snapshot to a new PVC, deploy kubectl create -f pvc-restore.yaml kubectl create -f pod-restore.yaml ``` + +## How to test RBD MULTI_NODE_MULTI_WRITER BLOCK feature + +Requires feature-gates: `BlockVolume=true` `CSIBlockVolume=true` + +*NOTE* The MULTI_NODE_MULTI_WRITER capability is only available for +Volumes that are of access_type `block` + +*WARNING* This feature is strictly for workloads that know how to deal +with concurrent acces to the Volume (eg Active/Passive applications). +Using RWX modes on non clustered file systems with applications trying +to simultaneously access the Volume will likely result in data corruption! + +Following are examples for issuing a request for a `Block` +`ReadWriteMany` Claim, and using the resultant Claim for a POD + +``` +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: block-pvc +spec: + accessModes: + - ReadWriteMany + volumeMode: Block + resources: + requests: + storage: 1Gi +``` + +Create a POD that uses this PVC: + +``` +apiVersion: v1 +kind: Pod +metadata: + name: my-pod +spec: + containers: + - name: my-container + image: debian + command: ["/bin/bash", "-c"] + args: [ "tail -f /dev/null" ] + volumeDevices: + - devicePath: /dev/rbdblock + name: my-volume + imagePullPolicy: IfNotPresent + volumes: + - name: my-volume + persistentVolumeClaim: + claimName: block-pvc + +``` + +Wait for the POD to enter Running state, check that our block device +is available in the container at `/dev/rdbblock` + + +Now, we can create a second POD (ensure the POD is scheduled on a different +node; multiwriter single node works without this feature) that also uses this +PVC at the same time, again wait for the pod to enter running state, and verify +the block device is available. + +``` +apiVersion: v1 +kind: Pod +metadata: + name: another-pod +spec: + containers: + - name: my-container + image: debian + command: ["/bin/bash", "-c"] + args: [ "tail -f /dev/null" ] + volumeDevices: + - devicePath: /dev/rbdblock + name: my-volume + imagePullPolicy: IfNotPresent + volumes: + - name: my-volume + persistentVolumeClaim: + claimName: block-pvc +``` diff --git a/pkg/rbd/controllerserver.go b/pkg/rbd/controllerserver.go index ba7cb9092058..23a1bda0faa3 100644 --- a/pkg/rbd/controllerserver.go +++ b/pkg/rbd/controllerserver.go @@ -93,7 +93,25 @@ func (cs *ControllerServer) validateVolumeReq(req *csi.CreateVolumeRequest) erro func parseVolCreateRequest(req *csi.CreateVolumeRequest) (*rbdVolume, error) { // TODO (sbezverk) Last check for not exceeding total storage capacity - rbdVol, err := getRBDVolumeOptions(req.GetParameters()) + isMultiNode := false + isBlock := false + for _, cap := range req.VolumeCapabilities { + // Only checking SINGLE_NODE_SINGLE_WRITER here because regardless of the other types (MULTI READER) we need to implement the same logic to ignore the in-use response + if cap.GetAccessMode().GetMode() != csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER { + isMultiNode = true + } + if cap.GetBlock() != nil { + isBlock = true + } + } + + // We want to fail early if the user is trying to create a RWX on a non-block type device + if isMultiNode && !isBlock { + return nil, fmt.Errorf("multi node access modes are only supported on block type devices with rbd") + } + + // if it's NOT SINGLE_NODE_WRITER and it's BLOCK we'll set the parameter to ignore the in-use checks + rbdVol, err := getRBDVolumeOptions(req.GetParameters(), (isMultiNode && isBlock)) if err != nil { return nil, status.Error(codes.InvalidArgument, err.Error()) } diff --git a/pkg/rbd/nodeserver.go b/pkg/rbd/nodeserver.go index 21d7ae8299a3..44ee67c51c18 100644 --- a/pkg/rbd/nodeserver.go +++ b/pkg/rbd/nodeserver.go @@ -48,6 +48,7 @@ type NodeServer struct { func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublishVolumeRequest) (*csi.NodePublishVolumeResponse, error) { targetPath := req.GetTargetPath() targetPathMutex.LockKey(targetPath) + disableInUseChecks := false defer func() { if err := targetPathMutex.UnlockKey(targetPath); err != nil { @@ -70,7 +71,18 @@ func (ns *NodeServer) NodePublishVolume(ctx context.Context, req *csi.NodePublis if !notMnt { return &csi.NodePublishVolumeResponse{}, nil } - volOptions, err := getRBDVolumeOptions(req.GetVolumeContext()) + + // MULTI_NODE_MULTI_WRITER is supported by default for Block access type volumes + if req.VolumeCapability.AccessMode.Mode == csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER { + if isBlock { + disableInUseChecks = true + } else { + klog.Warningf("MULTI_NODE_MULTI_WRITER currently only supported with volumes of access type `block`, invalid AccessMode for volume: %v", req.GetVolumeId()) + return nil, fmt.Errorf("rbd: MULTI_NODE_MULTI_WRITER access mode only allowed with BLOCK access type") + } + } + + volOptions, err := getRBDVolumeOptions(req.GetVolumeContext(), disableInUseChecks) if err != nil { return nil, err } diff --git a/pkg/rbd/rbd.go b/pkg/rbd/rbd.go index 73911aec4283..3fab59e015a6 100644 --- a/pkg/rbd/rbd.go +++ b/pkg/rbd/rbd.go @@ -102,7 +102,14 @@ func (r *Driver) Run(driverName, nodeID, endpoint string, containerized bool, ca csi.ControllerServiceCapability_RPC_LIST_SNAPSHOTS, csi.ControllerServiceCapability_RPC_CLONE_VOLUME, }) - r.cd.AddVolumeCapabilityAccessModes([]csi.VolumeCapability_AccessMode_Mode{csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER}) + + // We only support the multi-writer option when using block, but it's a supported capability for the plugin in general + // In addition, we want to add the remaining modes like MULTI_NODE_READER_ONLY, + // MULTI_NODE_SINGLE_WRITER etc, but need to do some verification of RO modes first + // will work those as follow up features + r.cd.AddVolumeCapabilityAccessModes( + []csi.VolumeCapability_AccessMode_Mode{csi.VolumeCapability_AccessMode_SINGLE_NODE_WRITER, + csi.VolumeCapability_AccessMode_MULTI_NODE_MULTI_WRITER}) // Create GRPC servers r.ids = NewIdentityServer(r.cd) diff --git a/pkg/rbd/rbd_attach.go b/pkg/rbd/rbd_attach.go index 354554d120c0..613cfa765e26 100644 --- a/pkg/rbd/rbd_attach.go +++ b/pkg/rbd/rbd_attach.go @@ -258,6 +258,7 @@ func attachRBDImage(volOptions *rbdVolume, userID string, credentials map[string Factor: rbdImageWatcherFactor, Steps: rbdImageWatcherSteps, } + err = waitForrbdImage(backoff, volOptions, userID, credentials) if err != nil { @@ -313,6 +314,10 @@ func waitForrbdImage(backoff wait.Backoff, volOptions *rbdVolume, userID string, if err != nil { return false, fmt.Errorf("fail to check rbd image status with: (%v), rbd output: (%s)", err, rbdOutput) } + if (volOptions.DisableInUseChecks) && (used) { + klog.V(2).Info("valid multi-node attach requested, ignoring watcher in-use result") + return used, nil + } return !used, nil }) // return error if rbd image has not become available for the specified timeout diff --git a/pkg/rbd/rbd_util.go b/pkg/rbd/rbd_util.go index 5f821549616c..aa5b19f79095 100644 --- a/pkg/rbd/rbd_util.go +++ b/pkg/rbd/rbd_util.go @@ -51,6 +51,7 @@ type rbdVolume struct { AdminID string `json:"adminId"` UserID string `json:"userId"` Mounter string `json:"mounter"` + DisableInUseChecks bool `json:"disableInUseChecks"` } type rbdSnapshot struct { @@ -226,7 +227,7 @@ func execCommand(command string, args []string) ([]byte, error) { return cmd.CombinedOutput() } -func getRBDVolumeOptions(volOptions map[string]string) (*rbdVolume, error) { +func getRBDVolumeOptions(volOptions map[string]string, disableInUseChecks bool) (*rbdVolume, error) { var ok bool rbdVol := &rbdVolume{} rbdVol.Pool, ok = volOptions["pool"] @@ -259,6 +260,10 @@ func getRBDVolumeOptions(volOptions map[string]string) (*rbdVolume, error) { } } + + klog.V(3).Infof("setting disableInUseChecks on rbd volume to: %v", disableInUseChecks) + rbdVol.DisableInUseChecks = disableInUseChecks + getCredsFromVol(rbdVol, volOptions) return rbdVol, nil }