Skip to content

Commit

Permalink
vSphere snapshot delete error checking (#1046)
Browse files Browse the repository at this point in the history
* error checking

* after carls review

* minor grammar

* remove unused error message
  • Loading branch information
bathina2 committed Jul 8, 2021
1 parent 404c7fb commit dfed9b1
Showing 1 changed file with 32 additions and 10 deletions.
42 changes: 32 additions & 10 deletions pkg/blockstorage/vmware/vmware.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"context"
"fmt"
"net/url"
"strings"
"time"

"github.com/pkg/errors"
Expand All @@ -15,6 +14,7 @@ import (
"github.com/vmware/govmomi/vim25/soap"
"github.com/vmware/govmomi/vim25/types"
"github.com/vmware/govmomi/vslm"
vslmtypes "github.com/vmware/govmomi/vslm/types"
"k8s.io/apimachinery/pkg/util/wait"

"github.com/kanisterio/kanister/pkg/blockstorage"
Expand All @@ -40,7 +40,6 @@ const (
noDescription = ""
defaultWaitTime = 10 * time.Minute
defaultRetryLimit = 30 * time.Minute
invalidStateError = "The operation is not allowed in the current state"
)

// FcdProvider provides blockstorage.Provider
Expand Down Expand Up @@ -191,10 +190,18 @@ func (p *FcdProvider) SnapshotCreate(ctx context.Context, volume blockstorage.Vo
log.Debug().Print("Started CreateSnapshot task", field.M{"VolumeID": volume.ID})
res, lerr = task.Wait(ctx, defaultWaitTime)
if lerr != nil {
if strings.Contains(lerr.Error(), invalidStateError) {
log.Debug().Print("Retrying CreateSnapshot task", field.M{"VolumeID": volume.ID})
// retry operation
return false, nil
if soap.IsVimFault(lerr) {
switch soap.ToVimFault(lerr).(type) {
case *types.InvalidState:
log.Error().WithError(lerr).Print("There is some operation, other than this CreateSnapshot invocation, on the VM attached still being protected by its VM state. Will retry")
return false, nil
case *vslmtypes.VslmSyncFault:
log.Error().WithError(lerr).Print("CreateSnapshot failed with VslmSyncFault error possibly due to race between concurrent DeleteSnapshot invocation. Will retry")
return false, nil
case *types.NotFound:
log.Error().WithError(lerr).Print("CreateSnapshot failed with NotFound error. Will retry")
return false, nil
}
}
return false, errors.Wrap(lerr, "Failed to wait on task")
}
Expand Down Expand Up @@ -242,10 +249,25 @@ func (p *FcdProvider) SnapshotDelete(ctx context.Context, snapshot *blockstorage
log.Debug().Print("Started SnapshotDelete task", field.M{"VolumeID": volID, "SnapshotID": snapshotID})
_, lerr = task.Wait(ctx, defaultWaitTime)
if lerr != nil {
if strings.Contains(lerr.Error(), invalidStateError) {
log.Debug().Print("Retrying SnapshotDelete task", field.M{"VolumeID": volID, "SnapshotID": snapshotID})
// retry operation
return false, nil
// The following error handling was pulled from https://github.com/vmware-tanzu/astrolabe/blob/91eeed4dcf77edd1387a25e984174f159d66fedb/pkg/ivd/ivd_protected_entity.go#L433
if soap.IsVimFault(lerr) {
switch soap.ToVimFault(lerr).(type) {
case *types.InvalidArgument:
log.Error().WithError(lerr).Print("Disk doesn't have given snapshot due to the snapshot stamp being removed in the previous DeleteSnapshot operation which failed with an InvalidState fault. It will be resolved by the next snapshot operation on the same VM. Will NOT retry")
return true, nil
case *types.NotFound:
log.Error().WithError(lerr).Print("There is a temporary catalog mismatch due to a race condition with one another concurrent DeleteSnapshot operation. It will be resolved by the next consolidateDisks operation on the same VM. Will NOT retry")
return true, nil
case *types.InvalidState:
log.Error().WithError(lerr).Print("There is some operation, other than this DeleteSnapshot invocation, on the same VM still being protected by its VM state. Will retry")
return false, nil
case *types.TaskInProgress:
log.Error().WithError(lerr).Print("There is some other InProgress operation on the same VM. Will retry")
return false, nil
case *types.FileLocked:
log.Error().WithError(lerr).Print("An error occurred while consolidating disks: Failed to lock the file. Will retry")
return false, nil
}
}
return false, errors.Wrap(lerr, "Failed to wait on task")
}
Expand Down

0 comments on commit dfed9b1

Please sign in to comment.