Skip to content

Commit

Permalink
Consider some VslmSyncFault errors fatal during FCD SnapshotCreate. (#…
Browse files Browse the repository at this point in the history
…1511)

* Consider some VslmSyncFault fatal during SnapshotCreate.
More logging during snapshot create.

* Fixed lint issue.

* Get VMware snapshot info before attempting to create a new snapshot.

* Revert "Get VMware snapshot info before attempting to create a new snapshot."

This reverts commit 0c2b84c.

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
  • Loading branch information
carlbraganza and mergify[bot] authored Jun 28, 2022
1 parent 87a5f5a commit 26959e7
Show file tree
Hide file tree
Showing 2 changed files with 236 additions and 12 deletions.
119 changes: 109 additions & 10 deletions pkg/blockstorage/vmware/vmware.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@ import (
"fmt"
"net/url"
"os"
"regexp"
"strconv"
"strings"
"time"

uuid "github.com/gofrs/uuid"
"github.com/pkg/errors"
"github.com/vmware/govmomi/cns"
govmomitask "github.com/vmware/govmomi/task"
"github.com/vmware/govmomi/vapi/rest"
vapitags "github.com/vmware/govmomi/vapi/tags"
"github.com/vmware/govmomi/vim25"
Expand Down Expand Up @@ -244,28 +246,36 @@ func (p *FcdProvider) SnapshotCopyWithArgs(ctx context.Context, from blockstorag
return nil, errors.New("Copy Snapshot with Args not implemented")
}

var reVslmSyncFaultFatal = regexp.MustCompile("Change tracking invalid or disk in use")

// SnapshotCreate is part of blockstorage.Provider
func (p *FcdProvider) SnapshotCreate(ctx context.Context, volume blockstorage.Volume, tags map[string]string) (*blockstorage.Snapshot, error) {
var res types.AnyType
err := wait.PollImmediate(time.Second, defaultRetryLimit, func() (bool, error) {
log.Debug().Print("CreateSnapshot", field.M{"VolumeID": volume.ID})
task, lerr := p.Gom.CreateSnapshot(ctx, vimID(volume.ID), noDescription)
if lerr != nil {
return false, errors.Wrap(lerr, "Failed to create snapshot")
return false, errors.Wrap(lerr, "CreateSnapshot task creation failure")
}
log.Debug().Print("Started CreateSnapshot task", field.M{"VolumeID": volume.ID})
res, lerr = task.Wait(ctx, vmWareTimeout)
if lerr != nil {
if soap.IsVimFault(lerr) {
switch soap.ToVimFault(lerr).(type) {
case *types.InvalidState:
log.Error().WithError(lerr).Print("There is some operation, other than this CreateSnapshot invocation, on the VM attached still being protected by its VM state. Will retry")
return false, nil
case *vslmtypes.VslmSyncFault:
log.Error().WithError(lerr).Print("CreateSnapshot failed with VslmSyncFault error possibly due to race between concurrent DeleteSnapshot invocation. Will retry")
log.Error().WithError(lerr).Print("There is some operation, other than this CreateSnapshot invocation, on the VM attached still being protected by its VM state. Will retry", field.M{"VolumeID": volume.ID})
return false, nil
case *vslmtypes.VslmSyncFault: // potentially can leak snapshots
if vFault, ok := soap.ToVimFault(lerr).(*vslmtypes.VslmSyncFault); ok {
log.Error().Print(fmt.Sprintf("VslmSyncFault: %#v", vFault))
}
if !(govmomiError{lerr}).Matches(reVslmSyncFaultFatal) {
log.Error().Print(fmt.Sprintf("CreateSnapshot failed with VslmSyncFault. Will retry: %s", (govmomiError{lerr}).Format()), field.M{"VolumeID": volume.ID})
return false, nil
}
return false, errors.Wrap(lerr, "CreateSnapshot failed with VslmSyncFault. A snapshot may have been created by this failed operation")
case *types.NotFound:
log.Error().WithError(lerr).Print("CreateSnapshot failed with NotFound error. Will retry")
log.Error().WithError(lerr).Print("CreateSnapshot failed with NotFound error. Will retry", field.M{"VolumeID": volume.ID})
return false, nil
}
}
Expand All @@ -275,15 +285,16 @@ func (p *FcdProvider) SnapshotCreate(ctx context.Context, volume blockstorage.Vo
return true, nil
})
if err != nil {
return nil, errors.Wrap(err, "Failed to create snapshot")
log.Error().WithError(err).Print(fmt.Sprintf("Failed to create snapshot for FCD %s: %s", volume.ID, govmomiError{err}.Format()))
return nil, errors.Wrap(err, fmt.Sprintf("Failed to create snapshot for FCD %s", volume.ID))
}
id, ok := res.(types.ID)
if !ok {
return nil, errors.New("Unexpected type")
return nil, errors.New(fmt.Sprintf("Unexpected type returned for FCD %s", volume.ID))
}
snap, err := p.SnapshotGet(ctx, SnapshotFullID(volume.ID, id.Id))
if err != nil {
return nil, err
return nil, errors.Wrap(err, fmt.Sprintf("Failed to get snapshot %s:%s", volume.ID, snap.ID))
}
log.Debug().Print("SnapshotCreate complete", field.M{"VolumeID": volume.ID, "SnapshotID": snap.ID})
// We don't get size information from `SnapshotGet` - so set this to the volume size for now
Expand All @@ -293,7 +304,7 @@ func (p *FcdProvider) SnapshotCreate(ctx context.Context, volume blockstorage.Vo
snap.Volume = &volume

if err = p.SetTags(ctx, snap, tags); err != nil {
return nil, errors.Wrap(err, "Failed to set tags")
return nil, errors.Wrap(err, fmt.Sprintf("Failed to set tags for snapshot %s:%s", volume.ID, snap.ID))
}

return snap, nil
Expand Down Expand Up @@ -651,3 +662,91 @@ type tagManager interface {
GetTagsForCategory(ctx context.Context, id string) ([]vapitags.Tag, error)
DeleteTag(ctx context.Context, tag *vapitags.Tag) error
}

// Helper to parse an error code returned by the govmomi repo.
type govmomiError struct {
err error
}

func (ge govmomiError) Format() string {
msgs := ge.ExtractMessages()
switch len(msgs) {
case 0:
return ""
case 1:
return msgs[0]
}
return fmt.Sprintf("[%s]", strings.Join(msgs, "; "))
}

func (ge govmomiError) ExtractMessages() []string {
err := ge.err

if err == nil {
return nil
}

msgs := []string{}
if reason := err.Error(); reason != "" {
msgs = append(msgs, reason)
}

// unwrap to a type handled
foundHandledErrorType := false
for err != nil && !foundHandledErrorType {
switch err.(type) {
case govmomitask.Error:
foundHandledErrorType = true
default:
if soap.IsSoapFault(err) {
foundHandledErrorType = true
} else if soap.IsVimFault(err) {
foundHandledErrorType = true
} else {
err = errors.Unwrap(err)
}
}
}

if err != nil {
var faultMsgs []types.LocalizableMessage
switch e := err.(type) {
case govmomitask.Error:
if e.Description != nil {
msgs = append(msgs, e.Description.Message)
}
faultMsgs = e.LocalizedMethodFault.Fault.GetMethodFault().FaultMessage
default:
if soap.IsSoapFault(err) {
detail := soap.ToSoapFault(err).Detail.Fault
if f, ok := detail.(types.BaseMethodFault); ok {
faultMsgs = f.GetMethodFault().FaultMessage
}
} else if soap.IsVimFault(err) {
f := soap.ToVimFault(err)
faultMsgs = f.GetMethodFault().FaultMessage
}
}

for _, m := range faultMsgs {
if m.Message != "" && !strings.HasPrefix(m.Message, "[context]") {
msgs = append(msgs, fmt.Sprintf("%s (%s)", m.Message, m.Key))
}
for _, a := range m.Arg {
msgs = append(msgs, fmt.Sprintf("%s", a.Value))
}
}
}

return msgs
}

func (ge govmomiError) Matches(pat *regexp.Regexp) bool {
for _, m := range ge.ExtractMessages() {
if pat.MatchString(m) {
return true
}
}

return false
}
129 changes: 127 additions & 2 deletions pkg/blockstorage/vmware/vmware_test.go
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@
package vmware

import (
"bytes"
"context"
"errors"
"fmt"
"os"
"sort"
"testing"
"time"

"github.com/kanisterio/kanister/pkg/blockstorage"
"github.com/pkg/errors"
govmomitask "github.com/vmware/govmomi/task"
vapitags "github.com/vmware/govmomi/vapi/tags"
"github.com/vmware/govmomi/vim25/soap"
"github.com/vmware/govmomi/vim25/types"
"github.com/vmware/govmomi/vim25/xml"
vslmtypes "github.com/vmware/govmomi/vslm/types"
. "gopkg.in/check.v1"

"github.com/kanisterio/kanister/pkg/blockstorage"
)

func Test(t *testing.T) { TestingT(t) }
Expand Down Expand Up @@ -392,6 +399,124 @@ func (s *VMWareSuite) TestGetSnapshotTags(c *C) {
}
}

// An XML trace from `govc disk.snapshot.ls` with the VslmSyncFault
var (
vslmSyncFaultReason = "Change tracking invalid or disk in use: api = DiskLib_BlockTrackGetEpoch, path->CValue() = /vmfs/volumes/vsan:52731cd109496ced-173f8e8aec7c6828/dc6d0c61-ec84-381f-2fa3-000c29e75b7f/4e1e7c4619a34919ae1f28fbb53fcd70-000008.vmdk"

vslmSyncFaultReasonEsc = "Change tracking invalid or disk in use: api = DiskLib_BlockTrackGetEpoch, path-&gt;CValue() = /vmfs/volumes/vsan:52731cd109496ced-173f8e8aec7c6828/dc6d0c61-ec84-381f-2fa3-000c29e75b7f/4e1e7c4619a34919ae1f28fbb53fcd70-000008.vmdk"

vslmSyncFaultString = "A general system error occurred: " + vslmSyncFaultReason
vslmSyncFaultStringEsc = "A general system error occurred: " + vslmSyncFaultReasonEsc

vslmSyncFaultXML = `<Fault xmlns="http://schemas.xmlsoap.org/soap/envelope/">
<faultcode>ServerFaultCode</faultcode>
<faultstring>` + vslmSyncFaultStringEsc + `</faultstring>
<detail>
<Fault xmlns:XMLSchema-instance="http://www.w3.org/2001/XMLSchema-instance" XMLSchema-instance:type="SystemError">
<reason>` + vslmSyncFaultReasonEsc + `</reason>
</Fault>
</detail>
</Fault>`

vslmSyncFaultXMLEnv = `<?xml version="1.0" encoding="UTF-8"?>
<soapenv:Envelope xmlns:soapenc="http://schemas.xmlsoap.org/soap/encoding/"
xmlns:soapenv="http://schemas.xmlsoap.org/soap/envelope/"
xmlns:xsd="http://www.w3.org/2001/XMLSchema"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<soapenv:Body>` + vslmSyncFaultXML + `</soapenv:Body>
</soapenv:Envelope>`
)

func (s *VMWareSuite) TestFormatGovmomiError(c *C) {
// basic soap fault
fault := &soap.Fault{
Code: "soap-fault",
String: "fault string",
}
soapFaultErr := soap.WrapSoapFault(fault)
c.Assert(govmomiError{soapFaultErr}.Format(), Equals, "soap-fault: fault string")
c.Assert(govmomiError{errors.Wrap(soapFaultErr, "outer wrapper")}.Format(), Equals, "outer wrapper: soap-fault: fault string")

// Experiment with a real fault XML to figure out how to decode an error.
// (adapted from govmomi/vim25/methods/fault_test.go)
type TestBody struct {
Fault *soap.Fault `xml:"http://schemas.xmlsoap.org/soap/envelope/ Fault,omitempty"`
}
body := TestBody{}
env := soap.Envelope{Body: &body}
dec := xml.NewDecoder(bytes.NewReader([]byte(vslmSyncFaultXMLEnv)))
dec.TypeFunc = types.TypeFunc()
err := dec.Decode(&env)
c.Assert(err, IsNil)
c.Assert(body.Fault, NotNil)

err = soap.WrapSoapFault(body.Fault)
c.Assert(soap.IsSoapFault(err), Equals, true)
c.Assert(err.Error(), Equals, "ServerFaultCode: "+vslmSyncFaultString) // details present

vimFault := &types.VimFault{
MethodFault: types.MethodFault{
FaultCause: &types.LocalizedMethodFault{
LocalizedMessage: err.Error(),
},
},
}
err = soap.WrapVimFault(vimFault)
c.Assert(soap.IsVimFault(err), Equals, true)
c.Assert(err.Error(), Equals, "VimFault") // lost the details

// A vslmFault fault with details such as that returned by gom.SnapshotCreate when
// a volume CTK file is moved. (Note: govc succeeds in this case but list will fail)
vslmFaultValue := "(vmodl.fault.SystemError) {\n faultCause = null,\n faultMessage = null,\n reason = " + vslmSyncFaultReason + "}"
vslmFault := &vslmtypes.VslmSyncFault{
VslmFault: vslmtypes.VslmFault{
MethodFault: types.MethodFault{
FaultMessage: []types.LocalizableMessage{
{
Key: "com.vmware.pbm.pbmFault.locale",
Arg: []types.KeyAnyValue{
{
Key: "summary",
Value: vslmFaultValue,
},
},
},
},
},
},
Id: &types.ID{},
}
c.Assert(vslmFault.GetMethodFault(), NotNil)
c.Assert(vslmFault.GetMethodFault().FaultMessage, DeepEquals, vslmFault.FaultMessage)

err = soap.WrapVimFault(vslmFault)
c.Assert(err.Error(), Equals, "VslmSyncFault")
c.Assert(govmomiError{err}.Format(), Equals, "["+err.Error()+"; "+vslmFaultValue+"]")
c.Assert(govmomiError{errors.Wrap(err, "outer wrapper")}.Format(), Equals, "[outer wrapper: "+err.Error()+"; "+vslmFaultValue+"]")

c.Assert(govmomiError{err}.Matches(reVslmSyncFaultFatal), Equals, true)

// task errors
te := govmomitask.Error{
LocalizedMethodFault: &types.LocalizedMethodFault{
Fault: vslmFault,
},
Description: &types.LocalizableMessage{
Message: "description message",
},
}
c.Assert(err.Error(), Equals, "VslmSyncFault")
c.Assert(govmomiError{te}.Format(), Equals, "[description message; "+vslmFaultValue+"]")
c.Assert(govmomiError{errors.Wrap(te, "outer wrapper")}.Format(), Equals, "[outer wrapper: ; description message; "+vslmFaultValue+"]")

// normal error
testError := errors.New("test-error")
c.Assert(govmomiError{testError}.Format(), Equals, testError.Error())

// nil
c.Assert(govmomiError{nil}.Format(), Equals, "")
}

type fakeTagManager struct {
retGetTagsForCategory []vapitags.Tag
errGetTagsForCategory error
Expand Down

0 comments on commit 26959e7

Please sign in to comment.