Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

flaps: retry on certain failure modes, such as 504s #3601

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
2 changes: 1 addition & 1 deletion internal/command/machine/select.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ func buildContextFromAppNameOrMachineID(ctx context.Context, machineIDs ...strin
var (
appName = appconfig.NameFromContext(ctx)

flapsClient *flaps.Client
flapsClient flapsutil.FlapsClient
err error
)

Expand Down
3 changes: 1 addition & 2 deletions internal/command/volumes/volumes.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"github.com/spf13/cobra"

fly "github.com/superfly/fly-go"
"github.com/superfly/fly-go/flaps"
"github.com/superfly/flyctl/iostreams"

"github.com/superfly/flyctl/internal/command"
Expand Down Expand Up @@ -131,7 +130,7 @@ func renderTable(ctx context.Context, volumes []fly.Volume, app *fly.AppBasic, o
return nil
}

func selectVolume(ctx context.Context, flapsClient *flaps.Client, app *fly.AppBasic) (*fly.Volume, error) {
func selectVolume(ctx context.Context, flapsClient flapsutil.FlapsClient, app *fly.AppBasic) (*fly.Volume, error) {
if !iostreams.FromContext(ctx).IsInteractive() {
return nil, fmt.Errorf("volume ID must be specified when not running interactively")
}
Expand Down
8 changes: 6 additions & 2 deletions internal/flapsutil/flapsutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ import (
"github.com/superfly/flyctl/internal/metrics"
)

func NewClientWithOptions(ctx context.Context, opts flaps.NewClientOpts) (*flaps.Client, error) {
func NewClientWithOptions(ctx context.Context, opts flaps.NewClientOpts) (FlapsClient, error) {
// Connect over wireguard depending on FLAPS URL.
if strings.TrimSpace(strings.ToLower(os.Getenv("FLY_FLAPS_BASE_URL"))) == "peer" {
orgSlug, err := resolveOrgSlugForApp(ctx, opts.AppCompact, opts.AppName)
Expand Down Expand Up @@ -57,7 +57,11 @@ func NewClientWithOptions(ctx context.Context, opts flaps.NewClientOpts) (*flaps
opts.Logger = v
}

return flaps.NewWithOptions(ctx, opts)
client, err := flaps.NewWithOptions(ctx, opts)
if err != nil {
return nil, fmt.Errorf("failed to create flaps client: %w", err)
}
return wrapWithRetry(client), nil
}

func resolveOrgSlugForApp(ctx context.Context, app *fly.AppCompact, appName string) (string, error) {
Expand Down
62 changes: 62 additions & 0 deletions internal/flapsutil/retry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
package flapsutil

import (
"errors"
"net/http"
"time"

"github.com/cenkalti/backoff/v4"
"github.com/superfly/fly-go/flaps"
)

func isRetryable(err error) bool {

var flapsErr *flaps.FlapsError
if errors.As(err, &flapsErr) {
switch flapsErr.ResponseStatusCode {
case http.StatusServiceUnavailable:
return true
case http.StatusGatewayTimeout:
return true
}
}

return false
}

// Retry retries a machine operation a few times before giving up
// This is useful for operations like that can fail only to succeed on another try, like machine creation
func Retry(f func() error) error {

var machineRetryBackoff = backoff.ExponentialBackOff{
InitialInterval: 500 * time.Millisecond,
RandomizationFactor: 0,
Multiplier: 2,
MaxInterval: 5 * time.Second,
MaxElapsedTime: 0,
Clock: backoff.SystemClock,
}

return backoff.Retry(func() error {
err := f()
if err == nil {
return nil
}
if isRetryable(err) {
return err
}
return backoff.Permanent(err)
}, &machineRetryBackoff)
}

// RetryRet retries a machine operation a few times before giving up
// This is useful for operations like that can fail only to succeed on another try, like machine creation
func RetryRet[T any](f func() (T, error)) (T, error) {
var res T
err := Retry(func() error {
var err error
res, err = f()
return err
})
return res, err
}
185 changes: 185 additions & 0 deletions internal/flapsutil/retrying_client.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
package flapsutil

import (
"context"
"net/http"
"time"

"github.com/superfly/fly-go"
)

type retryClient struct {
inner FlapsClient
}

func wrapWithRetry(inner FlapsClient) FlapsClient {
return &retryClient{inner}
}

func (r *retryClient) AcquireLease(ctx context.Context, machineID string, ttl *int) (*fly.MachineLease, error) {
return r.inner.AcquireLease(ctx, machineID, ttl)
}
func (r *retryClient) Cordon(ctx context.Context, machineID string, nonce string) (err error) {
return r.inner.Cordon(ctx, machineID, nonce)
}
func (r *retryClient) CreateApp(ctx context.Context, name string, org string) (err error) {
return r.inner.CreateApp(ctx, name, org)
}
func (r *retryClient) CreateVolume(ctx context.Context, req fly.CreateVolumeRequest) (*fly.Volume, error) {
// Not retryable - the CreateVolume endpoint can time out, but the volume may still be created
return r.inner.CreateVolume(ctx, req)
}
func (r *retryClient) CreateVolumeSnapshot(ctx context.Context, volumeId string) error {
// I don't feel comfortable retrying this, since CreateVolume is not retryable
return r.inner.CreateVolumeSnapshot(ctx, volumeId)
}
func (r *retryClient) DeleteMetadata(ctx context.Context, machineID, key string) error {
return Retry(func() error {
return r.inner.DeleteMetadata(ctx, machineID, key)
})
}
func (r *retryClient) DeleteVolume(ctx context.Context, volumeId string) (*fly.Volume, error) {
// I don't feel comfortable retrying this, since CreateVolume is not retryable
return r.inner.DeleteVolume(ctx, volumeId)
}
func (r *retryClient) Destroy(ctx context.Context, input fly.RemoveMachineInput, nonce string) (err error) {
// This should be safe to call more than once
return Retry(func() error {
return r.inner.Destroy(ctx, input, nonce)
})
}
func (r *retryClient) Exec(ctx context.Context, machineID string, in *fly.MachineExecRequest) (*fly.MachineExecResponse, error) {
// Not comfortable retrying this - the command may have side effects
return r.inner.Exec(ctx, machineID, in)
}
func (r *retryClient) ExtendVolume(ctx context.Context, volumeId string, size_gb int) (*fly.Volume, bool, error) {
// Not retrying volume endpoints
return r.inner.ExtendVolume(ctx, volumeId, size_gb)
}
func (r *retryClient) FindLease(ctx context.Context, machineID string) (*fly.MachineLease, error) {
return RetryRet(func() (*fly.MachineLease, error) {
return r.inner.FindLease(ctx, machineID)
})
}
func (r *retryClient) Get(ctx context.Context, machineID string) (*fly.Machine, error) {
return RetryRet(func() (*fly.Machine, error) {
return r.inner.Get(ctx, machineID)
})
}
func (r *retryClient) GetAllVolumes(ctx context.Context) ([]fly.Volume, error) {
return RetryRet(func() ([]fly.Volume, error) {
return r.inner.GetAllVolumes(ctx)
})
}
func (r *retryClient) GetMany(ctx context.Context, machineIDs []string) ([]*fly.Machine, error) {
return RetryRet(func() ([]*fly.Machine, error) {
return r.inner.GetMany(ctx, machineIDs)
})
}
func (r *retryClient) GetMetadata(ctx context.Context, machineID string) (map[string]string, error) {
return RetryRet(func() (map[string]string, error) {
return r.inner.GetMetadata(ctx, machineID)
})
}
func (r *retryClient) GetProcesses(ctx context.Context, machineID string) (fly.MachinePsResponse, error) {
return RetryRet(func() (fly.MachinePsResponse, error) {
return r.inner.GetProcesses(ctx, machineID)
})
}
func (r *retryClient) GetVolume(ctx context.Context, volumeId string) (*fly.Volume, error) {
return RetryRet(func() (*fly.Volume, error) {
return r.inner.GetVolume(ctx, volumeId)
})
}
func (r *retryClient) GetVolumeSnapshots(ctx context.Context, volumeId string) ([]fly.VolumeSnapshot, error) {
return RetryRet(func() ([]fly.VolumeSnapshot, error) {
return r.inner.GetVolumeSnapshots(ctx, volumeId)
})
}
func (r *retryClient) GetVolumes(ctx context.Context) ([]fly.Volume, error) {
return RetryRet(func() ([]fly.Volume, error) {
return r.inner.GetVolumes(ctx)
})
}
func (r *retryClient) Kill(ctx context.Context, machineID string) (err error) {
return Retry(func() error {
return r.inner.Kill(ctx, machineID)
})
}
func (r *retryClient) Launch(ctx context.Context, builder fly.LaunchMachineInput) (out *fly.Machine, err error) {
// TODO: Figure out how to _at least_ retry *this* operation safely
return r.inner.Launch(ctx, builder)
}
func (r *retryClient) List(ctx context.Context, state string) ([]*fly.Machine, error) {
return RetryRet(func() ([]*fly.Machine, error) {
return r.inner.List(ctx, state)
})
}
func (r *retryClient) ListActive(ctx context.Context) ([]*fly.Machine, error) {
return RetryRet(func() ([]*fly.Machine, error) {
return r.inner.ListActive(ctx)
})
}
func (r *retryClient) ListFlyAppsMachines(ctx context.Context) ([]*fly.Machine, *fly.Machine, error) {
// RetryRet only works for functions that return one value and an error
var (
retMachines []*fly.Machine
retRelCmdMachine *fly.Machine
)
err := Retry(func() error {
var err error
retMachines, retRelCmdMachine, err = r.inner.ListFlyAppsMachines(ctx)
return err
})
return retMachines, retRelCmdMachine, err
}
func (r *retryClient) NewRequest(ctx context.Context, method, path string, in interface{}, headers map[string][]string) (*http.Request, error) {
return r.inner.NewRequest(ctx, method, path, in, headers)
}
func (r *retryClient) RefreshLease(ctx context.Context, machineID string, ttl *int, nonce string) (*fly.MachineLease, error) {
return r.inner.RefreshLease(ctx, machineID, ttl, nonce)
}
func (r *retryClient) ReleaseLease(ctx context.Context, machineID, nonce string) error {
return r.inner.ReleaseLease(ctx, machineID, nonce)
}
func (r *retryClient) Restart(ctx context.Context, in fly.RestartMachineInput, nonce string) (err error) {
return r.inner.Restart(ctx, in, nonce)
}
func (r *retryClient) SetMetadata(ctx context.Context, machineID, key, value string) error {
// This should be safe to call multiple times
return Retry(func() error {
return r.inner.SetMetadata(ctx, machineID, key, value)
})
}
func (r *retryClient) Start(ctx context.Context, machineID string, nonce string) (out *fly.MachineStartResponse, err error) {
// Not *strictly* idempotent
return r.inner.Start(ctx, machineID, nonce)
}
func (r *retryClient) Stop(ctx context.Context, in fly.StopMachineInput, nonce string) (err error) {
// Not *strictly* idempotent
return r.inner.Stop(ctx, in, nonce)
}
func (r *retryClient) Uncordon(ctx context.Context, machineID string, nonce string) (err error) {
// Not *strictly* idempotent
return Retry(func() error {
return r.inner.Uncordon(ctx, machineID, nonce)
})
}
func (r *retryClient) Update(ctx context.Context, builder fly.LaunchMachineInput, nonce string) (out *fly.Machine, err error) {
// This should be safe to retry
return RetryRet(func() (*fly.Machine, error) {
return r.inner.Update(ctx, builder, nonce)
})
}
func (r *retryClient) UpdateVolume(ctx context.Context, volumeId string, req fly.UpdateVolumeRequest) (*fly.Volume, error) {
// This should be safe to retry
return RetryRet(func() (*fly.Volume, error) {
return r.inner.UpdateVolume(ctx, volumeId, req)
})
}
func (r *retryClient) Wait(ctx context.Context, machine *fly.Machine, state string, timeout time.Duration) (err error) {
return r.inner.Wait(ctx, machine, state, timeout)
}
func (r *retryClient) WaitForApp(ctx context.Context, name string) error {
return r.inner.WaitForApp(ctx, name)
}
Loading