Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better recovery #3709

Merged
merged 60 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
60 commits
Select commit Hold shift + click to select a range
64db673
Flatten the structure of machine updates
billyb2 Jul 5, 2024
6db4e43
Add recovery to machine updates
billyb2 Jul 5, 2024
5aa0cef
Allow destroying machines
billyb2 Jul 8, 2024
e333394
Update current state on every rollback attempt
billyb2 Jul 10, 2024
ef9acbe
remove debug code
billyb2 Jul 10, 2024
29c9a2b
Don't recursively try to rollback
billyb2 Jul 10, 2024
81c5bad
Disable deleting machines for now
billyb2 Jul 10, 2024
cb96bd1
fix up the code to create and destroy machines
billyb2 Jul 10, 2024
1934ec3
bump wait timeouts to 60 seconds
billyb2 Jul 10, 2024
af3beea
Check that configs are not equal before attempting an update
billyb2 Jul 10, 2024
e649eff
Allow CTRL+C'ing during recovery + return more info to the end user
billyb2 Jul 10, 2024
2df9f45
Small changes to focus on pushing forward deploys
billyb2 Jul 10, 2024
33d1571
Fix waiting for multiple states
billyb2 Jul 11, 2024
d0855b0
Run machine based tests
billyb2 Jul 11, 2024
870bf35
Return on some unrecoverable errors + cache health check results
billyb2 Jul 11, 2024
393d9ff
quick lint
billyb2 Jul 11, 2024
aedf269
try to replace machine if we can't deploy to a host
billyb2 Jul 11, 2024
b9605f0
more linting
billyb2 Jul 11, 2024
eda2604
Allow skipping health checks + immediate deploys
billyb2 Jul 11, 2024
2826af9
Smoke checks
billyb2 Jul 11, 2024
2972b27
Respect timeouts + concurrency settings
billyb2 Jul 11, 2024
76f7f3d
smoke checks
billyb2 Jul 11, 2024
138e2d9
add support for canary machines
billyb2 Jul 11, 2024
6b526af
Remove updateUsingImmediateStrategy
billyb2 Jul 11, 2024
3148eda
Add a flag for retrying deploys
billyb2 Jul 11, 2024
ce1b30f
return waitForSmokeChecksTopass
billyb2 Jul 12, 2024
a484615
Respect the detach flag better
billyb2 Jul 12, 2024
14167c9
move lease collection out of restartMachines
billyb2 Jul 12, 2024
ba45439
Correctly update machines
billyb2 Jul 14, 2024
66dd69b
respect skipLaunch
billyb2 Jul 14, 2024
29c1dcd
keep the 'auto replace machine' code
billyb2 Jul 14, 2024
9dfd3e2
replace builder on could not find app
billyb2 Jul 14, 2024
038a367
return after we destroy a machine
billyb2 Jul 15, 2024
a526558
some code cleanup
billyb2 Jul 15, 2024
1e819d5
correctly catch unrecoverable errors
billyb2 Jul 15, 2024
e8e91d1
make sure to use sl in updateMachineByReplace
billyb2 Jul 15, 2024
008780a
remove unnecessary error check
billyb2 Jul 15, 2024
175aa53
fix update by replace
billyb2 Jul 15, 2024
f4e03bc
update some comments to be more clear
billyb2 Jul 15, 2024
9f79bea
use flaps client in md when acquiring lease
billyb2 Jul 15, 2024
9009e4d
Add tests for some of plan.go
billyb2 Jul 15, 2024
1482fd6
Add tracing to plan.go
billyb2 Jul 15, 2024
caade93
lint
billyb2 Jul 15, 2024
4abfdca
remove some contradictory tracing
billyb2 Jul 15, 2024
02c11dd
a few more clarifying comments
billyb2 Jul 16, 2024
f207dac
ensure that we use machinedeployment flapsClient
billyb2 Jul 16, 2024
299fcac
correctly ensure we use flapsClient from md
billyb2 Jul 16, 2024
941b1f4
warn about incorrect listen address
billyb2 Jul 16, 2024
0f4bcb9
Fix TestUpdateMachines
billyb2 Jul 16, 2024
ee7829f
make sure to update by process group
billyb2 Jul 16, 2024
b27654d
make sure to print to io.ErrOut
billyb2 Jul 16, 2024
c0266ef
Make sure to use max-unavailable
billyb2 Jul 16, 2024
da18ca2
Acquire leases separately from updating machines
billyb2 Jul 16, 2024
bf82b9d
Make sure to concurrently update process groups
billyb2 Jul 16, 2024
95a3c94
Respect lease timeouts
billyb2 Jul 16, 2024
42c1964
remove code to start machine
billyb2 Jul 16, 2024
4998e71
grab machine leases from the previous app state
billyb2 Jul 17, 2024
b251e56
avoid duplicating error printing
billyb2 Jul 17, 2024
a717b49
set default for deploy-retries to 0
billyb2 Jul 17, 2024
f5b3330
get deploy-retries from launchdarkly by default
billyb2 Jul 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion internal/appconfig/remote.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ func getAppV2ConfigFromMachines(ctx context.Context, appName string) (*Config, e
if err != nil {
return nil, fmt.Errorf("error listing active machines for %s app: %w", appName, err)
}
machineSet := machine.NewMachineSet(flapsClient, io, activeMachines)
machineSet := machine.NewMachineSet(flapsClient, io, activeMachines, true)
appConfig, warnings, err := FromAppAndMachineSet(ctx, appName, machineSet)
if err != nil {
return nil, fmt.Errorf("failed to grab app config from existing machines, error: %w", err)
Expand Down
40 changes: 36 additions & 4 deletions internal/command/deploy/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,11 @@ var CommonFlags = flag.Set{
Shorthand: "s",
Description: "Signal to stop the machine with for bluegreen strategy (default: SIGINT)",
},
flag.String{
Name: "deploy-retries",
Description: "Number of times to retry a deployment if it fails",
Default: "auto",
},
}

type Command struct {
Expand Down Expand Up @@ -316,10 +321,14 @@ func DeployWithConfig(ctx context.Context, appConfig *appconfig.Config, userID i

// Fetch an image ref or build from source to get the final image reference to deploy
img, err := determineImage(ctx, appConfig, usingWireguard, recreateBuilder)
if err != nil && usingWireguard && httpFailover {
span.SetAttributes(attribute.String("builder.failover_error", err.Error()))
span.AddEvent("using http failover")
img, err = determineImage(ctx, appConfig, false, recreateBuilder)
if err != nil {
noBuilder := strings.Contains(err.Error(), "Could not find App")
recreateBuilder = recreateBuilder || noBuilder
if noBuilder || (usingWireguard && httpFailover) {
span.SetAttributes(attribute.String("builder.failover_error", err.Error()))
span.AddEvent("using http failover")
img, err = determineImage(ctx, appConfig, false, recreateBuilder)
}
}

if err != nil {
Expand Down Expand Up @@ -477,6 +486,28 @@ func deployToMachines(

status.FlyctlVersion = buildinfo.Info().Version.String()

retriesFlag := flag.GetString(ctx, "deploy-retries")
deployRetries := 0

switch retriesFlag {
case "auto":
ldClient := launchdarkly.ClientFromContext(ctx)
retries := ldClient.GetFeatureFlagValue("deploy-retries", 0.0).(float64)
deployRetries = int(retries)
billyb2 marked this conversation as resolved.
Show resolved Hide resolved

default:
var invalidRetriesErr error = fmt.Errorf("--deploy-retries must be set to a positive integer, 0, or 'auto'")
retries, err := strconv.Atoi(retriesFlag)
if err != nil {
return invalidRetriesErr
}
if retries < 0 {
return invalidRetriesErr
}

deployRetries = retries
}

md, err := NewMachineDeployment(ctx, MachineDeploymentArgs{
AppCompact: app,
DeploymentImage: img.Tag,
Expand Down Expand Up @@ -504,6 +535,7 @@ func deployToMachines(
MaxConcurrent: maxConcurrent,
VolumeInitialSize: flag.GetInt(ctx, "volume-initial-size"),
ProcessGroups: processGroups,
DeployRetries: deployRetries,
})
if err != nil {
sentry.CaptureExceptionWithAppInfo(ctx, err, "deploy", app)
Expand Down
69 changes: 43 additions & 26 deletions internal/command/deploy/machinebasedtest.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package deploy
import (
"context"
"fmt"
"strconv"
"time"

"github.com/cenkalti/backoff"
Expand All @@ -21,7 +20,26 @@ type createdTestMachine struct {
err error
}

func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest *fly.Machine) (err error) {
type machineTestErr struct {
testMachineLogs string
exitCode int
machineID string
}

func (e machineTestErr) Error() string {
return fmt.Sprintf("Error test command machine %s exited with non-zero status of %d", e.machineID, e.exitCode)
}

func (e machineTestErr) Description() string {
var desc string
desc += fmt.Sprintf("Error: test command failed running on machine %s with exit code %d.\n", e.machineID, e.exitCode)
desc += fmt.Sprintf("Check its logs: here's the last 100 lines below, or run 'fly logs -i %s':\n\n", e.machineID)
desc += e.testMachineLogs
return desc

}

func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest *fly.Machine, sl statuslogger.StatusLine) (err error) {
ctx, span := tracing.GetTracer().Start(ctx, "run_test_machine")
var (
flaps = md.flapsClient
Expand All @@ -34,9 +52,13 @@ func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest
span.End()
}()

if sl == nil {
return fmt.Errorf("bug: status logger is nil")
}

processGroup := machineToTest.ProcessGroup()
machineChecks := lo.FlatMap(md.appConfig.AllServices(), func(svc appconfig.Service, _ int) []*appconfig.ServiceMachineCheck {
matchesProcessGroup := lo.Contains(svc.Processes, processGroup)
matchesProcessGroup := lo.Contains(svc.Processes, processGroup) || len(svc.Processes) == 0
if matchesProcessGroup {
return svc.MachineChecks
} else {
Expand All @@ -55,11 +77,11 @@ func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest
var err error
defer func() {
if err != nil {
statuslogger.Failed(ctx, err)
sl.Failed(err)
}
}()

mach, err = md.createTestMachine(ctx, machineCheck, machineToTest)
mach, err = md.createTestMachine(ctx, machineCheck, machineToTest, sl)
return createdTestMachine{mach, err}
})

Expand All @@ -74,13 +96,13 @@ func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest
machineSet := machine.NewMachineSet(flaps, io, lo.FilterMap(machines, func(m createdTestMachine, _ int) (*fly.Machine, bool) {
if m.err != nil {
tracing.RecordError(span, m.err, "failed to create test machine")
statuslogger.LogStatus(ctx, statuslogger.StatusFailure, fmt.Sprintf("failed to create test machine: %s", m.err))
sl.LogStatus(statuslogger.StatusFailure, fmt.Sprintf("failed to create test machine: %s", m.err))
}
return m.mach, m.err == nil
}))
}), false)

// FIXME: consolidate this wait stuff with deploy waits? Especially once we improve the output
err = md.waitForTestMachinesToFinish(ctx, machineSet)
err = md.waitForTestMachinesToFinish(ctx, machineSet, sl)
if err != nil {
tracing.RecordError(span, err, "failed to wait for test cmd machine")
return err
Expand All @@ -89,7 +111,7 @@ func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest
for _, testMachine := range machineSet.GetMachines() {
md.waitForLogs(ctx, testMachine.Machine(), 10*time.Second)

statuslogger.Logf(ctx, "Checking test command machine %s", md.colorize.Bold(testMachine.Machine().ID))
sl.Logf("Checking test command machine %s", md.colorize.Bold(testMachine.Machine().ID))
lastExitEvent, err := testMachine.WaitForEventType(ctx, "exit", md.releaseCmdTimeout, true)
if err != nil {
return fmt.Errorf("error finding the test command machine %s exit event: %w", testMachine.Machine().ID, err)
Expand All @@ -100,27 +122,22 @@ func (md *machineDeployment) runTestMachines(ctx context.Context, machineToTest
}

if exitCode != 0 {
statuslogger.LogStatus(ctx, statuslogger.StatusFailure, "test command failed")
sl.LogStatus(statuslogger.StatusFailure, "test command failed")
// Preemptive cleanup of the logger so that the logs have a clean place to write to

fmt.Fprintf(md.io.ErrOut, "Error: test command failed running on machine %s with exit code %s.\n",
md.colorize.Bold(testMachine.Machine().ID), md.colorize.Red(strconv.Itoa(exitCode)))
fmt.Fprintf(md.io.ErrOut, "Check its logs: here's the last 100 lines below, or run 'fly logs -i %s':\n",
testMachine.Machine().ID)
testLogs, _, err := md.apiClient.GetAppLogs(ctx, md.app.Name, "", md.appConfig.PrimaryRegion, testMachine.Machine().ID)
if fly.IsNotAuthenticatedError(err) {
fmt.Fprintf(md.io.ErrOut, "Warn: not authorized to retrieve app logs (this can happen when using deploy tokens), so we can't show you what failed. Use `fly logs -i %s` or open the monitoring dashboard to see them: https://fly.io/apps/%s/monitoring?region=&instance=%s\n", testMachine.Machine().ID, md.appConfig.AppName, testMachine.Machine().ID)
} else {
if err != nil {
return fmt.Errorf("Error getting test command logs: %w", err)
}
if err == nil {
var logs string
for _, l := range testLogs {
fmt.Fprintf(md.io.ErrOut, " %s\n", l.Message)
logs += l.Message + "\n"
}

return machineTestErr{machineID: testMachine.Machine().ID, exitCode: exitCode, testMachineLogs: logs}
}

return fmt.Errorf("Error test command machine %s exited with non-zero status of %d", testMachine.Machine().ID, exitCode)
}
statuslogger.LogfStatus(ctx,
sl.LogfStatus(
statuslogger.StatusSuccess,
"Test machine %s completed successfully",
md.colorize.Bold(testMachine.Machine().ID),
Expand Down Expand Up @@ -151,7 +168,7 @@ func (md *machineDeployment) waitForLogs(ctx context.Context, mach *fly.Machine,
}, backoff.WithContext(b, ctx))
}

func (md *machineDeployment) createTestMachine(ctx context.Context, svc *appconfig.ServiceMachineCheck, machineToTest *fly.Machine) (*fly.Machine, error) {
func (md *machineDeployment) createTestMachine(ctx context.Context, svc *appconfig.ServiceMachineCheck, machineToTest *fly.Machine, sl statuslogger.StatusLine) (*fly.Machine, error) {
ctx, span := tracing.GetTracer().Start(ctx, "create_test_machine")
defer span.End()

Expand All @@ -165,7 +182,7 @@ func (md *machineDeployment) createTestMachine(ctx context.Context, svc *appconf
return nil, fmt.Errorf("error creating a test machine: %w", err)
}

statuslogger.Logf(ctx, "Created test machine %s", md.colorize.Bold(testMachine.ID))
sl.Logf("Created test machine %s", md.colorize.Bold(testMachine.ID))
return testMachine, nil
}

Expand Down Expand Up @@ -195,7 +212,7 @@ func (md *machineDeployment) launchInputForTestMachine(svc *appconfig.ServiceMac
}, nil
}

func (md *machineDeployment) waitForTestMachinesToFinish(ctx context.Context, testMachines machine.MachineSet) error {
func (md *machineDeployment) waitForTestMachinesToFinish(ctx context.Context, testMachines machine.MachineSet, sl statuslogger.StatusLine) error {
io := iostreams.FromContext(ctx)
colorize := io.ColorScheme()

Expand Down Expand Up @@ -224,7 +241,7 @@ func (md *machineDeployment) waitForTestMachinesToFinish(ctx context.Context, te
return m, err == nil
})
for _, mach := range machs {
statuslogger.Logf(ctx, "Test Machine %s: %s", colorize.Bold(mach.ID), mach.State)
sl.Logf("Test Machine %s: %s", colorize.Bold(mach.ID), mach.State)
}

return nil
Expand Down
7 changes: 5 additions & 2 deletions internal/command/deploy/machines.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ type MachineDeploymentArgs struct {
VolumeInitialSize int
RestartPolicy *fly.MachineRestartPolicy
RestartMaxRetries int
DeployRetries int
}

type machineDeployment struct {
Expand Down Expand Up @@ -111,6 +112,7 @@ type machineDeployment struct {
processGroups map[string]bool
maxConcurrent int
volumeInitialSize int
deployRetries int
}

func NewMachineDeployment(ctx context.Context, args MachineDeploymentArgs) (_ MachineDeployment, err error) {
Expand Down Expand Up @@ -235,6 +237,7 @@ func NewMachineDeployment(ctx context.Context, args MachineDeploymentArgs) (_ Ma
maxConcurrent: maxConcurrent,
volumeInitialSize: args.VolumeInitialSize,
processGroups: args.ProcessGroups,
deployRetries: args.DeployRetries,
}
if err := md.setStrategy(); err != nil {
tracing.RecordError(span, err, "failed to set strategy")
Expand Down Expand Up @@ -375,12 +378,12 @@ func (md *machineDeployment) setMachinesForDeployment(ctx context.Context) error
}
}

md.machineSet = machine.NewMachineSet(md.flapsClient, md.io, machines)
md.machineSet = machine.NewMachineSet(md.flapsClient, md.io, machines, true)
var releaseCmdSet []*fly.Machine
if releaseCmdMachine != nil {
releaseCmdSet = []*fly.Machine{releaseCmdMachine}
}
md.releaseCommandMachine = machine.NewMachineSet(md.flapsClient, md.io, releaseCmdSet)
md.releaseCommandMachine = machine.NewMachineSet(md.flapsClient, md.io, releaseCmdSet, true)
return nil
}

Expand Down
Loading
Loading