Skip to content

Commit

Permalink
fix!: do not uninstall helm chart after failed install or upgrade (de…
Browse files Browse the repository at this point in the history
…fenseunicorns#2456)

## Description
Do not uninstall helm chart after failed install or upgrade

## Related Issue

Fixes defenseunicorns#2455

## Checklist before merging

- [x] Test, docs, adr added or updated as needed
- [x] [Contributor Guide
Steps](https://github.com/defenseunicorns/zarf/blob/main/.github/CONTRIBUTING.md#developer-workflow)
followed

---------

Co-authored-by: Austin Abro <37223396+AustinAbro321@users.noreply.github.com>
  • Loading branch information
2 people authored and phillebaba committed May 7, 2024
1 parent 7e88fdf commit b1af44a
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 22 deletions.
38 changes: 38 additions & 0 deletions site/src/content/docs/ref/deploy.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,41 @@ $ zarf connect [service name]
You can also specify a package locally, or via oci such as `zarf package deploy oci://defenseunicorns/dos-games:1.0.0-$(uname -m) --key=https://zarf.dev/cosign.pub`

:::

## Installing, Upgrading, and Rolling Back with Helm

Zarf deploys resources in Kubernetes using [Helm's Go SDK](https://helm.sh/docs/topics/advanced/#go-sdk), and converts manifests into Helm charts for installation.

If no existing Helm releases match a given chart in the cluster, Zarf executes a `helm install`.

Should matching releases exist, a `helm upgrade` is performed.

### Handling CustomResourceDefinitions (CRDs)

- CRDs are _included_ during `helm install` to support Kubernetes Operator deployments
- CRDs are _excluded_ during `helm upgrade` due to [Helm's lack of support for upgrading CRDs](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/#some-caveats-and-explanations)

### Waiting for Resource Readiness

By default, Zarf waits for all resources to deploy successfully during install, upgrade, and rollback operations.

You can override this behavior during install and upgrade by setting the `noWait: true` key under the `charts` and `manifests` fields.

### Timeout Settings

The default timeout for Helm operations in Zarf is 15 minutes.

Use the `--timeout` flag with `zarf init` and `zarf package deploy` to modify the timeout duration.

### Retry Policy

Zarf retries install and upgrade operations up to three times by default if an error occurs.

Use the `--retries` flag with `zarf init` and `zarf package deploy` to change the number of retry attempts.

### Rollback Process

If attempts to upgrade a chart fail, Zarf tries to roll the chart back to its last successful release. During this rollback process:

- Any resources created during the failed upgrade attempt are deleted (`helm rollback --cleanup-on-fail`)
- Resource updates are forced through delete and recreate if needed (`helm rollback --force`)
25 changes: 10 additions & 15 deletions src/internal/packager/helm/chart.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
}

if err != nil {
return fmt.Errorf("unable to complete the helm chart install/upgrade: %w", err)
return err
}

message.Debug(output.Info.Description)
Expand All @@ -92,7 +92,6 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {

err = helpers.Retry(tryHelm, h.retries, 5*time.Second, message.Warnf)
if err != nil {
// Try to rollback any deployed releases
releases, _ := histClient.Run(h.chart.ReleaseName)
previouslyDeployedVersion := 0

Expand All @@ -103,25 +102,21 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", h.retries, err)
}
removeMsg := "if you need to remove the failed chart, use `zarf package remove`"

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", h.retries)
// No prior releases means this was an initial install.
if previouslyDeployedVersion == 0 {
return nil, "", fmt.Errorf("unable to install chart after %d attempts: %s", h.retries, removeMsg)
}

spinner.Updatef("Performing chart uninstall")
_, err = h.uninstallChart(h.chart.ReleaseName)
// Attempt to rollback on a failed upgrade.
spinner.Updatef("Performing chart rollback")
err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", h.retries, err)
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %s", h.retries, removeMsg)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", h.retries)
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts: %s", h.retries, removeMsg)
}

// return any collected connect strings for zarf connect.
Expand Down
2 changes: 1 addition & 1 deletion src/pkg/packager/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ func (p *Packager) deployComponent(component types.ZarfComponent, noImgChecksum

if hasCharts || hasManifests {
if charts, err = p.installChartAndManifests(componentPath, component); err != nil {
return charts, fmt.Errorf("unable to install helm chart(s): %w", err)
return charts, err
}
}

Expand Down
13 changes: 8 additions & 5 deletions src/test/e2e/25_helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,19 @@ func testHelmUninstallRollback(t *testing.T) {
// This package contains SBOMable things but was created with --skip-sbom
require.Contains(t, string(stdErr), "This package does NOT contain an SBOM.")

// Ensure that this does not leave behind a dos-games chart
// Ensure this leaves behind a dos-games chart.
// We do not want to uninstall charts that had failed installs/upgrades
// to prevent unintentional deletion and/or data loss in production environments.
// https://github.com/defenseunicorns/zarf/issues/2455
helmOut, err := exec.Command("helm", "list", "-n", "dos-games").Output()
require.NoError(t, err)
require.NotContains(t, string(helmOut), "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866")
require.Contains(t, string(helmOut), "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866")

// Deploy the good package.
stdOut, stdErr, err = e2e.Zarf("package", "deploy", goodPath, "--confirm")
require.NoError(t, err, stdOut, stdErr)

// Ensure that this does create a dos-games chart
// Ensure this upgrades/fixes the dos-games chart.
helmOut, err = exec.Command("helm", "list", "-n", "dos-games").Output()
require.NoError(t, err)
require.Contains(t, string(helmOut), "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866")
Expand All @@ -151,7 +154,7 @@ func testHelmUninstallRollback(t *testing.T) {
// Ensure that we rollback properly
helmOut, err = exec.Command("helm", "history", "-n", "dos-games", "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866", "--max", "1").Output()
require.NoError(t, err)
require.Contains(t, string(helmOut), "Rollback to 1")
require.Contains(t, string(helmOut), "Rollback to 4")

// Deploy the evil package (again to ensure we check full history)
stdOut, stdErr, err = e2e.Zarf("package", "deploy", evilPath, "--timeout", "10s", "--confirm")
Expand All @@ -160,7 +163,7 @@ func testHelmUninstallRollback(t *testing.T) {
// Ensure that we rollback properly
helmOut, err = exec.Command("helm", "history", "-n", "dos-games", "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866", "--max", "1").Output()
require.NoError(t, err)
require.Contains(t, string(helmOut), "Rollback to 5")
require.Contains(t, string(helmOut), "Rollback to 8")

// Remove the package.
stdOut, stdErr, err = e2e.Zarf("package", "remove", "dos-games", "--confirm")
Expand Down
5 changes: 4 additions & 1 deletion src/test/e2e/36_custom_retries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,8 @@ func TestRetries(t *testing.T) {
stdOut, stdErr, err = e2e.Zarf("package", "deploy", path.Join(tmpDir, pkgName), "--retries", "2", "--timeout", "3s", "--tmpdir", tmpDir, "--confirm")
require.Error(t, err, stdOut, stdErr)
require.Contains(t, stdErr, "Retrying in 5s")
require.Contains(t, stdErr, "unable to install chart after 2 attempts")
require.Contains(t, e2e.StripMessageFormatting(stdErr), "unable to install chart after 2 attempts")

_, _, err = e2e.Zarf("package", "remove", "dos-games", "--confirm")
require.NoError(t, err)
}

0 comments on commit b1af44a

Please sign in to comment.