Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: do not uninstall helm chart after failed install or upgrade #2456

Merged
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions site/src/content/docs/ref/deploy.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,43 @@ $ zarf connect [service name]
You can also specify a package locally, or via oci such as `zarf package deploy oci://defenseunicorns/dos-games:1.0.0-$(uname -m) --key=https://zarf.dev/cosign.pub`

:::

## Installing, Upgrading, and Rolling Back with Helm

Zarf deploys resources in Kubernetes using [Helm's Go SDK](https://helm.sh/docs/topics/advanced/#go-sdk), and converts manifests into Helm charts for installation.

If no existing Helm releases match a given chart in the cluster, Zarf executes a `helm install`.

Should matching releases exist, a `helm upgrade` is performed.

### Handling CustomResourceDefinitions (CRDs)

- CRDs are _included_ during `helm install` to support Kubernetes Operator deployments
- CRDs are _excluded_ during `helm upgrade` due to [Helm's lack of support for upgrading CRDs](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/#some-caveats-and-explanations)

### Waiting for Resource Readiness

By default, Zarf waits for all resources to deploy successfully during install, upgrade, and rollback operations.

You can override this behavior during install and upgrade by setting the `noWait: true` key under the `charts` and `manifests` fields.

### Timeout Settings

The default timeout for Helm operations in Zarf is 15 minutes.

Use the `--timeout` flag with `zarf init` and `zarf package deploy` to modify the timeout duration.

### Retry Policy

Zarf retries install and upgrade operations up to three times by default if an error occurs.

Use the `--retries` flag with `zarf init` and `zarf package deploy` to change the number of retry attempts.

### Rollback Process

If attempts to upgrade a chart fail, Zarf tries to roll the chart back to its last successful release.

Key features of Zarf's `helm rollback` include:

- Deleting newly created resources if a rollback fails (`helm rollback --cleanup-on-fail`)
- Forcing updates by deleting and recreating resources as necessary (`helm rollback --force`)
lucasrod16 marked this conversation as resolved.
Show resolved Hide resolved
25 changes: 10 additions & 15 deletions src/internal/packager/helm/chart.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
}

if err != nil {
return fmt.Errorf("unable to complete the helm chart install/upgrade: %w", err)
return err
}

message.Debug(output.Info.Description)
Expand All @@ -92,7 +92,6 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {

err = helpers.Retry(tryHelm, h.retries, 5*time.Second, message.Warnf)
if err != nil {
// Try to rollback any deployed releases
releases, _ := histClient.Run(h.chart.ReleaseName)
previouslyDeployedVersion := 0

Expand All @@ -103,25 +102,21 @@ func (h *Helm) InstallOrUpgradeChart() (types.ConnectStrings, string, error) {
}
}

// On total failure try to rollback (if there was a previously deployed version) or uninstall.
if previouslyDeployedVersion > 0 {
spinner.Updatef("Performing chart rollback")

err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %w", h.retries, err)
}
removeMsg := "if you need to remove the failed chart, use `zarf package remove`"

return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts", h.retries)
// No prior releases means this was an initial install.
if previouslyDeployedVersion == 0 {
return nil, "", fmt.Errorf("unable to install chart after %d attempts: %s", h.retries, removeMsg)
}

spinner.Updatef("Performing chart uninstall")
lucasrod16 marked this conversation as resolved.
Show resolved Hide resolved
_, err = h.uninstallChart(h.chart.ReleaseName)
// Attempt to rollback on a failed upgrade.
spinner.Updatef("Performing chart rollback")
err = h.rollbackChart(h.chart.ReleaseName, previouslyDeployedVersion)
if err != nil {
return nil, "", fmt.Errorf("unable to install chart after %d attempts and unable to uninstall: %w", h.retries, err)
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts and unable to rollback: %s", h.retries, removeMsg)
}

return nil, "", fmt.Errorf("unable to install chart after %d attempts", h.retries)
return nil, "", fmt.Errorf("unable to upgrade chart after %d attempts: %s", h.retries, removeMsg)
}

// return any collected connect strings for zarf connect.
Expand Down
2 changes: 1 addition & 1 deletion src/pkg/packager/deploy.go
Original file line number Diff line number Diff line change
Expand Up @@ -340,7 +340,7 @@ func (p *Packager) deployComponent(component types.ZarfComponent, noImgChecksum

if hasCharts || hasManifests {
if charts, err = p.installChartAndManifests(componentPath, component); err != nil {
return charts, fmt.Errorf("unable to install helm chart(s): %w", err)
return charts, err
}
}

Expand Down
13 changes: 8 additions & 5 deletions src/test/e2e/25_helm_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,19 @@ func testHelmUninstallRollback(t *testing.T) {
// This package contains SBOMable things but was created with --skip-sbom
require.Contains(t, string(stdErr), "This package does NOT contain an SBOM.")

// Ensure that this does not leave behind a dos-games chart
// Ensure this leaves behind a dos-games chart.
// We do not want to uninstall charts that had failed installs/upgrades
// to prevent unintentional deletion and/or data loss in production environments.
// https://github.com/defenseunicorns/zarf/issues/2455
helmOut, err := exec.Command("helm", "list", "-n", "dos-games").Output()
require.NoError(t, err)
require.NotContains(t, string(helmOut), "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866")
require.Contains(t, string(helmOut), "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866")

// Deploy the good package.
stdOut, stdErr, err = e2e.Zarf("package", "deploy", goodPath, "--confirm")
require.NoError(t, err, stdOut, stdErr)

// Ensure that this does create a dos-games chart
// Ensure this upgrades/fixes the dos-games chart.
helmOut, err = exec.Command("helm", "list", "-n", "dos-games").Output()
require.NoError(t, err)
require.Contains(t, string(helmOut), "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866")
Expand All @@ -151,7 +154,7 @@ func testHelmUninstallRollback(t *testing.T) {
// Ensure that we rollback properly
helmOut, err = exec.Command("helm", "history", "-n", "dos-games", "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866", "--max", "1").Output()
require.NoError(t, err)
require.Contains(t, string(helmOut), "Rollback to 1")
require.Contains(t, string(helmOut), "Rollback to 4")

// Deploy the evil package (again to ensure we check full history)
stdOut, stdErr, err = e2e.Zarf("package", "deploy", evilPath, "--timeout", "10s", "--confirm")
Expand All @@ -160,7 +163,7 @@ func testHelmUninstallRollback(t *testing.T) {
// Ensure that we rollback properly
helmOut, err = exec.Command("helm", "history", "-n", "dos-games", "zarf-f53a99d4a4dd9a3575bedf59cd42d48d751ae866", "--max", "1").Output()
require.NoError(t, err)
require.Contains(t, string(helmOut), "Rollback to 5")
require.Contains(t, string(helmOut), "Rollback to 8")

// Remove the package.
stdOut, stdErr, err = e2e.Zarf("package", "remove", "dos-games", "--confirm")
Expand Down
5 changes: 4 additions & 1 deletion src/test/e2e/36_custom_retries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,8 @@ func TestRetries(t *testing.T) {
stdOut, stdErr, err = e2e.Zarf("package", "deploy", path.Join(tmpDir, pkgName), "--retries", "2", "--timeout", "3s", "--tmpdir", tmpDir, "--confirm")
require.Error(t, err, stdOut, stdErr)
require.Contains(t, stdErr, "Retrying in 5s")
require.Contains(t, stdErr, "unable to install chart after 2 attempts")
require.Contains(t, e2e.StripMessageFormatting(stdErr), "unable to install chart after 2 attempts")

_, _, err = e2e.Zarf("package", "remove", "dos-games", "--confirm")
require.NoError(t, err)
}
Loading