Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Breaking change: Stop deleting failed GKE clusters, taint instead #15887

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/8978.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:breaking-change
container: removed the behaviour that `google_container_cluster` will delete the cluster if it's created in an error state. Instead, it will mark the cluster as tainted, allowing manual inspection and intervention. To proceed with deletion, run another `terraform apply`.
```
69 changes: 7 additions & 62 deletions google/services/container/resource_container_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2078,29 +2078,26 @@ func resourceContainerClusterCreate(d *schema.ResourceData, meta interface{}) er
if err := d.Set("operation", op.Name); err != nil {
return fmt.Errorf("Error setting operation: %s", err)
}

return nil
default:
// leaving default case to ensure this is non blocking
}

// Try a GET on the cluster so we can see the state in debug logs. This will help classify error states.
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(containerClusterFullName(project, location, clusterName))
if config.UserProjectOverride {
clusterGetCall.Header().Add("X-Goog-User-Project", project)
}

_, getErr := clusterGetCall.Do()
if getErr != nil {
log.Printf("[WARN] Cluster %s was created in an error state and not found", clusterName)
d.SetId("")
}

if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
} else {
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
d.SetId("")
}
// The resource didn't actually create
// Don't clear cluster id, this will taint the resource
log.Printf("[WARN] GKE cluster %s was created in an error state, and has been marked as tainted", clusterName)
return waitErr
}

Expand Down Expand Up @@ -2245,14 +2242,8 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
d.SetId("")
}

if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
} else {
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
d.SetId("")
}
// The resource didn't actually create
// Don't clear cluster id, this will taint the resource
log.Printf("[WARN] GKE cluster %s was created in an error state, and has been marked as tainted", clusterName)
return waitErr
}
}
Expand Down Expand Up @@ -3658,52 +3649,6 @@ func resourceContainerClusterDelete(d *schema.ResourceData, meta interface{}) er
return nil
}

// cleanFailedContainerCluster deletes clusters that failed but were
// created in an error state. Similar to resourceContainerClusterDelete
// but implemented in separate function as it doesn't try to lock already
// locked cluster state, does different error handling, and doesn't do retries.
func cleanFailedContainerCluster(d *schema.ResourceData, meta interface{}) error {
config := meta.(*transport_tpg.Config)

project, err := tpgresource.GetProject(d, config)
if err != nil {
return err
}

location, err := tpgresource.GetLocation(d, config)
if err != nil {
return err
}

userAgent, err := tpgresource.GenerateUserAgentString(d, config.UserAgent)
if err != nil {
return err
}

clusterName := d.Get("name").(string)
fullName := containerClusterFullName(project, location, clusterName)

log.Printf("[DEBUG] Cleaning up failed GKE cluster %s", d.Get("name").(string))
clusterDeleteCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Delete(fullName)
if config.UserProjectOverride {
clusterDeleteCall.Header().Add("X-Goog-User-Project", project)
}
op, err := clusterDeleteCall.Do()
if err != nil {
return transport_tpg.HandleNotFoundError(err, d, fmt.Sprintf("Container Cluster %q", d.Get("name").(string)))
}

// Wait until it's deleted
waitErr := ContainerOperationWait(config, op, project, location, "deleting GKE cluster", userAgent, d.Timeout(schema.TimeoutDelete))
if waitErr != nil {
return waitErr
}

log.Printf("[INFO] GKE cluster %s has been deleted", d.Id())
d.SetId("")
return nil
}

var containerClusterRestingStates = RestingStates{
"RUNNING": ReadyState,
"DEGRADED": ErrorState,
Expand Down
5 changes: 4 additions & 1 deletion google/services/container/resource_container_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2693,6 +2693,9 @@ func TestAccContainerCluster_autoprovisioningDefaultsManagement(t *testing.T) {
})
}

// This resource originally cleaned up the dangling cluster directly, but now
// taints it, having Terraform clean it up during the next apply. This test
// name is now inexact, but is being preserved to maintain the test history.
func TestAccContainerCluster_errorCleanDanglingCluster(t *testing.T) {
t.Parallel()

Expand Down Expand Up @@ -2721,7 +2724,7 @@ func TestAccContainerCluster_errorCleanDanglingCluster(t *testing.T) {
Config: overlapConfig,
ExpectError: regexp.MustCompile("Error waiting for creating GKE cluster"),
},
// If dangling cluster wasn't deleted, this plan will return an error
// If tainted cluster won't be deleted, this step will return an error
{
Config: overlapConfig,
PlanOnly: true,
Expand Down