Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix an issue with resuming a failed container cluster creation #13580

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/7121.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
container: fixed an issue with resuming failed cluster creation
```
162 changes: 160 additions & 2 deletions google/bootstrap_utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import (
"testing"
"time"

"google.golang.org/api/cloudkms/v1"
"google.golang.org/api/cloudbilling/v1"
cloudkms "google.golang.org/api/cloudkms/v1"
cloudresourcemanager "google.golang.org/api/cloudresourcemanager/v1"
"google.golang.org/api/iam/v1"
iam "google.golang.org/api/iam/v1"
sqladmin "google.golang.org/api/sqladmin/v1beta4"
)

Expand Down Expand Up @@ -358,6 +359,163 @@ func BootstrapServicePerimeterProjects(t *testing.T, desiredProjects int) []*clo
return projects
}

func removeContainerServiceAgentRoleFromContainerEngineRobot(t *testing.T, project *cloudresourcemanager.Project) {
config := BootstrapConfig(t)
if config == nil {
return
}

client := config.NewResourceManagerClient(config.userAgent)
containerEngineRobot := fmt.Sprintf("serviceAccount:service-%d@container-engine-robot.iam.gserviceaccount.com", project.ProjectNumber)
getPolicyRequest := &cloudresourcemanager.GetIamPolicyRequest{}
policy, err := client.Projects.GetIamPolicy(project.ProjectId, getPolicyRequest).Do()
if err != nil {
t.Fatalf("error getting project iam policy: %v", err)
}
roleFound := false
changed := false
for _, binding := range policy.Bindings {
if binding.Role == "roles/container.serviceAgent" {
memberFound := false
for i, member := range binding.Members {
if member == containerEngineRobot {
binding.Members[i] = binding.Members[len(binding.Members)-1]
memberFound = true
}
}
if memberFound {
binding.Members = binding.Members[:len(binding.Members)-1]
changed = true
}
} else if binding.Role == "roles/editor" {
memberFound := false
for _, member := range binding.Members {
if member == containerEngineRobot {
memberFound = true
break
}
}
if !memberFound {
binding.Members = append(binding.Members, containerEngineRobot)
changed = true
}
roleFound = true
}
}
if !roleFound {
policy.Bindings = append(policy.Bindings, &cloudresourcemanager.Binding{
Members: []string{containerEngineRobot},
Role: "roles/editor",
})
changed = true
}
if changed {
setPolicyRequest := &cloudresourcemanager.SetIamPolicyRequest{Policy: policy}
policy, err = client.Projects.SetIamPolicy(project.ProjectId, setPolicyRequest).Do()
if err != nil {
t.Fatalf("error setting project iam policy: %v", err)
}
}
}

func BootstrapProject(t *testing.T, projectID, billingAccount string, services []string) *cloudresourcemanager.Project {
config := BootstrapConfig(t)
if config == nil {
return nil
}

crmClient := config.NewResourceManagerClient(config.userAgent)

project, err := crmClient.Projects.Get(projectID).Do()
if err != nil {
if !isGoogleApiErrorWithCode(err, 403) {
t.Fatalf("Error getting bootstrapped project: %s", err)
}
org := getTestOrgFromEnv(t)

op, err := crmClient.Projects.Create(&cloudresourcemanager.Project{
ProjectId: projectID,
Name: "Bootstrapped Test Project",
Parent: &cloudresourcemanager.ResourceId{
Type: "organization",
Id: org,
},
}).Do()
if err != nil {
t.Fatalf("Error creating bootstrapped test project: %s", err)
}

opAsMap, err := ConvertToMap(op)
if err != nil {
t.Fatalf("Error converting create project operation to map: %s", err)
}

err = resourceManagerOperationWaitTime(config, opAsMap, "creating project", config.userAgent, 4*time.Minute)
if err != nil {
t.Fatalf("Error waiting for create project operation: %s", err)
}

project, err = crmClient.Projects.Get(projectID).Do()
if err != nil {
t.Fatalf("Error getting bootstrapped project: %s", err)
}

}

if project.LifecycleState == "DELETE_REQUESTED" {
_, err := crmClient.Projects.Undelete(projectID, &cloudresourcemanager.UndeleteProjectRequest{}).Do()
if err != nil {
t.Fatalf("Error undeleting bootstrapped project: %s", err)
}
}

if billingAccount != "" {
billingClient := config.NewBillingClient(config.userAgent)
var pbi *cloudbilling.ProjectBillingInfo
err = retryTimeDuration(func() error {
var reqErr error
pbi, reqErr = billingClient.Projects.GetBillingInfo(prefixedProject(projectID)).Do()
return reqErr
}, 30*time.Second)
if err != nil {
t.Fatalf("Error getting billing info for project %q: %v", projectID, err)
}
if strings.TrimPrefix(pbi.BillingAccountName, "billingAccounts/") != billingAccount {
pbi.BillingAccountName = "billingAccounts/" + billingAccount
err := retryTimeDuration(func() error {
_, err := config.NewBillingClient(config.userAgent).Projects.UpdateBillingInfo(prefixedProject(projectID), pbi).Do()
return err
}, 2*time.Minute)
if err != nil {
t.Fatalf("Error setting billing account for project %q to %q: %s", projectID, billingAccount, err)
}
}
}

if len(services) > 0 {

enabledServices, err := listCurrentlyEnabledServices(projectID, "", config.userAgent, config, 1*time.Minute)
if err != nil {
t.Fatalf("Error listing services for project %q: %s", projectID, err)
}

servicesToEnable := make([]string, 0, len(services))
for _, service := range services {
if _, ok := enabledServices[service]; !ok {
servicesToEnable = append(servicesToEnable, service)
}
}

if len(servicesToEnable) > 0 {
if err := enableServiceUsageProjectServices(servicesToEnable, projectID, "", config.userAgent, config, 10*time.Minute); err != nil {
t.Fatalf("Error enabling services for project %q: %s", projectID, err)
}
}
}

return project
}

func BootstrapConfig(t *testing.T) *Config {
if v := os.Getenv("TF_ACC"); v == "" {
t.Skip("Acceptance tests and bootstrapping skipped unless env 'TF_ACC' set")
Expand Down
22 changes: 21 additions & 1 deletion google/resource_container_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,8 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
return err
}

clusterName := d.Get("name").(string)

operation := d.Get("operation").(string)
if operation != "" {
log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation)
Expand All @@ -2014,11 +2016,29 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
}
waitErr := containerOperationWait(config, op, project, location, "resuming GKE cluster", userAgent, d.Timeout(schema.TimeoutRead))
if waitErr != nil {
// Try a GET on the cluster so we can see the state in debug logs. This will help classify error states.
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(containerClusterFullName(project, location, clusterName))
if config.UserProjectOverride {
clusterGetCall.Header().Add("X-Goog-User-Project", project)
}
_, getErr := clusterGetCall.Do()
if getErr != nil {
log.Printf("[WARN] Cluster %s was created in an error state and not found", clusterName)
d.SetId("")
}

if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
} else {
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
d.SetId("")
}
// The resource didn't actually create
return waitErr
}
}

clusterName := d.Get("name").(string)
name := containerClusterFullName(project, location, clusterName)
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(name)
if config.UserProjectOverride {
Expand Down
58 changes: 58 additions & 0 deletions google/resource_container_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3795,6 +3795,32 @@ func TestAccContainerCluster_withEnablePrivateEndpointToggle(t *testing.T) {
})
}

func TestAccContainerCluster_failedCreation(t *testing.T) {
// Test that in a scenario where the cluster fails to create, a subsequent apply will delete the resource.
t.Parallel()

clusterName := fmt.Sprintf("tf-test-cluster-%s", randString(t, 10))

project := BootstrapProject(t, "tf-fail-cluster-test", getTestBillingAccountFromEnv(t), []string{"container.googleapis.com"})
removeContainerServiceAgentRoleFromContainerEngineRobot(t, project)

vcrTest(t, resource.TestCase{
PreCheck: func() { testAccPreCheck(t) },
Providers: testAccProviders,
Steps: []resource.TestStep{
{
Config: testAccContainerCluster_failedCreation(clusterName, project.ProjectId),
ExpectError: regexp.MustCompile("timeout while waiting for state to become 'DONE'"),
},
{
Config: testAccContainerCluster_failedCreation_update(clusterName, project.ProjectId),
ExpectError: regexp.MustCompile("Failed to create cluster"),
Check: testAccCheckContainerClusterDestroyProducer(t),
},
},
})
}

func testAccContainerCluster_withEnablePrivateEndpoint(clusterName string, flag string) string {

return fmt.Sprintf(`
Expand Down Expand Up @@ -6133,3 +6159,35 @@ resource "google_container_cluster" "primary" {
}
`, name, name, name)
}

func testAccContainerCluster_failedCreation(cluster, project string) string {
return fmt.Sprintf(`
resource "google_container_cluster" "primary" {
name = "%s"
project = "%s"
location = "us-central1-a"
initial_node_count = 1

workload_identity_config {
workload_pool = "%s.svc.id.goog"
}

timeouts {
create = "40s"
}
}`, cluster, project, project)
}

func testAccContainerCluster_failedCreation_update(cluster, project string) string {
return fmt.Sprintf(`
resource "google_container_cluster" "primary" {
name = "%s"
project = "%s"
location = "us-central1-a"
initial_node_count = 1

workload_identity_config {
workload_pool = "%s.svc.id.goog"
}
}`, cluster, project, project)
}