Skip to content

Commit

Permalink
Fix an issue with resuming a failed container cluster creation (#7121) (
Browse files Browse the repository at this point in the history
#13580)

* Fix an issue with resuming a failed container cluster creation and add a test

* Do not persist operation when resuming during read.

Signed-off-by: Modular Magician <magic-modules@google.com>

Signed-off-by: Modular Magician <magic-modules@google.com>
  • Loading branch information
modular-magician committed Jan 26, 2023
1 parent 022e24a commit d433400
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .changelog/7121.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
container: fixed an issue with resuming failed cluster creation
```
162 changes: 160 additions & 2 deletions google/bootstrap_utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import (
"testing"
"time"

"google.golang.org/api/cloudkms/v1"
"google.golang.org/api/cloudbilling/v1"
cloudkms "google.golang.org/api/cloudkms/v1"
cloudresourcemanager "google.golang.org/api/cloudresourcemanager/v1"
"google.golang.org/api/iam/v1"
iam "google.golang.org/api/iam/v1"
sqladmin "google.golang.org/api/sqladmin/v1beta4"
)

Expand Down Expand Up @@ -358,6 +359,163 @@ func BootstrapServicePerimeterProjects(t *testing.T, desiredProjects int) []*clo
return projects
}

func removeContainerServiceAgentRoleFromContainerEngineRobot(t *testing.T, project *cloudresourcemanager.Project) {
config := BootstrapConfig(t)
if config == nil {
return
}

client := config.NewResourceManagerClient(config.userAgent)
containerEngineRobot := fmt.Sprintf("serviceAccount:service-%d@container-engine-robot.iam.gserviceaccount.com", project.ProjectNumber)
getPolicyRequest := &cloudresourcemanager.GetIamPolicyRequest{}
policy, err := client.Projects.GetIamPolicy(project.ProjectId, getPolicyRequest).Do()
if err != nil {
t.Fatalf("error getting project iam policy: %v", err)
}
roleFound := false
changed := false
for _, binding := range policy.Bindings {
if binding.Role == "roles/container.serviceAgent" {
memberFound := false
for i, member := range binding.Members {
if member == containerEngineRobot {
binding.Members[i] = binding.Members[len(binding.Members)-1]
memberFound = true
}
}
if memberFound {
binding.Members = binding.Members[:len(binding.Members)-1]
changed = true
}
} else if binding.Role == "roles/editor" {
memberFound := false
for _, member := range binding.Members {
if member == containerEngineRobot {
memberFound = true
break
}
}
if !memberFound {
binding.Members = append(binding.Members, containerEngineRobot)
changed = true
}
roleFound = true
}
}
if !roleFound {
policy.Bindings = append(policy.Bindings, &cloudresourcemanager.Binding{
Members: []string{containerEngineRobot},
Role: "roles/editor",
})
changed = true
}
if changed {
setPolicyRequest := &cloudresourcemanager.SetIamPolicyRequest{Policy: policy}
policy, err = client.Projects.SetIamPolicy(project.ProjectId, setPolicyRequest).Do()
if err != nil {
t.Fatalf("error setting project iam policy: %v", err)
}
}
}

func BootstrapProject(t *testing.T, projectID, billingAccount string, services []string) *cloudresourcemanager.Project {
config := BootstrapConfig(t)
if config == nil {
return nil
}

crmClient := config.NewResourceManagerClient(config.userAgent)

project, err := crmClient.Projects.Get(projectID).Do()
if err != nil {
if !isGoogleApiErrorWithCode(err, 403) {
t.Fatalf("Error getting bootstrapped project: %s", err)
}
org := getTestOrgFromEnv(t)

op, err := crmClient.Projects.Create(&cloudresourcemanager.Project{
ProjectId: projectID,
Name: "Bootstrapped Test Project",
Parent: &cloudresourcemanager.ResourceId{
Type: "organization",
Id: org,
},
}).Do()
if err != nil {
t.Fatalf("Error creating bootstrapped test project: %s", err)
}

opAsMap, err := ConvertToMap(op)
if err != nil {
t.Fatalf("Error converting create project operation to map: %s", err)
}

err = resourceManagerOperationWaitTime(config, opAsMap, "creating project", config.userAgent, 4*time.Minute)
if err != nil {
t.Fatalf("Error waiting for create project operation: %s", err)
}

project, err = crmClient.Projects.Get(projectID).Do()
if err != nil {
t.Fatalf("Error getting bootstrapped project: %s", err)
}

}

if project.LifecycleState == "DELETE_REQUESTED" {
_, err := crmClient.Projects.Undelete(projectID, &cloudresourcemanager.UndeleteProjectRequest{}).Do()
if err != nil {
t.Fatalf("Error undeleting bootstrapped project: %s", err)
}
}

if billingAccount != "" {
billingClient := config.NewBillingClient(config.userAgent)
var pbi *cloudbilling.ProjectBillingInfo
err = retryTimeDuration(func() error {
var reqErr error
pbi, reqErr = billingClient.Projects.GetBillingInfo(prefixedProject(projectID)).Do()
return reqErr
}, 30*time.Second)
if err != nil {
t.Fatalf("Error getting billing info for project %q: %v", projectID, err)
}
if strings.TrimPrefix(pbi.BillingAccountName, "billingAccounts/") != billingAccount {
pbi.BillingAccountName = "billingAccounts/" + billingAccount
err := retryTimeDuration(func() error {
_, err := config.NewBillingClient(config.userAgent).Projects.UpdateBillingInfo(prefixedProject(projectID), pbi).Do()
return err
}, 2*time.Minute)
if err != nil {
t.Fatalf("Error setting billing account for project %q to %q: %s", projectID, billingAccount, err)
}
}
}

if len(services) > 0 {

enabledServices, err := listCurrentlyEnabledServices(projectID, "", config.userAgent, config, 1*time.Minute)
if err != nil {
t.Fatalf("Error listing services for project %q: %s", projectID, err)
}

servicesToEnable := make([]string, 0, len(services))
for _, service := range services {
if _, ok := enabledServices[service]; !ok {
servicesToEnable = append(servicesToEnable, service)
}
}

if len(servicesToEnable) > 0 {
if err := enableServiceUsageProjectServices(servicesToEnable, projectID, "", config.userAgent, config, 10*time.Minute); err != nil {
t.Fatalf("Error enabling services for project %q: %s", projectID, err)
}
}
}

return project
}

func BootstrapConfig(t *testing.T) *Config {
if v := os.Getenv("TF_ACC"); v == "" {
t.Skip("Acceptance tests and bootstrapping skipped unless env 'TF_ACC' set")
Expand Down
22 changes: 21 additions & 1 deletion google/resource_container_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2003,6 +2003,8 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
return err
}

clusterName := d.Get("name").(string)

operation := d.Get("operation").(string)
if operation != "" {
log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation)
Expand All @@ -2014,11 +2016,29 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
}
waitErr := containerOperationWait(config, op, project, location, "resuming GKE cluster", userAgent, d.Timeout(schema.TimeoutRead))
if waitErr != nil {
// Try a GET on the cluster so we can see the state in debug logs. This will help classify error states.
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(containerClusterFullName(project, location, clusterName))
if config.UserProjectOverride {
clusterGetCall.Header().Add("X-Goog-User-Project", project)
}
_, getErr := clusterGetCall.Do()
if getErr != nil {
log.Printf("[WARN] Cluster %s was created in an error state and not found", clusterName)
d.SetId("")
}

if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
} else {
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
d.SetId("")
}
// The resource didn't actually create
return waitErr
}
}

clusterName := d.Get("name").(string)
name := containerClusterFullName(project, location, clusterName)
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(name)
if config.UserProjectOverride {
Expand Down
58 changes: 58 additions & 0 deletions google/resource_container_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3795,6 +3795,32 @@ func TestAccContainerCluster_withEnablePrivateEndpointToggle(t *testing.T) {
})
}

func TestAccContainerCluster_failedCreation(t *testing.T) {
// Test that in a scenario where the cluster fails to create, a subsequent apply will delete the resource.
t.Parallel()

clusterName := fmt.Sprintf("tf-test-cluster-%s", randString(t, 10))

project := BootstrapProject(t, "tf-fail-cluster-test", getTestBillingAccountFromEnv(t), []string{"container.googleapis.com"})
removeContainerServiceAgentRoleFromContainerEngineRobot(t, project)

vcrTest(t, resource.TestCase{
PreCheck: func() { testAccPreCheck(t) },
Providers: testAccProviders,
Steps: []resource.TestStep{
{
Config: testAccContainerCluster_failedCreation(clusterName, project.ProjectId),
ExpectError: regexp.MustCompile("timeout while waiting for state to become 'DONE'"),
},
{
Config: testAccContainerCluster_failedCreation_update(clusterName, project.ProjectId),
ExpectError: regexp.MustCompile("Failed to create cluster"),
Check: testAccCheckContainerClusterDestroyProducer(t),
},
},
})
}

func testAccContainerCluster_withEnablePrivateEndpoint(clusterName string, flag string) string {

return fmt.Sprintf(`
Expand Down Expand Up @@ -6133,3 +6159,35 @@ resource "google_container_cluster" "primary" {
}
`, name, name, name)
}

func testAccContainerCluster_failedCreation(cluster, project string) string {
return fmt.Sprintf(`
resource "google_container_cluster" "primary" {
name = "%s"
project = "%s"
location = "us-central1-a"
initial_node_count = 1
workload_identity_config {
workload_pool = "%s.svc.id.goog"
}
timeouts {
create = "40s"
}
}`, cluster, project, project)
}

func testAccContainerCluster_failedCreation_update(cluster, project string) string {
return fmt.Sprintf(`
resource "google_container_cluster" "primary" {
name = "%s"
project = "%s"
location = "us-central1-a"
initial_node_count = 1
workload_identity_config {
workload_pool = "%s.svc.id.goog"
}
}`, cluster, project, project)
}

0 comments on commit d433400

Please sign in to comment.