Skip to content

Commit

Permalink
Add retry to allVpcsByRegion request (#163)
Browse files Browse the repository at this point in the history
  • Loading branch information
johan3141592 authored Apr 15, 2024
1 parent 51cab53 commit 0605e26
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 16 deletions.
9 changes: 4 additions & 5 deletions cmd/testenv/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func main() {
os.Exit(1)
}

// Load configuration and create client
// Load configuration and create a client
polAccount, err := polaris.DefaultServiceAccount(true)
if err != nil {
log.Fatal(err)
Expand All @@ -46,7 +46,6 @@ func main() {
}

ctx := context.Background()

if *precheck {
err = check(ctx, client)
} else {
Expand Down Expand Up @@ -76,7 +75,7 @@ func check(ctx context.Context, client *polaris.Client) error {
return nil
})

// AWS with cross account role
// AWS with a cross-account role
g.Go(func() error {
testAcc, err := testsetup.AWSAccount()
if err != nil {
Expand Down Expand Up @@ -157,7 +156,7 @@ func clean(ctx context.Context, client *polaris.Client) error {
return awsClient.RemoveAccount(ctx, aws.Profile(testAcc.Profile), features, false)
})

// AWS with cross account role
// AWS with a cross-account role
g.Go(func() error {
testAcc, err := testsetup.AWSAccount()
if err != nil {
Expand Down Expand Up @@ -219,7 +218,7 @@ func clean(ctx context.Context, client *polaris.Client) error {
// Remove all features for the subscription.
for _, feature := range azureAcc.Features {
if err := azureClient.RemoveSubscription(ctx, azure.CloudAccountID(azureAcc.ID), feature.Feature, false); err != nil {
return fmt.Errorf("failed to remove Azure cloud account fetaure: %v", pretty.Sprint(feature))
return fmt.Errorf("failed to remove Azure cloud account fetaure %v: %s", feature.Name, err)
}
}

Expand Down
2 changes: 1 addition & 1 deletion pkg/polaris/aws/aws.go
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,7 @@ func (a API) AddAccountArtifacts(ctx context.Context, id IdentityFunc, features
}

// TrustPolicies returns the trust policies required by RSC for the specified
// features. If the external ID is the empty, RSC will generate an external ID.
// features. If the external ID is empty, RSC will generate an external ID.
func (a API) TrustPolicies(ctx context.Context, id IdentityFunc, features []core.Feature, externalID string) (map[string]string, error) {
a.log.Print(log.Trace)

Expand Down
38 changes: 35 additions & 3 deletions pkg/polaris/aws/exocompute.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
"context"
"errors"
"fmt"
"strings"
"time"

"github.com/google/uuid"

Expand Down Expand Up @@ -114,7 +116,7 @@ func Managed(region, vpcID string, subnetIDs []string) ExoConfigFunc {
}

// Validate VPC.
vpcs, err := aws.Wrap(gql).AllVpcsByRegion(ctx, id, reg)
vpcs, err := allVpcsByRegionWithRetry(ctx, gql, id, reg)
if err != nil {
return aws.ExocomputeConfigCreate{}, fmt.Errorf("failed to get vpcs: %v", err)
}
Expand Down Expand Up @@ -155,7 +157,7 @@ func Unmanaged(region, vpcID string, subnetIDs []string, clusterSecurityGroupID,
}

// Validate VPC.
vpcs, err := aws.Wrap(gql).AllVpcsByRegion(ctx, id, reg)
vpcs, err := allVpcsByRegionWithRetry(ctx, gql, id, reg)
if err != nil {
return aws.ExocomputeConfigCreate{}, fmt.Errorf("failed to get vpcs: %v", err)
}
Expand Down Expand Up @@ -198,6 +200,36 @@ func Unmanaged(region, vpcID string, subnetIDs []string, clusterSecurityGroupID,
}
}

const allVpcsByRegionAttempts = 10

// allVpcsByRegionWithRetry returns all VPCs for the specified account and
// region. If the request fails due to the connection being closed while
// performing TLS negotiation, it will be retried.
func allVpcsByRegionWithRetry(ctx context.Context, gql *graphql.Client, id uuid.UUID, region aws.Region) ([]aws.VPC, error) {
attempt := 0
for {
vpcs, err := aws.Wrap(gql).AllVpcsByRegion(ctx, id, region)
if attempt++; err != nil && attempt > allVpcsByRegionAttempts {
return nil, fmt.Errorf("failed to get vpcs after %d attempts", allVpcsByRegionAttempts)
}

var gqlErr graphql.GQLError
if errors.As(err, &gqlErr) {
if strings.HasPrefix(gqlErr.Error(), "UNAVAILABLE: Connection closed while performing TLS negotiation") {
gql.Log().Printf(log.Debug, "Endpoint temporarily unavailable (attempt: %d)", attempt)
select {
case <-time.After(10 * time.Second):
continue
case <-ctx.Done():
return nil, ctx.Err()
}
}
}

return vpcs, err
}
}

func BYOKCluster(region string) ExoConfigFunc {
return func(ctx context.Context, gql *graphql.Client, id uuid.UUID) (aws.ExocomputeConfigCreate, error) {
reg, err := aws.ParseRegion(region)
Expand All @@ -209,7 +241,7 @@ func BYOKCluster(region string) ExoConfigFunc {
}
}

// toExocomputeConfig converts an polaris/graphql/aws exocompute config to a
// toExocomputeConfig converts a polaris/graphql/aws exocompute config to a
// polaris/aws exocompute config.
func toExocomputeConfig(config aws.ExocomputeConfig) (ExocomputeConfig, error) {
id, err := uuid.Parse(config.ID)
Expand Down
8 changes: 4 additions & 4 deletions pkg/polaris/graphql/aws/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,25 +404,25 @@ type VPC struct {

// AllVpcsByRegion returns all VPCs including their subnets for the specified
// RSC cloud account id.
func (a API) AllVpcsByRegion(ctx context.Context, id uuid.UUID, regions Region) ([]VPC, error) {
func (a API) AllVpcsByRegion(ctx context.Context, id uuid.UUID, region Region) ([]VPC, error) {
a.log.Print(log.Trace)

buf, err := a.GQL.Request(ctx, allVpcsByRegionFromAwsQuery, struct {
ID uuid.UUID `json:"awsAccountRubrikId"`
Region Region `json:"region"`
}{ID: id, Region: regions})
}{ID: id, Region: region})
if err != nil {
return nil, fmt.Errorf("failed to request allVpcsByRegionFromAws: %w", err)
}
a.log.Printf(log.Debug, "allVpcsByRegionFromAws(%q, %q): %s", id, regions, string(buf))
a.log.Printf(log.Debug, "allVpcsByRegionFromAws(%q, %q): %s", id, region, string(buf))

var payload struct {
Data struct {
VPCs []VPC `json:"allVpcsByRegionFromAws"`
} `json:"data"`
}
if err := json.Unmarshal(buf, &payload); err != nil {
return nil, fmt.Errorf("failed to unmarshal allVpcsByRegionFromAws: %v", err)
return nil, fmt.Errorf("failed to unmarshal allVpcsByRegionFromAws: %s", err)
}

return payload.Data.VPCs, nil
Expand Down
4 changes: 2 additions & 2 deletions pkg/polaris/graphql/azure/cloud.go
Original file line number Diff line number Diff line change
Expand Up @@ -176,8 +176,8 @@ func (a API) AddCloudAccountWithoutOAuth(ctx context.Context, cloud Cloud, id uu
if err != nil {
return "", fmt.Errorf("failed to request addAzureCloudAccountWithoutOauth: %w", err)
}
a.log.Printf(log.Debug, "addAzureCloudAccountWithoutOauth(%q, %q, %q, %q, %q, %q, %d): %s", cloud, id, feature, name,
tenantDomain, regions, feature.PolicyVersion, string(buf))
a.log.Printf(log.Debug, "addAzureCloudAccountWithoutOauth(%q, %q, %q, %q, %q, %q, %d): %s", cloud, id,
feature.FeatureType, name, tenantDomain, regions, feature.PolicyVersion, string(buf))

var payload struct {
Data struct {
Expand Down
2 changes: 1 addition & 1 deletion pkg/polaris/graphql/core/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ func ParseFeature(feature string) (Feature, error) {
const (
// Number of attempts before failing to wait for the Korg job when the error
// returned is a 403, objects not authorized.
waitAttempts = 15
waitAttempts = 20
)

// Status represents a Polaris cloud account status.
Expand Down

0 comments on commit 0605e26

Please sign in to comment.