Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry on delete backend failure #330

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cloud/scope/load_balancer_reconciler.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ func (s *ClusterScope) ReconcileApiServerLB(ctx context.Context) error {
return err
}
if lb != nil {
if lb.LifecycleState != loadbalancer.LoadBalancerLifecycleStateActive {
return errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", lb.LifecycleState))
}
lbIP, err := s.getLoadbalancerIp(*lb)
if err != nil {
return err
Expand Down
94 changes: 64 additions & 30 deletions cloud/scope/load_balancer_reconciler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand All @@ -123,11 +124,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
},
}, nil)
},
Expand All @@ -143,11 +145,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down Expand Up @@ -189,11 +192,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down Expand Up @@ -537,11 +541,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down Expand Up @@ -570,6 +575,34 @@ func TestLBReconciliation(t *testing.T) {
}, nil)
},
},
{
name: "lb not active",
errorExpected: true,
errorSubStringMatch: true,
matchError: errors.New(fmt.Sprintf("load balancer is in %s state. Waiting for ACTIVE state.", loadbalancer.LoadBalancerLifecycleStateCreating)),
testSpecificSetup: func(clusterScope *ClusterScope, lbClient *mock_lb.MockLoadBalancerClient) {
ociClusterAccessor.OCICluster.Spec.NetworkSpec.APIServerLB.LoadBalancerId = common.String("lb-id")
lbClient.EXPECT().GetLoadBalancer(gomock.Any(), gomock.Eq(loadbalancer.GetLoadBalancerRequest{
LoadBalancerId: common.String("lb-id"),
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateCreating,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
IsPublic: common.Bool(true),
},
},
},
}, nil)
},
},
{
name: "lb update request failed",
errorExpected: true,
Expand All @@ -582,11 +615,12 @@ func TestLBReconciliation(t *testing.T) {
})).
Return(loadbalancer.GetLoadBalancerResponse{
LoadBalancer: loadbalancer.LoadBalancer{
Id: common.String("lb-id"),
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
Id: common.String("lb-id"),
LifecycleState: loadbalancer.LoadBalancerLifecycleStateActive,
FreeformTags: tags,
DefinedTags: make(map[string]map[string]interface{}),
IsPrivate: common.Bool(false),
DisplayName: common.String(fmt.Sprintf("%s-%s", "cluster", "apiserver-test")),
IpAddresses: []loadbalancer.IpAddress{
{
IpAddress: common.String("2.2.2.2"),
Expand Down
196 changes: 89 additions & 107 deletions cloud/scope/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -521,33 +521,29 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port))
if !m.containsLBBackend(backendSet, backendName) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
logger.Info("Checking work request status for create backend")
if workRequest != "" {
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: loadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the create backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.LoadBalancerClient.CreateBackend(ctx, loadbalancer.CreateBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: loadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
shyamradhakrishnan marked this conversation as resolved.
Show resolved Hide resolved
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
}

Expand All @@ -562,37 +558,32 @@ func (m *MachineScope) ReconcileCreateInstanceOnLB(ctx context.Context) error {
if !m.containsNLBBackend(backendSet, m.Name()) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
logger.Info("Checking work request status for create backend")
workRequest := m.OCIMachine.Status.CreateBackendWorkRequestId
if workRequest != "" {
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
Name: common.String(m.Name()),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for NLB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
logger.Info("NLB Backend addition work request is complete")
// we always try to create the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the create backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.NetworkLoadBalancerClient.CreateBackend(ctx, networkloadbalancer.CreateBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
CreateBackendDetails: networkloadbalancer.CreateBackendDetails{
IpAddress: common.String(instanceIp),
Port: common.Int(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port)),
Name: common.String(m.Name()),
},
OpcRetryToken: ociutil.GetOPCRetryToken("%s-%s", "create-backend", string(m.OCIMachine.UID)),
})
if err != nil {
return err
}
m.OCIMachine.Status.CreateBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Add instance to NLB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for NLB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
logger.Info("NLB Backend addition work request is complete")
}

}
return nil
}
Expand Down Expand Up @@ -629,36 +620,31 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
backendName := instanceIp + ":" + strconv.Itoa(int(m.OCIClusterAccessor.GetControlPlaneEndpoint().Port))
if m.containsLBBackend(backendSet, backendName) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
if workRequest != "" {
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
// in OCI CLI, the colon in the backend name is replaced by %3A
// replace the colon in the backend name by %3A to avoid the error in PCA
escapedBackendName := url.QueryEscape(backendName)

resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(escapedBackendName),
})
if err != nil {
logger.Error(err, "Delete instance from LB backend-set failed",
"backendSetName", *backendSet.Name,
"backendName", escapedBackendName,
)
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
// in OCI CLI, the colon in the backend name is replaced by %3A
// replace the colon in the backend name by %3A to avoid the error in PCA
escapedBackendName := url.QueryEscape(backendName)
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.LoadBalancerClient.DeleteBackend(ctx, loadbalancer.DeleteBackendRequest{
LoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(escapedBackendName),
})
if err != nil {
logger.Error(err, "Delete instance from LB backend-set failed",
"backendSetName", *backendSet.Name,
"backendName", escapedBackendName,
)
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitLBWorkRequest(ctx, m.LoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
logger.Info("LB Backend addition work request is complete")
}
Expand All @@ -672,28 +658,24 @@ func (m *MachineScope) ReconcileDeleteInstanceOnLB(ctx context.Context) error {
backendSet := lb.BackendSets[APIServerLBBackendSetName]
if m.containsNLBBackend(backendSet, m.Name()) {
logger := m.Logger.WithValues("backend-set", *backendSet.Name)
workRequest := m.OCIMachine.Status.DeleteBackendWorkRequestId
if workRequest != "" {
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, &workRequest)
if err != nil {
return err
}
} else {
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(m.Name()),
})
if err != nil {
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
// we always try to delete the backend if it exists during a reconcile loop and wait for the work request
// to complete. If there is a work request in progress, in the rare case, CAPOCI pod restarts during the
// work request, the delete backend call may throw an error which is ok, as reconcile will go into
// an exponential backoff
resp, err := m.NetworkLoadBalancerClient.DeleteBackend(ctx, networkloadbalancer.DeleteBackendRequest{
NetworkLoadBalancerId: loadbalancerId,
BackendSetName: backendSet.Name,
BackendName: common.String(m.Name()),
})
if err != nil {
return err
}
m.OCIMachine.Status.DeleteBackendWorkRequestId = *resp.OpcWorkRequestId
logger.Info("Delete instance from LB backend-set", "WorkRequestId", resp.OpcWorkRequestId)
logger.Info("Waiting for LB work request to be complete")
_, err = ociutil.AwaitNLBWorkRequest(ctx, m.NetworkLoadBalancerClient, resp.OpcWorkRequestId)
if err != nil {
return err
}
}
}
Expand Down
Loading