Skip to content

Commit

Permalink
[release-v1.33] Auto pick #3381: Fix deadlock where terminating resou…
Browse files Browse the repository at this point in the history
…rces were never (#3386)
  • Loading branch information
caseydavenport authored Jun 12, 2024
1 parent 335a8ec commit 7b44d30
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 4 deletions.
44 changes: 40 additions & 4 deletions pkg/controller/installation/core_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@ import (
"strconv"
"strings"

relasticsearch "github.com/tigera/operator/pkg/render/common/elasticsearch"

"github.com/elastic/cloud-on-k8s/v2/pkg/utils/stringsutil"
"github.com/go-logr/logr"
configv1 "github.com/openshift/api/config/v1"
relasticsearch "github.com/tigera/operator/pkg/render/common/elasticsearch"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
rbacv1 "k8s.io/api/rbac/v1"
Expand Down Expand Up @@ -194,6 +193,7 @@ func newReconciler(mgr manager.Manager, opts options.AddOptions) (*ReconcileInst
manageCRDs: opts.ManageCRDs,
usePSP: opts.UsePSP,
tierWatchReady: &utils.ReadyFlag{},
newComponentHandler: utils.NewComponentHandler,
}
r.status.Run(opts.ShutdownContext)
r.typhaAutoscaler.start(opts.ShutdownContext)
Expand Down Expand Up @@ -369,6 +369,9 @@ type ReconcileInstallation struct {
manageCRDs bool
usePSP bool
tierWatchReady *utils.ReadyFlag

// newComponentHandler returns a new component handler. Useful stub for unit testing.
newComponentHandler func(log logr.Logger, client client.Client, scheme *runtime.Scheme, cr metav1.Object) utils.ComponentHandler
}

// updateInstallationWithDefaults returns the default installation instance with defaults populated.
Expand Down Expand Up @@ -1326,6 +1329,39 @@ func (r *ReconcileInstallation) Reconcile(ctx context.Context, request reconcile
} else {
reqLogger.Info("calico-kube-controller is still present, waiting for termination")
}
} else {
// In some rare scenarios, we can hit a deadlock where resources have been marked with a deletion timestamp but the operator
// does not recognize that it must remove their finalizers. This can happen if, for example, someone manually
// deletes a ServiceAccount instead of deleting the Installation object. In this case, we need
// to allow the deletion to complete so the operator can re-create the resources. Otherwise the objects will be stuck terminating forever.
toCheck := render.CNIPluginFinalizedObjects()
needsCleanup := []client.Object{}
for _, obj := range toCheck {
if err := r.client.Get(ctx, types.NamespacedName{Name: obj.GetName(), Namespace: obj.GetNamespace()}, obj); err != nil {
if !apierrors.IsNotFound(err) {
r.status.SetDegraded(operator.ResourceReadError, "Error querying object", err, reqLogger)
return reconcile.Result{}, err
}
// Not found - nothing to do.
continue
}
if obj.GetDeletionTimestamp() != nil {
// The object is marked for deletion, but the installation is not terminating. We need to remove the finalizers from this object
// so that it can be deleted and recreated.
reqLogger.Info("Object is marked for deletion but installation is not terminating",
"kind", obj.GetObjectKind(),
"name", obj.GetName(),
"namespace", obj.GetNamespace(),
)
obj.SetFinalizers(stringsutil.RemoveStringInSlice(render.NodeFinalizer, obj.GetFinalizers()))
needsCleanup = append(needsCleanup, obj)
}
}
if len(needsCleanup) > 0 {
// Add a component to remove the finalizers from the objects that need it.
reqLogger.Info("Removing finalizers from objects that are wronly marked for deletion")
components = append(components, render.NewPassthrough(needsCleanup...))
}
}

// Fetch any existing default BGPConfiguration object.
Expand Down Expand Up @@ -1409,7 +1445,7 @@ func (r *ReconcileInstallation) Reconcile(ctx context.Context, request reconcile
}

// Create a component handler to create or update the rendered components.
handler := utils.NewComponentHandler(log, r.client, r.scheme, instance)
handler := r.newComponentHandler(log, r.client, r.scheme, instance)
for _, component := range components {
if err := handler.CreateOrUpdateOrDelete(ctx, component, nil); err != nil {
r.status.SetDegraded(operator.ResourceUpdateError, "Error creating / updating resource", err, reqLogger)
Expand Down Expand Up @@ -1741,7 +1777,7 @@ func (r *ReconcileInstallation) updateCRDs(ctx context.Context, variant operator
crdComponent := render.NewPassthrough(crds.ToRuntimeObjects(crds.GetCRDs(variant)...)...)
// Specify nil for the CR so no ownership is put on the CRDs. We do this so removing the
// Installation CR will not remove the CRDs.
handler := utils.NewComponentHandler(log, r.client, r.scheme, nil)
handler := r.newComponentHandler(log, r.client, r.scheme, nil)
if err := handler.CreateOrUpdateOrDelete(ctx, crdComponent, nil); err != nil {
r.status.SetDegraded(operator.ResourceUpdateError, "Error creating / updating CRD resource", err, log)
return err
Expand Down
3 changes: 3 additions & 0 deletions pkg/controller/installation/core_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ var _ = Describe("Testing core-controller installation", func() {
enterpriseCRDsExist: true,
migrationChecked: true,
tierWatchReady: ready,
newComponentHandler: utils.NewComponentHandler,
}

r.typhaAutoscaler.start(ctx)
Expand Down Expand Up @@ -778,6 +779,7 @@ var _ = Describe("Testing core-controller installation", func() {
migrationChecked: true,
clusterDomain: dns.DefaultClusterDomain,
tierWatchReady: ready,
newComponentHandler: utils.NewComponentHandler,
}
r.typhaAutoscaler.start(ctx)

Expand Down Expand Up @@ -977,6 +979,7 @@ var _ = Describe("Testing core-controller installation", func() {
enterpriseCRDsExist: true,
migrationChecked: true,
tierWatchReady: ready,
newComponentHandler: utils.NewComponentHandler,
}

r.typhaAutoscaler.start(ctx)
Expand Down
13 changes: 13 additions & 0 deletions pkg/render/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,19 @@ func (c *nodeComponent) Ready() bool {
return true
}

// CalicoSystemFinalizedObjects returns a list of objects that use the NodeFinalizer that should be
// removed only after the CNI plugin is removed.
func CNIPluginFinalizedObjects() []client.Object {
return []client.Object{
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: CalicoNodeObjectName, Namespace: common.CalicoNamespace}},
&corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: CalicoCNIPluginObjectName, Namespace: common.CalicoNamespace}},
&rbacv1.ClusterRoleBinding{ObjectMeta: metav1.ObjectMeta{Name: CalicoNodeObjectName}},
&rbacv1.ClusterRoleBinding{ObjectMeta: metav1.ObjectMeta{Name: CalicoCNIPluginObjectName}},
&rbacv1.ClusterRole{ObjectMeta: metav1.ObjectMeta{Name: CalicoNodeObjectName}},
&rbacv1.ClusterRole{ObjectMeta: metav1.ObjectMeta{Name: CalicoCNIPluginObjectName}},
}
}

// nodeServiceAccount creates the node's service account.
func (c *nodeComponent) nodeServiceAccount() *corev1.ServiceAccount {
finalizer := []string{}
Expand Down
28 changes: 28 additions & 0 deletions test/mainline_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,11 @@ import (
operator "github.com/tigera/operator/api/v1"
"github.com/tigera/operator/controllers"
"github.com/tigera/operator/pkg/apis"
"github.com/tigera/operator/pkg/common"
"github.com/tigera/operator/pkg/controller/options"
"github.com/tigera/operator/pkg/controller/utils"
"github.com/tigera/operator/pkg/crds"
"github.com/tigera/operator/pkg/render"
)

const (
Expand Down Expand Up @@ -135,6 +137,32 @@ var _ = Describe("Mainline component function tests", func() {
mgr = nil
})

It("should recreate resources with DeletionTimestamp set", func() {
// Reconcile as usual, allowing resources to be created.
operatorDone = installResourceCRD(c, mgr, shutdownContext, nil)
verifyCalicoHasDeployed(c)

// Delete a resource with a finalizer. This should set the DeletionTimestamp, but leave the
// resource in place. However, the operator should notice this an recreate the resource, thus
// clearing the DeletionTimestamp.
By("Deleting a resource with a finalizer")
sa := &corev1.ServiceAccount{ObjectMeta: metav1.ObjectMeta{Name: render.CalicoNodeObjectName, Namespace: common.CalicoNamespace}}
err := c.Delete(context.Background(), sa)
Expect(err).NotTo(HaveOccurred())

By("Verifying the resource is recreated")
Eventually(func() error {
err := GetResource(c, sa)
if err != nil {
return err
}
if sa.DeletionTimestamp != nil {
return fmt.Errorf("ServiceAccount DeletionTimestamp is still set")
}
return nil
}, 10*time.Second).Should(BeNil())
})

Describe("Installing CRD", func() {
It("Should install resources for a CRD", func() {
operatorDone = installResourceCRD(c, mgr, shutdownContext, nil)
Expand Down

0 comments on commit 7b44d30

Please sign in to comment.