Skip to content

Commit

Permalink
CI-1558 Automated cherry pick of #3404: Fix deadlock situation where …
Browse files Browse the repository at this point in the history
…two controller rely on the (#3405)

* Fix deadlock situation where two controller rely on the other to finish successfully.

* Don't block controllers when the certs they are trusting are missing key usages when there is a mutual dependency between two controllers.

Secrets controller is now adding internal-manager-tls to its bundle, rather than manager-tls. The internal secret is the one that is used for component-to-component authentication, while manager-tls is used for browser-to-manager TLS.
  • Loading branch information
rene-dekker committed Jul 4, 2024
1 parent 7b44d30 commit d32591f
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 22 deletions.
13 changes: 10 additions & 3 deletions pkg/controller/logstorage/secrets/secret_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,7 @@ func (r *SecretSubController) collectUpstreamCerts(log logr.Logger, helper utils
monitor.PrometheusClientTLSSecretName: common.OperatorNamespace(),

// Get certificate for es-proxy, which Linseed and es-gateway need to trust.
render.ManagerTLSSecretName: helper.TruthNamespace(),
render.ManagerInternalTLSSecretName: helper.TruthNamespace(),

// Get certificate for fluentd, which Linseed needs to trust in a standalone or management cluster.
render.FluentdPrometheusTLSSecretName: common.OperatorNamespace(),
Expand Down Expand Up @@ -518,8 +518,15 @@ func (r *SecretSubController) collectUpstreamCerts(log logr.Logger, helper utils
certNamespace := certs[certName]
cert, err := cm.GetCertificate(r.client, certName, certNamespace)
if err != nil {
r.status.SetDegraded(operatorv1.ResourceReadError, "Failed to get certificate", err, log)
return nil, err
if certificatemanager.IsCertExtKeyUsageError(err) {
// This secret is missing required key usages. Another controller will need to replace this secret with a
// new valid secret, before this controller will read and use it. The other controller may depend on this
// controller completing successfully. Therefore, we skip and continue.
log.Info(fmt.Sprintf("skipping %s/%s secret it will be added when it is updated: %s", common.OperatorNamespace(), certName, err))
} else {
r.status.SetDegraded(operatorv1.ResourceReadError, "Failed to get certificate", err, log)
return nil, err
}
} else if cert == nil {
msg := fmt.Sprintf("%s/%s secret not available yet, will add it if/when it becomes available", certNamespace, certName)
log.Info(msg)
Expand Down
52 changes: 44 additions & 8 deletions pkg/controller/logstorage/secrets/secret_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package secrets

import (
"bytes"
"context"
"fmt"

Expand All @@ -32,6 +33,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand All @@ -46,6 +48,7 @@ import (
"github.com/tigera/operator/pkg/controller/utils"
"github.com/tigera/operator/pkg/dns"
"github.com/tigera/operator/pkg/render"
rmeta "github.com/tigera/operator/pkg/render/common/meta"
"github.com/tigera/operator/pkg/render/common/secret"
rtest "github.com/tigera/operator/pkg/render/common/test"
"github.com/tigera/operator/pkg/render/logstorage"
Expand Down Expand Up @@ -123,12 +126,13 @@ func NewMultiTenantSecretControllerWithShims(

var _ = Describe("LogStorage Secrets controller", func() {
var (
cli client.Client
readyFlag *utils.ReadyFlag
scheme *runtime.Scheme
ctx context.Context
install *operatorv1.Installation
mockStatus *status.MockStatus
cli client.Client
readyFlag *utils.ReadyFlag
scheme *runtime.Scheme
ctx context.Context
install *operatorv1.Installation
mockStatus *status.MockStatus
certificateManager certificatemanager.CertificateManager
)

BeforeEach(func() {
Expand Down Expand Up @@ -177,9 +181,10 @@ var _ = Describe("LogStorage Secrets controller", func() {
mockStatus.On("ClearDegraded")

// Create a CA secret for the test, and create its KeyPair.
cm, err := certificatemanager.Create(cli, &install.Spec, dns.DefaultClusterDomain, common.OperatorNamespace(), certificatemanager.AllowCACreation())
var err error
certificateManager, err = certificatemanager.Create(cli, &install.Spec, dns.DefaultClusterDomain, common.OperatorNamespace(), certificatemanager.AllowCACreation())
Expect(err).ShouldNot(HaveOccurred())
Expect(cli.Create(ctx, cm.KeyPair().Secret(common.OperatorNamespace()))).ShouldNot(HaveOccurred())
Expect(cli.Create(ctx, certificateManager.KeyPair().Secret(common.OperatorNamespace()))).ShouldNot(HaveOccurred())
})

It("should wait for the cluster CA to be provisioned", func() {
Expand Down Expand Up @@ -256,6 +261,37 @@ var _ = Describe("LogStorage Secrets controller", func() {
rtest.ExpectBundleContents(bundleKibana, types.NamespacedName{Name: certificatemanagement.CASecretName, Namespace: common.OperatorNamespace()})
})

It("should not trip up when a cert with missing key usages is configured for other components", func() {

// Create a LogStorage instance with a default configuration.
ls := &operatorv1.LogStorage{}
ls.Name = "tigera-secure"
ls.Status.State = operatorv1.TigeraStatusReady
CreateLogStorage(cli, ls)

By("Creating a fluentd certificate secret without all necessary usages")
cryptoCA, err := tls.MakeCA(rmeta.TigeraOperatorCAIssuerPrefix)
Expect(err).NotTo(HaveOccurred())
tlsCfg, err := cryptoCA.MakeServerCertForDuration(sets.NewString("test"), tls.DefaultCertificateDuration, tls.SetServerAuth)
Expect(err).NotTo(HaveOccurred())
keyContent, crtContent := &bytes.Buffer{}, &bytes.Buffer{}
Expect(tlsCfg.WriteCertConfig(crtContent, keyContent)).NotTo(HaveOccurred())
privateKeyPEM, certificatePEM := keyContent.Bytes(), crtContent.Bytes()
fluentdCert, err := certificateManager.GetOrCreateKeyPair(cli, render.FluentdPrometheusTLSSecretName, common.OperatorNamespace(), []string{""})
Expect(err).NotTo(HaveOccurred())
fluentdSecret := fluentdCert.Secret(common.OperatorNamespace())
fluentdSecret.Data[corev1.TLSCertKey] = certificatePEM
fluentdSecret.Data[corev1.TLSPrivateKeyKey] = privateKeyPEM
Expect(err).NotTo(HaveOccurred())
r, err := NewSecretControllerWithShims(cli, scheme, mockStatus, operatorv1.ProviderNone, dns.DefaultClusterDomain)
Expect(err).ShouldNot(HaveOccurred())
Expect(r.client.Create(ctx, fluentdSecret)).NotTo(HaveOccurred())

By("reconciling the controller after a bad secret was created, we expect no problems, because bad secrets should be skipped")
_, err = r.Reconcile(ctx, reconcile.Request{})
Expect(err).NotTo(HaveOccurred())
})

It("test that LogStorage reconciles if the user-supplied certs have any DNS names", func() {
// This test currently just validates that user-provided certs will reconcile and not return an error and won't be
// overwritten by the operator. This test will change once we add validation for user-provided certs.
Expand Down
11 changes: 9 additions & 2 deletions pkg/controller/monitor/monitor_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,15 @@ func (r *ReconcileMonitor) Reconcile(ctx context.Context, request reconcile.Requ
if err == nil {
trustedBundle.AddCertificates(certificate)
} else {
r.status.SetDegraded(operatorv1.ResourceReadError, "Error fetching TLS certificate", err, reqLogger)
return reconcile.Result{}, err
if certificatemanager.IsCertExtKeyUsageError(err) {
// This secret is missing required key usages. Another controller will need to replace this secret with a
// new valid secret, before this controller will read and use it. The other controller may depend on this
// controller completing successfully. Therefore, we skip and continue.
log.Info(fmt.Sprintf("skipping %s/%s secret it will be added when it is updated: %s", common.OperatorNamespace(), certificateName, err))
} else {
r.status.SetDegraded(operatorv1.ResourceReadError, "Error fetching TLS certificate", err, reqLogger)
return reconcile.Result{}, err
}
}
}
certificateManager.AddToStatusManager(r.status, common.TigeraPrometheusNamespace)
Expand Down
48 changes: 39 additions & 9 deletions pkg/controller/monitor/monitor_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package monitor

import (
"bytes"
"context"

. "github.com/onsi/ginkgo"
Expand All @@ -29,6 +30,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
"sigs.k8s.io/controller-runtime/pkg/reconcile"
Expand All @@ -42,17 +44,22 @@ import (
"github.com/tigera/operator/pkg/controller/status"
"github.com/tigera/operator/pkg/controller/utils"
"github.com/tigera/operator/pkg/render"
rmeta "github.com/tigera/operator/pkg/render/common/meta"
"github.com/tigera/operator/pkg/render/monitor"
"github.com/tigera/operator/pkg/tls"
)

var _ = Describe("Monitor controller tests", func() {
var cli client.Client
var ctx context.Context
var mockStatus *status.MockStatus
var r ReconcileMonitor
var scheme *runtime.Scheme
var installation *operatorv1.Installation
var monitorCR *operatorv1.Monitor
var (
cli client.Client
ctx context.Context
mockStatus *status.MockStatus
r ReconcileMonitor
scheme *runtime.Scheme
installation *operatorv1.Installation
certificateManager certificatemanager.CertificateManager
monitorCR *operatorv1.Monitor
)

BeforeEach(func() {
// The schema contains all objects that should be known to the fake client when the test runs.
Expand Down Expand Up @@ -117,9 +124,10 @@ var _ = Describe("Monitor controller tests", func() {

// Create a certificate manager and provision the CA to unblock the controller. Generally this would be done by
// the cluster CA controller and is a prerequisite for the monitor controller to function.
cm, err := certificatemanager.Create(cli, &installation.Spec, "cluster.local", common.OperatorNamespace(), certificatemanager.AllowCACreation())
var err error
certificateManager, err = certificatemanager.Create(cli, &installation.Spec, "cluster.local", common.OperatorNamespace(), certificatemanager.AllowCACreation())
Expect(err).NotTo(HaveOccurred())
Expect(cli.Create(ctx, cm.KeyPair().Secret(common.OperatorNamespace()))).NotTo(HaveOccurred())
Expect(cli.Create(ctx, certificateManager.KeyPair().Secret(common.OperatorNamespace()))).NotTo(HaveOccurred())

// Mark that watches were successful.
r.prometheusReady.MarkAsReady()
Expand Down Expand Up @@ -157,6 +165,28 @@ var _ = Describe("Monitor controller tests", func() {
Expect(cli.Get(ctx, client.ObjectKey{Name: monitor.FluentdMetrics, Namespace: common.TigeraPrometheusNamespace}, sm)).NotTo(HaveOccurred())
})

It("should create Prometheus related resources even when a cert with missing key usages is configured for other components", func() {
By("Creating a fluentd certificate secret without all necessary usages")
cryptoCA, err := tls.MakeCA(rmeta.TigeraOperatorCAIssuerPrefix)
Expect(err).NotTo(HaveOccurred())
tlsCfg, err := cryptoCA.MakeServerCertForDuration(sets.NewString("test"), tls.DefaultCertificateDuration, tls.SetServerAuth)
Expect(err).NotTo(HaveOccurred())
keyContent, crtContent := &bytes.Buffer{}, &bytes.Buffer{}
Expect(tlsCfg.WriteCertConfig(crtContent, keyContent)).NotTo(HaveOccurred())
privateKeyPEM, certificatePEM := keyContent.Bytes(), crtContent.Bytes()
fluentdCert, err := certificateManager.GetOrCreateKeyPair(cli, render.FluentdPrometheusTLSSecretName, common.OperatorNamespace(), []string{""})
Expect(err).NotTo(HaveOccurred())
fluentdSecret := fluentdCert.Secret(common.OperatorNamespace())
fluentdSecret.Data[corev1.TLSCertKey] = certificatePEM
fluentdSecret.Data[corev1.TLSPrivateKeyKey] = privateKeyPEM
Expect(err).NotTo(HaveOccurred())
Expect(r.client.Create(ctx, fluentdSecret)).NotTo(HaveOccurred())

By("reconciling the controller after a bad secret was created, we expect no problems, because bad secrets should be skipped")
_, err = r.Reconcile(ctx, reconcile.Request{})
Expect(err).NotTo(HaveOccurred())
})

It("should render allow-tigera policy when tier and policy watch are ready", func() {
_, err := r.Reconcile(ctx, reconcile.Request{})
Expect(err).ShouldNot(HaveOccurred())
Expand Down

0 comments on commit d32591f

Please sign in to comment.