Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 Add support for CA/certificate rotation #1062

Merged
merged 4 commits into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,15 @@ jobs:

- name: Run the upgrade e2e test
run: make test-upgrade-e2e

cert-e2e:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- uses: actions/setup-go@v5
with:
go-version-file: go.mod

- name: Run the cert e2e test
run: make test-cert-e2e
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,15 @@ test-upgrade-e2e: export TEST_CLUSTER_CATALOG_NAME := test-catalog
test-upgrade-e2e: export TEST_CLUSTER_EXTENSION_NAME := test-package
test-upgrade-e2e: kind-cluster run-latest-release image-registry build-push-e2e-catalog registry-load-bundles pre-upgrade-setup docker-build kind-load kind-deploy post-upgrade-checks kind-clean #HELP Run upgrade e2e tests on a local kind cluster

.PHONY: run-cert-e2e
run-cert-e2e:
./hack/test/cert-e2e.sh

.PHONY: test-cert-e2e
test-cert-e2e: KIND_CLUSTER_NAME := operator-controller-cert-e2e
test-cert-e2e: KUSTOMIZE_BUILD_DIR := config/overlays/cert-manager
tmshort marked this conversation as resolved.
Show resolved Hide resolved
test-cert-e2e: run run-cert-e2e kind-clean #HELP Run certificate e2e tests on a local kind cluster

.PHONY: e2e-coverage
e2e-coverage:
COVERAGE_OUTPUT=./coverage/e2e.out ./hack/test/e2e-coverage.sh
Expand Down
45 changes: 34 additions & 11 deletions cmd/manager/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@ package main

import (
"context"
"crypto/x509"
"flag"
"fmt"
"net/http"
"os"
"path/filepath"

Expand Down Expand Up @@ -176,16 +178,23 @@ func main() {
os.Exit(1)
}

certPool, err := httputil.NewCertPool(caCertDir)
certPool, err := httputil.NewCertPool(caCertDir, ctrl.Log.WithName("cert-pool"))
if err != nil {
setupLog.Error(err, "unable to create CA certificate pool")
os.Exit(1)
}

ceRecon := &controllers.ClusterExtensionReconciler{}

unpacker := &source.ImageRegistry{
BaseCachePath: filepath.Join(cachePath, "unpack"),
// TODO: This needs to be derived per extension via ext.Spec.InstallNamespace
AuthNamespace: systemNamespace,
CaCertPool: certPool,
GetCaCertPool: func() (*x509.CertPool, error) {
ceRecon.CaMutex.RLock()
defer ceRecon.CaMutex.RUnlock()
return ceRecon.CaCertPool.Clone(), nil
},
}

clusterExtensionFinalizers := crfinalizer.NewFinalizers()
Expand All @@ -210,18 +219,17 @@ func main() {
}

cl := mgr.GetClient()
httpClient, err := httputil.BuildHTTPClient(certPool)
if err != nil {
setupLog.Error(err, "unable to create catalogd http client")
os.Exit(1)
}

catalogsCachePath := filepath.Join(cachePath, "catalogs")
if err := os.MkdirAll(catalogsCachePath, 0700); err != nil {
setupLog.Error(err, "unable to create catalogs cache directory")
os.Exit(1)
}
catalogClient := catalogclient.New(cache.NewFilesystemCache(catalogsCachePath, httpClient))
catalogClient := catalogclient.New(cache.NewFilesystemCache(catalogsCachePath, func() (*http.Client, error) {
ceRecon.CaMutex.RLock()
defer ceRecon.CaMutex.RUnlock()
return httputil.BuildHTTPClient(ceRecon.CaCertPool.Clone())
}))

resolver := &resolve.CatalogResolver{
WalkCatalogsFunc: resolve.CatalogWalker(
Expand All @@ -235,8 +243,7 @@ func main() {
catalogClient.GetPackage,
),
}

if err = (&controllers.ClusterExtensionReconciler{
ceRecon = &controllers.ClusterExtensionReconciler{
Client: cl,
Resolver: resolver,
ActionClientGetter: acg,
Expand All @@ -245,7 +252,8 @@ func main() {
Finalizers: clusterExtensionFinalizers,
CaCertPool: certPool,
Preflights: preflights,
}).SetupWithManager(mgr); err != nil {
}
if err = ceRecon.SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "ClusterExtension")
os.Exit(1)
}
Expand All @@ -259,6 +267,21 @@ func main() {
setupLog.Error(err, "unable to set up ready check")
os.Exit(1)
}
setupLog.Info("adding certificate watch", "directory", caCertDir)
if err := httputil.AddCertWatch(caCertDir, func() error {
ctrl.Log.WithName("cert-pool").Info("reloading certificate pool")
certPool, err := httputil.NewCertPool(caCertDir, ctrl.Log.WithName("cert-pool"))
if err != nil {
return err
}
ceRecon.CaMutex.Lock()
defer ceRecon.CaMutex.Unlock()
ceRecon.CaCertPool = certPool
tmshort marked this conversation as resolved.
Show resolved Hide resolved
return nil
}); err != nil {
setupLog.Error(err, "unable to add CA certificate watch")
os.Exit(1)
}

setupLog.Info("starting manager")
ctx := ctrl.SetupSignalHandler()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
value: {"name":"olmv1-certificate", "secret":{"secretName":"olmv1-cert", "optional": false, "items": [{"key": "ca.crt", "path": "olm-ca.crt"}]}}
- op: add
path: /spec/template/spec/containers/0/volumeMounts/-
value: {"name":"olmv1-certificate", "readOnly": true, "mountPath":"/var/certs/olm-ca.crt", "subPath":"olm-ca.crt"}
value: {"name":"olmv1-certificate", "readOnly": true, "mountPath":"/var/certs/"}
- op: add
path: /spec/template/spec/containers/0/args/-
value: "--ca-certs-dir=/var/certs"
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/Masterminds/semver/v3 v3.2.1
github.com/blang/semver/v4 v4.0.0
github.com/containerd/containerd v1.7.20
github.com/fsnotify/fsnotify v1.7.0
github.com/go-logr/logr v1.4.2
github.com/google/go-cmp v0.6.0
github.com/google/go-containerregistry v0.20.1
Expand Down Expand Up @@ -112,7 +113,6 @@ require (
github.com/exponent-io/jsonpath v0.0.0-20210407135951-1de76d718b3f // indirect
github.com/fatih/color v1.15.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-errors/errors v1.4.2 // indirect
github.com/go-git/gcfg v1.5.1-0.20230307220236-3a3c6141e376 // indirect
github.com/go-git/go-billy/v5 v5.5.0 // indirect
Expand Down
31 changes: 31 additions & 0 deletions hack/test/cert-e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/bin/bash

set -e

# patch the certificates to renew every minute
# (if set to 2 minutes, then the big timeout wait is about 5 minutes)
kubectl patch certificate.cert-manager.io -n cert-manager olmv1-ca --type='merge' -p '{"spec":{"duration":"1h", "renewBefore":"59m"}}'
kubectl patch certificate.cert-manager.io -n olmv1-system olmv1-cert --type='merge' -p '{"spec":{"duration":"1h", "renewBefore":"59m"}}'

# delete the old secrets, so new secrets are generated
kubectl delete secret -n cert-manager olmv1-ca
kubectl delete secret -n olmv1-system olmv1-cert

# delete the existing operator-controller, to force it to get a new secret
# (deleting the secret itself isn't enough)
kubectl delete pod -n olmv1-system -l control-plane=controller-manager
kubectl wait --for=condition=Available --namespace=olmv1-system "deployment/operator-controller-controller-manager" --timeout="60s"
tmshort marked this conversation as resolved.
Show resolved Hide resolved

# and then search through the previous logs
function check_logs() {
kubectl logs -c manager -n olmv1-system -l control-plane=controller-manager | grep "reloading certificate pool" >& /dev/null
}

WAIT_TIME=0
MAX_TIME=300
until (( NEXT_WAIT_TIME == MAX_TIME )) || check_logs; do
echo -n "."
sleep "$(( WAIT_TIME += 10 ))"
done
(( WAIT_TIME < MAX_TIME ))
echo "#"
tmshort marked this conversation as resolved.
Show resolved Hide resolved
12 changes: 8 additions & 4 deletions internal/catalogmetadata/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,11 @@ var _ client.Fetcher = &filesystemCache{}
// - IF cached it will verify the cache is up to date. If it is up to date it will return
// the cached contents, if not it will fetch the new contents from the catalogd HTTP
// server and update the cached contents.
func NewFilesystemCache(cachePath string, client *http.Client) client.Fetcher {
func NewFilesystemCache(cachePath string, clientFunc func() (*http.Client, error)) client.Fetcher {
return &filesystemCache{
cachePath: cachePath,
mutex: sync.RWMutex{},
client: client,
getClient: clientFunc,
cacheDataByCatalogName: map[string]cacheData{},
}
}
Expand All @@ -50,7 +50,7 @@ type cacheData struct {
type filesystemCache struct {
mutex sync.RWMutex
cachePath string
client *http.Client
getClient func() (*http.Client, error)
cacheDataByCatalogName map[string]cacheData
}

Expand Down Expand Up @@ -95,7 +95,11 @@ func (fsc *filesystemCache) FetchCatalogContents(ctx context.Context, catalog *c
return nil, fmt.Errorf("error forming request: %v", err)
}

resp, err := fsc.client.Do(req)
client, err := fsc.getClient()
if err != nil {
return nil, fmt.Errorf("error getting HTTP client: %w", err)
}
m1kola marked this conversation as resolved.
Show resolved Hide resolved
resp, err := client.Do(req)
if err != nil {
return nil, fmt.Errorf("error performing request: %v", err)
}
Expand Down
4 changes: 3 additions & 1 deletion internal/catalogmetadata/cache/cache_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,9 @@ func TestFilesystemCache(t *testing.T) {
maps.Copy(tt.tripper.content, tt.contents)
httpClient := http.DefaultClient
httpClient.Transport = tt.tripper
c := cache.NewFilesystemCache(cacheDir, httpClient)
c := cache.NewFilesystemCache(cacheDir, func() (*http.Client, error) {
return httpClient, nil
})

actualFS, err := c.FetchCatalogContents(ctx, tt.catalog)
if !tt.wantErr {
Expand Down
1 change: 1 addition & 0 deletions internal/controllers/clusterextension_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ type ClusterExtensionReconciler struct {
cache cache.Cache
InstalledBundleGetter InstalledBundleGetter
Finalizers crfinalizer.Finalizers
CaMutex sync.RWMutex
CaCertPool *x509.CertPool
Preflights []Preflight
}
Expand Down
88 changes: 81 additions & 7 deletions internal/httputil/certutil.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@ package httputil

import (
"bytes"
"context"
"crypto/x509"
"encoding/base64"
"encoding/pem"
"fmt"
"os"
"path/filepath"
"time"

"github.com/fsnotify/fsnotify"
"github.com/go-logr/logr"
)

var pemStart = []byte("\n-----BEGIN ")
Expand Down Expand Up @@ -157,7 +162,7 @@ func pemDecode(data []byte) (*pem.Block, []byte) {
}

// This version of (*x509.CertPool).AppendCertsFromPEM() will error out if parsing fails
func appendCertsFromPEM(s *x509.CertPool, pemCerts []byte) error {
func appendCertsFromPEM(s *x509.CertPool, pemCerts []byte, firstExpiration *time.Time) error {
n := 1
for len(pemCerts) > 0 {
var block *pem.Block
Expand All @@ -179,15 +184,25 @@ func appendCertsFromPEM(s *x509.CertPool, pemCerts []byte) error {
if err != nil {
return fmt.Errorf("unable to parse cert %d: %w", n, err)
}
// no return values - panics or always succeeds
s.AddCert(cert)
if firstExpiration.IsZero() || firstExpiration.After(cert.NotAfter) {
*firstExpiration = cert.NotAfter
}
now := time.Now()
if now.Before(cert.NotBefore) {
return fmt.Errorf("not yet valid cert %d: %q", n, cert.NotBefore.Format(time.RFC3339))
} else if now.After(cert.NotAfter) {
return fmt.Errorf("expired cert %d: %q", n, cert.NotAfter.Format(time.RFC3339))
} else {
// no return values - panics or always succeeds
s.AddCert(cert)
}
n++
}

return nil
}

func NewCertPool(caDir string) (*x509.CertPool, error) {
func NewCertPool(caDir string, log logr.Logger) (*x509.CertPool, error) {
caCertPool, err := x509.SystemCertPool()
if err != nil {
return nil, err
Expand All @@ -200,20 +215,79 @@ func NewCertPool(caDir string) (*x509.CertPool, error) {
if err != nil {
return nil, err
}
count := 0
firstExpiration := time.Time{}

for _, e := range dirEntries {
if e.IsDir() {
file := filepath.Join(caDir, e.Name())
// These might be symlinks pointing to directories, so use Stat() to resolve
fi, err := os.Stat(file)
if err != nil {
return nil, err
}
if fi.IsDir() {
log.Info("skip directory", "name", e.Name())
continue
}
file := filepath.Join(caDir, e.Name())
log.Info("load certificate", "name", e.Name())
data, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("error reading cert file %q: %w", file, err)
}
err = appendCertsFromPEM(caCertPool, data)
err = appendCertsFromPEM(caCertPool, data, &firstExpiration)
if err != nil {
return nil, fmt.Errorf("error adding cert file %q: %w", file, err)
}
count++
}

// Found no certs!
if count == 0 {
return nil, fmt.Errorf("no certificates found in %q", caDir)
}

log.Info("first expiration", "time", firstExpiration.Format(time.RFC3339))
return caCertPool, nil
}

func AddCertWatch(caDir string, update func() error) error {
watcher, err := fsnotify.NewWatcher()
if err != nil {
return err
}
if err = watcher.Add(caDir); err != nil {
return err
}
go func() {
for {
select {
case <-watcher.Events:
// drain as many events as possible before doing anything
func() {
for {
time.Sleep(time.Millisecond * 100)
select {
case <-watcher.Events:
default:
return
}
}
}()
if err := update(); err != nil {
log, err2 := logr.FromContext(context.Background())
if err2 == nil {
log.WithName("cert-pool").Error(err, "error updating cert-pool")
}
os.Exit(1)
}
case err = <-watcher.Errors:
log, err2 := logr.FromContext(context.Background())
if err2 == nil {
log.WithName("cert-pool").Error(err, "error watching certificate dir")
}
os.Exit(1)
}
}
}()
return nil
}
Loading