Skip to content

Commit

Permalink
feat(healthchecks): introduce healtchecks and metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
FalcoSuessgott committed Nov 16, 2024
1 parent d1fe99a commit d2d41aa
Show file tree
Hide file tree
Showing 36 changed files with 853 additions and 169 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/e2e.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
E2E:
strategy:
matrix:
vault: [1.15, 1.16, 1.17]
vault: [1.16, 1.17, 1.18]
versions:
- k8s_version: v1.28.0
kind_cfg: kind-config_v1.yaml
Expand Down Expand Up @@ -54,7 +54,7 @@ jobs:
- name: setup go
uses: actions/setup-go@v5
with:
go-version: '1.22.1'
go-version: '1.22.6'
cache: false

- name: setup qemu
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ jobs:
run: go get ./...

- name: Run coverage
run: |
gotestsum -- -v -race -coverprofile="coverage.out" -covermode=atomic ./...
run: make test
env:
# https://github.com/testcontainers/testcontainers-go/issues/1782
TESTCONTAINERS_RYUK_DISABLED: true
Expand Down
12 changes: 0 additions & 12 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,3 @@ repos:
args: [--branch, main]
- id: pretty-format-json
args: [--autofix, --no-sort-keys]

# - repo: https://github.com/tekwizely/pre-commit-golang
# rev: v1.0.0-rc.1
# hooks:
# - id: go-test-repo
# - id: go-staticcheck-repo
# - id: go-fmt
# - id: go-fumpt
# - id: go-imports
# - id: go-lint
# - id: golangci-lint-mod
# args: [-c.golang-ci.yml]
35 changes: 32 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ docs: ## render docs locally
mkdocs serve

PHONY: test
test: ## display test coverage
go test --cover -parallel=1 -v -coverprofile=coverage.out ./...
go tool cover -func=coverage.out | sort -rnk3
test: ## test
gotestsum -- -v --shuffle=on -race -coverprofile="coverage.out" -covermode=atomic ./...

PHONY: lint
lint: ## lint go files
Expand All @@ -31,10 +30,40 @@ setup-vault: ## setup a local vault dev server with transit engine + key
setup-registry: ## setup a local docker registry for pulling in kind
./scripts/local-registry.sh

.PHONY: gen-load
gen-load: ## generate load on KMS plugin
while true; do \
go run cmd/v2_client/main.go $$(openssl rand -base64 12);\
done;

.PHONY: gen-secrets
gen-secrets: ## generate secrets on KMS plugin
while true; do \
kubectl create secret generic $$(openssl rand -hex 8 | tr '[:upper:]' '[:lower:]')\
--from-literal=$$(openssl rand -hex 8 | tr '[:upper:]' '[:lower:]')=$$(openssl rand -hex 8 | tr '[:upper:]' '[:lower:]');\
done;

.PHONY: setup-kind
setup-kind: ## setup kind cluster with encrpytion provider configured
kind delete cluster --name=kms || true
kind create cluster --name=kms --config scripts/kind-config_v2.yaml

.PHONY: setup-o11y
setup-o11y: ## install grafana and prometheus via helm
kubectl apply -f scripts/svc.yml

helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm repo add grafana https://grafana.github.io/helm-charts
helm repo update

helm install prometheus prometheus-community/prometheus --values scripts/prometheus_values.yml
helm install grafana grafana/grafana --values scripts/grafana_values.yml

kubectl get secret --namespace default grafana -o jsonpath="{.data.admin-password}" | base64 --decode ; echo

.PHONY: setup-local
setup-local: setup-vault setup-registry setup-kind ## complete local setup

.PHONY: destroy
destroy: ## destroy kind cluster
kind delete cluster --name=kms
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Since the key used for encrypting secrets is not stored in Kubernetes, an attack

`vault-kubernetes-kms` is supposed to run as a static pod on every control plane node or on that node where the `kube-apiserver` will run.

The plugin creates a Unix-Socket and receive encryption requests through that socket from the `kube-apiserver`. The plugin will then use the specified Vault transit encryption key to encrypt the data and send it back to the `kube-apiserver`, who will then store the encrypted response in `etcd`.
`vault-kubernetes-kms` will start a UNIX domain socket and listens for encryption requests from the `kube-apiserver`. The plugin will then use the specified Vault transit encryption key to encrypt the data and send it back to the `kube-apiserver`, who will then store the encrypted response in `etcd`.

To do so, you will have to enable Data at Rest encryption, by configuring the `kube-apiserver` to use a `EncryptionConfiguration` (See [https://falcosuessgott.github.io/vault-kubernetes-kms/configuration/](https://falcosuessgott.github.io/vault-kubernetes-kms/configuration/) for more details).

Expand All @@ -31,6 +31,7 @@ To do so, you will have to enable Data at Rest encryption, by configuring the `k
* support [Vault Token](https://developer.hashicorp.com/vault/docs/auth/token), [AppRole](https://developer.hashicorp.com/vault/docs/auth/approle) authentication (Since a static pod cannot reference any other Kubernetes API-Objects, Vaults Kubernetes Authentication is not possible.)
* support Kubernetes [KMS Plugin v1 (deprecated since `v1.28.0`) & v2 (stable in `v1.29.0`)](https://kubernetes.io/docs/tasks/administer-cluster/kms-provider/#before-you-begin)
* automatic Token Renewal for avoiding Token expiry
* Exposes useful Prometheus Metrics

## Without a KMS Provider
```bash
Expand Down
10 changes: 9 additions & 1 deletion assets/vault-kubernetes-kms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,21 @@ spec:
# mount /opt/kms host directory
- name: kms
mountPath: /opt/kms
livenessProbe:
httpGet:
path: /health
port: 8080
readinessProbe:
httpGet:
path: /live
port: 8080
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 2
memory: 1Gi
memory: 256Mi
volumes:
# mount /opt/kms host directory
- name: kms
Expand Down
121 changes: 104 additions & 17 deletions cmd/plugin.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,26 @@
package cmd

import (
"context"
"errors"
"flag"
"fmt"
"log"
"net/http"
"os"
"os/signal"
"slices"
"strings"
"syscall"
"time"

"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/logging"
"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/metrics"
"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/plugin"
"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/probes"
"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/socket"
"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/utils"
"github.com/FalcoSuessgott/vault-kubernetes-kms/pkg/vault"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
"google.golang.org/grpc"
Expand All @@ -42,10 +47,20 @@ type Options struct {
AppRoleRoleSecretID string `env:"APPROLE_SECRET_ID"`
AppRoleMount string `env:"APPROLE_MOUNT" envDefault:"approle"`

// token refresh
TokenRefreshInterval string `env:"TOKEN_REFRESH_INTERVAL" envDefault:"60s"`
TokenRenewalSeconds int `env:"TOKEN_RENEWAL_SECONDS" envDefault:"3600"`

// transit
TransitKey string `env:"TRANSIT_KEY" envDefault:"kms"`
TransitMount string `env:"TRANSIT_MOUNT" envDefault:"transit"`

// healthz check
HealthPort string `env:"HEALTH_PORT" envDefault:"8080"`

// Disable KMSv1 Plugin
DisableV1 bool `env:"DISABLE_V1" envDefault:"true"`

Version bool
}

Expand Down Expand Up @@ -78,9 +93,16 @@ func NewPlugin(version string) error {
flag.StringVar(&opts.AppRoleRoleID, "approle-role-id", opts.AppRoleRoleID, "Vault Approle role ID (when approle auth)")
flag.StringVar(&opts.AppRoleRoleSecretID, "approle-secret-id", opts.AppRoleRoleSecretID, "Vault Approle Secret ID (when approle auth)")

flag.StringVar(&opts.TokenRefreshInterval, "token-refresh-interval", opts.TokenRefreshInterval, "Interval to check for a token renewal")
flag.IntVar(&opts.TokenRenewalSeconds, "token-renewal", opts.TokenRenewalSeconds, "The number of seconds to renew the token")

flag.StringVar(&opts.TransitMount, "transit-mount", opts.TransitMount, "Vault Transit mount name")
flag.StringVar(&opts.TransitKey, "transit-key", opts.TransitKey, "Vault Transit key name")

flag.StringVar(&opts.HealthPort, "health-port", opts.HealthPort, "Health Check Port")

flag.BoolVar(&opts.DisableV1, "disable-v1", opts.DisableV1, "disable the v1 kms plugin")

flag.BoolVar(&opts.Version, "version", opts.Version, "prints out the plugins version")

if err := flag.Parse(os.Args[1:]); err != nil {
Expand All @@ -90,7 +112,7 @@ func NewPlugin(version string) error {
if opts.Version {
fmt.Fprintf(os.Stdout, "vault-kubernetes-kms v%s\n", version)

os.Exit(0)
return nil
}

if err := opts.validateFlags(); err != nil {
Expand All @@ -111,38 +133,45 @@ func NewPlugin(version string) error {
zap.ReplaceGlobals(l)

var (
authMethod vault.Option
logfields []zapcore.Field
authMethod vault.Option
logFields []zapcore.Field
healthChecks = []probes.Prober{}
ctx = shutDownSignal(context.Background())
)

logfields = append(logfields,
logFields = append(logFields,
zap.String("auth-method", opts.AuthMethod),
zap.String("socket", opts.Socket),
zap.Bool("debug", opts.Debug),
zap.String("vault-address", opts.VaultAddress),
zap.String("vault-namespace", opts.VaultNamespace),
zap.String("transit-engine", opts.TransitMount),
zap.String("transit-key", opts.TransitKey),
zap.String("health-port", opts.HealthPort),
zap.String("token-refresh-interval", opts.TokenRefreshInterval),
zap.Int("token-renewal-seconds", opts.TokenRenewalSeconds),
zap.Bool("disable-v1", opts.DisableV1),
)

switch strings.ToLower(opts.AuthMethod) {
case "token":
authMethod = vault.WithTokenAuth(opts.Token)
case "approle":
authMethod = vault.WitAppRoleAuth(opts.AppRoleMount, opts.AppRoleRoleID, opts.AppRoleRoleSecretID)
logfields = append(logfields,
authMethod = vault.WithAppRoleAuth(opts.AppRoleMount, opts.AppRoleRoleID, opts.AppRoleRoleSecretID)
logFields = append(logFields,
zap.String("approle-mount", opts.AppRoleMount),
zap.String("approle-role-id", opts.AppRoleRoleID))
default:
return fmt.Errorf("invalid auth method: %s", opts.AuthMethod)
}

zap.L().Info("starting kms plugin", logfields...)
zap.L().Info("starting kms plugin", logFields...)

vc, err := vault.NewClient(
vault.WithVaultAddress(opts.VaultAddress),
vault.WithVaultNamespace(opts.VaultNamespace),
vault.WithTransit(opts.TransitMount, opts.TransitKey),
vault.WithTokenRenewalSeconds(opts.TokenRenewalSeconds),
authMethod,
)
if err != nil {
Expand All @@ -151,6 +180,17 @@ func NewPlugin(version string) error {

zap.L().Info("Successfully authenticated to vault")

go func() {
zap.L().Info("Starting token refresher",
zap.String("interval", opts.TokenRefreshInterval),
zap.Int("renewal-seconds", opts.TokenRenewalSeconds),
)

t, _ := time.ParseDuration(opts.TokenRefreshInterval)

vc.LeaseRefresher(ctx, t)
}()

s, err := socket.NewSocket(opts.Socket)
if err != nil {
zap.L().Fatal("Cannot create socket", zap.Error(err))
Expand All @@ -160,19 +200,27 @@ func NewPlugin(version string) error {

listener, err := s.Listen(opts.ForceSocketOverwrite)
if err != nil {
log.Fatal(fmt.Errorf("failed to listen on socket: %w. Use -force-socket-overwrite (VAULT_KUBERNETES_KMS_FORCE_SOCKET_OVERWRITE)", err))
zap.L().Fatal("failed to listen on socket: Use -force-socket-overwrite (VAULT_KUBERNETES_KMS_FORCE_SOCKET_OVERWRITE)",
zap.String("socket", opts.Socket),
zap.Any("error", err))
}

zap.L().Info("Listening for connection")

grpc := grpc.NewServer()
pluginV1 := plugin.NewPluginV1(vc)
pluginV1.Register(grpc)

zap.L().Info("Successfully registered kms plugin v1")
if !opts.DisableV1 {
pluginV1 := plugin.NewPluginV1(vc)
pluginV1.Register(grpc)

healthChecks = append(healthChecks, pluginV1)

zap.L().Info("Successfully registered kms plugin v1")
}

pluginV2 := plugin.NewPluginV2(vc)
pluginV2.Register(grpc)
healthChecks = append(healthChecks, pluginV2)

zap.L().Info("Successfully registered kms plugin v2")

Expand All @@ -182,13 +230,30 @@ func NewPlugin(version string) error {
}
}()

signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() {
mux := &http.ServeMux{}

mux.HandleFunc("/metrics", promhttp.HandlerFor(metrics.RegisterPrometheusMetrics(), promhttp.HandlerOpts{}).ServeHTTP)
mux.HandleFunc("/health", probes.HealthZ(healthChecks))
mux.HandleFunc("/live", probes.HealthZ(healthChecks))

signal := <-signals
//nolint: mnd
server := &http.Server{
Addr: ":" + opts.HealthPort,
Handler: mux,
ReadHeaderTimeout: 3 * time.Second,
}

if err := server.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
zap.L().Fatal("Failed to start health check handlers", zap.Error(err))
}

zap.L().Info("Exposing metrics under /metrics", zap.String("port", opts.HealthPort))
zap.L().Info("Exposing health check under /health", zap.String("port", opts.HealthPort))
zap.L().Info("Exposing live check under /live", zap.String("port", opts.HealthPort))
}()

zap.L().Info("Received signal", zap.Stringer("signal", signal))
zap.L().Info("Shutting down server")
<-ctx.Done()

grpc.GracefulStop()

Expand All @@ -215,5 +280,27 @@ func (o *Options) validateFlags() error {
return errors.New("approle role id and secret id required when using approle auth")
}

if _, err := time.ParseDuration(o.TokenRefreshInterval); err != nil {
return fmt.Errorf("invalid token refresh interval: %w", err)
}

return nil
}

func shutDownSignal(ctx context.Context) context.Context {
signalChan := make(chan os.Signal, 1)
signal.Notify(signalChan, syscall.SIGTERM, syscall.SIGINT, os.Interrupt)

parentCtx, cancel := context.WithCancel(ctx)

go func() {
signal := <-signalChan

cancel()

zap.L().Info("Received signal", zap.Stringer("signal", signal))
zap.L().Info("Shutting down server")
}()

return parentCtx
}
Loading

0 comments on commit d2d41aa

Please sign in to comment.