Skip to content

Commit

Permalink
v1: re-implement Node failure recovery
Browse files Browse the repository at this point in the history
Due to the usage of local NVME disks, redpanda deployments are particular
sensitive to Node failure. Whenever a Node crashes, the resultant redpanda Pod
will be stuck in a Pending state due to the NodeAffinity of it's PV.

This commit implements a "PVCUnbinder" reconciler that watches for such cases
and attempts automatic remediation by "unbinding" PVs. See the implementation
for details on the strategy.

This implementation is similar to, yet much more paranoid than, the
`RedpandaNodePVCReconciler`. Ideally the two implementation should merge
together before long.

Fixes #166
  • Loading branch information
chrisseto committed Jul 12, 2024
1 parent d9e2a57 commit 7a337a9
Show file tree
Hide file tree
Showing 7 changed files with 818 additions and 24 deletions.
31 changes: 21 additions & 10 deletions src/go/k8s/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import (
redpandav1alpha2 "github.com/redpanda-data/redpanda-operator/src/go/k8s/api/redpanda/v1alpha2"
vectorizedv1alpha1 "github.com/redpanda-data/redpanda-operator/src/go/k8s/api/vectorized/v1alpha1"
clusterredpandacomcontrollers "github.com/redpanda-data/redpanda-operator/src/go/k8s/internal/controller/cluster.redpanda.com"
"github.com/redpanda-data/redpanda-operator/src/go/k8s/internal/controller/pvcunbinder"
redpandacontrollers "github.com/redpanda-data/redpanda-operator/src/go/k8s/internal/controller/redpanda"
adminutils "github.com/redpanda-data/redpanda-operator/src/go/k8s/pkg/admin"
consolepkg "github.com/redpanda-data/redpanda-operator/src/go/k8s/pkg/console"
Expand Down Expand Up @@ -149,14 +150,9 @@ func main() {
additionalControllers []string
operatorMode bool
enableHelmControllers bool

// allowPVCDeletion controls the PVC deletion feature in the Cluster custom resource.
// PVCs will be deleted when its Pod has been deleted and the Node that Pod is assigned to
// does not exist, or has the NoExecute taint. This is intended to support the rancher.io/local-path
// storage driver.
allowPVCDeletion bool
debug bool
ghostbuster bool
debug bool
ghostbuster bool
unbindPVCsAfter time.Duration
)

flag.StringVar(&eventsAddr, "events-addr", "", "The address of the events receiver.")
Expand All @@ -174,7 +170,7 @@ func main() {
flag.DurationVar(&decommissionWaitInterval, "decommission-wait-interval", 8*time.Second, "Set the time to wait for a node decommission to happen in the cluster")
flag.DurationVar(&metricsTimeout, "metrics-timeout", 8*time.Second, "Set the timeout for a checking metrics Admin API endpoint. If set to 0, then the 2 seconds default will be used")
flag.BoolVar(&vectorizedv1alpha1.AllowDownscalingInWebhook, "allow-downscaling", true, "Allow to reduce the number of replicas in existing clusters")
flag.BoolVar(&allowPVCDeletion, "allow-pvc-deletion", false, "Allow the operator to delete PVCs for Pods assigned to failed or missing Nodes (alpha feature)")
flag.Bool("allow-pvc-deletion", false, "Deprecated: Ignored if specified")
flag.BoolVar(&vectorizedv1alpha1.AllowConsoleAnyNamespace, "allow-console-any-ns", false, "Allow to create Console in any namespace. Allowing this copies Redpanda SchemaRegistry TLS Secret to namespace (alpha feature)")
flag.StringVar(&restrictToRedpandaVersion, "restrict-redpanda-version", "", "Restrict management of clusters to those with this version")
flag.StringVar(&vectorizedv1alpha1.SuperUsersPrefix, "superusers-prefix", "", "Prefix to add in username of superusers managed by operator. This will only affect new clusters, enabling this will not add prefix to existing clusters (alpha feature)")
Expand All @@ -185,6 +181,7 @@ func main() {
flag.StringSliceVar(&additionalControllers, "additional-controllers", []string{""}, fmt.Sprintf("which controllers to run, available: all, %s", strings.Join(availableControllers, ", ")))
flag.BoolVar(&operatorMode, "operator-mode", true, "enables to run as an operator, setting this to false will disable cluster (deprecated), redpanda resources reconciliation.")
flag.BoolVar(&enableHelmControllers, "enable-helm-controllers", true, "if a namespace is defined and operator mode is true, this enables the use of helm controllers to manage fluxcd helm resources.")
flag.DurationVar(&unbindPVCsAfter, "unbind-pvcs-after", 0, "if not zero, runs the PVCUnbinder controller which attempts to 'unbind' the PVCs' of Pods that are Pending for longer than the given duration")

logOptions.BindFlags(flag.CommandLine)
clientOptions.BindFlags(flag.CommandLine)
Expand Down Expand Up @@ -270,7 +267,7 @@ func main() {
MetricsTimeout: metricsTimeout,
RestrictToRedpandaVersion: restrictToRedpandaVersion,
GhostDecommissioning: ghostbuster,
}).WithClusterDomain(clusterDomain).WithConfiguratorSettings(configurator).WithAllowPVCDeletion(allowPVCDeletion).SetupWithManager(mgr); err != nil {
}).WithClusterDomain(clusterDomain).WithConfiguratorSettings(configurator).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "Unable to create controller", "controller", "Cluster")
os.Exit(1)
}
Expand Down Expand Up @@ -320,6 +317,20 @@ func main() {
os.Exit(1)
}

if unbindPVCsAfter <= 0 {
setupLog.Info("PVCUnbinder controller not active", "flag", unbindPVCsAfter)
} else {
setupLog.Info("starting PVCUnbinder controller", "flag", unbindPVCsAfter)

if err := (&pvcunbinder.Reconciler{
Client: mgr.GetClient(),
Timeout: unbindPVCsAfter,
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "PVCUnbinder")
os.Exit(1)
}
}

// Setup webhooks
if webhookEnabled {
setupLog.Info("Setup webhook")
Expand Down
17 changes: 15 additions & 2 deletions src/go/k8s/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ require (
github.com/json-iterator/go v1.1.12
github.com/moby/moby v24.0.7+incompatible
github.com/moby/sys/mountinfo v0.6.2
github.com/onsi/ginkgo/v2 v2.14.0
github.com/onsi/gomega v1.30.0
github.com/onsi/ginkgo/v2 v2.15.0
github.com/onsi/gomega v1.31.1
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/common v0.45.0
github.com/redpanda-data/console/backend v0.0.0-20230222172326-354751cc7524
Expand Down Expand Up @@ -198,6 +198,12 @@ require (
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.4 // indirect
github.com/golang/snappy v0.0.4 // indirect
github.com/gonvenience/bunt v1.3.5 // indirect
github.com/gonvenience/neat v1.3.13 // indirect
github.com/gonvenience/term v1.0.2 // indirect
github.com/gonvenience/text v1.0.7 // indirect
github.com/gonvenience/wrap v1.2.0 // indirect
github.com/gonvenience/ytbx v1.4.4 // indirect
github.com/google/btree v1.1.2 // indirect
github.com/google/certificate-transparency-go v1.1.7 // indirect
github.com/google/gnostic-models v0.6.9-0.20230804172637-c7be7c783f49 // indirect
Expand All @@ -223,6 +229,7 @@ require (
github.com/hashicorp/go-retryablehttp v0.7.5 // indirect
github.com/hashicorp/go-uuid v1.0.3 // indirect
github.com/hashicorp/hcl v1.0.1-vault-5 // indirect
github.com/homeport/dyff v1.7.1 // indirect
github.com/huandu/xstrings v1.4.0 // indirect
github.com/imdario/mergo v0.3.16 // indirect
github.com/in-toto/in-toto-golang v0.9.0 // indirect
Expand All @@ -249,10 +256,12 @@ require (
github.com/letsencrypt/boulder v0.0.0-20231026200631-000cd05d5491 // indirect
github.com/lib/pq v1.10.9 // indirect
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/lucasb-eyer/go-colorful v1.2.0 // indirect
github.com/lucasjones/reggen v0.0.0-20200904144131-37ba4fa293bb // indirect
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0 // indirect
github.com/magiconair/properties v1.8.7 // indirect
github.com/mailru/easyjson v0.7.7 // indirect
github.com/mattn/go-ciede2000 v0.0.0-20170301095244-782e8c62fec3 // indirect
github.com/mattn/go-colorable v0.1.13 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.15 // indirect
Expand All @@ -263,7 +272,9 @@ require (
github.com/minio/sha256-simd v1.0.1 // indirect
github.com/mitchellh/copystructure v1.2.0 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/go-ps v1.0.0 // indirect
github.com/mitchellh/go-wordwrap v1.0.1 // indirect
github.com/mitchellh/hashstructure v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/mitchellh/reflectwalk v1.0.2 // indirect
github.com/moby/locker v1.0.1 // indirect
Expand Down Expand Up @@ -328,6 +339,7 @@ require (
github.com/spf13/viper v1.18.1 // indirect
github.com/subosito/gotenv v1.6.0 // indirect
github.com/syndtr/goleveldb v1.0.1-0.20220721030215-126854af5e6d // indirect
github.com/texttheater/golang-levenshtein v1.0.1 // indirect
github.com/thales-e-security/pool v0.0.2 // indirect
github.com/theupdateframework/go-tuf v0.7.0 // indirect
github.com/tidwall/gjson v1.17.0 // indirect
Expand All @@ -341,6 +353,7 @@ require (
github.com/transparency-dev/merkle v0.0.2 // indirect
github.com/twmb/tlscfg v1.2.1 // indirect
github.com/vbatts/tar-split v0.11.5 // indirect
github.com/virtuald/go-ordered-json v0.0.0-20170621173500-b18e6e673d74 // indirect
github.com/wI2L/jsondiff v0.5.0 // indirect
github.com/wk8/go-ordered-map/v2 v2.1.8 // indirect
github.com/xanzy/go-gitlab v0.95.2 // indirect
Expand Down
8 changes: 4 additions & 4 deletions src/go/k8s/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -934,14 +934,14 @@ github.com/onsi/ginkgo v1.16.4/go.mod h1:dX+/inL/fNMqNlz0e9LfyB9TswhZpCVdJM/Z6Vv
github.com/onsi/ginkgo v1.16.5 h1:8xi0RTUf59SOSfEtZMvwTvXYMzG4gV23XVHOZiXNtnE=
github.com/onsi/ginkgo v1.16.5/go.mod h1:+E8gABHa3K6zRBolWtd+ROzc/U5bkGt0FwiG042wbpU=
github.com/onsi/ginkgo/v2 v2.1.3/go.mod h1:vw5CSIxN1JObi/U8gcbwft7ZxR2dgaR70JSE3/PpL4c=
github.com/onsi/ginkgo/v2 v2.14.0 h1:vSmGj2Z5YPb9JwCWT6z6ihcUvDhuXLc3sJiqd3jMKAY=
github.com/onsi/ginkgo/v2 v2.14.0/go.mod h1:JkUdW7JkN0V6rFvsHcJ478egV3XH9NxpD27Hal/PhZw=
github.com/onsi/ginkgo/v2 v2.15.0 h1:79HwNRBAZHOEwrczrgSOPy+eFTTlIGELKy5as+ClttY=
github.com/onsi/ginkgo/v2 v2.15.0/go.mod h1:HlxMHtYF57y6Dpf+mc5529KKmSq9h2FpCF+/ZkwUxKM=
github.com/onsi/gomega v1.7.1/go.mod h1:XdKZgCCFLUoM/7CFJVPcG8C1xQ1AJ0vpAezJrB7JYyY=
github.com/onsi/gomega v1.10.1/go.mod h1:iN09h71vgCQne3DLsj+A5owkum+a2tYe+TOCB1ybHNo=
github.com/onsi/gomega v1.17.0/go.mod h1:HnhC7FXeEQY45zxNK3PPoIUhzk/80Xly9PcubAlGdZY=
github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro=
github.com/onsi/gomega v1.30.0 h1:hvMK7xYz4D3HapigLTeGdId/NcfQx1VHMJc60ew99+8=
github.com/onsi/gomega v1.30.0/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ=
github.com/onsi/gomega v1.31.1 h1:KYppCUK+bUgAZwHOu7EXVBKyQA6ILvOESHkn/tgoqvo=
github.com/onsi/gomega v1.31.1/go.mod h1:y40C95dwAD1Nz36SsEnxvfFe8FFfNxzI5eJ0EYGyAy0=
github.com/opencontainers/go-digest v1.0.1-0.20230815154656-802ce17c4f59 h1:CxG/9ofVc6JEX3s9ORGnWClpUpeR7DurWZTdyWk6f9Y=
github.com/opencontainers/go-digest v1.0.1-0.20230815154656-802ce17c4f59/go.mod h1:RqnyioA3pIEZMkSbOIcrw32YSgETfn/VrLuEikEdPNU=
github.com/opencontainers/go-digest/blake3 v0.0.0-20231212064514-429d0316a3dd h1:6eP3AE0nXQEGF7Q4lj27mNp1dLHF/+Ab2he8fYPgxwA=
Expand Down
116 changes: 116 additions & 0 deletions src/go/k8s/internal/controller/pvcunbinder/k3d.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package pvcunbinder

import (
"fmt"
"os/exec"
"strings"
"sync"
"time"

"github.com/redpanda-data/helm-charts/pkg/kube"
"k8s.io/client-go/tools/clientcmd"
)

type K3DCluster struct {
Name string

mu sync.Mutex
restConfig *kube.RESTConfig
agentCounter int32
}

func (c *K3DCluster) RESTConfig() *kube.RESTConfig {
return c.restConfig
}

func (c *K3DCluster) DeleteNode(name string) error {
c.mu.Lock()
defer c.mu.Unlock()
if out, err := exec.Command(
"k3d",
"node",
"delete",
name,
).CombinedOutput(); err != nil {
return fmt.Errorf("%w: %s", err, out)
}
return nil
}

func (c *K3DCluster) CreateNode() error {
c.mu.Lock()
defer c.mu.Unlock()

c.agentCounter += 1

if out, err := exec.Command(
"k3d",
"node",
"create",
fmt.Sprintf("k3d-%s-agent-%d", c.Name, c.agentCounter),
fmt.Sprintf("--cluster=%s", c.Name),
"--wait",
"--role=agent",
).CombinedOutput(); err != nil {
return fmt.Errorf("%w: %s", err, out)
}
return nil
}

func (c *K3DCluster) Cleanup() error {
c.mu.Lock()
defer c.mu.Unlock()

_, err := exec.Command(
"k3d",
"cluster",
"delete",
c.Name,
).CombinedOutput()
return err
}

func NewK3DCluster(name string) (*K3DCluster, error) {
name = strings.ToLower(name)

out, err := exec.Command(
"k3d",
"cluster",
"create",
name,
fmt.Sprintf("--agents=%d", 3),
fmt.Sprintf("--timeout=%s", 30*time.Second),
// See also https://github.com/k3d-io/k3d/blob/main/docs/faq/faq.md#passing-additional-argumentsflags-to-k3s-and-on-to-eg-the-kube-apiserver
// As the formatting is QUITE finicky.
// Halve the node-monitor-grace-period to speed up tests that rely on dead node detection.
`--k3s-arg`, `--kube-controller-manager-arg=node-monitor-grace-period=10s@server:*`,
// Dramatically decrease (5m -> 10s) the default tolerations to ensure
// Pod eviction happens in a timely fashion.
`--k3s-arg`, `--kube-apiserver-arg=default-not-ready-toleration-seconds=10@server:*`,
`--k3s-arg`, `--kube-apiserver-arg=default-unreachable-toleration-seconds=10@server:*`,
).CombinedOutput()
if err != nil {
return nil, fmt.Errorf("%w: %s", err, out)
}

kubeconfigYAML, err := exec.Command("k3d", "kubeconfig", "get", name).CombinedOutput()
if err != nil {
return nil, err
}

kubeconfig, err := clientcmd.Load(kubeconfigYAML)
if err != nil {
return nil, err
}

cfg, err := kube.ConfigToRest(*kubeconfig)
if err != nil {
return nil, err
}

return &K3DCluster{
Name: name,
restConfig: cfg,
agentCounter: 3,
}, nil
}
Loading

0 comments on commit 7a337a9

Please sign in to comment.