From 47656a8e6fba2bd913f019156ef6dc8bcda0258b Mon Sep 17 00:00:00 2001 From: Gerard Nguyen Date: Tue, 13 Aug 2024 08:42:26 +1000 Subject: [PATCH] feat: etcd collector (#1589) * new schema for etcd collector * add placeholder * wip * get supported distribution * add exec implementation * wait for etcd pod to be ready * misc * update k0s etcd certs path * fix unit tests * address code reviews * update from code review * add etcdctl version --- Makefile | 4 + config/crds/troubleshoot.sh_collectors.yaml | 11 + config/crds/troubleshoot.sh_preflights.yaml | 11 + .../crds/troubleshoot.sh_supportbundles.yaml | 11 + .../troubleshoot/v1beta2/collector_shared.go | 6 + .../v1beta2/zz_generated.deepcopy.go | 21 + pkg/collect/collector.go | 4 + pkg/collect/etcd.go | 395 ++++++++++++++++++ pkg/collect/etcd_test.go | 70 ++++ schemas/collector-troubleshoot-v1beta2.json | 17 + schemas/preflight-troubleshoot-v1beta2.json | 17 + .../supportbundle-troubleshoot-v1beta2.json | 17 + 12 files changed, 584 insertions(+) create mode 100644 pkg/collect/etcd.go create mode 100644 pkg/collect/etcd_test.go diff --git a/Makefile b/Makefile index 70ce471d7..120df33d7 100644 --- a/Makefile +++ b/Makefile @@ -114,6 +114,10 @@ bin/analyze: bin/collect: go build ${BUILDFLAGS} ${LDFLAGS} -o bin/collect github.com/replicatedhq/troubleshoot/cmd/collect +build-linux: tidy + @echo "Build cli binaries for Linux" + GOOS=linux GOARCH=amd64 $(MAKE) -j bin/support-bundle bin/preflight bin/analyze bin/collect + .PHONY: fmt fmt: go fmt ${BUILDPATHS} diff --git a/config/crds/troubleshoot.sh_collectors.yaml b/config/crds/troubleshoot.sh_collectors.yaml index 8a7619172..5eab06b1a 100644 --- a/config/crds/troubleshoot.sh_collectors.yaml +++ b/config/crds/troubleshoot.sh_collectors.yaml @@ -301,6 +301,17 @@ spec: timeout: type: string type: object + etcd: + properties: + collectorName: + type: string + exclude: + type: BoolString + image: + type: string + required: + - image + type: object exec: properties: args: diff --git a/config/crds/troubleshoot.sh_preflights.yaml b/config/crds/troubleshoot.sh_preflights.yaml index 8d483325b..bdbb88a0d 100644 --- a/config/crds/troubleshoot.sh_preflights.yaml +++ b/config/crds/troubleshoot.sh_preflights.yaml @@ -2030,6 +2030,17 @@ spec: timeout: type: string type: object + etcd: + properties: + collectorName: + type: string + exclude: + type: BoolString + image: + type: string + required: + - image + type: object exec: properties: args: diff --git a/config/crds/troubleshoot.sh_supportbundles.yaml b/config/crds/troubleshoot.sh_supportbundles.yaml index f9f650947..89b9470bd 100644 --- a/config/crds/troubleshoot.sh_supportbundles.yaml +++ b/config/crds/troubleshoot.sh_supportbundles.yaml @@ -2061,6 +2061,17 @@ spec: timeout: type: string type: object + etcd: + properties: + collectorName: + type: string + exclude: + type: BoolString + image: + type: string + required: + - image + type: object exec: properties: args: diff --git a/pkg/apis/troubleshoot/v1beta2/collector_shared.go b/pkg/apis/troubleshoot/v1beta2/collector_shared.go index b05c79280..0841a0314 100644 --- a/pkg/apis/troubleshoot/v1beta2/collector_shared.go +++ b/pkg/apis/troubleshoot/v1beta2/collector_shared.go @@ -299,6 +299,11 @@ type DNS struct { Timeout string `json:"timeout,omitempty" yaml:"timeout,omitempty"` } +type Etcd struct { + CollectorMeta `json:",inline" yaml:",inline"` + Image string `json:"image" yaml:"image"` +} + type Collect struct { ClusterInfo *ClusterInfo `json:"clusterInfo,omitempty" yaml:"clusterInfo,omitempty"` ClusterResources *ClusterResources `json:"clusterResources,omitempty" yaml:"clusterResources,omitempty"` @@ -329,6 +334,7 @@ type Collect struct { Sonobuoy *Sonobuoy `json:"sonobuoy,omitempty" yaml:"sonobuoy,omitempty"` NodeMetrics *NodeMetrics `json:"nodeMetrics,omitempty" yaml:"nodeMetrics,omitempty"` DNS *DNS `json:"dns,omitempty" yaml:"dns,omitempty"` + Etcd *Etcd `json:"etcd,omitempty" yaml:"etcd,omitempty"` } func (c *Collect) AccessReviewSpecs(overrideNS string) []authorizationv1.SelfSubjectAccessReviewSpec { diff --git a/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go b/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go index cce96046b..d3777d784 100644 --- a/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go +++ b/pkg/apis/troubleshoot/v1beta2/zz_generated.deepcopy.go @@ -936,6 +936,11 @@ func (in *Collect) DeepCopyInto(out *Collect) { *out = new(DNS) (*in).DeepCopyInto(*out) } + if in.Etcd != nil { + in, out := &in.Etcd, &out.Etcd + *out = new(Etcd) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Collect. @@ -1452,6 +1457,22 @@ func (in *Distribution) DeepCopy() *Distribution { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Etcd) DeepCopyInto(out *Etcd) { + *out = *in + in.CollectorMeta.DeepCopyInto(&out.CollectorMeta) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Etcd. +func (in *Etcd) DeepCopy() *Etcd { + if in == nil { + return nil + } + out := new(Etcd) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *EventAnalyze) DeepCopyInto(out *EventAnalyze) { *out = *in diff --git a/pkg/collect/collector.go b/pkg/collect/collector.go index 5d3e76ce4..310f84ff0 100644 --- a/pkg/collect/collector.go +++ b/pkg/collect/collector.go @@ -126,6 +126,8 @@ func GetCollector(collector *troubleshootv1beta2.Collect, bundlePath string, nam return &CollectNodeMetrics{collector.NodeMetrics, bundlePath, clientConfig, client, ctx, RBACErrors}, true case collector.DNS != nil: return &CollectDNS{collector.DNS, bundlePath, namespace, clientConfig, client, ctx, RBACErrors}, true + case collector.Etcd != nil: + return &CollectEtcd{collector.Etcd, bundlePath, clientConfig, client, ctx, RBACErrors}, true default: return nil, false } @@ -219,6 +221,8 @@ func getCollectorName(c interface{}) string { collector = "node-metrics" case *CollectDNS: collector = "dns" + case *CollectEtcd: + collector = "etcd" default: collector = "" } diff --git a/pkg/collect/etcd.go b/pkg/collect/etcd.go new file mode 100644 index 000000000..5a5f0bf20 --- /dev/null +++ b/pkg/collect/etcd.go @@ -0,0 +1,395 @@ +package collect + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "strings" + "time" + + "github.com/pkg/errors" + troubleshootv1beta2 "github.com/replicatedhq/troubleshoot/pkg/apis/troubleshoot/v1beta2" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/remotecommand" + "k8s.io/klog/v2" +) + +const etcdOutputDir = "etcd" + +type CollectEtcd struct { + Collector *troubleshootv1beta2.Etcd + BundlePath string + ClientConfig *rest.Config + Client kubernetes.Interface + Context context.Context + RBACErrors +} + +// etcdDebug is a helper struct to exec into an etcd pod +type etcdDebug struct { + context context.Context + clientConfig *rest.Config + client kubernetes.Interface + pod *corev1.Pod // etcd pod to exec into + ephemeral bool // if true, the pod will be deleted after the collector is done + commands []string // list of commands to run in the etcd pod + args []string // list of args to pass to each command + hostPath string // path to the host's etcd certs + image string // image to use for the etcd client pod +} + +func (c *CollectEtcd) Title() string { + return getCollectorName(c) +} + +func (c *CollectEtcd) IsExcluded() (bool, error) { + return isExcluded(c.Collector.Exclude) +} + +func (c *CollectEtcd) Collect(progressChan chan<- interface{}) (CollectorResult, error) { + debugInstance := etcdDebug{ + context: c.Context, + clientConfig: c.ClientConfig, + client: c.Client, + commands: []string{ + "etcdctl endpoint health", + "etcdctl endpoint status", + "etcdctl member list", + "etcdctl alarm list", + "etcdctl version", + }, + image: c.Collector.Image, + } + + distribution, err := debugInstance.getSupportedDistro() + if err != nil { + klog.V(2).Infof("etcd collector is not supported on this distribution: %v", err) + return nil, err + } + + // etcd on these distros are not running as pod but as a process managed by k0scontroller + // we have to spin up an etcd pod to exec into and run the commands + // after the collector is done, the pod will be deleted + if distribution == "k0s" || distribution == "embedded-cluster" { + debugInstance.ephemeral = true + } + defer debugInstance.cleanup() + + etcdArgs, hostPath, err := getEtcdArgsByDistribution(distribution) + if err != nil { + return nil, err + } + debugInstance.args = etcdArgs + debugInstance.hostPath = hostPath + + err = debugInstance.getOrCreateEtcdPod() + if err != nil { + return nil, err + } + + // wait until the pod is running + err = debugInstance.waitForPodReady() + if err != nil { + return nil, err + } + + // finally exec etcdctl troubleshoot commands + output := NewResult() + + for _, command := range debugInstance.commands { + fileName := generateFilenameFromCommand(command) + stdout, stderr, err := debugInstance.executeCommand(command) + if err != nil { + klog.Infof("failed to exec command %s: %v", command, err) + continue + } + if len(stdout) > 0 { + output.SaveResult(c.BundlePath, getFullPath(fileName), bytes.NewBuffer(stdout)) + } + if len(stderr) > 0 { + fileName := fmt.Sprintf("%s-stderr", fileName) + output.SaveResult(c.BundlePath, getFullPath(fileName), bytes.NewBuffer(stderrToJson(stderr))) + } + } + + return output, nil +} + +func getEtcdArgsByDistribution(distribution string) ([]string, string, error) { + type certs struct { + hostPath string + ca string + cert string + key string + } + + lookup := map[string]certs{ + "k0s": { + hostPath: "/var/lib/k0s/pki/etcd", + ca: "ca.crt", + cert: "peer.crt", + key: "peer.key", + }, + "embedded-cluster": { + hostPath: "/var/lib/k0s/pki/etcd", + ca: "ca.crt", + cert: "peer.crt", + key: "peer.key", + }, + "kurl": { + hostPath: "/etc/kubernetes/pki/etcd", + ca: "ca.crt", + cert: "healthcheck-client.crt", + key: "healthcheck-client.key", + }, + } + + c, ok := lookup[distribution] + if !ok { + return nil, "", errors.Errorf("distribution %s not supported", distribution) + } + + return []string{ + "--cacert", c.hostPath + "/" + c.ca, + "--cert", c.hostPath + "/" + c.cert, + "--key", c.hostPath + "/" + c.key, + "--write-out", "json", + "--endpoints", "https://127.0.0.1:2379", // always use localhost + }, c.hostPath, nil +} + +// getSupportedDistro returns the distro that etcd collector can run on +// either due to the distro has static etcd pod (kurl by kubeadm) or +// the distro has etcd running as a process (k0s, embedded-cluster) +func (c *etcdDebug) getSupportedDistro() (string, error) { + // extract distro logic from analyzer.ParseNodesForProviders + // pkg/analyze/distribution.go + // we can't import analyzer because of circular dependency + // TODO: may refactor this to a common package + + nodes, err := c.client.CoreV1().Nodes().List(c.context, metav1.ListOptions{}) + if err != nil { + return "", errors.Wrap(err, "failed to list nodes") + } + + for _, node := range nodes.Items { + for k, v := range node.ObjectMeta.Labels { + if k == "kurl.sh/cluster" && v == "true" { + return "kurl", nil + } + if k == "node.k0sproject.io/role" { + return "k0s", nil + } + if k == "kots.io/embedded-cluster-role" { + return "embedded-cluster", nil + } + } + } + + return "", errors.New("current k8s distribution does not support etcd collector") +} + +func (c *etcdDebug) getOrCreateEtcdPod() error { + // if ephemeral, create a etcd client pod to exec into + // the pod will use hostNetwork: true to access the etcd server + if c.ephemeral { + err := c.createEtcdPod() + if err != nil { + return errors.Wrap(err, "failed to create etcd pod") + } + return nil + } + // if not ephemeral, find the static etcd pod to exec into + // get the first etcd pod in the cluster with label "component=etcd" in all namespaces + label := "component=etcd" + pods, err := c.client.CoreV1().Pods("").List(c.context, metav1.ListOptions{ + LabelSelector: label, + }) + if err != nil { + return errors.Wrap(err, fmt.Sprintf("failed to list etcd pods with label %s", label)) + } + if len(pods.Items) == 0 { + return errors.New("no static etcd pod found") + } + + klog.V(2).Infof("found etcd pod %s in namespace %s", pods.Items[0].Name, pods.Items[0].Namespace) + c.pod = &pods.Items[0] + return nil +} + +// createEtcdPod creates a etcd client pod to exec into +func (c *etcdDebug) createEtcdPod() error { + if c.image == "" { + c.image = "quay.io/coreos/etcd:latest" + } + namespace := "default" + labels := map[string]string{ + "troubleshoot-role": "etcd-collector", + } + spec := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "etcd-collector-", + Namespace: namespace, + Labels: labels, + }, + Spec: corev1.PodSpec{ + HostNetwork: true, + Containers: []corev1.Container{ + { + Name: "etcd-client", + Image: c.image, + Command: []string{"sleep"}, + Args: []string{"5m"}, + Env: []corev1.EnvVar{ + { + Name: "ETCDCTL_API", + Value: "3", + }, { + Name: "ETCDCTL_INSECURE_SKIP_TLS_VERIFY", + Value: "true", + }, + }, + VolumeMounts: []corev1.VolumeMount{ + { + Name: "etcd-certs", + MountPath: c.hostPath, + ReadOnly: true, + }, + }, + }, + }, + Volumes: []corev1.Volume{ + { + Name: "etcd-certs", + VolumeSource: corev1.VolumeSource{ + HostPath: &corev1.HostPathVolumeSource{ + Path: c.hostPath, + }, + }, + }, + }, + }, + } + + klog.V(2).Infof("creating etcd troubleshoot pod in namespace %s", namespace) + pod, err := c.client.CoreV1().Pods(namespace).Create(c.context, spec, metav1.CreateOptions{}) + if err != nil { + return errors.Wrap(err, "failed to create etcd troubleshoot pod") + } + c.pod = pod + return nil +} + +// cleanup deletes the etcd troubleshoot pod if it's ephemeral +func (c *etcdDebug) cleanup() { + if !c.ephemeral || c.pod == nil { + return + } + + // delete the pod + klog.V(2).Infof("deleting etcd troubleshoot pod %s in namespace %s", c.pod.Name, c.pod.Namespace) + err := c.client.CoreV1().Pods(c.pod.Namespace).Delete(context.Background(), c.pod.Name, metav1.DeleteOptions{ + GracePeriodSeconds: new(int64), // delete immediately + }) + if err != nil { + klog.Errorf("failed to delete pod %s: %v", c.pod.Name, err) + } +} + +// executeCommand exec into the pod and run the command +// it returns the stdout, stderr and error if any of the command +func (c *etcdDebug) executeCommand(command string) ([]byte, []byte, error) { + + // split command into a slice of strings + // e.g. "etcdctl endpoint health" -> ["etcdctl", "endpoint", "health"] + cdmArgs := strings.Fields(command) + cdmArgs = append(cdmArgs, c.args...) + klog.V(2).Infof("executing command: %q in pod %q (namespace %q)", strings.Join(cdmArgs, " "), c.pod.Name, c.pod.Namespace) + + req := c.client.CoreV1().RESTClient().Post(). + Resource("pods"). + Name(c.pod.Name). + Namespace(c.pod.Namespace). + SubResource("exec") + + req.VersionedParams(&corev1.PodExecOptions{ + Command: cdmArgs, + Stdin: false, + Stdout: true, + Stderr: true, + TTY: false, + }, scheme.ParameterCodec) + + exec, err := remotecommand.NewSPDYExecutor(c.clientConfig, "POST", req.URL()) + if err != nil { + return nil, nil, err + } + + var stdout, stderr bytes.Buffer + err = exec.StreamWithContext(c.context, remotecommand.StreamOptions{ + Stdout: &stdout, + Stderr: &stderr, + }) + + return stdout.Bytes(), stderr.Bytes(), err +} + +// waitForPodReady waits until the etcd troubleshoot pod is running +func (c *etcdDebug) waitForPodReady() error { + timeout := 60 * time.Second + ticker := time.NewTicker(1 * time.Second) + + ctx, cancel := context.WithTimeout(c.context, timeout) + defer cancel() + + for { + select { + case <-ctx.Done(): + return errors.New("timeout waiting for etcd troubleshooting pod to be running") + case <-ticker.C: + pod, err := c.client.CoreV1().Pods(c.pod.Namespace).Get(c.context, c.pod.Name, metav1.GetOptions{}) + if err != nil { + return errors.Wrap(err, "failed to get etcd troubleshoot pod") + } + if pod.Status.Phase == corev1.PodRunning { + // ok, pod is running + return nil + } + klog.V(2).Infof("waiting for etcd troubleshoot pod %q to be running, current status: %q", c.pod.Name, pod.Status.Phase) + } + } +} + +// generateFilenameFromCommand generates a filename from the command +// e.g. "etcdctl endpoint health" -> "endpoint-health" +func generateFilenameFromCommand(command string) string { + parts := strings.Fields(command) + if len(parts) == 0 { + return "" + } + return strings.Join(parts[1:], "-") +} + +// getFullPath returns the full path to the file +// e.g. "endpoint-health" -> "etcd/endpoint-health.json" +func getFullPath(fileName string) string { + return fmt.Sprintf("%s/%s.json", etcdOutputDir, fileName) +} + +// stderrToJson converts stderr output to json bytes +func stderrToJson(stderr []byte) []byte { + jsonObj := map[string]string{ + "stderr": string(stderr), + } + jsonBytes, err := json.Marshal(jsonObj) + if err != nil { + klog.Errorf("failed to marshal stderr to json: %v", err) + return []byte{} + } + return jsonBytes +} diff --git a/pkg/collect/etcd_test.go b/pkg/collect/etcd_test.go new file mode 100644 index 000000000..001b5392b --- /dev/null +++ b/pkg/collect/etcd_test.go @@ -0,0 +1,70 @@ +package collect + +import ( + "testing" + + "github.com/pkg/errors" + "github.com/stretchr/testify/assert" +) + +func TestGetEtcdArgsByDistribution(t *testing.T) { + tests := []struct { + distribution string + expectedArgs []string + expectedPath string + expectedErr error + }{ + { + distribution: "k0s", + expectedArgs: []string{ + "--cacert", "/var/lib/k0s/pki/etcd/ca.crt", + "--cert", "/var/lib/k0s/pki/etcd/peer.crt", + "--key", "/var/lib/k0s/pki/etcd/peer.key", + "--write-out", "json", + "--endpoints", "https://127.0.0.1:2379", + }, + expectedPath: "/var/lib/k0s/pki/etcd", + expectedErr: nil, + }, + { + distribution: "embedded-cluster", + expectedArgs: []string{ + "--cacert", "/var/lib/k0s/pki/etcd/ca.crt", + "--cert", "/var/lib/k0s/pki/etcd/peer.crt", + "--key", "/var/lib/k0s/pki/etcd/peer.key", + "--write-out", "json", + "--endpoints", "https://127.0.0.1:2379", + }, + expectedPath: "/var/lib/k0s/pki/etcd", + expectedErr: nil, + }, + { + distribution: "kurl", + expectedArgs: []string{ + "--cacert", "/etc/kubernetes/pki/etcd/ca.crt", + "--cert", "/etc/kubernetes/pki/etcd/healthcheck-client.crt", + "--key", "/etc/kubernetes/pki/etcd/healthcheck-client.key", + "--write-out", "json", + "--endpoints", "https://127.0.0.1:2379", + }, + expectedPath: "/etc/kubernetes/pki/etcd", + expectedErr: nil, + }, + { + distribution: "unknown", + expectedArgs: nil, + expectedPath: "", + expectedErr: errors.Errorf("distribution unknown not supported"), + }, + } + + for _, test := range tests { + args, path, err := getEtcdArgsByDistribution(test.distribution) + assert.Equal(t, test.expectedArgs, args) + assert.Equal(t, test.expectedPath, path) + if test.expectedErr != nil { + assert.NotNil(t, err) + assert.EqualError(t, test.expectedErr, err.Error()) + } + } +} diff --git a/schemas/collector-troubleshoot-v1beta2.json b/schemas/collector-troubleshoot-v1beta2.json index 95ab40f31..d4b664a4c 100644 --- a/schemas/collector-troubleshoot-v1beta2.json +++ b/schemas/collector-troubleshoot-v1beta2.json @@ -404,6 +404,23 @@ } } }, + "etcd": { + "type": "object", + "required": [ + "image" + ], + "properties": { + "collectorName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "image": { + "type": "string" + } + } + }, "exec": { "type": "object", "required": [ diff --git a/schemas/preflight-troubleshoot-v1beta2.json b/schemas/preflight-troubleshoot-v1beta2.json index ce3c9b40a..10c807086 100644 --- a/schemas/preflight-troubleshoot-v1beta2.json +++ b/schemas/preflight-troubleshoot-v1beta2.json @@ -3068,6 +3068,23 @@ } } }, + "etcd": { + "type": "object", + "required": [ + "image" + ], + "properties": { + "collectorName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "image": { + "type": "string" + } + } + }, "exec": { "type": "object", "required": [ diff --git a/schemas/supportbundle-troubleshoot-v1beta2.json b/schemas/supportbundle-troubleshoot-v1beta2.json index 35273b7d3..d8671faeb 100644 --- a/schemas/supportbundle-troubleshoot-v1beta2.json +++ b/schemas/supportbundle-troubleshoot-v1beta2.json @@ -3114,6 +3114,23 @@ } } }, + "etcd": { + "type": "object", + "required": [ + "image" + ], + "properties": { + "collectorName": { + "type": "string" + }, + "exclude": { + "oneOf": [{"type": "string"},{"type": "boolean"}] + }, + "image": { + "type": "string" + } + } + }, "exec": { "type": "object", "required": [