From 87927b3692fb4d61c382444d77cfc7a8cc91e566 Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Mon, 10 Jun 2024 15:43:13 -0700 Subject: [PATCH 1/5] rpk: add hint to debug slab info collection Most of the time this step fails due to a permission error. --- src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go index b94570f5dfb4..bc359f206ad3 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go @@ -557,6 +557,9 @@ func saveSlabInfo(ps *stepParams) step { return func() error { bs, err := afero.ReadFile(ps.fs, "/proc/slabinfo") if err != nil { + if errors.Is(err, fs.ErrPermission) { + return fmt.Errorf("%v: you may need to run the command as root to read this file", err) + } return err } return writeFileToZip(ps, "proc/slabinfo", bs) From 3ec0d6cf73bb7e39bc04d02d1f363b41a918459e Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Mon, 10 Jun 2024 16:45:30 -0700 Subject: [PATCH 2/5] rpk: controller log collection err improvement If a user provides a configuration file without redpanda.data_directory, rpk won't know where to find the controller log dirs. We now provide a better error message instead of: * lstat redpanda/controller/0_0: no such file or directory Either way, a configuration file (redpanda.yaml) without a data_directory is an invalid config file, --- src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go index bc359f206ad3..4500418a2acc 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go @@ -943,13 +943,16 @@ func sliceControllerDir(cFiles []fileSize, logLimitBytes int64) (slice []fileSiz func saveControllerLogDir(ps *stepParams, y *config.RedpandaYaml, logLimitBytes int) step { return func() error { + if y.Redpanda.Directory == "" { + return fmt.Errorf("failed to save controller logs: 'redpanda.data_directory' is empty on the provided configuration file") + } controllerDir := filepath.Join(y.Redpanda.Directory, "redpanda", "controller", "0_0") // We don't need the .base_index files to parse out the messages. exclude := regexp.MustCompile(`^*.base_index$`) cFiles, size, err := walkSizeDir(controllerDir, exclude) if err != nil { - return err + return fmt.Errorf("unable to save controller logs: %v", err) } if int(size) < logLimitBytes { @@ -969,11 +972,11 @@ func saveControllerLogDir(ps *stepParams, y *config.RedpandaYaml, logLimitBytes for _, cLog := range slice { file, err := os.ReadFile(cLog.path) if err != nil { - return err + return fmt.Errorf("unable to save controller logs: %v", err) } err = writeFileToZip(ps, filepath.Join("controller", filepath.Base(cLog.path)), file) if err != nil { - return err + return fmt.Errorf("unable to save controller logs: %v", err) } } return nil From 63a367e392230e29222e4e8ae571653249dee4f7 Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Mon, 10 Jun 2024 17:08:41 -0700 Subject: [PATCH 3/5] rpk: hint error message location for bundle failures When a command fails to run, rpk will return: - couldn't save 'foo.txt': exit status 1 And will save stderr in foo.txt for full debugging. This is not clear, so users may be lost about what happened and won't know how to get pass this error. We are adding a hint of where is the rest of the error (which might be multiple lines of text) --- src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go index 4500418a2acc..76964f973470 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_linux.go @@ -310,7 +310,7 @@ func writeCommandOutputToZipLimit( err = cmd.Wait() if err != nil { if !strings.Contains(err.Error(), "broken pipe") { - return fmt.Errorf("couldn't save '%s': %w", filename, err) + return fmt.Errorf("couldn't save '%s': %w; %[1]v contains the full error message", filename, err) } zap.L().Sugar().Debugf( "Got '%v' while running '%s'. This is probably due to the"+ From 865af177690504a39277106d07116212df06a6f7 Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Mon, 10 Jun 2024 17:40:00 -0700 Subject: [PATCH 4/5] rpk: fallback to profile's admin address in k8s bundle Clusters deployed with helm/operator will now have the rpk section of the redpanda.yaml filled with the Admin API addresses of the cluster. We fallback to these addresses in case rpk can't discover the API addresses using the k8s API. --- src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go index c4f9b8afc7de..93989c10050e 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go @@ -86,7 +86,14 @@ func executeK8SBundle(ctx context.Context, bp bundleParams) error { zap.L().Sugar().Debugf("unable to get admin API addresses from the k8s API: %v", err) } if len(adminAddresses) == 0 { - adminAddresses = []string{fmt.Sprintf("127.0.0.1:%v", config.DefaultAdminPort)} + if len(bp.p.AdminAPI.Addresses) > 0 { + zap.L().Sugar().Debugf("using admin API addresses from profile: %v", bp.p.AdminAPI.Addresses) + adminAddresses = bp.p.AdminAPI.Addresses + } else { + defaultAddress := fmt.Sprintf("127.0.0.1:%v", config.DefaultAdminPort) + zap.L().Sugar().Debugf("profile empty, using %v for the Admin API address", defaultAddress) + adminAddresses = []string{defaultAddress} + } } steps = append(steps, []step{ saveClusterAdminAPICalls(ctx, ps, bp.fs, bp.p, adminAddresses, bp.partitions), From e779bf3bf01aabeca73b543afe12373e4bdf5fea Mon Sep 17 00:00:00 2001 From: r-vasquez Date: Mon, 10 Jun 2024 17:43:17 -0700 Subject: [PATCH 5/5] rpk: permission check for k8s bundle Now we want to check if the authenticated user account has authorization to collect the k8s resources needed for the debug bundle process. If not, we avoid running all the steps and instead providing a single, meaningful error message with a hint on how to solve this (link to our docs). --- .../pkg/cli/debug/bundle/bundle_k8s_linux.go | 65 +++++++++++++++++-- 1 file changed, 58 insertions(+), 7 deletions(-) diff --git a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go index 93989c10050e..7a8aca8134e2 100644 --- a/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go +++ b/src/go/rpk/pkg/cli/debug/bundle/bundle_k8s_linux.go @@ -24,6 +24,8 @@ import ( "strings" "time" + authorizationv1 "k8s.io/api/authorization/v1" + "github.com/hashicorp/go-multierror" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/adminapi" "github.com/redpanda-data/redpanda/src/go/rpk/pkg/config" @@ -70,8 +72,6 @@ func executeK8SBundle(ctx context.Context, bp bundleParams) error { saveDataDirStructure(ps, bp.y), saveDiskUsage(ctx, ps, bp.y), saveInterrupts(ps), - saveK8SLogs(ctx, ps, bp.namespace, bp.logsSince, bp.logsLimitBytes, bp.labelSelector), - saveK8SResources(ctx, ps, bp.namespace, bp.labelSelector), saveKafkaMetadata(ctx, ps, bp.cl), saveKernelSymbols(ps), saveMdstat(ps), @@ -81,9 +81,25 @@ func executeK8SBundle(ctx context.Context, bp bundleParams) error { saveSlabInfo(ps), } - adminAddresses, err := adminAddressesFromK8S(ctx, bp.namespace) - if err != nil { - zap.L().Sugar().Debugf("unable to get admin API addresses from the k8s API: %v", err) + // We use the K8S to discover the cluster's admin API addresses and collect + // logs and k8s resources. First we check if we have enough permissions + // before kicking the steps. + var adminAddresses []string + if err := checkK8sPermissions(ctx, bp.namespace); err != nil { + errs = multierror.Append( + errs, + fmt.Errorf("skipping log collection and Kubernetes resource collection (such as Pods and Services) in the namespace %q. To enable this, grant additional permissions to your Service Account. For more information, visit https://docs.redpanda.com/current/manage/kubernetes/troubleshooting/k-diagnostics-bundle/", err), + ) + } else { + steps = append(steps, []step{ + saveK8SResources(ctx, ps, bp.namespace, bp.labelSelector), + saveK8SLogs(ctx, ps, bp.namespace, bp.logsSince, bp.logsLimitBytes, bp.labelSelector), + }...) + + adminAddresses, err = adminAddressesFromK8S(ctx, bp.namespace) + if err != nil { + zap.L().Sugar().Debugf("unable to get admin API addresses from the k8s API: %v", err) + } } if len(adminAddresses) == 0 { if len(bp.p.AdminAPI.Addresses) > 0 { @@ -145,6 +161,41 @@ func k8sPodList(ctx context.Context, namespace string, labelSelector map[string] return clientset, pods, nil } +// checkK8sPermissions will check for the minimal service account permissions +// needed to perform the k8s-API-related steps in the debug bundle collection +// process. +func checkK8sPermissions(ctx context.Context, namespace string) error { + cl, err := k8sClientset() + if err != nil { + return fmt.Errorf("unable to create kubernetes client: %v", err) + } + + // These are the minimal permissions needed for the k8s bundle to function. + perMap := map[string]string{ + "services": "list", + "pods": "list", + } + for resource, verb := range perMap { + sar := &authorizationv1.SelfSubjectAccessReview{ + Spec: authorizationv1.SelfSubjectAccessReviewSpec{ + ResourceAttributes: &authorizationv1.ResourceAttributes{ + Namespace: namespace, + Verb: verb, + Resource: resource, + }, + }, + } + response, err := cl.AuthorizationV1().SelfSubjectAccessReviews().Create(ctx, sar, metav1.CreateOptions{}) + if err != nil { + return fmt.Errorf("unable to check service account permissions: %v", err) + } + if !response.Status.Allowed { + return fmt.Errorf("permission denied to %s %s", verb, resource) + } + } + return nil +} + // adminAddressesFromK8S returns the admin API host:port list by querying the // K8S Api. func adminAddressesFromK8S(ctx context.Context, namespace string) ([]string, error) { @@ -379,7 +430,7 @@ func saveK8SResources(ctx context.Context, ps *stepParams, namespace string, lab return func() error { clientset, pods, err := k8sPodList(ctx, namespace, labelSelector) if err != nil { - return err + return fmt.Errorf("unable to save k8s resources: unable to list k8s pods: %v", err) } // This is a safeguard, so we don't end up saving empty request for // namespace who don't have any pods. @@ -421,7 +472,7 @@ func saveK8SLogs(ctx context.Context, ps *stepParams, namespace, since string, l return func() error { clientset, pods, err := k8sPodList(ctx, namespace, labelSelector) if err != nil { - return err + return fmt.Errorf("unable to save logs: unable to list k8s pods: %v", err) } podsInterface := clientset.CoreV1().Pods(namespace)