Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: [sc-110727] troubleshoot: collector/analyzer for wildcard dns #1606

Merged
merged 6 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.sh_collectors.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,10 @@ spec:
type: string
exclude:
type: BoolString
image:
type: string
nonResolvable:
type: string
timeout:
type: string
type: object
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.sh_preflights.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2027,6 +2027,10 @@ spec:
type: string
exclude:
type: BoolString
image:
type: string
nonResolvable:
type: string
timeout:
type: string
type: object
Expand Down
4 changes: 4 additions & 0 deletions config/crds/troubleshoot.sh_supportbundles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2058,6 +2058,10 @@ spec:
type: string
exclude:
type: BoolString
image:
type: string
nonResolvable:
type: string
timeout:
type: string
type: object
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/troubleshoot/v1beta2/collector_shared.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,8 @@ type Sonobuoy struct {
type DNS struct {
CollectorMeta `json:",inline" yaml:",inline"`
Timeout string `json:"timeout,omitempty" yaml:"timeout,omitempty"`
Image string `json:"image,omitempty" yaml:"image,omitempty"`
NonResolvable string `json:"nonResolvable,omitempty" yaml:"nonResolvable,omitempty"`
}

type Etcd struct {
Expand Down
117 changes: 103 additions & 14 deletions pkg/collect/dns.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package collect

import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"path/filepath"
"strings"
"time"

Expand All @@ -21,7 +21,8 @@ import (
)

const (
dnsUtilsImage = "registry.k8s.io/e2e-test-images/jessie-dnsutils:1.3"
dnsUtilsImage = "registry.k8s.io/e2e-test-images/agnhost:2.39"
nonResolvableDomain = "non-existent-domain"
)

type CollectDNS struct {
Expand All @@ -34,6 +35,25 @@ type CollectDNS struct {
RBACErrors
}

// DNSTroubleshootResult represents the structure of the DNS troubleshooting JSON data
type DNSTroubleshootResult struct {
KubernetesClusterIP string `json:"kubernetesClusterIP"`
PodResolvConf string `json:"podResolvConf"`
Query struct {
Kubernetes struct {
Name string `json:"name"`
Address string `json:"address"`
} `json:"kubernetes"`
NonResolvableDomain struct {
Name string `json:"name"`
Address string `json:"address"`
} `json:"nonResolvableDomain"`
} `json:"query"`
KubeDNSPods []string `json:"kubeDNSPods"`
KubeDNSService string `json:"kubeDNSService"`
KubeDNSEndpoints string `json:"kubeDNSEndpoints"`
}

func (c *CollectDNS) Title() string {
return getCollectorName(c)
}
Expand All @@ -48,32 +68,57 @@ func (c *CollectDNS) Collect(progressChan chan<- interface{}) (CollectorResult,
defer cancel()

sb := strings.Builder{}
dnsDebug := DNSTroubleshootResult{}

// get kubernetes Cluster IP
clusterIP, err := getKubernetesClusterIP(c.Client, ctx)
if err == nil {
sb.WriteString(fmt.Sprintf("=== Kubernetes Cluster IP from API Server: %s\n", clusterIP))
dnsDebug.KubernetesClusterIP = clusterIP
} else {
sb.WriteString(fmt.Sprintf("=== Failed to detect Kubernetes Cluster IP: %v\n", err))
}

// run a pod and perform DNS lookup
podLog, err := troubleshootDNSFromPod(c.Client, ctx)
testDomain := c.Collector.NonResolvable
if testDomain == "" {
testDomain = nonResolvableDomain
}
dnsDebug.Query.NonResolvableDomain.Name = testDomain

image := c.Collector.Image
if image == "" {
image = dnsUtilsImage
}

podLog, err := troubleshootDNSFromPod(c.Client, ctx, testDomain, image)
if err == nil {
sb.WriteString(fmt.Sprintf("=== Test DNS resolution in pod %s: \n", dnsUtilsImage))
sb.WriteString(fmt.Sprintf("=== Test DNS resolution in pod %s: \n", image))
sb.WriteString(podLog)
} else {
sb.WriteString(fmt.Sprintf("=== Failed to run commands from pod: %v\n", err))
}

// extract DNS queries from pod log
err = extractDNSQueriesFromPodLog(podLog, &dnsDebug)
if err != nil {
sb.WriteString(fmt.Sprintf("=== Failed to extract DNS queries from pod log: %v\n", err))
}

// is DNS pods running?
sb.WriteString(fmt.Sprintf("=== Running kube-dns pods: %s\n", getRunningKubeDNSPodNames(c.Client, ctx)))
kubeDNSPods := getRunningKubeDNSPodNames(c.Client, ctx)
sb.WriteString(fmt.Sprintf("=== Running kube-dns pods: %s\n", kubeDNSPods))
dnsDebug.KubeDNSPods = strings.Split(kubeDNSPods, ", ")

// is DNS service up?
sb.WriteString(fmt.Sprintf("=== Running kube-dns service: %s\n", getKubeDNSServiceClusterIP(c.Client, ctx)))
kubeDNSService := getKubeDNSServiceClusterIP(c.Client, ctx)
sb.WriteString(fmt.Sprintf("=== Running kube-dns service: %s\n", kubeDNSService))
dnsDebug.KubeDNSService = kubeDNSService

// are DNS endpoints exposed?
sb.WriteString(fmt.Sprintf("=== kube-dns endpoints: %s\n", getKubeDNSEndpoints(c.Client, ctx)))
kubeDNSEndpoints := getKubeDNSEndpoints(c.Client, ctx)
sb.WriteString(fmt.Sprintf("=== kube-dns endpoints: %s\n", kubeDNSEndpoints))
dnsDebug.KubeDNSEndpoints = kubeDNSEndpoints

// get DNS server config
coreDNSConfig, err := getCoreDNSConfig(c.Client, ctx)
Expand All @@ -89,7 +134,16 @@ func (c *CollectDNS) Collect(progressChan chan<- interface{}) (CollectorResult,

data := sb.String()
output := NewResult()
output.SaveResult(c.BundlePath, filepath.Join("dns", c.Collector.CollectorName), bytes.NewBuffer([]byte(data)))

// save raw debug output
output.SaveResult(c.BundlePath, "dns/debug.txt", bytes.NewBuffer([]byte(data)))

// save structured debug output as JSON file
jsonData, err := json.Marshal(dnsDebug)
if err != nil {
return output, errors.Wrap(err, "failed to marshal DNS troubleshooting data")
}
output.SaveResult(c.BundlePath, "dns/debug.json", bytes.NewBuffer(jsonData))

return output, nil
}
Expand All @@ -104,14 +158,17 @@ func getKubernetesClusterIP(client kubernetes.Interface, ctx context.Context) (s
return service.Spec.ClusterIP, nil
}

func troubleshootDNSFromPod(client kubernetes.Interface, ctx context.Context) (string, error) {
func troubleshootDNSFromPod(client kubernetes.Interface, ctx context.Context, nonResolvableDomain string, image string) (string, error) {
namespace := "default"
command := []string{"/bin/sh", "-c", `
set -x
command := []string{"/bin/sh", "-c", fmt.Sprintf(`
Copy link
Member

@banjoh banjoh Sep 3, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why you switched from nslookup to dig? People will need find an image that has dig installed which is an additional hurdle to jump. Images such as busybox and alpine package nslookup. Embedded Cluster uses busybox which, if not overridden, will be present in the airgap bundle

It might be slightly more code, but you can attempt dig and if the binary is not present, try nslookup. The collector will be more robust this way.

Copy link
Member Author

@nvanthao nvanthao Sep 4, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The nslookup command in busybox is not working correctly

E.g.

k run -it --rm debug --image busybox -- /bin/sh
If you don't see a command prompt, try pressing enter.
/ #
/ # nslookup kubernetes.default
Server:         10.43.0.10
Address:        10.43.0.10:53

** server can't find kubernetes.default: NXDOMAIN

I've changed to dig simply because of the base image has dig installed, we only want to resolve an address, and the dig +short is easier to parse.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add this as a comment in the code for future context on the technical choice?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

many thanks @banjoh! I'll update the docs accordingly as well.

echo "=== /etc/resolv.conf ==="
cat /etc/resolv.conf
nslookup -debug kubernetes
echo "=== dig kubernetes ==="
dig +search +short kubernetes
echo "=== dig non-existent-domain ==="
dig +short %s
exit 0
`}
`, nonResolvableDomain)}

// TODO: image pull secret?
podLabels := map[string]string{
Expand All @@ -127,7 +184,7 @@ func troubleshootDNSFromPod(client kubernetes.Interface, ctx context.Context) (s
Containers: []corev1.Container{
{
Name: "troubleshoot-dns",
Image: dnsUtilsImage,
Image: image,
Command: command,
},
},
Expand Down Expand Up @@ -271,3 +328,35 @@ func getKubeDNSEndpoints(client kubernetes.Interface, ctx context.Context) strin

return strings.Join(endpointStrings, ", ")
}

func extractDNSQueriesFromPodLog(podLog string, dnsDebug *DNSTroubleshootResult) error {
nvanthao marked this conversation as resolved.
Show resolved Hide resolved
scanner := bufio.NewScanner(strings.NewReader(podLog))

var currentSection string

for scanner.Scan() {
line := scanner.Text()

switch {
case strings.Contains(line, "=== /etc/resolv.conf ==="):
currentSection = "podResolvConf"
case strings.Contains(line, "=== dig kubernetes ==="):
currentSection = "kubernetes"
case strings.Contains(line, "=== dig non-existent-domain ==="):
currentSection = "nonResolvableDomain"
default:
switch currentSection {
case "podResolvConf":
dnsDebug.PodResolvConf += line + "\n"
case "kubernetes":
dnsDebug.Query.Kubernetes.Name = "kubernetes"
dnsDebug.Query.Kubernetes.Address = line
case "nonResolvableDomain":
dnsDebug.Query.NonResolvableDomain.Address = line
}
}
}

return nil

}
41 changes: 41 additions & 0 deletions pkg/collect/dns_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"testing"

"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes/fake"
Expand Down Expand Up @@ -39,3 +40,43 @@ func TestGetKubernetesClusterIP(t *testing.T) {
t.Errorf("expected %s, got %s", k8sSvcIp, clusterIP)
}
}

func TestExtractDNSQueriesFromPodLog(t *testing.T) {
podLog := `
=== /etc/resolv.conf ===
search default.svc.cluster.local svc.cluster.local cluster.local
nameserver 10.43.0.10
options ndots:5
=== dig kubernetes ===
10.43.0.1
=== dig non-existent-domain ===`

expectedResolvConf := `search default.svc.cluster.local svc.cluster.local cluster.local
nameserver 10.43.0.10
options ndots:5
`

expectedKubernetesQuery := struct {
Name string `json:"name"`
Address string `json:"address"`
}{
Name: "kubernetes",
Address: "10.43.0.1",
}

expectedNonResolvableDomainQuery := struct {
Name string `json:"name"`
Address string `json:"address"`
}{
Name: "",
Address: "",
}

dnsDebug := &DNSTroubleshootResult{}
err := extractDNSQueriesFromPodLog(podLog, dnsDebug)
assert.NoError(t, err)

assert.Equal(t, expectedResolvConf, dnsDebug.PodResolvConf)
assert.Equal(t, expectedKubernetesQuery, dnsDebug.Query.Kubernetes)
assert.Equal(t, expectedNonResolvableDomainQuery, dnsDebug.Query.NonResolvableDomain)
}
6 changes: 6 additions & 0 deletions schemas/collector-troubleshoot-v1beta2.json
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,12 @@
"exclude": {
"oneOf": [{"type": "string"},{"type": "boolean"}]
},
"image": {
"type": "string"
},
"nonResolvable": {
"type": "string"
},
"timeout": {
"type": "string"
}
Expand Down
6 changes: 6 additions & 0 deletions schemas/preflight-troubleshoot-v1beta2.json
Original file line number Diff line number Diff line change
Expand Up @@ -3063,6 +3063,12 @@
"exclude": {
"oneOf": [{"type": "string"},{"type": "boolean"}]
},
"image": {
"type": "string"
},
"nonResolvable": {
"type": "string"
},
"timeout": {
"type": "string"
}
Expand Down
6 changes: 6 additions & 0 deletions schemas/supportbundle-troubleshoot-v1beta2.json
Original file line number Diff line number Diff line change
Expand Up @@ -3109,6 +3109,12 @@
"exclude": {
"oneOf": [{"type": "string"},{"type": "boolean"}]
},
"image": {
"type": "string"
},
"nonResolvable": {
"type": "string"
},
"timeout": {
"type": "string"
}
Expand Down
Loading