diff --git a/pkg/pinger/config.go b/pkg/pinger/config.go index a39dab1d85a..b2f6e1395ae 100644 --- a/pkg/pinger/config.go +++ b/pkg/pinger/config.go @@ -21,6 +21,7 @@ type Configuration struct { DaemonSetName string Interval int Mode string + ExitCode int InternalDNS string ExternalDNS string NodeName string @@ -54,6 +55,7 @@ func ParseFlags() (*Configuration, error) { argDaemonSetName = pflag.String("ds-name", "kube-ovn-pinger", "kube-ovn-pinger daemonset name") argInterval = pflag.Int("interval", 5, "interval seconds between consecutive pings") argMode = pflag.String("mode", "server", "server or job Mode") + argExitCode = pflag.Int("exit-code", 0, "exit code when failure happens") argInternalDns = pflag.String("internal-dns", "kubernetes.default", "check dns from pod") argExternalDns = pflag.String("external-dns", "alauda.cn", "check external dns resolve from pod") argExternalAddress = pflag.String("external-address", "114.114.114.114", "check ping connection to an external address, default empty that will disable external check") @@ -101,6 +103,7 @@ func ParseFlags() (*Configuration, error) { DaemonSetName: *argDaemonSetName, Interval: *argInterval, Mode: *argMode, + ExitCode: *argExitCode, InternalDNS: *argInternalDns, ExternalDNS: *argExternalDns, PodIP: os.Getenv("POD_IP"), @@ -136,10 +139,10 @@ func ParseFlags() (*Configuration, error) { } for _, arg := range ds.Spec.Template.Spec.Containers[0].Command { arg = strings.Trim(arg, "\"") - if strings.HasPrefix(arg, "--external-address=") { + if config.ExternalAddress == "114.114.114.114" && strings.HasPrefix(arg, "--external-address=") { config.ExternalAddress = strings.TrimPrefix(arg, "--external-address=") } - if strings.HasPrefix(arg, "--external-dns=") { + if config.ExternalDNS == "alauda.cn" && strings.HasPrefix(arg, "--external-dns=") { config.ExternalDNS = strings.TrimPrefix(arg, "--external-dns=") } } diff --git a/pkg/pinger/ovn.go b/pkg/pinger/ovn.go index 82649ceceb1..a286469bc69 100644 --- a/pkg/pinger/ovn.go +++ b/pkg/pinger/ovn.go @@ -9,39 +9,40 @@ import ( "strings" ) -func checkOvs(config *Configuration) { +func checkOvs(config *Configuration) error { output, err := exec.Command("/usr/share/openvswitch/scripts/ovs-ctl", "status").CombinedOutput() if err != nil { klog.Errorf("check ovs status failed %v, %s", err, string(output)) SetOvsDownMetrics(config.NodeName) - return + return err } klog.Infof("ovs-vswitchd and ovsdb are up") SetOvsUpMetrics(config.NodeName) - return + return nil } -func checkOvnController(config *Configuration) { +func checkOvnController(config *Configuration) error { output, err := exec.Command("/usr/share/ovn/scripts/ovn-ctl", "status_controller").CombinedOutput() if err != nil { klog.Errorf("check ovn_controller status failed %v, %q", err, output) SetOvnControllerDownMetrics(config.NodeName) - return + return err } klog.Infof("ovn_controller is up") SetOvnControllerUpMetrics(config.NodeName) + return nil } -func checkPortBindings(config *Configuration) { +func checkPortBindings(config *Configuration) error { klog.Infof("start to check port binding") ovsBindings, err := checkOvsBindings() if err != nil { - return + return err } sbBindings, err := checkSBBindings(config) if err != nil { - return + return err } klog.Infof("port in sb is %v", sbBindings) misMatch := []string{} @@ -53,11 +54,12 @@ func checkPortBindings(config *Configuration) { if len(misMatch) > 0 { klog.Errorf("%d port %v not exist in sb-bindings", len(misMatch), misMatch) inconsistentPortBindingGauge.WithLabelValues(config.NodeName).Set(float64(len(misMatch))) + return fmt.Errorf("%d port %v not exist in sb-bindings", len(misMatch), misMatch) } else { klog.Infof("ovs and ovn-sb binding check passed") inconsistentPortBindingGauge.WithLabelValues(config.NodeName).Set(0) } - return + return nil } func checkOvsBindings() ([]string, error) { diff --git a/pkg/pinger/ping.go b/pkg/pinger/ping.go index 783a7f1e7ad..266cf16eda9 100644 --- a/pkg/pinger/ping.go +++ b/pkg/pinger/ping.go @@ -2,8 +2,10 @@ package pinger import ( "context" + "fmt" "math" "net" + "os" "time" goping "github.com/oilbeater/go-ping" @@ -14,44 +16,65 @@ import ( ) func StartPinger(config *Configuration, e *Exporter) { + errHappens := false for { if config.NetworkMode == "kube-ovn" { - checkOvs(config) - checkOvnController(config) - checkPortBindings(config) + if checkOvs(config) != nil || + checkOvnController(config) != nil || + checkPortBindings(config) != nil { + errHappens = true + } e.ovsMetricsUpdate() } - ping(config) + if ping(config) != nil { + errHappens = true + } if config.Mode != "server" { break } time.Sleep(time.Duration(config.Interval) * time.Second) } + if errHappens && config.ExitCode != 0 { + os.Exit(config.ExitCode) + } } -func ping(config *Configuration) { - checkApiServer(config) - pingNodes(config) - pingPods(config) - internalNslookup(config) +func ping(config *Configuration) error { + errHappens := false + if checkApiServer(config) != nil || + pingNodes(config) != nil || + pingPods(config) != nil || + internalNslookup(config) != nil { + errHappens = true + } if config.ExternalDNS != "" { - externalNslookup(config) + if externalNslookup(config) != nil { + errHappens = true + } } if config.ExternalAddress != "" { - pingExternal(config) + if pingExternal(config) != nil { + errHappens = true + } } + if errHappens { + return fmt.Errorf("ping failed") + } + return nil } -func pingNodes(config *Configuration) { +func pingNodes(config *Configuration) error { klog.Infof("start to check node connectivity") nodes, err := config.KubeClient.CoreV1().Nodes().List(metav1.ListOptions{}) if err != nil { klog.Errorf("failed to list nodes, %v", err) - return + return err } + + var pingErr error for _, no := range nodes.Items { for _, addr := range no.Status.Addresses { if addr.Type == v1.NodeInternalIP { @@ -59,6 +82,7 @@ func pingNodes(config *Configuration) { pinger, err := goping.NewPinger(nodeIP) if err != nil { klog.Errorf("failed to init pinger, %v", err) + pingErr = err return } pinger.SetPrivileged(true) @@ -70,6 +94,9 @@ func pingNodes(config *Configuration) { stats := pinger.Statistics() klog.Infof("ping node: %s %s, count: %d, loss count %d, average rtt %.2fms", nodeName, nodeIP, pinger.Count, int(math.Abs(float64(stats.PacketsSent-stats.PacketsRecv))), float64(stats.AvgRtt)/float64(time.Millisecond)) + if int(math.Abs(float64(stats.PacketsSent-stats.PacketsRecv))) != 0 { + pingErr = fmt.Errorf("ping failed") + } SetNodePingMetrics( config.NodeName, config.HostIP, @@ -81,27 +108,30 @@ func pingNodes(config *Configuration) { } } } + return pingErr } -func pingPods(config *Configuration) { +func pingPods(config *Configuration) error { klog.Infof("start to check pod connectivity") ds, err := config.KubeClient.AppsV1().DaemonSets(config.DaemonSetNamespace).Get(config.DaemonSetName, metav1.GetOptions{}) if err != nil { klog.Errorf("failed to get peer ds: %v", err) - return + return err } pods, err := config.KubeClient.CoreV1().Pods(config.DaemonSetNamespace).List(metav1.ListOptions{LabelSelector: labels.Set(ds.Spec.Selector.MatchLabels).String()}) if err != nil { klog.Errorf("failed to list peer pods: %v", err) - return + return err } + var pingErr error for _, pod := range pods.Items { if pod.Status.PodIP != "" { func(podIp, podName, nodeIP, nodeName string) { pinger, err := goping.NewPinger(podIp) if err != nil { klog.Errorf("failed to init pinger, %v", err) + pingErr = err return } pinger.SetPrivileged(true) @@ -113,6 +143,9 @@ func pingPods(config *Configuration) { stats := pinger.Statistics() klog.Infof("ping pod: %s %s, count: %d, loss count %d, average rtt %.2fms", podName, podIp, pinger.Count, int(math.Abs(float64(stats.PacketsSent-stats.PacketsRecv))), float64(stats.AvgRtt)/float64(time.Millisecond)) + if int(math.Abs(float64(stats.PacketsSent-stats.PacketsRecv))) != 0 { + pingErr = fmt.Errorf("ping failed") + } SetPodPingMetrics( config.NodeName, config.HostIP, @@ -125,17 +158,18 @@ func pingPods(config *Configuration) { }(pod.Status.PodIP, pod.Name, pod.Status.HostIP, pod.Spec.NodeName) } } + return pingErr } -func pingExternal(config *Configuration) { +func pingExternal(config *Configuration) error { if config.ExternalAddress == "" { - return + return nil } klog.Infof("start to check ping external to %s", config.ExternalAddress) pinger, err := goping.NewPinger(config.ExternalAddress) if err != nil { klog.Errorf("failed to init pinger, %v", err) - return + return err } pinger.SetPrivileged(true) pinger.Timeout = 5 * time.Second @@ -153,9 +187,13 @@ func pingExternal(config *Configuration) { config.ExternalAddress, float64(stats.AvgRtt)/float64(time.Millisecond), int(math.Abs(float64(stats.PacketsSent-stats.PacketsRecv)))) + if int(math.Abs(float64(stats.PacketsSent-stats.PacketsRecv))) != 0 { + return fmt.Errorf("ping failed") + } + return nil } -func internalNslookup(config *Configuration) { +func internalNslookup(config *Configuration) error { klog.Infof("start to check dns connectivity") t1 := time.Now() ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) @@ -166,13 +204,14 @@ func internalNslookup(config *Configuration) { if err != nil { klog.Errorf("failed to resolve dns %s, %v", config.InternalDNS, err) SetInternalDnsUnhealthyMetrics(config.NodeName) - return + return err } SetInternalDnsHealthyMetrics(config.NodeName, float64(elpased)/float64(time.Millisecond)) klog.Infof("resolve dns %s to %v in %.2fms", config.InternalDNS, addrs, float64(elpased)/float64(time.Millisecond)) + return nil } -func externalNslookup(config *Configuration) { +func externalNslookup(config *Configuration) error { klog.Infof("start to check dns connectivity") t1 := time.Now() ctx, cancel := context.WithTimeout(context.TODO(), 10*time.Second) @@ -183,13 +222,14 @@ func externalNslookup(config *Configuration) { if err != nil { klog.Errorf("failed to resolve dns %s, %v", config.ExternalDNS, err) SetExternalDnsUnhealthyMetrics(config.NodeName) - return + return err } SetExternalDnsHealthyMetrics(config.NodeName, float64(elpased)/float64(time.Millisecond)) klog.Infof("resolve dns %s to %v in %.2fms", config.ExternalDNS, addrs, float64(elpased)/float64(time.Millisecond)) + return nil } -func checkApiServer(config *Configuration) { +func checkApiServer(config *Configuration) error { klog.Infof("start to check apiserver connectivity") t1 := time.Now() _, err := config.KubeClient.Discovery().ServerVersion() @@ -197,9 +237,9 @@ func checkApiServer(config *Configuration) { if err != nil { klog.Errorf("failed to connect to apiserver: %v", err) SetApiserverUnhealthyMetrics(config.NodeName) - return + return err } klog.Infof("connect to apiserver success in %.2fms", float64(elpased)/float64(time.Millisecond)) SetApiserverHealthyMetrics(config.NodeName, float64(elpased)/float64(time.Millisecond)) - return + return nil }