Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix some diagnostic error handling (NetworkCheck and DiagnosticPod) #16848

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 31 additions & 9 deletions pkg/diagnostics/client/run_diagnostics_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@ package client
import (
"bufio"
"fmt"
"os"
"os/signal"
"regexp"
"strconv"
"syscall"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand Down Expand Up @@ -51,11 +54,11 @@ func (d *DiagnosticPod) CanRun() (bool, error) {
// Check is part of the Diagnostic interface; it runs the actual diagnostic logic
func (d *DiagnosticPod) Check() types.DiagnosticResult {
r := types.NewDiagnosticResult("DiagnosticPod")
d.runDiagnosticPod(nil, r)
d.runDiagnosticPod(r)
return r
}

func (d *DiagnosticPod) runDiagnosticPod(service *kapi.Service, r types.DiagnosticResult) {
func (d *DiagnosticPod) runDiagnosticPod(r types.DiagnosticResult) {
loglevel := d.Level
if loglevel > 2 {
loglevel = 2 // need to show summary at least
Expand All @@ -78,14 +81,33 @@ func (d *DiagnosticPod) runDiagnosticPod(service *kapi.Service, r types.Diagnost
r.Error("DCli2001", err, fmt.Sprintf("Creating diagnostic pod with image %s failed. Error: (%[2]T) %[2]v", imageName, err))
return
}
defer func() { // delete what we created, or notify that we couldn't
zero := int64(0)
delOpts := metav1.DeleteOptions{TypeMeta: pod.TypeMeta, GracePeriodSeconds: &zero}
if err := d.KubeClient.Core().Pods(d.Namespace).Delete(pod.ObjectMeta.Name, &delOpts); err != nil {
r.Error("DCl2002", err, fmt.Sprintf("Deleting diagnostic pod '%s' failed. Error: %s", pod.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]s", err)))
}

// Jump straight to clean up if there is an interrupt/terminate signal while running diagnostic
done := make(chan bool, 1)
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
go func() {
<-sig
r.Warn("DCli2014", nil, "Interrupt received; aborting diagnostic.")
done <- true
}()
pod, err = d.KubeClient.Core().Pods(d.Namespace).Get(pod.ObjectMeta.Name, metav1.GetOptions{}) // status is filled in post-create
go func() {
d.processDiagnosticPodResults(pod, imageName, r)
done <- true
}()

<-done
signal.Stop(sig)
// delete what we created, or notify that we couldn't
zero := int64(0)
delOpts := metav1.DeleteOptions{TypeMeta: pod.TypeMeta, GracePeriodSeconds: &zero}
if err := d.KubeClient.Core().Pods(d.Namespace).Delete(pod.ObjectMeta.Name, &delOpts); err != nil {
r.Error("DCl2002", err, fmt.Sprintf("Deleting diagnostic pod '%s' failed. Error: %s", pod.ObjectMeta.Name, fmt.Sprintf("(%T) %[1]s", err)))
}
}

func (d *DiagnosticPod) processDiagnosticPodResults(protoPod *kapi.Pod, imageName string, r types.DiagnosticResult) {
pod, err := d.KubeClient.Core().Pods(d.Namespace).Get(protoPod.ObjectMeta.Name, metav1.GetOptions{}) // status is filled in post-create
if err != nil {
r.Error("DCli2003", err, fmt.Sprintf("Retrieving the diagnostic pod definition failed. Error: (%T) %[1]v", err))
return
Expand Down
31 changes: 18 additions & 13 deletions pkg/diagnostics/network/run_pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ func (d *NetworkDiagnostic) Check() types.DiagnosticResult {
return d.res
}
if !ok {
d.res.Warn("DNet2002", nil, "Skipping network diagnostics check. Reason: Not using openshift network plugin.")
d.res.Info("DNet2002", "Skipping network diagnostics check. Reason: Not using openshift network plugin.")
return d.res
}

Expand All @@ -99,22 +99,27 @@ func (d *NetworkDiagnostic) Check() types.DiagnosticResult {
return d.res
}

d.runNetworkDiagnostic()
// Abort and clean up if there is an interrupt/terminate signal while running network diagnostics
done := make(chan bool, 1)
sig := make(chan os.Signal, 1)
signal.Notify(sig, os.Interrupt, syscall.SIGTERM)
go func() {
<-sig
d.res.Warn("DNet2014", nil, "Interrupt received; aborting network diagnostic.")
done <- true
}()
go func() {
d.runNetworkDiagnostic()
done <- true
}()
<-done
signal.Stop(sig)
d.Cleanup()

return d.res
}

func (d *NetworkDiagnostic) runNetworkDiagnostic() {
// Do clean up if there is an interrupt/terminate signal while running network diagnostics
c := make(chan os.Signal, 2)
signal.Notify(c, os.Interrupt, syscall.SIGTERM)
go func() {
<-c
d.Cleanup()
}()

defer func() {
d.Cleanup()
}()
// Setup test environment
if err := d.TestSetup(); err != nil {
d.res.Error("DNet2005", err, fmt.Sprintf("Setting up test environment for network diagnostics failed: %v", err))
Expand Down