-
Notifications
You must be signed in to change notification settings - Fork 4.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16742 from smarterclayton/healthcheck_ovs
Automatic merge from submit-queue (batch tested with PRs 16737, 16638, 16742, 16765, 16711). Health check the OVS process and restart if it dies Reorganize the existing setup code to perform a periodic background check on the state of the OVS database. If the SDN setup is lost, force the node/network processes to restart. Use the JSONRPC endpoint to perform a few simple checks of status, and detect failure quickly. This reuses our existing health check code, which does not appear to be a performance issue when checked periodically. Node waiting for OVS to start: ``` I1008 06:41:25.661293 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:26.690356 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:27.653112 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:28.671950 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:29.653713 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory W1008 06:41:30.285617 11598 cni.go:189] Unable to update cni config: No networks found in /etc/cni/net.d E1008 06:41:30.286780 11598 kubelet.go:2093] Container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:docker: network plugin is not ready: cni config uninitialized I1008 06:41:30.661441 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:31.653232 11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:32.674697 11598 sdn_controller.go:180] [SDN setup] full SDN setup required ``` Let node start, then stop OVS, node detects immediately ``` I1008 06:41:40.208239 11598 kubelet_node_status.go:433] Recording NodeReady event message for node localhost.localdomain I1008 06:41:43.076299 11598 nodecontroller.go:770] NodeController detected that some Nodes are Ready. Exiting master disruption mode. E1008 06:41:50.941351 11598 healthcheck.go:55] SDN healthcheck disconnected from OVS server: <nil> I1008 06:41:50.941541 11598 healthcheck.go:60] SDN healthcheck unable to reconnect to OVS server: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory I1008 06:41:51.045661 11598 healthcheck.go:60] SDN healthcheck unable to reconnect to OVS server: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory F1008 06:41:51.148105 11598 healthcheck.go:76] SDN healthcheck detected unhealthy OVS server, restarting: OVS health check failed ``` Fixes #16630 @openshift/sig-networking
- Loading branch information
Showing
6 changed files
with
231 additions
and
62 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package node | ||
|
||
import ( | ||
"fmt" | ||
"time" | ||
|
||
"github.com/golang/glog" | ||
|
||
utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
utilwait "k8s.io/apimachinery/pkg/util/wait" | ||
|
||
"github.com/openshift/origin/pkg/util/ovs/ovsclient" | ||
) | ||
|
||
const ( | ||
ovsDialTimeout = 5 * time.Second | ||
ovsHealthcheckInterval = 30 * time.Second | ||
ovsRecoveryTimeout = 10 * time.Second | ||
ovsDialDefaultNetwork = "unix" | ||
ovsDialDefaultAddress = "/var/run/openvswitch/db.sock" | ||
) | ||
|
||
// waitForOVS polls until the OVS server responds to a connection and an 'echo' | ||
// command. | ||
func waitForOVS(network, addr string) error { | ||
return utilwait.PollImmediate(time.Second, time.Minute, func() (bool, error) { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("waiting for OVS to start: %v", err) | ||
return false, nil | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("waiting for OVS to start, ping failed: %v", err) | ||
return false, nil | ||
} | ||
return true, nil | ||
}) | ||
} | ||
|
||
// runOVSHealthCheck runs two background loops - one that waits for disconnection | ||
// from the OVS server and then checks healthFn, and one that periodically checks | ||
// healthFn. If healthFn returns false in either of these two cases while the OVS | ||
// server is responsive the node process will terminate. | ||
func runOVSHealthCheck(network, addr string, healthFn func() bool) { | ||
// this loop holds an open socket connection to OVS until it times out, then | ||
// checks for health | ||
go utilwait.Until(func() { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
utilruntime.HandleError(fmt.Errorf("SDN healthcheck unable to connect to OVS server: %v", err)) | ||
return | ||
} | ||
defer c.Close() | ||
|
||
err = c.WaitForDisconnect() | ||
utilruntime.HandleError(fmt.Errorf("SDN healthcheck disconnected from OVS server: %v", err)) | ||
|
||
err = utilwait.PollImmediate(100*time.Millisecond, ovsRecoveryTimeout, func() (bool, error) { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err) | ||
return false, nil | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err) | ||
return false, nil | ||
} | ||
if !healthFn() { | ||
return false, fmt.Errorf("OVS health check failed") | ||
} | ||
return true, nil | ||
}) | ||
if err != nil { | ||
// If OVS restarts and our health check fails, we exit | ||
// TODO: make openshift-sdn able to reconcile without a restart | ||
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) | ||
} | ||
}, ovsDialTimeout, utilwait.NeverStop) | ||
|
||
// this loop periodically verifies we can still connect to the OVS server and | ||
// is an upper bound on the time we wait before detecting a failed OVS configuartion | ||
go utilwait.Until(func() { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err) | ||
return | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err) | ||
return | ||
} | ||
if !healthFn() { | ||
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) | ||
} | ||
glog.V(4).Infof("SDN healthcheck succeeded") | ||
}, ovsHealthcheckInterval, utilwait.NeverStop) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.