Skip to content

Commit

Permalink
Merge pull request #16742 from smarterclayton/healthcheck_ovs
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue (batch tested with PRs 16737, 16638, 16742, 16765, 16711).

Health check the OVS process and restart if it dies

Reorganize the existing setup code to perform a periodic background check on the state of the OVS database. If the SDN setup is lost, force the node/network processes to restart. Use the JSONRPC endpoint to perform a few simple checks of status, and detect failure quickly. This reuses our existing health check code, which does not appear to be a performance issue when checked periodically.

Node waiting for OVS to start:

```
I1008 06:41:25.661293   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:26.690356   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:27.653112   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:28.671950   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:29.653713   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
W1008 06:41:30.285617   11598 cni.go:189] Unable to update cni config: No networks found in /etc/cni/net.d
E1008 06:41:30.286780   11598 kubelet.go:2093] Container runtime network not ready: NetworkReady=false reason:NetworkPluginNotReady message:docker: network plugin is not ready: cni config uninitialized
I1008 06:41:30.661441   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:31.653232   11598 healthcheck.go:27] waiting for OVS to start: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:32.674697   11598 sdn_controller.go:180] [SDN setup] full SDN setup required
```

Let node start, then stop OVS, node detects immediately

```
I1008 06:41:40.208239   11598 kubelet_node_status.go:433] Recording NodeReady event message for node localhost.localdomain
I1008 06:41:43.076299   11598 nodecontroller.go:770] NodeController detected that some Nodes are Ready. Exiting master disruption mode.
E1008 06:41:50.941351   11598 healthcheck.go:55] SDN healthcheck disconnected from OVS server: <nil>
I1008 06:41:50.941541   11598 healthcheck.go:60] SDN healthcheck unable to reconnect to OVS server: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
I1008 06:41:51.045661   11598 healthcheck.go:60] SDN healthcheck unable to reconnect to OVS server: dial unix /var/run/openvswitch/db.sock: connect: no such file or directory
F1008 06:41:51.148105   11598 healthcheck.go:76] SDN healthcheck detected unhealthy OVS server, restarting: OVS health check failed
```

Fixes #16630

@openshift/sig-networking
  • Loading branch information
openshift-merge-robot committed Oct 10, 2017
2 parents f8bbb7a + 572d44b commit 16e9703
Show file tree
Hide file tree
Showing 6 changed files with 231 additions and 62 deletions.
3 changes: 2 additions & 1 deletion pkg/cmd/server/kubernetes/network/network.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@ func (c *NetworkConfig) RunSDN() {
if c.SDNNode == nil {
return
}

if err := c.SDNNode.Start(); err != nil {
glog.Fatalf("error: SDN node startup failed: %v", err)
glog.Fatalf("SDN node startup failed: %v", err)
}
}

Expand Down
30 changes: 0 additions & 30 deletions pkg/cmd/server/kubernetes/network/network_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@ package network
import (
"fmt"
"net"
"strings"

"github.com/golang/glog"

miekgdns "github.com/miekg/dns"

kerrs "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kclientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/pkg/apis/componentconfig"
kclientsetexternal "k8s.io/kubernetes/pkg/client/clientset_generated/clientset"
Expand All @@ -19,7 +16,6 @@ import (
configapi "github.com/openshift/origin/pkg/cmd/server/api"
"github.com/openshift/origin/pkg/dns"
"github.com/openshift/origin/pkg/network"
networkapi "github.com/openshift/origin/pkg/network/apis/network"
networkclient "github.com/openshift/origin/pkg/network/generated/internalclientset"
)

Expand Down Expand Up @@ -66,10 +62,6 @@ func New(options configapi.NodeConfig, clusterDomain string, proxyConfig *compon
return nil, err
}

if err = validateNetworkPluginName(networkClient, options.NetworkConfig.NetworkPluginName); err != nil {
return nil, err
}

internalKubeInformers := kinternalinformers.NewSharedInformerFactory(internalKubeClient, proxyConfig.ConfigSyncPeriod.Duration)

var sdnNode network.NodeInterface
Expand Down Expand Up @@ -146,25 +138,3 @@ func New(options configapi.NodeConfig, clusterDomain string, proxyConfig *compon

return config, nil
}

func validateNetworkPluginName(networkClient networkclient.Interface, pluginName string) error {
if network.IsOpenShiftNetworkPlugin(pluginName) {
// Detect any plugin mismatches between node and master
clusterNetwork, err := networkClient.Network().ClusterNetworks().Get(networkapi.ClusterNetworkDefault, metav1.GetOptions{})
if kerrs.IsNotFound(err) {
return fmt.Errorf("master has not created a default cluster network, network plugin %q can not start", pluginName)
} else if err != nil {
return fmt.Errorf("cannot fetch %q cluster network: %v", networkapi.ClusterNetworkDefault, err)
}

if clusterNetwork.PluginName != strings.ToLower(pluginName) {
if len(clusterNetwork.PluginName) != 0 {
return fmt.Errorf("detected network plugin mismatch between OpenShift node(%q) and master(%q)", pluginName, clusterNetwork.PluginName)
} else {
// Do not return error in this case
glog.Warningf(`either there is network plugin mismatch between OpenShift node(%q) and master or OpenShift master is running an older version where we did not persist plugin name`, pluginName)
}
}
}
return nil
}
100 changes: 100 additions & 0 deletions pkg/network/node/healthcheck.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
package node

import (
"fmt"
"time"

"github.com/golang/glog"

utilruntime "k8s.io/apimachinery/pkg/util/runtime"
utilwait "k8s.io/apimachinery/pkg/util/wait"

"github.com/openshift/origin/pkg/util/ovs/ovsclient"
)

const (
ovsDialTimeout = 5 * time.Second
ovsHealthcheckInterval = 30 * time.Second
ovsRecoveryTimeout = 10 * time.Second
ovsDialDefaultNetwork = "unix"
ovsDialDefaultAddress = "/var/run/openvswitch/db.sock"
)

// waitForOVS polls until the OVS server responds to a connection and an 'echo'
// command.
func waitForOVS(network, addr string) error {
return utilwait.PollImmediate(time.Second, time.Minute, func() (bool, error) {
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout)
if err != nil {
glog.V(2).Infof("waiting for OVS to start: %v", err)
return false, nil
}
defer c.Close()
if err := c.Ping(); err != nil {
glog.V(2).Infof("waiting for OVS to start, ping failed: %v", err)
return false, nil
}
return true, nil
})
}

// runOVSHealthCheck runs two background loops - one that waits for disconnection
// from the OVS server and then checks healthFn, and one that periodically checks
// healthFn. If healthFn returns false in either of these two cases while the OVS
// server is responsive the node process will terminate.
func runOVSHealthCheck(network, addr string, healthFn func() bool) {
// this loop holds an open socket connection to OVS until it times out, then
// checks for health
go utilwait.Until(func() {
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout)
if err != nil {
utilruntime.HandleError(fmt.Errorf("SDN healthcheck unable to connect to OVS server: %v", err))
return
}
defer c.Close()

err = c.WaitForDisconnect()
utilruntime.HandleError(fmt.Errorf("SDN healthcheck disconnected from OVS server: %v", err))

err = utilwait.PollImmediate(100*time.Millisecond, ovsRecoveryTimeout, func() (bool, error) {
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout)
if err != nil {
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err)
return false, nil
}
defer c.Close()
if err := c.Ping(); err != nil {
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err)
return false, nil
}
if !healthFn() {
return false, fmt.Errorf("OVS health check failed")
}
return true, nil
})
if err != nil {
// If OVS restarts and our health check fails, we exit
// TODO: make openshift-sdn able to reconcile without a restart
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err)
}
}, ovsDialTimeout, utilwait.NeverStop)

// this loop periodically verifies we can still connect to the OVS server and
// is an upper bound on the time we wait before detecting a failed OVS configuartion
go utilwait.Until(func() {
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout)
if err != nil {
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err)
return
}
defer c.Close()
if err := c.Ping(); err != nil {
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err)
return
}
if !healthFn() {
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err)
}
glog.V(4).Infof("SDN healthcheck succeeded")
}, ovsHealthcheckInterval, utilwait.NeverStop)
}
58 changes: 39 additions & 19 deletions pkg/network/node/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@ import (
"sync"
"time"

log "github.com/golang/glog"
"github.com/golang/glog"
"github.com/vishvananda/netlink"

"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
Expand Down Expand Up @@ -143,27 +144,27 @@ func New(c *OsdnNodeConfig) (network.NodeInterface, error) {
// we're ready yet
os.Remove(filepath.Join(cniDirPath, openshiftCNIFile))

log.Infof("Initializing SDN node of type %q with configured hostname %q (IP %q), iptables sync period %q", c.PluginName, c.Hostname, c.SelfIP, c.IPTablesSyncPeriod.String())
glog.Infof("Initializing SDN node of type %q with configured hostname %q (IP %q), iptables sync period %q", c.PluginName, c.Hostname, c.SelfIP, c.IPTablesSyncPeriod.String())
if c.Hostname == "" {
output, err := kexec.New().Command("uname", "-n").CombinedOutput()
if err != nil {
return nil, err
}
c.Hostname = strings.TrimSpace(string(output))
log.Infof("Resolved hostname to %q", c.Hostname)
glog.Infof("Resolved hostname to %q", c.Hostname)
}
if c.SelfIP == "" {
var err error
c.SelfIP, err = netutils.GetNodeIP(c.Hostname)
if err != nil {
log.V(5).Infof("Failed to determine node address from hostname %s; using default interface (%v)", c.Hostname, err)
glog.V(5).Infof("Failed to determine node address from hostname %s; using default interface (%v)", c.Hostname, err)
var defaultIP net.IP
defaultIP, err = kubeutilnet.ChooseHostInterface()
if err != nil {
return nil, err
}
c.SelfIP = defaultIP.String()
log.Infof("Resolved IP address to %q", c.SelfIP)
glog.Infof("Resolved IP address to %q", c.SelfIP)
}
}

Expand Down Expand Up @@ -226,10 +227,10 @@ func (node *OsdnNode) dockerPreCNICleanup() error {
// OpenShift-in-a-container case, so we work around that by sending
// the messages by hand.
if _, err := osexec.Command("dbus-send", "--system", "--print-reply", "--reply-timeout=2000", "--type=method_call", "--dest=org.freedesktop.systemd1", "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager.Reload").CombinedOutput(); err != nil {
log.Error(err)
glog.Error(err)
}
if _, err := osexec.Command("dbus-send", "--system", "--print-reply", "--reply-timeout=2000", "--type=method_call", "--dest=org.freedesktop.systemd1", "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager.RestartUnit", "string:'docker.service' string:'replace'").CombinedOutput(); err != nil {
log.Error(err)
glog.Error(err)
}

// Delete pre-CNI interfaces
Expand All @@ -245,7 +246,7 @@ func (node *OsdnNode) dockerPreCNICleanup() error {
return err
}

log.Infof("Cleaned up left-over openshift-sdn docker bridge and interfaces")
glog.Infof("Cleaned up left-over openshift-sdn docker bridge and interfaces")

return nil
}
Expand Down Expand Up @@ -288,16 +289,20 @@ func (node *OsdnNode) killUpdateFailedPods(pods []kapi.Pod) error {
return err
}

log.V(5).Infof("Killing pod '%s/%s' sandbox due to failed restart", pod.Namespace, pod.Name)
glog.V(5).Infof("Killing pod '%s/%s' sandbox due to failed restart", pod.Namespace, pod.Name)
if err := node.runtimeService.StopPodSandbox(sandboxID); err != nil {
log.Warningf("Failed to kill pod '%s/%s' sandbox: %v", pod.Namespace, pod.Name, err)
glog.Warningf("Failed to kill pod '%s/%s' sandbox: %v", pod.Namespace, pod.Name, err)
}
}
return nil
}

func (node *OsdnNode) Start() error {
log.V(2).Infof("Starting openshift-sdn network plugin")
glog.V(2).Infof("Starting openshift-sdn network plugin")

if err := validateNetworkPluginName(node.networkClient, node.policy.Name()); err != nil {
return fmt.Errorf("failed to validate network configuration: %v", err)
}

var err error
node.networkInfo, err = common.GetNetworkInfo(node.networkClient)
Expand All @@ -311,7 +316,7 @@ func (node *OsdnNode) Start() error {
}
if err := node.networkInfo.CheckHostNetworks(hostIPNets); err != nil {
// checkHostNetworks() errors *should* be fatal, but we didn't used to check this, and we can't break (mostly-)working nodes on upgrade.
log.Errorf("Local networks conflict with SDN; this will eventually cause problems: %v", err)
glog.Errorf("Local networks conflict with SDN; this will eventually cause problems: %v", err)
}

node.localSubnetCIDR, err = node.getLocalSubnet()
Expand Down Expand Up @@ -353,7 +358,7 @@ func (node *OsdnNode) Start() error {
node.watchServices()
}

log.V(2).Infof("Starting openshift-sdn pod manager")
glog.V(2).Infof("Starting openshift-sdn pod manager")
if err := node.podManager.Start(cniserver.CNIServerSocketPath, node.localSubnetCIDR, node.networkInfo.ClusterNetworks); err != nil {
return err
}
Expand All @@ -371,7 +376,7 @@ func (node *OsdnNode) Start() error {
continue
}
if err := node.UpdatePod(p); err != nil {
log.Warningf("will restart pod '%s/%s' due to update failure on restart: %s", p.Namespace, p.Name, err)
glog.Warningf("will restart pod '%s/%s' due to update failure on restart: %s", p.Namespace, p.Name, err)
podsToKill = append(podsToKill, p)
} else if vnid, err := node.policy.GetVNID(p.Namespace); err == nil {
node.policy.EnsureVNIDRules(vnid)
Expand All @@ -382,7 +387,7 @@ func (node *OsdnNode) Start() error {
// we'll be able to set them up correctly
if len(podsToKill) > 0 {
if err := node.killUpdateFailedPods(podsToKill); err != nil {
log.Warningf("failed to restart pods that failed to update at startup: %v", err)
glog.Warningf("failed to restart pods that failed to update at startup: %v", err)
}
}
}
Expand All @@ -396,7 +401,7 @@ func (node *OsdnNode) Start() error {
gatherPeriodicMetrics(node.oc.ovs)
}, time.Minute*2)

log.V(2).Infof("openshift-sdn network plugin ready")
glog.V(2).Infof("openshift-sdn network plugin ready")

// Write our CNI config file out to disk to signal to kubelet that
// our network plugin is ready
Expand Down Expand Up @@ -479,7 +484,7 @@ func (node *OsdnNode) handleAddOrUpdateService(obj, oldObj interface{}, eventTyp
return
}

log.V(5).Infof("Watch %s event for Service %q", eventType, serv.Name)
glog.V(5).Infof("Watch %s event for Service %q", eventType, serv.Name)
oldServ, exists := oldObj.(*kapi.Service)
if exists {
if !isServiceChanged(oldServ, serv) {
Expand All @@ -490,7 +495,7 @@ func (node *OsdnNode) handleAddOrUpdateService(obj, oldObj interface{}, eventTyp

netid, err := node.policy.GetVNID(serv.Namespace)
if err != nil {
log.Errorf("Skipped adding service rules for serviceEvent: %v, Error: %v", eventType, err)
glog.Errorf("Skipped adding service rules for serviceEvent: %v, Error: %v", eventType, err)
return
}

Expand All @@ -500,6 +505,21 @@ func (node *OsdnNode) handleAddOrUpdateService(obj, oldObj interface{}, eventTyp

func (node *OsdnNode) handleDeleteService(obj interface{}) {
serv := obj.(*kapi.Service)
log.V(5).Infof("Watch %s event for Service %q", watch.Deleted, serv.Name)
glog.V(5).Infof("Watch %s event for Service %q", watch.Deleted, serv.Name)
node.DeleteServiceRules(serv)
}

func validateNetworkPluginName(networkClient networkclient.Interface, pluginName string) error {
// Detect any plugin mismatches between node and master
clusterNetwork, err := networkClient.Network().ClusterNetworks().Get(networkapi.ClusterNetworkDefault, metav1.GetOptions{})
switch {
case errors.IsNotFound(err):
return fmt.Errorf("master has not created a default cluster network, network plugin %q can not start", pluginName)
case err != nil:
return fmt.Errorf("cannot fetch %q cluster network: %v", networkapi.ClusterNetworkDefault, err)
}
if clusterNetwork.PluginName != strings.ToLower(pluginName) {
return fmt.Errorf("detected network plugin mismatch between OpenShift node(%q) and master(%q)", pluginName, clusterNetwork.PluginName)
}
return nil
}
Loading

0 comments on commit 16e9703

Please sign in to comment.