-
Notifications
You must be signed in to change notification settings - Fork 4.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Health check the OVS process and restart if it dies #16742
Merged
openshift-merge-robot
merged 3 commits into
openshift:master
from
smarterclayton:healthcheck_ovs
Oct 10, 2017
+231
−62
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
package node | ||
|
||
import ( | ||
"fmt" | ||
"time" | ||
|
||
"github.com/golang/glog" | ||
|
||
utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
utilwait "k8s.io/apimachinery/pkg/util/wait" | ||
|
||
"github.com/openshift/origin/pkg/util/ovs/ovsclient" | ||
) | ||
|
||
const ( | ||
ovsDialTimeout = 5 * time.Second | ||
ovsHealthcheckInterval = 30 * time.Second | ||
ovsRecoveryTimeout = 10 * time.Second | ||
ovsDialDefaultNetwork = "unix" | ||
ovsDialDefaultAddress = "/var/run/openvswitch/db.sock" | ||
) | ||
|
||
// waitForOVS polls until the OVS server responds to a connection and an 'echo' | ||
// command. | ||
func waitForOVS(network, addr string) error { | ||
return utilwait.PollImmediate(time.Second, time.Minute, func() (bool, error) { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("waiting for OVS to start: %v", err) | ||
return false, nil | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("waiting for OVS to start, ping failed: %v", err) | ||
return false, nil | ||
} | ||
return true, nil | ||
}) | ||
} | ||
|
||
// runOVSHealthCheck runs two background loops - one that waits for disconnection | ||
// from the OVS server and then checks healthFn, and one that periodically checks | ||
// healthFn. If healthFn returns false in either of these two cases while the OVS | ||
// server is responsive the node process will terminate. | ||
func runOVSHealthCheck(network, addr string, healthFn func() bool) { | ||
// this loop holds an open socket connection to OVS until it times out, then | ||
// checks for health | ||
go utilwait.Until(func() { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
utilruntime.HandleError(fmt.Errorf("SDN healthcheck unable to connect to OVS server: %v", err)) | ||
return | ||
} | ||
defer c.Close() | ||
|
||
err = c.WaitForDisconnect() | ||
utilruntime.HandleError(fmt.Errorf("SDN healthcheck disconnected from OVS server: %v", err)) | ||
|
||
err = utilwait.PollImmediate(100*time.Millisecond, ovsRecoveryTimeout, func() (bool, error) { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err) | ||
return false, nil | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err) | ||
return false, nil | ||
} | ||
if !healthFn() { | ||
return false, fmt.Errorf("OVS health check failed") | ||
} | ||
return true, nil | ||
}) | ||
if err != nil { | ||
// If OVS restarts and our health check fails, we exit | ||
// TODO: make openshift-sdn able to reconcile without a restart | ||
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) | ||
} | ||
}, ovsDialTimeout, utilwait.NeverStop) | ||
|
||
// this loop periodically verifies we can still connect to the OVS server and | ||
// is an upper bound on the time we wait before detecting a failed OVS configuartion | ||
go utilwait.Until(func() { | ||
c, err := ovsclient.DialTimeout(network, addr, ovsDialTimeout) | ||
if err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to reconnect to OVS server: %v", err) | ||
return | ||
} | ||
defer c.Close() | ||
if err := c.Ping(); err != nil { | ||
glog.V(2).Infof("SDN healthcheck unable to ping OVS server: %v", err) | ||
return | ||
} | ||
if !healthFn() { | ||
glog.Fatalf("SDN healthcheck detected unhealthy OVS server, restarting: %v", err) | ||
} | ||
glog.V(4).Infof("SDN healthcheck succeeded") | ||
}, ovsHealthcheckInterval, utilwait.NeverStop) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,9 +13,10 @@ import ( | |
"sync" | ||
"time" | ||
|
||
log "github.com/golang/glog" | ||
"github.com/golang/glog" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This makes it so that this commit doesn't compile... would be better to put the |
||
"github.com/vishvananda/netlink" | ||
|
||
"k8s.io/apimachinery/pkg/api/errors" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/fields" | ||
"k8s.io/apimachinery/pkg/labels" | ||
|
@@ -143,27 +144,27 @@ func New(c *OsdnNodeConfig) (network.NodeInterface, error) { | |
// we're ready yet | ||
os.Remove(filepath.Join(cniDirPath, openshiftCNIFile)) | ||
|
||
log.Infof("Initializing SDN node of type %q with configured hostname %q (IP %q), iptables sync period %q", c.PluginName, c.Hostname, c.SelfIP, c.IPTablesSyncPeriod.String()) | ||
glog.Infof("Initializing SDN node of type %q with configured hostname %q (IP %q), iptables sync period %q", c.PluginName, c.Hostname, c.SelfIP, c.IPTablesSyncPeriod.String()) | ||
if c.Hostname == "" { | ||
output, err := kexec.New().Command("uname", "-n").CombinedOutput() | ||
if err != nil { | ||
return nil, err | ||
} | ||
c.Hostname = strings.TrimSpace(string(output)) | ||
log.Infof("Resolved hostname to %q", c.Hostname) | ||
glog.Infof("Resolved hostname to %q", c.Hostname) | ||
} | ||
if c.SelfIP == "" { | ||
var err error | ||
c.SelfIP, err = netutils.GetNodeIP(c.Hostname) | ||
if err != nil { | ||
log.V(5).Infof("Failed to determine node address from hostname %s; using default interface (%v)", c.Hostname, err) | ||
glog.V(5).Infof("Failed to determine node address from hostname %s; using default interface (%v)", c.Hostname, err) | ||
var defaultIP net.IP | ||
defaultIP, err = kubeutilnet.ChooseHostInterface() | ||
if err != nil { | ||
return nil, err | ||
} | ||
c.SelfIP = defaultIP.String() | ||
log.Infof("Resolved IP address to %q", c.SelfIP) | ||
glog.Infof("Resolved IP address to %q", c.SelfIP) | ||
} | ||
} | ||
|
||
|
@@ -226,10 +227,10 @@ func (node *OsdnNode) dockerPreCNICleanup() error { | |
// OpenShift-in-a-container case, so we work around that by sending | ||
// the messages by hand. | ||
if _, err := osexec.Command("dbus-send", "--system", "--print-reply", "--reply-timeout=2000", "--type=method_call", "--dest=org.freedesktop.systemd1", "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager.Reload").CombinedOutput(); err != nil { | ||
log.Error(err) | ||
glog.Error(err) | ||
} | ||
if _, err := osexec.Command("dbus-send", "--system", "--print-reply", "--reply-timeout=2000", "--type=method_call", "--dest=org.freedesktop.systemd1", "/org/freedesktop/systemd1", "org.freedesktop.systemd1.Manager.RestartUnit", "string:'docker.service' string:'replace'").CombinedOutput(); err != nil { | ||
log.Error(err) | ||
glog.Error(err) | ||
} | ||
|
||
// Delete pre-CNI interfaces | ||
|
@@ -245,7 +246,7 @@ func (node *OsdnNode) dockerPreCNICleanup() error { | |
return err | ||
} | ||
|
||
log.Infof("Cleaned up left-over openshift-sdn docker bridge and interfaces") | ||
glog.Infof("Cleaned up left-over openshift-sdn docker bridge and interfaces") | ||
|
||
return nil | ||
} | ||
|
@@ -288,16 +289,20 @@ func (node *OsdnNode) killUpdateFailedPods(pods []kapi.Pod) error { | |
return err | ||
} | ||
|
||
log.V(5).Infof("Killing pod '%s/%s' sandbox due to failed restart", pod.Namespace, pod.Name) | ||
glog.V(5).Infof("Killing pod '%s/%s' sandbox due to failed restart", pod.Namespace, pod.Name) | ||
if err := node.runtimeService.StopPodSandbox(sandboxID); err != nil { | ||
log.Warningf("Failed to kill pod '%s/%s' sandbox: %v", pod.Namespace, pod.Name, err) | ||
glog.Warningf("Failed to kill pod '%s/%s' sandbox: %v", pod.Namespace, pod.Name, err) | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
func (node *OsdnNode) Start() error { | ||
log.V(2).Infof("Starting openshift-sdn network plugin") | ||
glog.V(2).Infof("Starting openshift-sdn network plugin") | ||
|
||
if err := validateNetworkPluginName(node.networkClient, node.policy.Name()); err != nil { | ||
return fmt.Errorf("failed to validate network configuration: %v", err) | ||
} | ||
|
||
var err error | ||
node.networkInfo, err = common.GetNetworkInfo(node.networkClient) | ||
|
@@ -311,7 +316,7 @@ func (node *OsdnNode) Start() error { | |
} | ||
if err := node.networkInfo.CheckHostNetworks(hostIPNets); err != nil { | ||
// checkHostNetworks() errors *should* be fatal, but we didn't used to check this, and we can't break (mostly-)working nodes on upgrade. | ||
log.Errorf("Local networks conflict with SDN; this will eventually cause problems: %v", err) | ||
glog.Errorf("Local networks conflict with SDN; this will eventually cause problems: %v", err) | ||
} | ||
|
||
node.localSubnetCIDR, err = node.getLocalSubnet() | ||
|
@@ -353,7 +358,7 @@ func (node *OsdnNode) Start() error { | |
node.watchServices() | ||
} | ||
|
||
log.V(2).Infof("Starting openshift-sdn pod manager") | ||
glog.V(2).Infof("Starting openshift-sdn pod manager") | ||
if err := node.podManager.Start(cniserver.CNIServerSocketPath, node.localSubnetCIDR, node.networkInfo.ClusterNetworks); err != nil { | ||
return err | ||
} | ||
|
@@ -371,7 +376,7 @@ func (node *OsdnNode) Start() error { | |
continue | ||
} | ||
if err := node.UpdatePod(p); err != nil { | ||
log.Warningf("will restart pod '%s/%s' due to update failure on restart: %s", p.Namespace, p.Name, err) | ||
glog.Warningf("will restart pod '%s/%s' due to update failure on restart: %s", p.Namespace, p.Name, err) | ||
podsToKill = append(podsToKill, p) | ||
} else if vnid, err := node.policy.GetVNID(p.Namespace); err == nil { | ||
node.policy.EnsureVNIDRules(vnid) | ||
|
@@ -382,7 +387,7 @@ func (node *OsdnNode) Start() error { | |
// we'll be able to set them up correctly | ||
if len(podsToKill) > 0 { | ||
if err := node.killUpdateFailedPods(podsToKill); err != nil { | ||
log.Warningf("failed to restart pods that failed to update at startup: %v", err) | ||
glog.Warningf("failed to restart pods that failed to update at startup: %v", err) | ||
} | ||
} | ||
} | ||
|
@@ -396,7 +401,7 @@ func (node *OsdnNode) Start() error { | |
gatherPeriodicMetrics(node.oc.ovs) | ||
}, time.Minute*2) | ||
|
||
log.V(2).Infof("openshift-sdn network plugin ready") | ||
glog.V(2).Infof("openshift-sdn network plugin ready") | ||
|
||
// Write our CNI config file out to disk to signal to kubelet that | ||
// our network plugin is ready | ||
|
@@ -479,7 +484,7 @@ func (node *OsdnNode) handleAddOrUpdateService(obj, oldObj interface{}, eventTyp | |
return | ||
} | ||
|
||
log.V(5).Infof("Watch %s event for Service %q", eventType, serv.Name) | ||
glog.V(5).Infof("Watch %s event for Service %q", eventType, serv.Name) | ||
oldServ, exists := oldObj.(*kapi.Service) | ||
if exists { | ||
if !isServiceChanged(oldServ, serv) { | ||
|
@@ -490,7 +495,7 @@ func (node *OsdnNode) handleAddOrUpdateService(obj, oldObj interface{}, eventTyp | |
|
||
netid, err := node.policy.GetVNID(serv.Namespace) | ||
if err != nil { | ||
log.Errorf("Skipped adding service rules for serviceEvent: %v, Error: %v", eventType, err) | ||
glog.Errorf("Skipped adding service rules for serviceEvent: %v, Error: %v", eventType, err) | ||
return | ||
} | ||
|
||
|
@@ -500,6 +505,21 @@ func (node *OsdnNode) handleAddOrUpdateService(obj, oldObj interface{}, eventTyp | |
|
||
func (node *OsdnNode) handleDeleteService(obj interface{}) { | ||
serv := obj.(*kapi.Service) | ||
log.V(5).Infof("Watch %s event for Service %q", watch.Deleted, serv.Name) | ||
glog.V(5).Infof("Watch %s event for Service %q", watch.Deleted, serv.Name) | ||
node.DeleteServiceRules(serv) | ||
} | ||
|
||
func validateNetworkPluginName(networkClient networkclient.Interface, pluginName string) error { | ||
// Detect any plugin mismatches between node and master | ||
clusterNetwork, err := networkClient.Network().ClusterNetworks().Get(networkapi.ClusterNetworkDefault, metav1.GetOptions{}) | ||
switch { | ||
case errors.IsNotFound(err): | ||
return fmt.Errorf("master has not created a default cluster network, network plugin %q can not start", pluginName) | ||
case err != nil: | ||
return fmt.Errorf("cannot fetch %q cluster network: %v", networkapi.ClusterNetworkDefault, err) | ||
} | ||
if clusterNetwork.PluginName != strings.ToLower(pluginName) { | ||
return fmt.Errorf("detected network plugin mismatch between OpenShift node(%q) and master(%q)", pluginName, clusterNetwork.PluginName) | ||
} | ||
return nil | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know anything about the OVS raw protocol, but if it eventually times out idle connections then this might result in spurious errors in the logs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's possible to configure OVS to do timeouts, but at least ootb on our deployed systems it does not. I also have the 5s disconnect. I think if in practice we see this error showing up we would increase the timeout on connections significantly and still be ok. It's effectively a deadman switch (and it works really well at it).