Skip to content

Commit

Permalink
Try several times to contact kube-api before failing
Browse files Browse the repository at this point in the history
Signed-off-by: Manuel Buil <mbuil@suse.com>
  • Loading branch information
manuelbuil committed May 24, 2024
1 parent bf59847 commit 4331f3d
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 33 deletions.
19 changes: 9 additions & 10 deletions pkg/backend/hostgw/hostgw_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,11 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup
// Wait for the network to populate Management IP
log.Infof("Waiting to get ManagementIP from HNSNetwork %s", networkName)
var newNetworkID = newNetwork.Id
waitErr = wait.Poll(500*time.Millisecond, 30*time.Second, func() (done bool, err error) {
waitErr := wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
newNetwork, lastErr = hcsshim.HNSNetworkRequest("GET", newNetworkID, "")
return newNetwork != nil && len(newNetwork.ManagementIP) != 0, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
// Do not swallow the root cause
if lastErr != nil {
waitErr = lastErr
Expand All @@ -179,12 +179,11 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup
if err != nil {
return nil, errors.Wrapf(err, "Failed to parse management ip (%s)", newNetwork.ManagementIP)
}

waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
_, lastErr = ip.GetInterfaceByIP(managementIP.ToIP())
return lastErr == nil, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
return nil, errors.Wrapf(lastErr, "timeout, failed to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP)
}

Expand Down Expand Up @@ -227,19 +226,19 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup

// Wait for the bridgeEndpoint to attach to the host
log.Infof("Waiting to attach bridge endpoint %s to host", bridgeEndpointName)
waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
lastErr = expectedBridgeEndpoint.HostAttach(1)
if lastErr == nil {
return true, nil
return false, nil
}
// See https://github.com/flannel-io/flannel/issues/1391 and
// hcsshim lacks some validations to detect the error, so we judge it by error message.
if strings.Contains(lastErr.Error(), "This endpoint is already attached to the switch.") {
return true, nil
return false, nil
}
return false, nil
return true, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
return nil, errors.Wrapf(lastErr, "failed to hot attach bridge HNSEndpoint %s to host compartment", bridgeEndpointName)
}
log.Infof("Attached bridge endpoint %s to host successfully", bridgeEndpointName)
Expand Down
23 changes: 12 additions & 11 deletions pkg/backend/vxlan/device_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package vxlan

import (
"context"
"encoding/json"
"fmt"
"time"
Expand Down Expand Up @@ -46,7 +47,7 @@ type NetAdapterNameSettings struct {
NetworkAdapterName string `json:"NetworkAdapterName"`
}

func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
func newVXLANDevice(ctx context.Context, devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
subnet := createSubnet(devAttrs.addressPrefix.String(), (devAttrs.addressPrefix.IP + 1).String(), "0.0.0.0/0")
network := &hcn.HostComputeNetwork{
Type: "Overlay",
Expand Down Expand Up @@ -90,7 +91,7 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
addNetAdapterName(network, devAttrs.interfaceName)
}

hnsNetwork, err := ensureNetwork(network, devAttrs.addressPrefix.String())
hnsNetwork, err := ensureNetwork(ctx, network, devAttrs.addressPrefix.String())
if err != nil {
return nil, err
}
Expand All @@ -100,7 +101,7 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
}, nil
}

func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefix string) (*hcn.HostComputeNetwork, error) {
func ensureNetwork(ctx context.Context, expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefix string) (*hcn.HostComputeNetwork, error) {
createNetwork := true
networkName := expectedNetwork.Name

Expand Down Expand Up @@ -134,19 +135,19 @@ func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefi
// Wait for the network to populate Management IP
log.Infof("Waiting to get ManagementIP from HostComputeNetwork %s", networkName)
var newNetworkID = newNetwork.Id
waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
newNetwork, lastErr = hcn.GetNetworkByID(newNetworkID)
return newNetwork != nil && len(getManagementIP(newNetwork)) != 0, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
// Do not swallow the root cause
if lastErr != nil {
waitErr = lastErr
}
return nil, errors.Wrapf(lastErr, "timeout, failed to get management IP from HostComputeNetwork %s", networkName)
}

err = checkHostNetworkReady(newNetwork)
err = checkHostNetworkReady(ctx, newNetwork)
if err != nil {
return nil, errors.Wrapf(err, "Interface bound to %s took too long to get ready. Please check your network host configuration", networkName)
}
Expand Down Expand Up @@ -227,7 +228,7 @@ func addNetAdapterName(network *hcn.HostComputeNetwork, netAdapterName string) e
}

// checkHostNetworkReady waits for the host network to be ready: the main interface must be up and have an IP address
func checkHostNetworkReady(network *hcn.HostComputeNetwork) error {
func checkHostNetworkReady(ctx context.Context, network *hcn.HostComputeNetwork) error {
managementIP := getManagementIP(network)
// Wait for the interface with the management IP
log.Infof("Waiting to get net interface for HostComputeNetwork %s (%s)", network.Name, managementIP)
Expand All @@ -236,16 +237,16 @@ func checkHostNetworkReady(network *hcn.HostComputeNetwork) error {
return errors.Wrapf(err, "Failed to parse management ip (%s)", managementIP)
}

waitErr := wait.Poll(3*time.Second, 25*time.Second, func() (done bool, err error) {
waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 25*time.Second, true, func(context.Context) (done bool, err error) {
iface, lastErr := ip.GetInterfaceByIP(managementIPv4.ToIP())
if lastErr == nil {
log.V(2).Infof("Host interface: %s bound by %s ready", iface.Name, network.Name)
return true, nil
return false, nil
}
log.V(2).Infof("Host interface bound by %s not ready", network.Name)
return false, nil
return true, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
return errors.Wrapf(waitErr, "timeout, failed to get net interface for HostComputeNetwork %s (%s)", network.Name, managementIP)
}
return nil
Expand Down
4 changes: 2 additions & 2 deletions pkg/backend/vxlan/vxlan_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup,
interfaceName: be.extIface.Iface.Name,
}

dev, err := newVXLANDevice(&devAttrs)
dev, err := newVXLANDevice(ctx, &devAttrs)
if err != nil {
return nil, fmt.Errorf("failed to create VXLAN network: %w", err)
}
Expand Down Expand Up @@ -191,7 +191,7 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup,
}

// Before contacting the lease server (e.g. kube-api), we verify that the physical interface is ready
err = checkHostNetworkReady(hcnNetwork)
err = checkHostNetworkReady(ctx, hcnNetwork)
if err != nil {
return nil, fmt.Errorf("interface bound to %s took too long to get ready. Please check your network host configuration", hcnNetwork.Name)
}
Expand Down
34 changes: 24 additions & 10 deletions pkg/subnet/kube/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,13 +283,20 @@ func (ksm *kubeSubnetManager) GetNetworkConfig(ctx context.Context) (*subnet.Con
func (ksm *kubeSubnetManager) AcquireLease(ctx context.Context, attrs *lease.LeaseAttrs) (*lease.Lease, error) {
var cachedNode *v1.Node
var err error
if ksm.disableNodeInformer {
cachedNode, err = ksm.client.CoreV1().Nodes().Get(ctx, ksm.nodeName, metav1.GetOptions{ResourceVersion: "0"})
} else {
cachedNode, err = ksm.nodeStore.Get(ksm.nodeName)
}
if err != nil {
return nil, err
waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 30*time.Second, true, func(context.Context) (done bool, err error) {
if ksm.disableNodeInformer {
cachedNode, err = ksm.client.CoreV1().Nodes().Get(ctx, ksm.nodeName, metav1.GetOptions{ResourceVersion: "0"})
} else {
cachedNode, err = ksm.nodeStore.Get(ksm.nodeName)
}
if err != nil {
log.V(2).Infof("Failed to get node %q: %v", ksm.nodeName, err)
return false, nil
}
return true, nil
})
if waitErr != nil {
return nil, fmt.Errorf("timeout contacting kube-api, failed to patch node %q. Error: %v", ksm.nodeName, waitErr)
}

n := cachedNode.DeepCopy()
Expand Down Expand Up @@ -397,9 +404,16 @@ func (ksm *kubeSubnetManager) AcquireLease(ctx context.Context, attrs *lease.Lea
return nil, fmt.Errorf("failed to create patch for node %q: %v", ksm.nodeName, err)
}

_, err = ksm.client.CoreV1().Nodes().Patch(ctx, ksm.nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
if err != nil {
return nil, err
waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 30*time.Second, true, func(context.Context) (done bool, err error) {
_, err = ksm.client.CoreV1().Nodes().Patch(ctx, ksm.nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
if err != nil {
log.V(2).Infof("Failed to patch node %q: %v", ksm.nodeName, err)
return false, nil
}
return true, nil
})
if waitErr != nil {
return nil, fmt.Errorf("timeout contacting kube-api, failed to patch node %q. Error: %v", ksm.nodeName, waitErr)
}
}

Expand Down

0 comments on commit 4331f3d

Please sign in to comment.