Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Try several times to contact kube-api before failing #1977

Merged
merged 1 commit into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 9 additions & 10 deletions pkg/backend/hostgw/hostgw_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -161,11 +161,11 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup
// Wait for the network to populate Management IP
log.Infof("Waiting to get ManagementIP from HNSNetwork %s", networkName)
var newNetworkID = newNetwork.Id
waitErr = wait.Poll(500*time.Millisecond, 30*time.Second, func() (done bool, err error) {
waitErr := wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
newNetwork, lastErr = hcsshim.HNSNetworkRequest("GET", newNetworkID, "")
return newNetwork != nil && len(newNetwork.ManagementIP) != 0, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
// Do not swallow the root cause
if lastErr != nil {
waitErr = lastErr
Expand All @@ -179,12 +179,11 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup
if err != nil {
return nil, errors.Wrapf(err, "Failed to parse management ip (%s)", newNetwork.ManagementIP)
}

waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
_, lastErr = ip.GetInterfaceByIP(managementIP.ToIP())
return lastErr == nil, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
return nil, errors.Wrapf(lastErr, "timeout, failed to get net interface for HNSNetwork %s (%s)", networkName, newNetwork.ManagementIP)
}

Expand Down Expand Up @@ -227,19 +226,19 @@ func (be *HostgwBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup

// Wait for the bridgeEndpoint to attach to the host
log.Infof("Waiting to attach bridge endpoint %s to host", bridgeEndpointName)
waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
lastErr = expectedBridgeEndpoint.HostAttach(1)
if lastErr == nil {
return true, nil
return false, nil
}
// See https://github.com/flannel-io/flannel/issues/1391 and
// hcsshim lacks some validations to detect the error, so we judge it by error message.
if strings.Contains(lastErr.Error(), "This endpoint is already attached to the switch.") {
return true, nil
return false, nil
}
return false, nil
return true, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
return nil, errors.Wrapf(lastErr, "failed to hot attach bridge HNSEndpoint %s to host compartment", bridgeEndpointName)
}
log.Infof("Attached bridge endpoint %s to host successfully", bridgeEndpointName)
Expand Down
23 changes: 12 additions & 11 deletions pkg/backend/vxlan/device_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package vxlan

import (
"context"
"encoding/json"
"fmt"
"time"
Expand Down Expand Up @@ -46,7 +47,7 @@ type NetAdapterNameSettings struct {
NetworkAdapterName string `json:"NetworkAdapterName"`
}

func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
func newVXLANDevice(ctx context.Context, devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
subnet := createSubnet(devAttrs.addressPrefix.String(), (devAttrs.addressPrefix.IP + 1).String(), "0.0.0.0/0")
network := &hcn.HostComputeNetwork{
Type: "Overlay",
Expand Down Expand Up @@ -90,7 +91,7 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
addNetAdapterName(network, devAttrs.interfaceName)
}

hnsNetwork, err := ensureNetwork(network, devAttrs.addressPrefix.String())
hnsNetwork, err := ensureNetwork(ctx, network, devAttrs.addressPrefix.String())
if err != nil {
return nil, err
}
Expand All @@ -100,7 +101,7 @@ func newVXLANDevice(devAttrs *vxlanDeviceAttrs) (*vxlanDevice, error) {
}, nil
}

func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefix string) (*hcn.HostComputeNetwork, error) {
func ensureNetwork(ctx context.Context, expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefix string) (*hcn.HostComputeNetwork, error) {
createNetwork := true
networkName := expectedNetwork.Name

Expand Down Expand Up @@ -134,19 +135,19 @@ func ensureNetwork(expectedNetwork *hcn.HostComputeNetwork, expectedAddressPrefi
// Wait for the network to populate Management IP
log.Infof("Waiting to get ManagementIP from HostComputeNetwork %s", networkName)
var newNetworkID = newNetwork.Id
waitErr = wait.Poll(500*time.Millisecond, 5*time.Second, func() (done bool, err error) {
waitErr = wait.PollUntilContextTimeout(ctx, 500*time.Millisecond, 5*time.Second, true, func(context.Context) (done bool, err error) {
newNetwork, lastErr = hcn.GetNetworkByID(newNetworkID)
return newNetwork != nil && len(getManagementIP(newNetwork)) != 0, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
// Do not swallow the root cause
if lastErr != nil {
waitErr = lastErr
}
return nil, errors.Wrapf(lastErr, "timeout, failed to get management IP from HostComputeNetwork %s", networkName)
}

err = checkHostNetworkReady(newNetwork)
err = checkHostNetworkReady(ctx, newNetwork)
if err != nil {
return nil, errors.Wrapf(err, "Interface bound to %s took too long to get ready. Please check your network host configuration", networkName)
}
Expand Down Expand Up @@ -227,7 +228,7 @@ func addNetAdapterName(network *hcn.HostComputeNetwork, netAdapterName string) e
}

// checkHostNetworkReady waits for the host network to be ready: the main interface must be up and have an IP address
func checkHostNetworkReady(network *hcn.HostComputeNetwork) error {
func checkHostNetworkReady(ctx context.Context, network *hcn.HostComputeNetwork) error {
managementIP := getManagementIP(network)
// Wait for the interface with the management IP
log.Infof("Waiting to get net interface for HostComputeNetwork %s (%s)", network.Name, managementIP)
Expand All @@ -236,16 +237,16 @@ func checkHostNetworkReady(network *hcn.HostComputeNetwork) error {
return errors.Wrapf(err, "Failed to parse management ip (%s)", managementIP)
}

waitErr := wait.Poll(3*time.Second, 25*time.Second, func() (done bool, err error) {
waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 25*time.Second, true, func(context.Context) (done bool, err error) {
iface, lastErr := ip.GetInterfaceByIP(managementIPv4.ToIP())
if lastErr == nil {
log.V(2).Infof("Host interface: %s bound by %s ready", iface.Name, network.Name)
return true, nil
return false, nil
}
log.V(2).Infof("Host interface bound by %s not ready", network.Name)
return false, nil
return true, nil
})
if waitErr == wait.ErrWaitTimeout {
if waitErr != nil {
return errors.Wrapf(waitErr, "timeout, failed to get net interface for HostComputeNetwork %s (%s)", network.Name, managementIP)
}
return nil
Expand Down
4 changes: 2 additions & 2 deletions pkg/backend/vxlan/vxlan_windows.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup,
interfaceName: be.extIface.Iface.Name,
}

dev, err := newVXLANDevice(&devAttrs)
dev, err := newVXLANDevice(ctx, &devAttrs)
if err != nil {
return nil, fmt.Errorf("failed to create VXLAN network: %w", err)
}
Expand Down Expand Up @@ -191,7 +191,7 @@ func (be *VXLANBackend) RegisterNetwork(ctx context.Context, wg *sync.WaitGroup,
}

// Before contacting the lease server (e.g. kube-api), we verify that the physical interface is ready
err = checkHostNetworkReady(hcnNetwork)
err = checkHostNetworkReady(ctx, hcnNetwork)
if err != nil {
return nil, fmt.Errorf("interface bound to %s took too long to get ready. Please check your network host configuration", hcnNetwork.Name)
}
Expand Down
34 changes: 24 additions & 10 deletions pkg/subnet/kube/kube.go
Original file line number Diff line number Diff line change
Expand Up @@ -283,13 +283,20 @@ func (ksm *kubeSubnetManager) GetNetworkConfig(ctx context.Context) (*subnet.Con
func (ksm *kubeSubnetManager) AcquireLease(ctx context.Context, attrs *lease.LeaseAttrs) (*lease.Lease, error) {
var cachedNode *v1.Node
var err error
if ksm.disableNodeInformer {
cachedNode, err = ksm.client.CoreV1().Nodes().Get(ctx, ksm.nodeName, metav1.GetOptions{ResourceVersion: "0"})
} else {
cachedNode, err = ksm.nodeStore.Get(ksm.nodeName)
}
if err != nil {
return nil, err
waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 30*time.Second, true, func(context.Context) (done bool, err error) {
if ksm.disableNodeInformer {
cachedNode, err = ksm.client.CoreV1().Nodes().Get(ctx, ksm.nodeName, metav1.GetOptions{ResourceVersion: "0"})
} else {
cachedNode, err = ksm.nodeStore.Get(ksm.nodeName)
}
if err != nil {
log.V(2).Infof("Failed to get node %q: %v", ksm.nodeName, err)
return false, nil
}
return true, nil
})
if waitErr != nil {
return nil, fmt.Errorf("timeout contacting kube-api, failed to patch node %q. Error: %v", ksm.nodeName, waitErr)
}

n := cachedNode.DeepCopy()
Expand Down Expand Up @@ -397,9 +404,16 @@ func (ksm *kubeSubnetManager) AcquireLease(ctx context.Context, attrs *lease.Lea
return nil, fmt.Errorf("failed to create patch for node %q: %v", ksm.nodeName, err)
}

_, err = ksm.client.CoreV1().Nodes().Patch(ctx, ksm.nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
if err != nil {
return nil, err
waitErr := wait.PollUntilContextTimeout(ctx, 3*time.Second, 30*time.Second, true, func(context.Context) (done bool, err error) {
_, err = ksm.client.CoreV1().Nodes().Patch(ctx, ksm.nodeName, types.StrategicMergePatchType, patchBytes, metav1.PatchOptions{}, "status")
if err != nil {
log.V(2).Infof("Failed to patch node %q: %v", ksm.nodeName, err)
return false, nil
}
return true, nil
})
if waitErr != nil {
return nil, fmt.Errorf("timeout contacting kube-api, failed to patch node %q. Error: %v", ksm.nodeName, waitErr)
}
}

Expand Down
Loading