Skip to content

Commit

Permalink
Fix incorrect MTU configurations (#5880)
Browse files Browse the repository at this point in the history
The commit fixes 3 incorrect MTU configurations:

1. When using the WireGuard encryption mode, the Pod eth0's MTU was not
correct. The MTU deducted Geneve overhead because the default tunnel
type is Geneve while it should deduct the WireGuard overhead as traffic
will be encrypted instead of encapsulated.

2. When using the GRE tunnel type, the Pod eth0's MTU was not correct.
The actual overhead is 14 outer MAC, 20 outer IP, and 8 GRE header
(4 standard header + 4 key field), summing up to 42 bytes.

3. When enabling Wireguard for Multicluster, the MTU of all Pod
interfaces and wireguard interface were reduced 130 bytes (50 for
geneve + 80 for wireguard), however, cross-cluster traffic sent from
Pods were not forwarded by wireguard interface. This is because traffic
originated from Pods will be encapsulated on gateway Node, and it's the
encapsulated packet which will be encrypted. If the wireguard interface
is set with the same MTU as the Pod interface, the encapsulated packet
will exceed wireguard interface's MTU.

Signed-off-by: Jiajing Hu <hjiajing@vmware.com>
Signed-off-by: Quan Tian <qtian@vmware.com>
Co-authored-by: Quan Tian <qtian@vmware.com>
  • Loading branch information
hjiajing and tnqn committed Mar 26, 2024
1 parent df64816 commit b343486
Show file tree
Hide file tree
Showing 13 changed files with 182 additions and 67 deletions.
4 changes: 3 additions & 1 deletion cmd/antrea-agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ func run(o *Options) error {
enableNodePortLocal := features.DefaultFeatureGate.Enabled(features.NodePortLocal) && o.config.NodePortLocal.Enable
l7NetworkPolicyEnabled := features.DefaultFeatureGate.Enabled(features.L7NetworkPolicy)
enableMulticlusterGW := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableGateway
_, multiclusterEncryptionMode := config.GetTrafficEncryptionModeFromStr(o.config.Multicluster.TrafficEncryptionMode)
enableMulticlusterNP := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableStretchedNetworkPolicy
enableFLowExporter := features.DefaultFeatureGate.Enabled(features.FlowExporter) && o.config.FlowExporter.Enable

Expand Down Expand Up @@ -199,7 +200,8 @@ func run(o *Options) error {
IPsecConfig: config.IPsecConfig{
AuthenticationMode: ipsecAuthenticationMode,
},
EnableMulticlusterGW: enableMulticlusterGW,
EnableMulticlusterGW: enableMulticlusterGW,
MulticlusterEncryptionMode: multiclusterEncryptionMode,
}

wireguardConfig := &config.WireGuardConfig{
Expand Down
6 changes: 1 addition & 5 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -1092,7 +1092,7 @@ func (i *Initializer) waitForIPsecMonitorDaemon() error {

// initializeWireguard checks if preconditions are met for using WireGuard and initializes WireGuard client or cleans up.
func (i *Initializer) initializeWireGuard() error {
i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - config.WireGuardOverhead
i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - i.networkConfig.WireGuardMTUDeduction
wgClient, err := wireguard.New(i.nodeConfig, i.wireGuardConfig)
if err != nil {
return err
Expand Down Expand Up @@ -1195,10 +1195,6 @@ func (i *Initializer) getInterfaceMTU(transportInterface *net.Interface) (int, e

isIPv6 := i.nodeConfig.NodeIPv6Addr != nil
mtu -= i.networkConfig.CalculateMTUDeduction(isIPv6)

if i.networkConfig.TrafficEncryptionMode == config.TrafficEncryptionModeIPSec {
mtu -= config.IPSecESPOverhead
}
return mtu, nil
}

Expand Down
76 changes: 55 additions & 21 deletions pkg/agent/config/node_config.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,15 @@ const (
)

const (
vxlanOverhead = 50
geneveOverhead = 50
greOverhead = 38
vxlanOverhead = 50
geneveOverhead = 50
// GRE overhead: 14-byte outer MAC, 20-byte outer IPv4, 8-byte GRE header (4-byte standard header + 4-byte key field)
greOverhead = 42

ipv6ExtraOverhead = 20

WireGuardOverhead = 80
// WireGuard overhead: 20-byte outer IPv4, 8-byte UDP header, 4-byte type, 4-byte key index, 8-byte nonce, 16-byte authentication tag
WireGuardOverhead = 60
// IPsec ESP can add a maximum of 38 bytes to the packet including the ESP
// header and trailer.
IPSecESPOverhead = 38
Expand Down Expand Up @@ -201,14 +204,19 @@ type NetworkConfig struct {
TransportIfaceCIDRs []string
IPv4Enabled bool
IPv6Enabled bool
// MTUDeduction only counts IPv4 tunnel overhead, no IPsec and WireGuard overhead.
// MTUDeduction is the MTU deduction for encapsulation and encryption in cluster.
MTUDeduction int
// WireGuardMTUDeduction is the MTU deduction for WireGuard encryption.
// It is calculated based on whether IPv6 is used.
WireGuardMTUDeduction int
// Set by the defaultMTU config option or auto discovered.
// Auto discovery will use MTU value of the Node's transport interface.
// For Encap and Hybrid mode, InterfaceMTU will be adjusted to account for
// encap header.
InterfaceMTU int
EnableMulticlusterGW bool
InterfaceMTU int

EnableMulticlusterGW bool
MulticlusterEncryptionMode TrafficEncryptionModeType
}

// IsIPv4Enabled returns true if the cluster network supports IPv4. Legal cases are:
Expand Down Expand Up @@ -264,24 +272,50 @@ func (nc *NetworkConfig) NeedsDirectRoutingToPeer(peerIP net.IP, localIP *net.IP
return (nc.TrafficEncapMode == TrafficEncapModeNoEncap || nc.TrafficEncapMode == TrafficEncapModeHybrid) && localIP.Contains(peerIP)
}

func (nc *NetworkConfig) getEncapMTUDeduction(isIPv6 bool) int {
var deduction int
if nc.TunnelType == ovsconfig.VXLANTunnel {
deduction = vxlanOverhead
} else if nc.TunnelType == ovsconfig.GeneveTunnel {
deduction = geneveOverhead
} else if nc.TunnelType == ovsconfig.GRETunnel {
deduction = greOverhead
} else {
return 0
}
if isIPv6 {
deduction += ipv6ExtraOverhead
}
return deduction
}

func (nc *NetworkConfig) CalculateMTUDeduction(isIPv6 bool) int {
var mtuDeduction int
// When Multi-cluster Gateway is enabled, we need to reduce MTU for potential cross-cluster traffic.
if nc.TrafficEncapMode.SupportsEncap() || nc.EnableMulticlusterGW {
if nc.TunnelType == ovsconfig.VXLANTunnel {
mtuDeduction = vxlanOverhead
} else if nc.TunnelType == ovsconfig.GeneveTunnel {
mtuDeduction = geneveOverhead
} else if nc.TunnelType == ovsconfig.GRETunnel {
mtuDeduction = greOverhead
}
nc.WireGuardMTUDeduction = WireGuardOverhead
if isIPv6 {
nc.WireGuardMTUDeduction += ipv6ExtraOverhead
}

if nc.TrafficEncapMode.SupportsEncap() && isIPv6 {
mtuDeduction += ipv6ExtraOverhead
if nc.EnableMulticlusterGW {
nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
// When multi-cluster WireGuard is enabled, cross-cluster traffic will be encapsulated and encrypted, we need to
// reduce MTU for both encapsulation and encryption.
if nc.MulticlusterEncryptionMode == TrafficEncryptionModeWireGuard {
nc.MTUDeduction += nc.WireGuardMTUDeduction
}
return nc.MTUDeduction
}
if nc.TrafficEncapMode.SupportsEncap() {
nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
}
if nc.TrafficEncryptionMode == TrafficEncryptionModeWireGuard {
// When WireGuard is enabled, cross-node traffic will only be encrypted, just reduce MTU for encryption.
nc.MTUDeduction = nc.WireGuardMTUDeduction
} else if nc.TrafficEncryptionMode == TrafficEncryptionModeIPSec {
// When IPsec is enabled, cross-node traffic will be encapsulated and encrypted, we need to reduce MTU for both
// encapsulation and encryption.
nc.MTUDeduction += IPSecESPOverhead
}
nc.MTUDeduction = mtuDeduction
return mtuDeduction
return nc.MTUDeduction
}

// ServiceConfig includes K8s Service CIDR and available IP addresses for NodePort.
Expand Down
48 changes: 47 additions & 1 deletion pkg/agent/config/node_config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -298,14 +298,60 @@ func TestCalculateMTUDeduction(t *testing.T) {
{
name: "GRE encap without IPv6",
nc: &NetworkConfig{TunnelType: ovsconfig.GRETunnel},
expectedMTUDeduction: 38,
expectedMTUDeduction: 42,
},
{
name: "Default encap with IPv6",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel},
isIPv6: true,
expectedMTUDeduction: 70,
},
{
name: "WireGuard enabled",
nc: &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
expectedMTUDeduction: 60,
},
{
name: "IPv6 with WireGuard enabled",
nc: &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
isIPv6: true,
expectedMTUDeduction: 80,
},
{
name: "Multicluster enabled with Geneve encap",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, EnableMulticlusterGW: true},
expectedMTUDeduction: 50,
},
{
name: "Geneve encap with Multicluster WireGuard enabled",
nc: &NetworkConfig{
TunnelType: ovsconfig.GeneveTunnel,
EnableMulticlusterGW: true,
MulticlusterEncryptionMode: TrafficEncryptionModeWireGuard,
},
expectedMTUDeduction: 110,
},
{
name: "Geneve encap with IPSec enabled",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
expectedMTUDeduction: 88,
},
{
name: "Geneve encap with IPSec enabled and IPv6",
nc: &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
isIPv6: true,
expectedMTUDeduction: 108,
},
{
name: "VXLan encap with IPSec enabled",
nc: &NetworkConfig{TunnelType: ovsconfig.VXLANTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
expectedMTUDeduction: 88,
},
{
name: "GRE encap with IPSec enabled",
nc: &NetworkConfig{TunnelType: ovsconfig.GRETunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
expectedMTUDeduction: 80,
},
}

for _, tt := range tests {
Expand Down
4 changes: 3 additions & 1 deletion pkg/agent/multicluster/mc_route_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,9 @@ func NewMCDefaultRouteController(
controller.wireGuardConfig = &config.WireGuardConfig{
Port: multiclusterConfig.WireGuard.Port,
Name: multiclusterWireGuardInterface,
MTU: controller.nodeConfig.NodeTransportInterfaceMTU - controller.networkConfig.MTUDeduction - config.WireGuardOverhead,
// Regardless of the tunnel type, the WireGuard device must only reduce MTU for encryption because the
// packets it transmits have been encapsulated.
MTU: nodeConfig.NodeTransportInterfaceMTU - networkConfig.WireGuardMTUDeduction,
}
}
controller.gwInformer.Informer().AddEventHandlerWithResyncPeriod(
Expand Down
10 changes: 5 additions & 5 deletions test/e2e/antreaipam_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -267,16 +267,16 @@ func testAntreaIPAMPodConnectivitySameNode(t *testing.T, data *TestData) {
})
workerNode := workerNodeName(1)

t.Logf("Creating %d agnhost Pods on '%s'", numPods+1, workerNode)
t.Logf("Creating %d toolbox Pods on '%s'", numPods+1, workerNode)
for i := range podInfos {
podInfos[i].os = clusterInfo.nodesOS[workerNode]
if err := data.createAgnhostPodOnNodeWithAnnotations(podInfos[i].name, podInfos[i].namespace, workerNode, nil); err != nil {
t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err)
if err := data.createToolboxPodOnNode(podInfos[i].name, podInfos[i].namespace, workerNode, false); err != nil {
t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err)
}
defer deletePodWrapper(t, data, podInfos[i].namespace, podInfos[i].name)
}

data.runPingMesh(t, podInfos, agnhostContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)
}

func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
Expand All @@ -290,7 +290,7 @@ func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
}
podInfos = append(podInfos, createdPodInfos...)
}
data.runPingMesh(t, podInfos, agnhostContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)
}

func testAntreaIPAMStatefulSet(t *testing.T, data *TestData, dedicatedIPPoolKey *string) {
Expand Down
36 changes: 19 additions & 17 deletions test/e2e/connectivity_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,9 @@ func waitForPodIPs(t *testing.T, data *TestData, podInfos []podInfo) map[string]

// runPingMesh runs a ping mesh between all the provided Pods after first retrieving their IP
// addresses.
func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname string) {
// When dontFragment is true, it will specify the packet size to the maximum value the MTU allows and set DF flag to
// validate the MTU is correct.
func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname string, dontFragment bool) {
podIPs := waitForPodIPs(t, data, podInfos)

t.Logf("Ping mesh test between all Pods")
Expand All @@ -110,7 +112,7 @@ func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname stri
if pi2.namespace != "" {
pod2Namespace = pi2.namespace
}
if err := data.runPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.name], ctrname, pingCount, 0); err != nil {
if err := data.runPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.name], ctrname, pingCount, 0, dontFragment); err != nil {
t.Errorf("Ping '%s' -> '%s': ERROR (%v)", k8s.NamespacedName(podNamespace, pi1.name), k8s.NamespacedName(pod2Namespace, pi2.name), err)
} else {
t.Logf("Ping '%s' -> '%s': OK", k8s.NamespacedName(podNamespace, pi1.name), k8s.NamespacedName(pod2Namespace, pi2.name))
Expand All @@ -131,16 +133,16 @@ func (data *TestData) testPodConnectivitySameNode(t *testing.T) {
workerNode = workerNodeName(clusterInfo.windowsNodes[0])
}

t.Logf("Creating %d agnhost Pods on '%s'", numPods, workerNode)
t.Logf("Creating %d toolbox Pods on '%s'", numPods, workerNode)
for i := range podInfos {
podInfos[i].os = clusterInfo.nodesOS[workerNode]
if err := data.createAgnhostPodOnNode(podInfos[i].name, data.testNamespace, workerNode, false); err != nil {
t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err)
if err := data.createToolboxPodOnNode(podInfos[i].name, data.testNamespace, workerNode, false); err != nil {
t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err)
}
defer deletePodWrapper(t, data, data.testNamespace, podInfos[i].name)
}

data.runPingMesh(t, podInfos, agnhostContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)
}

// testPodConnectivityOnSameNode checks that Pods running on the same Node can reach each other, by
Expand Down Expand Up @@ -185,13 +187,13 @@ func testHostPortPodConnectivity(t *testing.T, data *TestData) {
data.testHostPortPodConnectivity(t, data.testNamespace, data.testNamespace)
}

// createPodsOnDifferentNodes creates agnhost Pods through a DaemonSet. This function returns information of the created
// createPodsOnDifferentNodes creates toolbox Pods through a DaemonSet. This function returns information of the created
// Pods as well as a function which will delete the Pods when called. Since Pods can be on Nodes of different OSes, podInfo
// slice instead of PodName slice is used to inform caller of correct commands and options. Linux and Windows Pods are
// alternating in this podInfo slice so that the test can cover different connectivity cases between different OSes.
func createPodsOnDifferentNodes(t *testing.T, data *TestData, namespace, tag string) (podInfos []podInfo, cleanup func() error) {
dsName := "connectivity-test" + tag
_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, agnhostContainerName, agnhostImage, []string{"sleep", "3600"}, nil)
_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, toolboxContainerName, toolboxImage, []string{"sleep", "3600"}, nil)
if err != nil {
t.Fatalf("Error when creating DaemonSet '%s': %v", dsName, err)
}
Expand Down Expand Up @@ -264,7 +266,7 @@ func (data *TestData) testPodConnectivityDifferentNodes(t *testing.T) {
if len(podInfos) > maxPods {
podInfos = podInfos[:maxPods]
}
data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
}

// testPodConnectivityDifferentNodes checks that Pods running on different Nodes can reach each
Expand Down Expand Up @@ -315,11 +317,11 @@ func testPodConnectivityAfterAntreaRestart(t *testing.T, data *TestData, namespa
podInfos, deletePods := createPodsOnDifferentNodes(t, data, namespace, "antrearestart")
defer deletePods()

data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)

data.redeployAntrea(t, deployAntreaDefault)

data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
}

// testOVSRestartSameNode verifies that datapath flows are not removed when the Antrea Agent Pod is
Expand Down Expand Up @@ -396,16 +398,16 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
}
workerNode := workerNodeName(1)

t.Logf("Creating %d busybox test Pods on '%s'", numPods, workerNode)
t.Logf("Creating %d toolbox test Pods on '%s'", numPods, workerNode)
for i := range podInfos {
podInfos[i].os = clusterInfo.nodesOS[workerNode]
if err := data.createBusyboxPodOnNode(podInfos[i].name, namespace, workerNode, false); err != nil {
t.Fatalf("Error when creating busybox test Pod '%s': %v", podInfos[i].name, err)
if err := data.createToolboxPodOnNode(podInfos[i].name, namespace, workerNode, false); err != nil {
t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i].name, err)
}
defer deletePodWrapper(t, data, namespace, podInfos[i].name)
}

data.runPingMesh(t, podInfos, busyboxContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)

var antreaPodName string
var err error
Expand Down Expand Up @@ -487,7 +489,7 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
// This should give Antrea ~10s to restore flows, since we generate 10 "pings" with a 1s
// interval.
t.Logf("Running second ping mesh to check that flows have been restored")
data.runPingMesh(t, podInfos, busyboxContainerName)
data.runPingMesh(t, podInfos, toolboxContainerName, true)

flows2, groups2 := dumpFlows(), dumpGroups()
numFlows2, numGroups2 := len(flows2), len(groups2)
Expand Down Expand Up @@ -515,7 +517,7 @@ func testPingLargeMTU(t *testing.T, data *TestData) {

pingSize := 2000
t.Logf("Running ping with size %d between Pods %s and %s", pingSize, podInfos[0].name, podInfos[1].name)
if err := data.runPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].name], agnhostContainerName, pingCount, pingSize); err != nil {
if err := data.runPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].name], toolboxContainerName, pingCount, pingSize, false); err != nil {
t.Error(err)
}
}
Loading

0 comments on commit b343486

Please sign in to comment.