From fd74a1938c039a25594a4a93ec06440c0d16835b Mon Sep 17 00:00:00 2001
From: Jiajing Hu <hjiajing@vmware.com>
Date: Fri, 26 Jan 2024 11:27:35 +0800
Subject: [PATCH] Fix incorrect MTU configurations (#5880)

The commit fixes 3 incorrect MTU configurations:

1. When using the WireGuard encryption mode, the Pod eth0's MTU was not
correct. The MTU deducted Geneve overhead because the default tunnel
type is Geneve while it should deduct the WireGuard overhead as traffic
will be encrypted instead of encapsulated.

2. When using the GRE tunnel type, the Pod eth0's MTU was not correct.
The actual overhead is 14 outer MAC, 20 outer IP, and 8 GRE header
(4 standard header + 4 key field), summing up to 42 bytes.

3. When enabling Wireguard for Multicluster, the MTU of all Pod
interfaces and wireguard interface were reduced 130 bytes (50 for
geneve + 80 for wireguard), however, cross-cluster traffic sent from
Pods were not forwarded by wireguard interface. This is because traffic
originated from Pods will be encapsulated on gateway Node, and it's the
encapsulated packet which will be encrypted. If the wireguard interface
is set with the same MTU as the Pod interface, the encapsulated packet
will exceed wireguard interface's MTU.

Signed-off-by: Jiajing Hu <hjiajing@vmware.com>
Signed-off-by: Quan Tian <qtian@vmware.com>
Co-authored-by: Quan Tian <qtian@vmware.com>
---
 cmd/antrea-agent/agent.go                     |  4 +-
 pkg/agent/agent.go                            |  6 +-
 pkg/agent/config/node_config.go               | 76 ++++++++++++++-----
 pkg/agent/config/node_config_test.go          | 48 +++++++++++-
 pkg/agent/multicluster/mc_route_controller.go |  4 +-
 test/e2e/antreaipam_test.go                   | 10 +--
 test/e2e/connectivity_test.go                 | 36 ++++-----
 test/e2e/framework.go                         | 46 +++++++++--
 test/e2e/ipsec_test.go                        |  4 +-
 test/e2e/traceflow_test.go                    |  4 +-
 test/e2e/upgrade_test.go                      |  6 +-
 test/e2e/vmagent_test.go                      |  2 +-
 test/e2e/wireguard_test.go                    |  2 +-
 13 files changed, 181 insertions(+), 67 deletions(-)

diff --git a/cmd/antrea-agent/agent.go b/cmd/antrea-agent/agent.go
index 7efcfd3b2f1..a295f822790 100644
--- a/cmd/antrea-agent/agent.go
+++ b/cmd/antrea-agent/agent.go
@@ -141,6 +141,7 @@ func run(o *Options) error {
 	enableBridgingMode := enableAntreaIPAM && o.config.EnableBridgingMode
 	l7NetworkPolicyEnabled := features.DefaultFeatureGate.Enabled(features.L7NetworkPolicy)
 	enableMulticlusterGW := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableGateway
+	_, multiclusterEncryptionMode := config.GetTrafficEncryptionModeFromStr(o.config.Multicluster.TrafficEncryptionMode)
 	enableMulticlusterNP := features.DefaultFeatureGate.Enabled(features.Multicluster) && o.config.Multicluster.EnableStretchedNetworkPolicy
 	enableFlowExporter := features.DefaultFeatureGate.Enabled(features.FlowExporter) && o.config.FlowExporter.Enable
 	var nodeIPTracker *nodeip.Tracker
@@ -205,7 +206,8 @@ func run(o *Options) error {
 		IPsecConfig: config.IPsecConfig{
 			AuthenticationMode: ipsecAuthenticationMode,
 		},
-		EnableMulticlusterGW: enableMulticlusterGW,
+		EnableMulticlusterGW:       enableMulticlusterGW,
+		MulticlusterEncryptionMode: multiclusterEncryptionMode,
 	}
 
 	wireguardConfig := &config.WireGuardConfig{
diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go
index 079719ea0d1..2f1378ab76a 100644
--- a/pkg/agent/agent.go
+++ b/pkg/agent/agent.go
@@ -1097,7 +1097,7 @@ func (i *Initializer) waitForIPsecMonitorDaemon() error {
 
 // initializeWireguard checks if preconditions are met for using WireGuard and initializes WireGuard client or cleans up.
 func (i *Initializer) initializeWireGuard() error {
-	i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - config.WireGuardOverhead
+	i.wireGuardConfig.MTU = i.nodeConfig.NodeTransportInterfaceMTU - i.networkConfig.WireGuardMTUDeduction
 	wgClient, err := wireguard.New(i.nodeConfig, i.wireGuardConfig)
 	if err != nil {
 		return err
@@ -1200,10 +1200,6 @@ func (i *Initializer) getInterfaceMTU(transportInterface *net.Interface) (int, e
 
 	isIPv6 := i.nodeConfig.NodeIPv6Addr != nil
 	mtu -= i.networkConfig.CalculateMTUDeduction(isIPv6)
-
-	if i.networkConfig.TrafficEncryptionMode == config.TrafficEncryptionModeIPSec {
-		mtu -= config.IPSecESPOverhead
-	}
 	return mtu, nil
 }
 
diff --git a/pkg/agent/config/node_config.go b/pkg/agent/config/node_config.go
index ebba7f3da9e..59de8bdb8c6 100644
--- a/pkg/agent/config/node_config.go
+++ b/pkg/agent/config/node_config.go
@@ -34,12 +34,15 @@ const (
 )
 
 const (
-	vxlanOverhead     = 50
-	geneveOverhead    = 50
-	greOverhead       = 38
+	vxlanOverhead  = 50
+	geneveOverhead = 50
+	// GRE overhead: 14-byte outer MAC, 20-byte outer IPv4, 8-byte GRE header (4-byte standard header + 4-byte key field)
+	greOverhead = 42
+
 	ipv6ExtraOverhead = 20
 
-	WireGuardOverhead = 80
+	// WireGuard overhead: 20-byte outer IPv4, 8-byte UDP header, 4-byte type, 4-byte key index, 8-byte nonce, 16-byte authentication tag
+	WireGuardOverhead = 60
 	// IPsec ESP can add a maximum of 38 bytes to the packet including the ESP
 	// header and trailer.
 	IPSecESPOverhead = 38
@@ -201,14 +204,19 @@ type NetworkConfig struct {
 	TransportIfaceCIDRs   []string
 	IPv4Enabled           bool
 	IPv6Enabled           bool
-	// MTUDeduction only counts IPv4 tunnel overhead, no IPsec and WireGuard overhead.
+	// MTUDeduction is the MTU deduction for encapsulation and encryption in cluster.
 	MTUDeduction int
+	// WireGuardMTUDeduction is the MTU deduction for WireGuard encryption.
+	// It is calculated based on whether IPv6 is used.
+	WireGuardMTUDeduction int
 	// Set by the defaultMTU config option or auto discovered.
 	// Auto discovery will use MTU value of the Node's transport interface.
 	// For Encap and Hybrid mode, InterfaceMTU will be adjusted to account for
 	// encap header.
-	InterfaceMTU         int
-	EnableMulticlusterGW bool
+	InterfaceMTU int
+
+	EnableMulticlusterGW       bool
+	MulticlusterEncryptionMode TrafficEncryptionModeType
 }
 
 // IsIPv4Enabled returns true if the cluster network supports IPv4. Legal cases are:
@@ -264,24 +272,50 @@ func (nc *NetworkConfig) NeedsDirectRoutingToPeer(peerIP net.IP, localIP *net.IP
 	return (nc.TrafficEncapMode == TrafficEncapModeNoEncap || nc.TrafficEncapMode == TrafficEncapModeHybrid) && localIP.Contains(peerIP)
 }
 
+func (nc *NetworkConfig) getEncapMTUDeduction(isIPv6 bool) int {
+	var deduction int
+	if nc.TunnelType == ovsconfig.VXLANTunnel {
+		deduction = vxlanOverhead
+	} else if nc.TunnelType == ovsconfig.GeneveTunnel {
+		deduction = geneveOverhead
+	} else if nc.TunnelType == ovsconfig.GRETunnel {
+		deduction = greOverhead
+	} else {
+		return 0
+	}
+	if isIPv6 {
+		deduction += ipv6ExtraOverhead
+	}
+	return deduction
+}
+
 func (nc *NetworkConfig) CalculateMTUDeduction(isIPv6 bool) int {
-	var mtuDeduction int
-	// When Multi-cluster Gateway is enabled, we need to reduce MTU for potential cross-cluster traffic.
-	if nc.TrafficEncapMode.SupportsEncap() || nc.EnableMulticlusterGW {
-		if nc.TunnelType == ovsconfig.VXLANTunnel {
-			mtuDeduction = vxlanOverhead
-		} else if nc.TunnelType == ovsconfig.GeneveTunnel {
-			mtuDeduction = geneveOverhead
-		} else if nc.TunnelType == ovsconfig.GRETunnel {
-			mtuDeduction = greOverhead
-		}
+	nc.WireGuardMTUDeduction = WireGuardOverhead
+	if isIPv6 {
+		nc.WireGuardMTUDeduction += ipv6ExtraOverhead
 	}
 
-	if nc.TrafficEncapMode.SupportsEncap() && isIPv6 {
-		mtuDeduction += ipv6ExtraOverhead
+	if nc.EnableMulticlusterGW {
+		nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
+		// When multi-cluster WireGuard is enabled, cross-cluster traffic will be encapsulated and encrypted, we need to
+		// reduce MTU for both encapsulation and encryption.
+		if nc.MulticlusterEncryptionMode == TrafficEncryptionModeWireGuard {
+			nc.MTUDeduction += nc.WireGuardMTUDeduction
+		}
+		return nc.MTUDeduction
+	}
+	if nc.TrafficEncapMode.SupportsEncap() {
+		nc.MTUDeduction = nc.getEncapMTUDeduction(isIPv6)
+	}
+	if nc.TrafficEncryptionMode == TrafficEncryptionModeWireGuard {
+		// When WireGuard is enabled, cross-node traffic will only be encrypted, just reduce MTU for encryption.
+		nc.MTUDeduction = nc.WireGuardMTUDeduction
+	} else if nc.TrafficEncryptionMode == TrafficEncryptionModeIPSec {
+		// When IPsec is enabled, cross-node traffic will be encapsulated and encrypted, we need to reduce MTU for both
+		// encapsulation and encryption.
+		nc.MTUDeduction += IPSecESPOverhead
 	}
-	nc.MTUDeduction = mtuDeduction
-	return mtuDeduction
+	return nc.MTUDeduction
 }
 
 // ServiceConfig includes K8s Service CIDR and available IP addresses for NodePort.
diff --git a/pkg/agent/config/node_config_test.go b/pkg/agent/config/node_config_test.go
index 081ce053c7f..bc9791a7468 100644
--- a/pkg/agent/config/node_config_test.go
+++ b/pkg/agent/config/node_config_test.go
@@ -298,7 +298,7 @@ func TestCalculateMTUDeduction(t *testing.T) {
 		{
 			name:                 "GRE encap without IPv6",
 			nc:                   &NetworkConfig{TunnelType: ovsconfig.GRETunnel},
-			expectedMTUDeduction: 38,
+			expectedMTUDeduction: 42,
 		},
 		{
 			name:                 "Default encap with IPv6",
@@ -306,6 +306,52 @@ func TestCalculateMTUDeduction(t *testing.T) {
 			isIPv6:               true,
 			expectedMTUDeduction: 70,
 		},
+		{
+			name:                 "WireGuard enabled",
+			nc:                   &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
+			expectedMTUDeduction: 60,
+		},
+		{
+			name:                 "IPv6 with WireGuard enabled",
+			nc:                   &NetworkConfig{TrafficEncryptionMode: TrafficEncryptionModeWireGuard},
+			isIPv6:               true,
+			expectedMTUDeduction: 80,
+		},
+		{
+			name:                 "Multicluster enabled with Geneve encap",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, EnableMulticlusterGW: true},
+			expectedMTUDeduction: 50,
+		},
+		{
+			name: "Geneve encap with Multicluster WireGuard enabled",
+			nc: &NetworkConfig{
+				TunnelType:                 ovsconfig.GeneveTunnel,
+				EnableMulticlusterGW:       true,
+				MulticlusterEncryptionMode: TrafficEncryptionModeWireGuard,
+			},
+			expectedMTUDeduction: 110,
+		},
+		{
+			name:                 "Geneve encap with IPSec enabled",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			expectedMTUDeduction: 88,
+		},
+		{
+			name:                 "Geneve encap with IPSec enabled and IPv6",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GeneveTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			isIPv6:               true,
+			expectedMTUDeduction: 108,
+		},
+		{
+			name:                 "VXLan encap with IPSec enabled",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.VXLANTunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			expectedMTUDeduction: 88,
+		},
+		{
+			name:                 "GRE encap with IPSec enabled",
+			nc:                   &NetworkConfig{TunnelType: ovsconfig.GRETunnel, TrafficEncryptionMode: TrafficEncryptionModeIPSec},
+			expectedMTUDeduction: 80,
+		},
 	}
 
 	for _, tt := range tests {
diff --git a/pkg/agent/multicluster/mc_route_controller.go b/pkg/agent/multicluster/mc_route_controller.go
index 568371329d5..d13f1b52a10 100644
--- a/pkg/agent/multicluster/mc_route_controller.go
+++ b/pkg/agent/multicluster/mc_route_controller.go
@@ -129,7 +129,9 @@ func NewMCDefaultRouteController(
 		controller.wireGuardConfig = &config.WireGuardConfig{
 			Port: multiclusterConfig.WireGuard.Port,
 			Name: multiclusterWireGuardInterface,
-			MTU:  controller.nodeConfig.NodeTransportInterfaceMTU - controller.networkConfig.MTUDeduction - config.WireGuardOverhead,
+			// Regardless of the tunnel type, the WireGuard device must only reduce MTU for encryption because the
+			// packets it transmits have been encapsulated.
+			MTU: nodeConfig.NodeTransportInterfaceMTU - networkConfig.WireGuardMTUDeduction,
 		}
 	}
 	controller.gwInformer.Informer().AddEventHandlerWithResyncPeriod(
diff --git a/test/e2e/antreaipam_test.go b/test/e2e/antreaipam_test.go
index 5b3ab9acf2c..f337d8d0fed 100644
--- a/test/e2e/antreaipam_test.go
+++ b/test/e2e/antreaipam_test.go
@@ -267,16 +267,16 @@ func testAntreaIPAMPodConnectivitySameNode(t *testing.T, data *TestData) {
 	})
 	workerNode := workerNodeName(1)
 
-	t.Logf("Creating %d agnhost Pods on '%s'", numPods+1, workerNode)
+	t.Logf("Creating %d toolbox Pods on '%s'", numPods+1, workerNode)
 	for i := range podInfos {
 		podInfos[i].os = clusterInfo.nodesOS[workerNode]
-		if err := data.createAgnhostPodOnNodeWithAnnotations(podInfos[i].name, podInfos[i].namespace, workerNode, nil); err != nil {
-			t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err)
+		if err := data.createToolboxPodOnNode(podInfos[i].name, podInfos[i].namespace, workerNode, false); err != nil {
+			t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err)
 		}
 		defer deletePodWrapper(t, data, podInfos[i].namespace, podInfos[i].name)
 	}
 
-	data.runPingMesh(t, podInfos, agnhostContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 }
 
 func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
@@ -290,7 +290,7 @@ func testAntreaIPAMPodConnectivityDifferentNodes(t *testing.T, data *TestData) {
 		}
 		podInfos = append(podInfos, createdPodInfos...)
 	}
-	data.runPingMesh(t, podInfos, agnhostContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 }
 
 func testAntreaIPAMStatefulSet(t *testing.T, data *TestData, dedicatedIPPoolKey *string) {
diff --git a/test/e2e/connectivity_test.go b/test/e2e/connectivity_test.go
index f6bbf5e199e..fb94e0337fc 100644
--- a/test/e2e/connectivity_test.go
+++ b/test/e2e/connectivity_test.go
@@ -93,7 +93,9 @@ func waitForPodIPs(t *testing.T, data *TestData, podInfos []podInfo) map[string]
 
 // runPingMesh runs a ping mesh between all the provided Pods after first retrieving their IP
 // addresses.
-func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname string) {
+// When dontFragment is true, it will specify the packet size to the maximum value the MTU allows and set DF flag to
+// validate the MTU is correct.
+func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname string, dontFragment bool) {
 	podIPs := waitForPodIPs(t, data, podInfos)
 
 	t.Logf("Ping mesh test between all Pods")
@@ -110,7 +112,7 @@ func (data *TestData) runPingMesh(t *testing.T, podInfos []podInfo, ctrname stri
 			if pi2.namespace != "" {
 				pod2Namespace = pi2.namespace
 			}
-			if err := data.runPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.name], ctrname, pingCount, 0); err != nil {
+			if err := data.runPingCommandFromTestPod(pi1, podNamespace, podIPs[pi2.name], ctrname, pingCount, 0, dontFragment); err != nil {
 				t.Errorf("Ping '%s' -> '%s': ERROR (%v)", k8s.NamespacedName(podNamespace, pi1.name), k8s.NamespacedName(pod2Namespace, pi2.name), err)
 			} else {
 				t.Logf("Ping '%s' -> '%s': OK", k8s.NamespacedName(podNamespace, pi1.name), k8s.NamespacedName(pod2Namespace, pi2.name))
@@ -131,16 +133,16 @@ func (data *TestData) testPodConnectivitySameNode(t *testing.T) {
 		workerNode = workerNodeName(clusterInfo.windowsNodes[0])
 	}
 
-	t.Logf("Creating %d agnhost Pods on '%s'", numPods, workerNode)
+	t.Logf("Creating %d toolbox Pods on '%s'", numPods, workerNode)
 	for i := range podInfos {
 		podInfos[i].os = clusterInfo.nodesOS[workerNode]
-		if err := data.createAgnhostPodOnNode(podInfos[i].name, data.testNamespace, workerNode, false); err != nil {
-			t.Fatalf("Error when creating agnhost test Pod '%s': %v", podInfos[i], err)
+		if err := data.createToolboxPodOnNode(podInfos[i].name, data.testNamespace, workerNode, false); err != nil {
+			t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i], err)
 		}
 		defer deletePodWrapper(t, data, data.testNamespace, podInfos[i].name)
 	}
 
-	data.runPingMesh(t, podInfos, agnhostContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 }
 
 // testPodConnectivityOnSameNode checks that Pods running on the same Node can reach each other, by
@@ -185,13 +187,13 @@ func testHostPortPodConnectivity(t *testing.T, data *TestData) {
 	data.testHostPortPodConnectivity(t, data.testNamespace, data.testNamespace)
 }
 
-// createPodsOnDifferentNodes creates agnhost Pods through a DaemonSet. This function returns information of the created
+// createPodsOnDifferentNodes creates toolbox Pods through a DaemonSet. This function returns information of the created
 // Pods as well as a function which will delete the Pods when called. Since Pods can be on Nodes of different OSes, podInfo
 // slice instead of PodName slice is used to inform caller of correct commands and options. Linux and Windows Pods are
 // alternating in this podInfo slice so that the test can cover different connectivity cases between different OSes.
 func createPodsOnDifferentNodes(t *testing.T, data *TestData, namespace, tag string) (podInfos []podInfo, cleanup func() error) {
 	dsName := "connectivity-test" + tag
-	_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, agnhostContainerName, agnhostImage, []string{"sleep", "3600"}, nil)
+	_, deleteDaemonSet, err := data.createDaemonSet(dsName, namespace, toolboxContainerName, toolboxImage, []string{"sleep", "3600"}, nil)
 	if err != nil {
 		t.Fatalf("Error when creating DaemonSet '%s': %v", dsName, err)
 	}
@@ -264,7 +266,7 @@ func (data *TestData) testPodConnectivityDifferentNodes(t *testing.T) {
 	if len(podInfos) > maxPods {
 		podInfos = podInfos[:maxPods]
 	}
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 }
 
 // testPodConnectivityDifferentNodes checks that Pods running on different Nodes can reach each
@@ -315,11 +317,11 @@ func testPodConnectivityAfterAntreaRestart(t *testing.T, data *TestData, namespa
 	podInfos, deletePods := createPodsOnDifferentNodes(t, data, namespace, "antrearestart")
 	defer deletePods()
 
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 
 	data.redeployAntrea(t, deployAntreaDefault)
 
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 }
 
 // testOVSRestartSameNode verifies that datapath flows are not removed when the Antrea Agent Pod is
@@ -396,16 +398,16 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
 	}
 	workerNode := workerNodeName(1)
 
-	t.Logf("Creating %d busybox test Pods on '%s'", numPods, workerNode)
+	t.Logf("Creating %d toolbox test Pods on '%s'", numPods, workerNode)
 	for i := range podInfos {
 		podInfos[i].os = clusterInfo.nodesOS[workerNode]
-		if err := data.createBusyboxPodOnNode(podInfos[i].name, namespace, workerNode, false); err != nil {
-			t.Fatalf("Error when creating busybox test Pod '%s': %v", podInfos[i].name, err)
+		if err := data.createToolboxPodOnNode(podInfos[i].name, namespace, workerNode, false); err != nil {
+			t.Fatalf("Error when creating toolbox test Pod '%s': %v", podInfos[i].name, err)
 		}
 		defer deletePodWrapper(t, data, namespace, podInfos[i].name)
 	}
 
-	data.runPingMesh(t, podInfos, busyboxContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 
 	var antreaPodName string
 	var err error
@@ -487,7 +489,7 @@ func testOVSFlowReplay(t *testing.T, data *TestData, namespace string) {
 	// This should give Antrea ~10s to restore flows, since we generate 10 "pings" with a 1s
 	// interval.
 	t.Logf("Running second ping mesh to check that flows have been restored")
-	data.runPingMesh(t, podInfos, busyboxContainerName)
+	data.runPingMesh(t, podInfos, toolboxContainerName, true)
 
 	flows2, groups2 := dumpFlows(), dumpGroups()
 	numFlows2, numGroups2 := len(flows2), len(groups2)
@@ -515,7 +517,7 @@ func testPingLargeMTU(t *testing.T, data *TestData) {
 
 	pingSize := 2000
 	t.Logf("Running ping with size %d between Pods %s and %s", pingSize, podInfos[0].name, podInfos[1].name)
-	if err := data.runPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].name], agnhostContainerName, pingCount, pingSize); err != nil {
+	if err := data.runPingCommandFromTestPod(podInfos[0], data.testNamespace, podIPs[podInfos[1].name], toolboxContainerName, pingCount, pingSize, false); err != nil {
 		t.Error(err)
 	}
 }
diff --git a/test/e2e/framework.go b/test/e2e/framework.go
index 1d39f4988e3..d0da3b03cba 100644
--- a/test/e2e/framework.go
+++ b/test/e2e/framework.go
@@ -2176,18 +2176,34 @@ func parseArpingStdout(out string) (sent uint32, received uint32, loss float32,
 	return sent, received, loss, nil
 }
 
-func (data *TestData) runPingCommandFromTestPod(podInfo podInfo, ns string, targetPodIPs *PodIPs, ctrName string, count int, size int) error {
+// RunPingCommandFromTestPod uses ping to check connectivity between the Pod and the given target Pod IPs.
+// If dontFragment is true and size is 0, it will set the size to the maximum value allowed by the Pod's MTU.
+func (data *TestData) runPingCommandFromTestPod(podInfo podInfo, ns string, targetPodIPs *PodIPs, ctrName string, count int, size int, dontFragment bool) error {
 	if podInfo.os != "windows" && podInfo.os != "linux" {
 		return fmt.Errorf("OS of Pod '%s' is not clear", podInfo.name)
 	}
+	var sizeIPv4, sizeIPv6 int
+	// TODO: GetPodInterfaceMTU should work for Windows.
+	if dontFragment && size == 0 && podInfo.os == "linux" {
+		mtu, err := data.GetPodInterfaceMTU(ns, podInfo.name, ctrName)
+		if err != nil {
+			return fmt.Errorf("error when retrieving MTU of Pod '%s': %w", podInfo.name, err)
+		}
+		// 8 ICMP header, 20 IPv4 header, 40 IPv6 header
+		sizeIPv4 = mtu - 28
+		sizeIPv6 = mtu - 48
+	} else {
+		sizeIPv4 = size
+		sizeIPv6 = size
+	}
 	if targetPodIPs.ipv4 != nil {
-		cmdV4 := getPingCommand(count, size, podInfo.os, targetPodIPs.ipv4)
+		cmdV4 := getPingCommand(count, sizeIPv4, podInfo.os, targetPodIPs.ipv4, dontFragment)
 		if stdout, stderr, err := data.RunCommandFromPod(ns, podInfo.name, ctrName, cmdV4); err != nil {
 			return fmt.Errorf("error when running ping command '%s': %v - stdout: %s - stderr: %s", strings.Join(cmdV4, " "), err, stdout, stderr)
 		}
 	}
 	if targetPodIPs.ipv6 != nil {
-		cmdV6 := getPingCommand(count, size, podInfo.os, targetPodIPs.ipv6)
+		cmdV6 := getPingCommand(count, sizeIPv6, podInfo.os, targetPodIPs.ipv6, dontFragment)
 		if stdout, stderr, err := data.RunCommandFromPod(ns, podInfo.name, ctrName, cmdV6); err != nil {
 			return fmt.Errorf("error when running ping command '%s': %v - stdout: %s - stderr: %s", strings.Join(cmdV6, " "), err, stdout, stderr)
 		}
@@ -2425,6 +2441,20 @@ func (data *TestData) GetTransportInterface() (string, error) {
 	return "", fmt.Errorf("no interface was assigned with Node IP %s", nodeIP)
 }
 
+func (data *TestData) GetPodInterfaceMTU(namespace string, podName string, containerName string) (int, error) {
+	cmd := []string{"cat", "/sys/class/net/eth0/mtu"}
+	stdout, stderr, err := data.RunCommandFromPod(namespace, podName, containerName, cmd)
+	if stdout == "" || stderr != "" || err != nil {
+		return 0, fmt.Errorf("failed to get interface MTU, stdout: %s, stderr: %s, err: %v", stdout, stderr, err)
+	}
+
+	mtu, err := strconv.Atoi(strings.TrimSpace(stdout))
+	if err != nil {
+		return 0, fmt.Errorf("failed to convert MTU to int: %v", err)
+	}
+	return mtu, nil
+}
+
 func (data *TestData) GetNodeMACAddress(node, device string) (string, error) {
 	antreaPod, err := data.getAntreaPodOnNode(node)
 	if err != nil {
@@ -2783,10 +2813,6 @@ func (data *TestData) createAgnhostPodWithSAOnNode(name string, ns string, nodeN
 	return NewPodBuilder(name, ns, agnhostImage).OnNode(nodeName).WithCommand([]string{"sleep", "3600"}).WithHostNetwork(hostNetwork).WithServiceAccountName(serviceAccountName).Create(data)
 }
 
-func (data *TestData) createAgnhostPodOnNodeWithAnnotations(name string, ns string, nodeName string, annotations map[string]string) error {
-	return NewPodBuilder(name, ns, agnhostImage).OnNode(nodeName).WithCommand([]string{"sleep", "3600"}).WithAnnotations(annotations).Create(data)
-}
-
 func (data *TestData) createDaemonSet(name string, ns string, ctrName string, image string, cmd []string, args []string) (*appsv1.DaemonSet, func() error, error) {
 	podSpec := corev1.PodSpec{
 		Tolerations: controlPlaneNoScheduleTolerations(),
@@ -3003,7 +3029,7 @@ func (data *TestData) checkAntreaAgentInfo(interval time.Duration, timeout time.
 	return err
 }
 
-func getPingCommand(count int, size int, os string, ip *net.IP) []string {
+func getPingCommand(count int, size int, os string, ip *net.IP, dontFragment bool) []string {
 	countOption, sizeOption := "-c", "-s"
 	if os == "windows" {
 		countOption = "-n"
@@ -3013,6 +3039,10 @@ func getPingCommand(count int, size int, os string, ip *net.IP) []string {
 	if size != 0 {
 		cmd = append(cmd, sizeOption, strconv.Itoa(size))
 	}
+	if dontFragment {
+		cmd = append(cmd, "-M", "do")
+	}
+
 	if ip.To4() != nil {
 		cmd = append(cmd, "-4", ip.String())
 	} else {
diff --git a/test/e2e/ipsec_test.go b/test/e2e/ipsec_test.go
index 268a7976d01..24c148ffb6f 100644
--- a/test/e2e/ipsec_test.go
+++ b/test/e2e/ipsec_test.go
@@ -137,7 +137,9 @@ func testIPSecTunnelConnectivity(t *testing.T, data *TestData, certAuth bool) {
 	podInfos, deletePods := createPodsOnDifferentNodes(t, data, data.testNamespace, tag)
 	defer deletePods()
 	t.Logf("Executing ping tests across Nodes: '%s' <-> '%s'", podInfos[0].nodeName, podInfos[1].nodeName)
-	data.runPingMesh(t, podInfos[:2], agnhostContainerName)
+	// PMTU is wrong when using GRE+IPsec with some Linux kernel versions, do not set DF to work around.
+	// See https://github.com/antrea-io/antrea/issues/5922 for more details.
+	data.runPingMesh(t, podInfos[:2], toolboxContainerName, false)
 
 	// Check that there is at least one 'up' Security Association on the Node
 	nodeName := podInfos[0].nodeName
diff --git a/test/e2e/traceflow_test.go b/test/e2e/traceflow_test.go
index 028fb648fa5..c0df5086f3b 100644
--- a/test/e2e/traceflow_test.go
+++ b/test/e2e/traceflow_test.go
@@ -1147,7 +1147,7 @@ func testTraceflowInterNode(t *testing.T, data *TestData) {
 		podInfos[1].name = node2Pods[2]
 		podInfos[1].namespace = data.testNamespace
 		podInfos[1].os = "windows"
-		data.runPingMesh(t, podInfos, agnhostContainerName)
+		data.runPingMesh(t, podInfos, agnhostContainerName, false)
 	}
 
 	// Setup 2 NetworkPolicies:
@@ -2470,7 +2470,7 @@ func runTestTraceflow(t *testing.T, data *TestData, tc testcase) {
 		// Give a little time for Nodes to install OVS flows.
 		time.Sleep(time.Second * 2)
 		// Send an ICMP echo packet from the source Pod to the destination.
-		if err := data.runPingCommandFromTestPod(podInfo{srcPod, osString, "", ""}, data.testNamespace, dstPodIPs, agnhostContainerName, 2, 0); err != nil {
+		if err := data.runPingCommandFromTestPod(podInfo{srcPod, osString, "", ""}, data.testNamespace, dstPodIPs, agnhostContainerName, 2, 0, false); err != nil {
 			t.Logf("Ping '%s' -> '%v' failed: ERROR (%v)", srcPod, *dstPodIPs, err)
 		}
 	}
diff --git a/test/e2e/upgrade_test.go b/test/e2e/upgrade_test.go
index 255e8153af5..df98aed402c 100644
--- a/test/e2e/upgrade_test.go
+++ b/test/e2e/upgrade_test.go
@@ -54,9 +54,9 @@ func TestUpgrade(t *testing.T) {
 	nodeName := nodeName(0)
 	podName := randName("test-pod-")
 
-	t.Logf("Creating a busybox test Pod on '%s'", nodeName)
-	if err := data.createBusyboxPodOnNode(podName, data.testNamespace, nodeName, false); err != nil {
-		t.Fatalf("Error when creating busybox test Pod: %v", err)
+	t.Logf("Creating a toolbox test Pod on '%s'", nodeName)
+	if err := data.createToolboxPodOnNode(podName, data.testNamespace, nodeName, false); err != nil {
+		t.Fatalf("Error when creating toolbox test Pod: %v", err)
 	}
 	if err := data.podWaitForRunning(defaultTimeout, podName, data.testNamespace); err != nil {
 		t.Fatalf("Error when waiting for Pod '%s' to be in the Running state", podName)
diff --git a/test/e2e/vmagent_test.go b/test/e2e/vmagent_test.go
index 6036a88b2de..9f42eb53d05 100644
--- a/test/e2e/vmagent_test.go
+++ b/test/e2e/vmagent_test.go
@@ -656,7 +656,7 @@ func createANPWithFQDN(t *testing.T, data *TestData, name string, namespace stri
 
 func runPingCommandOnVM(data *TestData, dstVM vmInfo, connected bool) error {
 	dstIP := net.ParseIP(dstVM.ip)
-	cmd := getPingCommand(pingCount, 0, strings.ToLower(linuxOS), &dstIP)
+	cmd := getPingCommand(pingCount, 0, strings.ToLower(linuxOS), &dstIP, false)
 	cmdStr := strings.Join(cmd, " ")
 	expCount := pingCount
 	if !connected {
diff --git a/test/e2e/wireguard_test.go b/test/e2e/wireguard_test.go
index 87f9c786074..f297d030f99 100644
--- a/test/e2e/wireguard_test.go
+++ b/test/e2e/wireguard_test.go
@@ -69,7 +69,7 @@ func testPodConnectivity(t *testing.T, data *TestData) {
 	podInfos, deletePods := createPodsOnDifferentNodes(t, data, data.testNamespace, "differentnodes")
 	defer deletePods()
 	numPods := 2
-	data.runPingMesh(t, podInfos[:numPods], agnhostContainerName)
+	data.runPingMesh(t, podInfos[:numPods], toolboxContainerName, true)
 
 	// Make sure that route to Pod on peer Node and route to peer gateway is targeting the WireGuard device.
 	srcPod, err := data.getAntreaPodOnNode(nodeName(0))