Skip to content

Commit

Permalink
Fix gateway datapath disrupt on update
Browse files Browse the repository at this point in the history
Reimplementing the vswitchd reload using separate start and stop scripts
so that it can be executed partially between consecutives instances of
the pod. To avoid disruptions, it runs ovs save-flows command on the
stop-vswitchd script, saving the resulting backup on a mounted folder
from the host and then loading that backup on the start script. For this
to succeed, the script needs to set the flow-restore-wait flag from ovs
to true while loading the flows.

Note that ovs save-flows command needs to have ovsdb-server running, so
a semaphore like mechanism was added to ensure the ovsdb-server
container is never deleted before ovs-vswitchd.

Closes-Issue: OSPRH-6326
  • Loading branch information
elvgarrui authored and ralonsoh committed Jun 24, 2024
1 parent 867a324 commit 303cbcd
Show file tree
Hide file tree
Showing 15 changed files with 168 additions and 17 deletions.
7 changes: 3 additions & 4 deletions pkg/ovncontroller/daemonset.go
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ func CreateOVSDaemonSet(
Lifecycle: &corev1.Lifecycle{
PreStop: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: []string{"/usr/share/openvswitch/scripts/ovs-ctl", "stop", "--no-ovs-vswitchd"},
Command: []string{"/usr/local/bin/container-scripts/stop-ovsdb-server.sh"},
},
},
},
Expand All @@ -199,12 +199,11 @@ func CreateOVSDaemonSet(
},
{
Name: "ovs-vswitchd",
Command: []string{"/bin/bash", "-c"},
Args: []string{"/usr/local/bin/container-scripts/net_setup.sh && /usr/sbin/ovs-vswitchd --pidfile --mlockall"},
Command: []string{"/usr/local/bin/container-scripts/start-vswitchd.sh"},
Lifecycle: &corev1.Lifecycle{
PreStop: &corev1.LifecycleHandler{
Exec: &corev1.ExecAction{
Command: []string{"/usr/share/openvswitch/scripts/ovs-ctl", "stop", "--no-ovsdb-server"},
Command: []string{"/usr/local/bin/container-scripts/stop-vswitchd.sh"},
},
},
},
Expand Down
14 changes: 14 additions & 0 deletions templates/ovncontroller/bin/functions
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,20 @@ EnableChassisAsGateway=${EnableChassisAsGateway:-true}
PhysicalNetworks=${PhysicalNetworks:-""}
OVNHostName=${OVNHostName:-""}

ovs_dir=/var/lib/openvswitch
FLOWS_RESTORE_SCRIPT=$ovs_dir/flows-script
FLOWS_RESTORE_DIR=$ovs_dir/saved-flows
SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE=$ovs_dir/is_safe_to_stop_ovsdb_server

function cleanup_ovsdb_server_semaphore() {
rm -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE 2>&1 > /dev/null
}

function cleanup_flows_backup() {
rm -f $FLOWS_RESTORE_SCRIPT 2>&1 > /dev/null
rm -rf $FLOWS_RESTORE_DIR 2>&1 > /dev/null
}

function wait_for_ovsdb_server {
while true; do
/usr/bin/ovs-vsctl show
Expand Down
6 changes: 5 additions & 1 deletion templates/ovncontroller/bin/start-ovsdb-server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,15 @@
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

set -ex
source $(dirname $0)/functions

CTL_ARGS="--system-id=random --no-ovs-vswitchd"
# Remove the obsolete semaphore file in case it still exists.
cleanup_ovsdb_server_semaphore

# Initialize or upgrade database if needed
CTL_ARGS="--system-id=random --no-ovs-vswitchd"
/usr/share/openvswitch/scripts/ovs-ctl start $CTL_ARGS
/usr/share/openvswitch/scripts/ovs-ctl stop $CTL_ARGS

Expand Down
54 changes: 54 additions & 0 deletions templates/ovncontroller/bin/start-vswitchd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/sh
#
# Copyright 2024 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

source $(dirname $0)/functions
wait_for_ovsdb_server

# The order - first wait for db server, then set -ex - is important. Otherwise,
# wait_for_ovsdb_server interrim check would make the script exit.
set -ex

# Configure encap IP.
OVNEncapIP=$(ip -o addr show dev {{ .OVNEncapNIC }} scope global | awk '{print $4}' | cut -d/ -f1)
ovs-vsctl --no-wait set open . external-ids:ovn-encap-ip=${OVNEncapIP}

# Before starting vswitchd, block it from flushing existing datapath flows.
ovs-vsctl --no-wait set open_vswitch . other_config:flow-restore-wait=true

# It's safe to start vswitchd now. Do it.
# --detach to allow the execution to continue to restoring the flows.
/usr/sbin/ovs-vswitchd --pidfile --mlockall --detach

# Restore saved flows.
if [ -f $FLOWS_RESTORE_SCRIPT ]; then
# It's unsafe to leave these files in place if they fail once. Make sure we
# remove them if the eval fails.
trap cleanup_flows_backup EXIT
eval "$(cat $FLOWS_RESTORE_SCRIPT)"
trap - EXIT
fi

# It's also unsafe to leave these files after flow-restore-wait flag is removed
# because the backup will become stale and if a container later crashes, it may
# mistakenly try to restore from this old backup.
cleanup_flows_backup

# Now, inform vswitchd that we are done.
ovs-vsctl remove open_vswitch . other_config flow-restore-wait

# This is container command script. Block it from exiting, otherwise k8s will
# restart the container again.
sleep infinity
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/sh
#
# Copyright 2023 Red Hat Inc.
# Copyright 2024 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -14,11 +14,17 @@
# License for the specific language governing permissions and limitations
# under the License.

OVNEncapIP=$(ip -o addr show dev {{ .OVNEncapNIC }} scope global | awk '{print $4}' | cut -d/ -f1)

set -ex
source $(dirname $0)/functions

wait_for_ovsdb_server
# The ovs_vswitchd container has to terminate before ovsdb-server because it
# needs access to db in its preStop script. The preStop script backs up flows
# for restoration during the next startup. This semaphore ensures the vswitchd
# container is not torn down before flows are saved.
while [ ! -f $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE ]; do
sleep 0.5
done
cleanup_ovsdb_server_semaphore

set -ex
ovs-vsctl --no-wait set open . external-ids:ovn-encap-ip=${OVNEncapIP}
# Now it's safe to stop db server. Do it.
/usr/share/openvswitch/scripts/ovs-ctl stop --no-ovs-vswitchd
36 changes: 36 additions & 0 deletions templates/ovncontroller/bin/stop-vswitchd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/sh
#
# Copyright 2024 Red Hat Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.

set -ex
source $(dirname $0)/functions

# Clean up any previously created flow backups to avoid conflict with newly
# generated backup.
cleanup_flows_backup

# Passing --real to mimic what upstream startup scripts do; maybe redundant.
bridges=$(ovs-vsctl -- --real list-br)

# Saving flows to avoid disrupting gateway datapath.
mkdir $FLOWS_RESTORE_DIR
TMPDIR=$FLOWS_RESTORE_DIR /usr/share/openvswitch/scripts/ovs-save save-flows $bridges > $FLOWS_RESTORE_SCRIPT

# Once save-flows logic is complete it no longer needs ovsdb-server, this file
# unlocks the db preStop script, working as a semaphore
touch $SAFE_TO_STOP_OVSDB_SERVER_SEMAPHORE

# Finally, stop vswitchd.
/usr/share/openvswitch/scripts/ovs-ctl stop --no-ovsdb-server
12 changes: 6 additions & 6 deletions tests/functional/ovncontroller_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ var _ = Describe("OVNController controller", func() {
}, timeout, interval).Should(ContainElement("openstack.org/ovncontroller"))
})

It("should create a ConfigMap for net_setup.sh with eth0 as Interface Name", func() {
It("should create a ConfigMap for start-vswitchd.sh with eth0 as Interface Name", func() {
scriptsCM := types.NamespacedName{
Namespace: OVNControllerName.Namespace,
Name: fmt.Sprintf("%s-%s", OVNControllerName.Name, "scripts"),
Expand All @@ -74,7 +74,7 @@ var _ = Describe("OVNController controller", func() {
return *th.GetConfigMap(scriptsCM)
}, timeout, interval).ShouldNot(BeNil())

Expect(th.GetConfigMap(scriptsCM).Data["net_setup.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["start-vswitchd.sh"]).Should(
ContainSubstring("addr show dev eth0"))

th.ExpectCondition(
Expand Down Expand Up @@ -179,12 +179,12 @@ var _ = Describe("OVNController controller", func() {
th.AssertJobDoesNotExist(configJobOVS)
})

It("should create a ConfigMap for net_setup.sh with eth0 as Interface Name", func() {
It("should create a ConfigMap for start-vswitchd.sh with eth0 as Interface Name", func() {
Eventually(func() corev1.ConfigMap {
return *th.GetConfigMap(scriptsCM)
}, timeout, interval).ShouldNot(BeNil())

Expect(th.GetConfigMap(scriptsCM).Data["net_setup.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["start-vswitchd.sh"]).Should(
ContainSubstring("addr show dev eth0"))

th.ExpectCondition(
Expand Down Expand Up @@ -445,7 +445,7 @@ var _ = Describe("OVNController controller", func() {

}, timeout, interval).Should(Succeed())
})
It("should create a ConfigMap for net_setup.sh with nic name as Network Attachment and OwnerReferences set", func() {
It("should create a ConfigMap for start-vswitchd.sh with nic name as Network Attachment and OwnerReferences set", func() {

scriptsCM := types.NamespacedName{
Namespace: OVNControllerName.Namespace,
Expand All @@ -461,7 +461,7 @@ var _ = Describe("OVNController controller", func() {
Expect(th.GetConfigMap(scriptsCM).ObjectMeta.OwnerReferences[0].Kind).To(Equal("OVNController"))

ovncontroller := GetOVNController(OVNControllerName)
Expect(th.GetConfigMap(scriptsCM).Data["net_setup.sh"]).Should(
Expect(th.GetConfigMap(scriptsCM).Data["start-vswitchd.sh"]).Should(
ContainSubstring("addr show dev %s", ovncontroller.Spec.NetworkAttachment))
})
It("should create an external ConfigMap with expected key-value pairs and OwnerReferences set", func() {
Expand Down
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/01-assert.yaml
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/01-deploy-ovn.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: |
node=$(oc get nodes -o name|sort|head -1| sed "s|node/||g")
controller_pod=$(oc get pod -n $NAMESPACE -l service=ovn-controller-ovs --field-selector spec.nodeName=$node -o name | head -1)
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-vsctl --if-exists del-br br-test-flows || exit 1
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-vsctl add-br br-test-flows || exit 1
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-ofctl add-flow br-test-flows "table=100, priority=200 action=drop" || exit 1
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/03-assert.yaml
7 changes: 7 additions & 0 deletions tests/kuttl/tests/ovn_restart_flow/03-delete-pods.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
apiVersion: kuttl.dev/v1beta1
kind: TestStep
commands:
- script: |
node=$(oc get nodes -o name|sort|head -1| sed "s|node/||g")
controller_pod=$(oc get pod -n $NAMESPACE -l service=ovn-controller-ovs --field-selector spec.nodeName=$node -o name | head -1)
oc delete -n $NAMESPACE $controller_pod
17 changes: 17 additions & 0 deletions tests/kuttl/tests/ovn_restart_flow/04-assert-flows-present.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#
# Check for:
#
# - Check that the flow added to the test bridge "br-test-flows" is present
# after the ovn-controller has been restarted.
# Expected flow: "table=100, priority=200 actions=drop"

apiVersion: kuttl.dev/v1beta1
kind: TestAssert
timeout: 300
commands:
- script: |
node=$(oc get nodes -o name|sort|head -1| sed "s|node/||g")
controller_pod=$(oc get pod -n $NAMESPACE -l service=ovn-controller-ovs --field-selector spec.nodeName=$node -o name | head -1)
expected_flows="table=100, priority=200 actions=drop"
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-ofctl dump-flows br-test-flows --no-stats | grep -q "$expected_flows" || exit 1
oc rsh -n $NAMESPACE --container ovs-vswitchd $controller_pod ovs-vsctl --if-exists del-br br-test-flows
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/06-cleanup-ovn.yaml
1 change: 1 addition & 0 deletions tests/kuttl/tests/ovn_restart_flow/06-errors.yaml

0 comments on commit 303cbcd

Please sign in to comment.