diff mbox series

[ovs-dev,v2,3/4] patch.c: Avoid patch interface deletion & recreation during restart.

Message ID 20220725213443.646243-4-hzhou@ovn.org
State Changes Requested
Headers show
Series Avoid unnecessary deletion & recreation during restart. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test fail github build: failed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Han Zhou July 25, 2022, 9:34 p.m. UTC
When ovn-controller is restarted, it may need multiple iterations of
main loop before completely download all related data from SB DB,
especially when ovn-monitor-all=false, so after restart, before it sees
the related localnet ports from SB DB, it treats the related patch
ports on the chassis as not needed, and deleted them. Later when it
downloads thoses port-bindings it recreates them.  For a graceful
upgrade, we don't want this to happen, because it would break the
traffic.

This is especially problematic at ovn-k8s setups because the external
side of the patch port is used to program some flows for external
network communication. When the patch ports are recreated, the OF port
number changes and those flows are broken. The CMS would detect and fix
the flows but it would result in even longer downtime.

This patch postpone the deletion of the patch ports, with the assumption
that leaving the unused ports on the chassis for little longer is not
harmful.

Signed-off-by: Han Zhou <hzhou@ovn.org>
---
 controller/patch.c      |  9 +++++-
 tests/ovn-controller.at | 71 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

Comments

Dumitru Ceara Aug. 17, 2022, 3:27 p.m. UTC | #1
On 7/25/22 23:34, Han Zhou wrote:
> When ovn-controller is restarted, it may need multiple iterations of
> main loop before completely download all related data from SB DB,
> especially when ovn-monitor-all=false, so after restart, before it sees
> the related localnet ports from SB DB, it treats the related patch
> ports on the chassis as not needed, and deleted them. Later when it
> downloads thoses port-bindings it recreates them.  For a graceful
> upgrade, we don't want this to happen, because it would break the
> traffic.
> 
> This is especially problematic at ovn-k8s setups because the external
> side of the patch port is used to program some flows for external
> network communication. When the patch ports are recreated, the OF port
> number changes and those flows are broken. The CMS would detect and fix
> the flows but it would result in even longer downtime.
> 
> This patch postpone the deletion of the patch ports, with the assumption
> that leaving the unused ports on the chassis for little longer is not
> harmful.
> 
> Signed-off-by: Han Zhou <hzhou@ovn.org>
> ---

Looks good to me, thanks!

Acked-by: Dumitru Ceara <dceara@redhat.com>
diff mbox series

Patch

diff --git a/controller/patch.c b/controller/patch.c
index ed831302c..12e0b6f7c 100644
--- a/controller/patch.c
+++ b/controller/patch.c
@@ -311,7 +311,14 @@  patch_run(struct ovsdb_idl_txn *ovs_idl_txn,
     SHASH_FOR_EACH_SAFE (port_node, &existing_ports) {
         port = port_node->data;
         shash_delete(&existing_ports, port_node);
-        remove_port(bridge_table, port);
+        /* Wait for some iterations before really deleting any patch ports,
+         * because with conditional monitoring it is possible that related SB
+         * data is not completely downloaded yet after last restart of
+         * ovn-controller.  Otherwise it may cause unncessary dataplane
+         * interruption during restart/upgrade. */
+        if (!daemon_started_recently()) {
+            remove_port(bridge_table, port);
+        }
     }
     shash_destroy(&existing_ports);
 }
diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
index b8db342b9..66232e79c 100644
--- a/tests/ovn-controller.at
+++ b/tests/ovn-controller.at
@@ -114,6 +114,7 @@  check_patches \
     'br-eth0 patch-localnet2-to-br-int patch-br-int-to-localnet2'
 
 # Delete the mapping and the ovn-bridge-mapping patch ports should go away.
+check ovn-appctl -t ovn-controller debug/ignore-startup-delay
 AT_CHECK([ovs-vsctl remove Open_vSwitch . external-ids ovn-bridge-mappings])
 check_bridge_mappings
 check_patches
@@ -2242,3 +2243,73 @@  OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $(port_b
 
 OVN_CLEANUP([hv1])
 AT_CLEANUP
+
+AT_SETUP([ovn-controller - restart should not delete patch ports])
+
+ovn_start
+
+net_add n1
+sim_add hv1
+as hv1
+check ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+check ovs-vsctl add-port br-int hv1-vif1 -- \
+    set interface hv1-vif1 external-ids:iface-id=lsp1
+check ovs-vsctl add-br br-ext -- \
+    set open . external-ids:ovn-bridge-mappings=extnet:br-ext
+
+# Create logical topology so that lsp1 has multiple hops to a localnet port,
+# which would require multiple iterations to download the related datapaths and
+# port_bindings from SB DB during startup.
+#
+# lsp1@hv1 -- ls1 -- lr1 -- ls2 -- lr2 -- ls-ext -- lsp-ext (localnet)
+
+check ovn-appctl -t ovn-controller vlog/set file:dbg
+check ovn-nbctl ls-add ls1 \
+    -- lsp-add ls1 lsp1 \
+    -- lsp-set-addresses lsp1 "f0:00:00:00:00:01 10.0.1.2"
+check ovn-nbctl lr-add lr1 \
+    -- lrp-add lr1 lrp-lr1-ls1 f0:00:aa:00:01:01 10.0.1.1/24 \
+    -- lsp-add ls1 lsp-ls1-lr1 \
+    -- lsp-set-type lsp-ls1-lr1 router \
+    -- lsp-set-options lsp-ls1-lr1 router-port=lrp-lr1-ls1 \
+    -- lsp-set-addresses lsp-ls1-lr1 router
+check ovn-nbctl ls-add ls2 \
+    -- lrp-add lr1 lrp-lr1-ls2 f0:00:aa:00:01:02 10.0.2.1/24 \
+    -- lsp-add ls2 lsp-ls2-lr1 \
+    -- lsp-set-type lsp-ls2-lr1 router \
+    -- lsp-set-options lsp-ls2-lr1 router-port=lrp-lr1-ls2 \
+    -- lsp-set-addresses lsp-ls2-lr1 router
+check ovn-nbctl lr-add lr2 \
+    -- lrp-add lr2 lrp-lr2-ls2 f0:00:aa:00:02:02 10.0.2.2/24 \
+    -- lsp-add ls2 lsp-ls2-lr2 \
+    -- lsp-set-type lsp-ls2-lr2 router \
+    -- lsp-set-options lsp-ls2-lr2 router-port=lrp-lr2-ls2 \
+    -- lsp-set-addresses lsp-ls2-lr2 router
+check ovn-nbctl ls-add ls-ext \
+    -- lrp-add lr2 lrp-lr2-ext f0:00:aa:00:02:03 10.0.3.1/24 \
+    -- lsp-add ls-ext lsp-ext-lr2 \
+    -- lsp-set-type lsp-ext-lr2 router \
+    -- lsp-set-options lsp-ext-lr2 router-port=lrp-lr2-ext \
+    -- lsp-set-addresses lsp-ext-lr2 router
+check ovn-nbctl lsp-add ls-ext lsp-ext \
+    -- lsp-set-type lsp-ext localnet \
+    -- lsp-set-options lsp-ext network_name=extnet \
+    -- lsp-set-addresses lsp-ext unknown
+check ovn-nbctl --wait=hv sync
+
+AT_CHECK([ovs-vsctl list-ports br-int | grep patch-br-int-to-lsp-ext], [0], [ignore])
+
+# Stop ovn-controller
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+# Start ovn-controller, which shouldn't cause any patch interface
+# deletion/recreation
+start_daemon ovn-controller
+for i in $(seq 20); do
+    check ovn-nbctl --wait=hv sync
+done
+
+AT_CHECK([grep "deleted interface patch" hv1/ovs-vswitchd.log], [1], [ignore])
+OVN_CLEANUP([hv1])
+AT_CLEANUP