diff mbox series

[ovs-dev,v2,2/2] binding.c: Make sure that localport is removed from local datapath

Message ID 20220525065155.293205-3-amusil@redhat.com
State Changes Requested
Headers show
Series This patch series fixes two issues with localport. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Ales Musil May 25, 2022, 6:51 a.m. UTC
When localport is removed from NB, and it is the last port
remaining on the host, it is not part of local datapath
anymore. Which can cause troubles when there is recompute
happening in between the removal from NB and the removal
of interface from host. The localport would stay in lport_ids
set, so any new localport that happens to have the same unique
lport key wouldn't be initiliazed properly thorugh I-P.

Remove the localport port binding from local datapath
if it was part of the that said datapath before.

Reported-at: https://bugzilla.redhat.com/2076604
Signed-off-by: Ales Musil <amusil@redhat.com>
---
v2: Add a test case for the scenario and rebase on newer main
---
 controller/binding.c    |  9 +++++++
 tests/ovn-controller.at | 54 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)

Comments

Numan Siddique May 26, 2022, 5:13 p.m. UTC | #1
On Wed, May 25, 2022 at 2:52 AM Ales Musil <amusil@redhat.com> wrote:
>
> When localport is removed from NB, and it is the last port
> remaining on the host, it is not part of local datapath
> anymore. Which can cause troubles when there is recompute
> happening in between the removal from NB and the removal
> of interface from host. The localport would stay in lport_ids
> set, so any new localport that happens to have the same unique
> lport key wouldn't be initiliazed properly thorugh I-P.
>
> Remove the localport port binding from local datapath
> if it was part of the that said datapath before.
>
> Reported-at: https://bugzilla.redhat.com/2076604
> Signed-off-by: Ales Musil <amusil@redhat.com>

Thanks for the v2.

Looks like either the test case added is flaky or some issue with
ovn-controller.

If you run the added test case in a loop continuously, it fails intermittently.
And when it fails I see the below in testsuite.log

----
../../tests/ovn-controller.at:2212: ovn-nbctl ls-add ls0
../../tests/ovn-controller.at:2213: ovn-nbctl lsp-add ls0 vm0
../../tests/ovn-controller.at:2214: ovn-nbctl lsp-set-addresses vm0
"00:00:00:00:10:10 192.168.10.10"
../../tests/ovn-controller.at:2201: ovn-nbctl lsp-add ls0 metadata
../../tests/ovn-controller.at:2202: ovn-nbctl lsp-set-type metadata localport
../../tests/ovn-controller.at:2203: ovn-nbctl lsp-set-addresses
metadata "00:00:00:00:10:25 192.168.10.25"
../../tests/ovn-controller.at:2207: ovs-vsctl add-port br-int vm0 --
set interface vm0 type=internal external_ids:iface-id=vm0
../../tests/ovn-controller.at:2208: ovs-vsctl add-port br-int metadata
-- set interface metadata type=internal external_ids:iface-id=metadata
ovn-controller.at:2222: waiting until test 6 = $(as hv1 ovs-ofctl
dump-flows br-int | grep -c $pb)...
ovn-controller.at:2222: wait succeeded immediately
../../tests/ovn-controller.at:2225: ovs-vsctl del-port br-int vm0
../../tests/ovn-controller.at:2226: ovn-appctl inc-engine/recompute
ovn-controller.at:2229: waiting until test 0 = $(as hv1 ovs-ofctl
dump-flows br-int | grep -c $pb)...
ovn-controller.at:2229: wait succeeded immediately
../../tests/ovn-controller.at:2232: ovn-nbctl lsp-del metadata
../../tests/ovn-controller.at:2233: ovs-vsctl del-port br-int metadata
../../tests/ovn-controller.at:2201: ovn-nbctl lsp-add ls0 metadata
../../tests/ovn-controller.at:2202: ovn-nbctl lsp-set-type metadata localport
../../tests/ovn-controller.at:2203: ovn-nbctl lsp-set-addresses
metadata "00:00:00:00:10:25 192.168.10.25"
../../tests/ovn-controller.at:2207: ovs-vsctl add-port br-int vm0 --
set interface vm0 type=internal external_ids:iface-id=vm0
../../tests/ovn-controller.at:2208: ovs-vsctl add-port br-int metadata
-- set interface metadata type=internal external_ids:iface-id=metadata
ovn-controller.at:2239: waiting until test 6 = $(as hv1 ovs-ofctl
dump-flows br-int | grep -c $pb)...
ovn-controller.at:2239: wait failed after 30 seconds
../../tests/ovs-macros.at:259: hard failure
2220. ovn-controller.at:2190: 2220. ovn-controller - localport can be
recreated (ovn-controller.at:2190): FAILED (ovs-macros.at:259)

------

Can you try testing this locally and see what's causing this failure ?

Below is my run loop script

----
#!/bin/bash

start_time=$(date)
tst="${TST:-2220}"
while true
do
    make check TESTSUITEFLAGS="$tst" 2>&1 | tee log
    grep successful log
    if [ "$?" = "1" ]; then
        echo "Test failed"
echo "Test started at $start_time"
echo "Exiting on "
date
        exit 1
    fi

    if [ -e tst_exit ]; then
        echo "Exiting  !!!"
        rm -f tst_exit
        break
    fi
done

echo "Done testing"
-------------

Thanks
Numan


> ---
> v2: Add a test case for the scenario and rebase on newer main
> ---
>  controller/binding.c    |  9 +++++++
>  tests/ovn-controller.at | 54 +++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 63 insertions(+)
>
> diff --git a/controller/binding.c b/controller/binding.c
> index 92baebd29..a900c9a50 100644
> --- a/controller/binding.c
> +++ b/controller/binding.c
> @@ -2131,6 +2131,15 @@ handle_deleted_lport(const struct sbrec_port_binding *pb,
>          return;
>      }
>
> +    /*
> +     * Remove localport that was part of local datapath that is not
> +     * considered to be local anymore.
> +     */
> +    if (!ld && !strcmp(pb->type, "localport") &&
> +        sset_find(&b_ctx_out->related_lports->lport_names, pb->logical_port)) {
> +        remove_related_lport(pb, b_ctx_out);
> +    }
> +
>      /* If the binding is not local, if 'pb' is a L3 gateway port, we should
>       * remove its peer, if that one is local.
>       */
> diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
> index 3ca59f7e0..a34740d85 100644
> --- a/tests/ovn-controller.at
> +++ b/tests/ovn-controller.at
> @@ -2186,3 +2186,57 @@ AT_CHECK([grep "Parsing of ovn-chassis-mac-mappings failed" hv1/ovn-controller.l
>
>  OVN_CLEANUP([hv1])
>  AT_CLEANUP
> +
> +AT_SETUP([ovn-controller - localport can be recreated])
> +
> +ovn_start
> +
> +net_add n1
> +sim_add hv1
> +as hv1
> +ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.1
> +
> +create_localport() {
> +    AT_CHECK([ovn-nbctl lsp-add ls0 metadata])
> +    AT_CHECK([ovn-nbctl lsp-set-type metadata localport])
> +    AT_CHECK([ovn-nbctl lsp-set-addresses metadata "00:00:00:00:10:25 192.168.10.25"])
> +}
> +
> +bind_ports() {
> +    AT_CHECK([ovs-vsctl add-port br-int vm0 -- set interface vm0 type=internal external_ids:iface-id=vm0])
> +    AT_CHECK([ovs-vsctl add-port br-int metadata -- set interface metadata type=internal external_ids:iface-id=metadata])
> +}
> +
> +# Create one VIF and localport and bind it to chassis
> +AT_CHECK([ovn-nbctl ls-add ls0])
> +AT_CHECK([ovn-nbctl lsp-add ls0 vm0])
> +AT_CHECK([ovn-nbctl lsp-set-addresses vm0 "00:00:00:00:10:10 192.168.10.10"])
> +create_localport
> +bind_ports
> +
> +pb=$(ovn-sbctl --bare --columns _uuid find port_binding logical_port=metadata | cut -d '-' -f 1 | tr -d '\n')
> +
> +
> +# Check that localport has all physical flows defined
> +OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $pb)])
> +
> +# Remove ls0 from local datapaths
> +AT_CHECK([ovs-vsctl del-port br-int vm0])
> +AT_CHECK([ovn-appctl inc-engine/recompute])
> +
> +# Check that localports physical flows are removed
> +OVS_WAIT_UNTIL([test 0 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $pb)])
> +
> +# The order is impotant, if the port is removed first, the bug wouldn't be triggered
> +AT_CHECK([ovn-nbctl lsp-del metadata])
> +AT_CHECK([ovs-vsctl del-port br-int metadata])
> +create_localport
> +bind_ports
> +
> +pb=$(ovn-sbctl --bare --columns _uuid find port_binding logical_port=metadata | cut -d '-' -f 1 | tr -d '\n')
> +# Check that localport has all physical flows re-defined
> +OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $pb)])
> +
> +OVN_CLEANUP([hv1])
> +AT_CLEANUP
> --
> 2.35.3
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Ales Musil May 27, 2022, 7:30 a.m. UTC | #2
On Thu, May 26, 2022 at 7:14 PM Numan Siddique <numans@ovn.org> wrote:

> On Wed, May 25, 2022 at 2:52 AM Ales Musil <amusil@redhat.com> wrote:
> >
> > When localport is removed from NB, and it is the last port
> > remaining on the host, it is not part of local datapath
> > anymore. Which can cause troubles when there is recompute
> > happening in between the removal from NB and the removal
> > of interface from host. The localport would stay in lport_ids
> > set, so any new localport that happens to have the same unique
> > lport key wouldn't be initiliazed properly thorugh I-P.
> >
> > Remove the localport port binding from local datapath
> > if it was part of the that said datapath before.
> >
> > Reported-at: https://bugzilla.redhat.com/2076604
> > Signed-off-by: Ales Musil <amusil@redhat.com>
>
> Thanks for the v2.
>
> Looks like either the test case added is flaky or some issue with
> ovn-controller.
>

The test was flaky, thank you for checking it out and sorry for the noise.
Should be fixed in v3.

Thanks,
Ales


>
> If you run the added test case in a loop continuously, it fails
> intermittently.
> And when it fails I see the below in testsuite.log
>
> ----
> ../../tests/ovn-controller.at:2212: ovn-nbctl ls-add ls0
> ../../tests/ovn-controller.at:2213: ovn-nbctl lsp-add ls0 vm0
> ../../tests/ovn-controller.at:2214: ovn-nbctl lsp-set-addresses vm0
> "00:00:00:00:10:10 192.168.10.10"
> ../../tests/ovn-controller.at:2201: ovn-nbctl lsp-add ls0 metadata
> ../../tests/ovn-controller.at:2202: ovn-nbctl lsp-set-type metadata
> localport
> ../../tests/ovn-controller.at:2203: ovn-nbctl lsp-set-addresses
> metadata "00:00:00:00:10:25 192.168.10.25"
> ../../tests/ovn-controller.at:2207: ovs-vsctl add-port br-int vm0 --
> set interface vm0 type=internal external_ids:iface-id=vm0
> ../../tests/ovn-controller.at:2208: ovs-vsctl add-port br-int metadata
> -- set interface metadata type=internal external_ids:iface-id=metadata
> ovn-controller.at:2222: waiting until test 6 = $(as hv1 ovs-ofctl
> dump-flows br-int | grep -c $pb)...
> ovn-controller.at:2222: wait succeeded immediately
> ../../tests/ovn-controller.at:2225: ovs-vsctl del-port br-int vm0
> ../../tests/ovn-controller.at:2226: ovn-appctl inc-engine/recompute
> ovn-controller.at:2229: waiting until test 0 = $(as hv1 ovs-ofctl
> dump-flows br-int | grep -c $pb)...
> ovn-controller.at:2229: wait succeeded immediately
> ../../tests/ovn-controller.at:2232: ovn-nbctl lsp-del metadata
> ../../tests/ovn-controller.at:2233: ovs-vsctl del-port br-int metadata
> ../../tests/ovn-controller.at:2201: ovn-nbctl lsp-add ls0 metadata
> ../../tests/ovn-controller.at:2202: ovn-nbctl lsp-set-type metadata
> localport
> ../../tests/ovn-controller.at:2203: ovn-nbctl lsp-set-addresses
> metadata "00:00:00:00:10:25 192.168.10.25"
> ../../tests/ovn-controller.at:2207: ovs-vsctl add-port br-int vm0 --
> set interface vm0 type=internal external_ids:iface-id=vm0
> ../../tests/ovn-controller.at:2208: ovs-vsctl add-port br-int metadata
> -- set interface metadata type=internal external_ids:iface-id=metadata
> ovn-controller.at:2239: waiting until test 6 = $(as hv1 ovs-ofctl
> dump-flows br-int | grep -c $pb)...
> ovn-controller.at:2239: wait failed after 30 seconds
> ../../tests/ovs-macros.at:259: hard failure
> 2220. ovn-controller.at:2190: 2220. ovn-controller - localport can be
> recreated (ovn-controller.at:2190): FAILED (ovs-macros.at:259)
>
> ------
>
> Can you try testing this locally and see what's causing this failure ?
>
> Below is my run loop script
>
> ----
> #!/bin/bash
>
> start_time=$(date)
> tst="${TST:-2220}"
> while true
> do
>     make check TESTSUITEFLAGS="$tst" 2>&1 | tee log
>     grep successful log
>     if [ "$?" = "1" ]; then
>         echo "Test failed"
> echo "Test started at $start_time"
> echo "Exiting on "
> date
>         exit 1
>     fi
>
>     if [ -e tst_exit ]; then
>         echo "Exiting  !!!"
>         rm -f tst_exit
>         break
>     fi
> done
>
> echo "Done testing"
> -------------
>
> Thanks
> Numan
>
>
> > ---
> > v2: Add a test case for the scenario and rebase on newer main
> > ---
> >  controller/binding.c    |  9 +++++++
> >  tests/ovn-controller.at | 54 +++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 63 insertions(+)
> >
> > diff --git a/controller/binding.c b/controller/binding.c
> > index 92baebd29..a900c9a50 100644
> > --- a/controller/binding.c
> > +++ b/controller/binding.c
> > @@ -2131,6 +2131,15 @@ handle_deleted_lport(const struct
> sbrec_port_binding *pb,
> >          return;
> >      }
> >
> > +    /*
> > +     * Remove localport that was part of local datapath that is not
> > +     * considered to be local anymore.
> > +     */
> > +    if (!ld && !strcmp(pb->type, "localport") &&
> > +        sset_find(&b_ctx_out->related_lports->lport_names,
> pb->logical_port)) {
> > +        remove_related_lport(pb, b_ctx_out);
> > +    }
> > +
> >      /* If the binding is not local, if 'pb' is a L3 gateway port, we
> should
> >       * remove its peer, if that one is local.
> >       */
> > diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
> > index 3ca59f7e0..a34740d85 100644
> > --- a/tests/ovn-controller.at
> > +++ b/tests/ovn-controller.at
> > @@ -2186,3 +2186,57 @@ AT_CHECK([grep "Parsing of
> ovn-chassis-mac-mappings failed" hv1/ovn-controller.l
> >
> >  OVN_CLEANUP([hv1])
> >  AT_CLEANUP
> > +
> > +AT_SETUP([ovn-controller - localport can be recreated])
> > +
> > +ovn_start
> > +
> > +net_add n1
> > +sim_add hv1
> > +as hv1
> > +ovs-vsctl add-br br-phys
> > +ovn_attach n1 br-phys 192.168.0.1
> > +
> > +create_localport() {
> > +    AT_CHECK([ovn-nbctl lsp-add ls0 metadata])
> > +    AT_CHECK([ovn-nbctl lsp-set-type metadata localport])
> > +    AT_CHECK([ovn-nbctl lsp-set-addresses metadata "00:00:00:00:10:25
> 192.168.10.25"])
> > +}
> > +
> > +bind_ports() {
> > +    AT_CHECK([ovs-vsctl add-port br-int vm0 -- set interface vm0
> type=internal external_ids:iface-id=vm0])
> > +    AT_CHECK([ovs-vsctl add-port br-int metadata -- set interface
> metadata type=internal external_ids:iface-id=metadata])
> > +}
> > +
> > +# Create one VIF and localport and bind it to chassis
> > +AT_CHECK([ovn-nbctl ls-add ls0])
> > +AT_CHECK([ovn-nbctl lsp-add ls0 vm0])
> > +AT_CHECK([ovn-nbctl lsp-set-addresses vm0 "00:00:00:00:10:10
> 192.168.10.10"])
> > +create_localport
> > +bind_ports
> > +
> > +pb=$(ovn-sbctl --bare --columns _uuid find port_binding
> logical_port=metadata | cut -d '-' -f 1 | tr -d '\n')
> > +
> > +
> > +# Check that localport has all physical flows defined
> > +OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c
> $pb)])
> > +
> > +# Remove ls0 from local datapaths
> > +AT_CHECK([ovs-vsctl del-port br-int vm0])
> > +AT_CHECK([ovn-appctl inc-engine/recompute])
> > +
> > +# Check that localports physical flows are removed
> > +OVS_WAIT_UNTIL([test 0 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c
> $pb)])
> > +
> > +# The order is impotant, if the port is removed first, the bug wouldn't
> be triggered
> > +AT_CHECK([ovn-nbctl lsp-del metadata])
> > +AT_CHECK([ovs-vsctl del-port br-int metadata])
> > +create_localport
> > +bind_ports
> > +
> > +pb=$(ovn-sbctl --bare --columns _uuid find port_binding
> logical_port=metadata | cut -d '-' -f 1 | tr -d '\n')
> > +# Check that localport has all physical flows re-defined
> > +OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c
> $pb)])
> > +
> > +OVN_CLEANUP([hv1])
> > +AT_CLEANUP
> > --
> > 2.35.3
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
>
>
diff mbox series

Patch

diff --git a/controller/binding.c b/controller/binding.c
index 92baebd29..a900c9a50 100644
--- a/controller/binding.c
+++ b/controller/binding.c
@@ -2131,6 +2131,15 @@  handle_deleted_lport(const struct sbrec_port_binding *pb,
         return;
     }
 
+    /*
+     * Remove localport that was part of local datapath that is not
+     * considered to be local anymore.
+     */
+    if (!ld && !strcmp(pb->type, "localport") &&
+        sset_find(&b_ctx_out->related_lports->lport_names, pb->logical_port)) {
+        remove_related_lport(pb, b_ctx_out);
+    }
+
     /* If the binding is not local, if 'pb' is a L3 gateway port, we should
      * remove its peer, if that one is local.
      */
diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
index 3ca59f7e0..a34740d85 100644
--- a/tests/ovn-controller.at
+++ b/tests/ovn-controller.at
@@ -2186,3 +2186,57 @@  AT_CHECK([grep "Parsing of ovn-chassis-mac-mappings failed" hv1/ovn-controller.l
 
 OVN_CLEANUP([hv1])
 AT_CLEANUP
+
+AT_SETUP([ovn-controller - localport can be recreated])
+
+ovn_start
+
+net_add n1
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+create_localport() {
+    AT_CHECK([ovn-nbctl lsp-add ls0 metadata])
+    AT_CHECK([ovn-nbctl lsp-set-type metadata localport])
+    AT_CHECK([ovn-nbctl lsp-set-addresses metadata "00:00:00:00:10:25 192.168.10.25"])
+}
+
+bind_ports() {
+    AT_CHECK([ovs-vsctl add-port br-int vm0 -- set interface vm0 type=internal external_ids:iface-id=vm0])
+    AT_CHECK([ovs-vsctl add-port br-int metadata -- set interface metadata type=internal external_ids:iface-id=metadata])
+}
+
+# Create one VIF and localport and bind it to chassis
+AT_CHECK([ovn-nbctl ls-add ls0])
+AT_CHECK([ovn-nbctl lsp-add ls0 vm0])
+AT_CHECK([ovn-nbctl lsp-set-addresses vm0 "00:00:00:00:10:10 192.168.10.10"])
+create_localport
+bind_ports
+
+pb=$(ovn-sbctl --bare --columns _uuid find port_binding logical_port=metadata | cut -d '-' -f 1 | tr -d '\n')
+
+
+# Check that localport has all physical flows defined
+OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $pb)])
+
+# Remove ls0 from local datapaths
+AT_CHECK([ovs-vsctl del-port br-int vm0])
+AT_CHECK([ovn-appctl inc-engine/recompute])
+
+# Check that localports physical flows are removed
+OVS_WAIT_UNTIL([test 0 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $pb)])
+
+# The order is impotant, if the port is removed first, the bug wouldn't be triggered
+AT_CHECK([ovn-nbctl lsp-del metadata])
+AT_CHECK([ovs-vsctl del-port br-int metadata])
+create_localport
+bind_ports
+
+pb=$(ovn-sbctl --bare --columns _uuid find port_binding logical_port=metadata | cut -d '-' -f 1 | tr -d '\n')
+# Check that localport has all physical flows re-defined
+OVS_WAIT_UNTIL([test 6 = $(as hv1 ovs-ofctl dump-flows br-int | grep -c $pb)])
+
+OVN_CLEANUP([hv1])
+AT_CLEANUP