diff mbox series

[ovs-dev,v3,ovn,2/2] ovn-northd: Fix tunnel_key allocation for SB Port_Bindings.

Message ID 20200430183225.12099.42798.stgit@dceara.remote.csb
State Accepted
Headers show
Series Make ovn-northd recover from NB/SB inconsistencies. | expand

Commit Message

Dumitru Ceara April 30, 2020, 6:32 p.m. UTC
When generating Port_Binding records ovn-northd tries to reuse the
tunnel_key value from the original SB record, if any available.

However, there's no check for tunnel_keys that would conflict with
newly allocated keys for new records. In order to avoid that, we
don't reuse stale Port_Binding entries, i.e., their "datapath" field
doesn't match the Datapath_Binding record associated with the
logical switch/router they're part of.

One way to reproduce the issue is:
$ ovn-nbctl ls-add ls1
$ ovn-nbctl ls-add ls2
$ ovn-nbctl lsp-add ls1 lsp1
$ ovn-nbctl lsp-add ls2 lsp2
$ ovn-nbctl --wait=sb sync
$ ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2

Another option to reproduce the issue is with HA_Chassis_Group:
$ ovn-nbctl ls-add ls1
$ ovn-nbctl ls-add ls2
$ ovn-nbctl lsp-add ls1 lsp1
$ ovn-nbctl lsp-add ls2 lsp2
$ ovn-nbctl lsp-set-type lsp2 external
$ ovn-nbctl ha-chassis-group-add chg1
$ ovn-nbctl ha-chassis-group-add-chassis chg1 chassis-1 30
$ chg1_uuid=$(ovn-nbctl --bare --columns _uuid list ha_Chassis_Group .)
$ ovn-nbctl set logical_switch_port lsp2 ha_chassis_group=${chg1_uuid}
$ ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2

Reported-by: Dan Williams <dcbw@redhat.com>
Reported-at: https://bugzilla.redhat.com/1828637
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 northd/ovn-northd.c |    6 +++--
 tests/ovn-northd.at |   56 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 59 insertions(+), 3 deletions(-)

Comments

Numan Siddique May 1, 2020, 11:38 a.m. UTC | #1
On Fri, May 1, 2020 at 12:03 AM Dumitru Ceara <dceara@redhat.com> wrote:

> When generating Port_Binding records ovn-northd tries to reuse the
> tunnel_key value from the original SB record, if any available.
>
> However, there's no check for tunnel_keys that would conflict with
> newly allocated keys for new records. In order to avoid that, we
> don't reuse stale Port_Binding entries, i.e., their "datapath" field
> doesn't match the Datapath_Binding record associated with the
> logical switch/router they're part of.
>
> One way to reproduce the issue is:
> $ ovn-nbctl ls-add ls1
> $ ovn-nbctl ls-add ls2
> $ ovn-nbctl lsp-add ls1 lsp1
> $ ovn-nbctl lsp-add ls2 lsp2
> $ ovn-nbctl --wait=sb sync
> $ ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
>
> Another option to reproduce the issue is with HA_Chassis_Group:
> $ ovn-nbctl ls-add ls1
> $ ovn-nbctl ls-add ls2
> $ ovn-nbctl lsp-add ls1 lsp1
> $ ovn-nbctl lsp-add ls2 lsp2
> $ ovn-nbctl lsp-set-type lsp2 external
> $ ovn-nbctl ha-chassis-group-add chg1
> $ ovn-nbctl ha-chassis-group-add-chassis chg1 chassis-1 30
> $ chg1_uuid=$(ovn-nbctl --bare --columns _uuid list ha_Chassis_Group .)
> $ ovn-nbctl set logical_switch_port lsp2 ha_chassis_group=${chg1_uuid}
> $ ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
>
> Reported-by: Dan Williams <dcbw@redhat.com>
> Reported-at: https://bugzilla.redhat.com/1828637
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>

Acked-by: Numan Siddique <numans@ovn.org>

Thanks
Numan


> ---
>  northd/ovn-northd.c |    6 +++--
>  tests/ovn-northd.at |   56
> +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index de59452..e37b346 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -2029,7 +2029,7 @@ join_logical_ports(struct northd_context *ctx,
>                  const struct nbrec_logical_switch_port *nbsp
>                      = od->nbs->ports[i];
>                  struct ovn_port *op = ovn_port_find(ports, nbsp->name);
> -                if (op) {
> +                if (op && op->sb->datapath == od->sb) {
>                      if (op->nbsp || op->nbrp) {
>                          static struct vlog_rate_limit rl
>                              = VLOG_RATE_LIMIT_INIT(5, 1);
> @@ -2123,7 +2123,7 @@ join_logical_ports(struct northd_context *ctx,
>                  }
>
>                  struct ovn_port *op = ovn_port_find(ports, nbrp->name);
> -                if (op) {
> +                if (op && op->sb->datapath == od->sb) {
>                      if (op->nbsp || op->nbrp) {
>                          static struct vlog_rate_limit rl
>                              = VLOG_RATE_LIMIT_INIT(5, 1);
> @@ -2175,7 +2175,7 @@ join_logical_ports(struct northd_context *ctx,
>                      char *redirect_name =
>                          ovn_chassis_redirect_name(nbrp->name);
>                      struct ovn_port *crp = ovn_port_find(ports,
> redirect_name);
> -                    if (crp) {
> +                    if (crp && crp->sb->datapath == od->sb) {
>                          crp->derived = true;
>                          ovn_port_set_nb(crp, NULL, nbrp);
>                          ovs_list_remove(&crp->list);
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index a4469c7..a0109ae 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -1414,3 +1414,59 @@ lr_uuid=$(ovn-nbctl --columns _uuid list
> Logical_Router .)
>  AT_CHECK[test ${nb_uuid} = ${lr_uuid}]
>
>  AT_CLEANUP
> +
> +AT_SETUP([ovn -- check reconcile stale tunnel keys])
> +ovn_start
> +
> +ovn-nbctl ls-add ls1
> +ovn-nbctl ls-add ls2
> +ovn-nbctl lsp-add ls1 lsp1
> +ovn-nbctl lsp-add ls2 lsp2
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +# Ports are bound on different datapaths so it's expected that they both
> +# get tunnel_key == 1.
> +AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp1)])
> +AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp2)])
> +
> +ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp1)])
> +AT_CHECK([test 2 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp2)])
> +
> +# ovn-northd should allocate a new tunnel_key for lsp1 or lsp2 to maintain
> +# unique DB indices.
> +AT_CHECK([test ${pb1_key} != ${pb2_key}])
> +
> +AT_CLEANUP
> +
> +AT_SETUP([ovn -- check reconcile stale Ha_Chassis_Group])
> +ovn_start
> +
> +ovn-nbctl ls-add ls1
> +ovn-nbctl ls-add ls2
> +ovn-nbctl lsp-add ls1 lsp1
> +ovn-nbctl lsp-add ls2 lsp2
> +
> +ovn-nbctl lsp-set-type lsp2 external
> +
> +ovn-nbctl ha-chassis-group-add chg1
> +ovn-nbctl ha-chassis-group-add-chassis chg1 chassis-1 30
> +
> +chg1_uuid=$(ovn-nbctl --bare --columns _uuid list Ha_Chassis_Group .)
> +ovn-nbctl set logical_switch_port lsp2 ha_chassis_group=${chg1_uuid}
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +# Move lsp2 from ls2 to ls1. This should also remove the SB
> HA_Chassis_Group
> +# record.
> +ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +AT_CHECK([test 0 = $(ovn-sbctl list Ha_Chassis_Group | wc -l)])
> +
> +AT_CLEANUP
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Girish Moodalbail May 1, 2020, 3:36 p.m. UTC | #2
Great! We were seeing this issue as well and ovn-northd would just stall
when this occurs. Thanks for fixing Dumitru!

On Thu, Apr 30, 2020 at 11:32 AM Dumitru Ceara <dceara@redhat.com> wrote:

> When generating Port_Binding records ovn-northd tries to reuse the
> tunnel_key value from the original SB record, if any available.
>
> However, there's no check for tunnel_keys that would conflict with
> newly allocated keys for new records. In order to avoid that, we
> don't reuse stale Port_Binding entries, i.e., their "datapath" field
> doesn't match the Datapath_Binding record associated with the
> logical switch/router they're part of.
>
> One way to reproduce the issue is:
> $ ovn-nbctl ls-add ls1
> $ ovn-nbctl ls-add ls2
> $ ovn-nbctl lsp-add ls1 lsp1
> $ ovn-nbctl lsp-add ls2 lsp2
> $ ovn-nbctl --wait=sb sync
> $ ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
>
> Another option to reproduce the issue is with HA_Chassis_Group:
> $ ovn-nbctl ls-add ls1
> $ ovn-nbctl ls-add ls2
> $ ovn-nbctl lsp-add ls1 lsp1
> $ ovn-nbctl lsp-add ls2 lsp2
> $ ovn-nbctl lsp-set-type lsp2 external
> $ ovn-nbctl ha-chassis-group-add chg1
> $ ovn-nbctl ha-chassis-group-add-chassis chg1 chassis-1 30
> $ chg1_uuid=$(ovn-nbctl --bare --columns _uuid list ha_Chassis_Group .)
> $ ovn-nbctl set logical_switch_port lsp2 ha_chassis_group=${chg1_uuid}
> $ ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
>
> Reported-by: Dan Williams <dcbw@redhat.com>
> Reported-at: https://bugzilla.redhat.com/1828637
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> ---
>  northd/ovn-northd.c |    6 +++--
>  tests/ovn-northd.at |   56
> +++++++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 59 insertions(+), 3 deletions(-)
>
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index de59452..e37b346 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -2029,7 +2029,7 @@ join_logical_ports(struct northd_context *ctx,
>                  const struct nbrec_logical_switch_port *nbsp
>                      = od->nbs->ports[i];
>                  struct ovn_port *op = ovn_port_find(ports, nbsp->name);
> -                if (op) {
> +                if (op && op->sb->datapath == od->sb) {
>                      if (op->nbsp || op->nbrp) {
>                          static struct vlog_rate_limit rl
>                              = VLOG_RATE_LIMIT_INIT(5, 1);
> @@ -2123,7 +2123,7 @@ join_logical_ports(struct northd_context *ctx,
>                  }
>
>                  struct ovn_port *op = ovn_port_find(ports, nbrp->name);
> -                if (op) {
> +                if (op && op->sb->datapath == od->sb) {
>                      if (op->nbsp || op->nbrp) {
>                          static struct vlog_rate_limit rl
>                              = VLOG_RATE_LIMIT_INIT(5, 1);
> @@ -2175,7 +2175,7 @@ join_logical_ports(struct northd_context *ctx,
>                      char *redirect_name =
>                          ovn_chassis_redirect_name(nbrp->name);
>                      struct ovn_port *crp = ovn_port_find(ports,
> redirect_name);
> -                    if (crp) {
> +                    if (crp && crp->sb->datapath == od->sb) {
>                          crp->derived = true;
>                          ovn_port_set_nb(crp, NULL, nbrp);
>                          ovs_list_remove(&crp->list);
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index a4469c7..a0109ae 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -1414,3 +1414,59 @@ lr_uuid=$(ovn-nbctl --columns _uuid list
> Logical_Router .)
>  AT_CHECK[test ${nb_uuid} = ${lr_uuid}]
>
>  AT_CLEANUP
> +
> +AT_SETUP([ovn -- check reconcile stale tunnel keys])
> +ovn_start
> +
> +ovn-nbctl ls-add ls1
> +ovn-nbctl ls-add ls2
> +ovn-nbctl lsp-add ls1 lsp1
> +ovn-nbctl lsp-add ls2 lsp2
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +# Ports are bound on different datapaths so it's expected that they both
> +# get tunnel_key == 1.
> +AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp1)])
> +AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp2)])
> +
> +ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp1)])
> +AT_CHECK([test 2 = $(ovn-sbctl --bare --columns tunnel_key find \
> +port_binding logical_port=lsp2)])
> +
> +# ovn-northd should allocate a new tunnel_key for lsp1 or lsp2 to maintain
> +# unique DB indices.
> +AT_CHECK([test ${pb1_key} != ${pb2_key}])
> +
> +AT_CLEANUP
> +
> +AT_SETUP([ovn -- check reconcile stale Ha_Chassis_Group])
> +ovn_start
> +
> +ovn-nbctl ls-add ls1
> +ovn-nbctl ls-add ls2
> +ovn-nbctl lsp-add ls1 lsp1
> +ovn-nbctl lsp-add ls2 lsp2
> +
> +ovn-nbctl lsp-set-type lsp2 external
> +
> +ovn-nbctl ha-chassis-group-add chg1
> +ovn-nbctl ha-chassis-group-add-chassis chg1 chassis-1 30
> +
> +chg1_uuid=$(ovn-nbctl --bare --columns _uuid list Ha_Chassis_Group .)
> +ovn-nbctl set logical_switch_port lsp2 ha_chassis_group=${chg1_uuid}
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +# Move lsp2 from ls2 to ls1. This should also remove the SB
> HA_Chassis_Group
> +# record.
> +ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
> +AT_CHECK([ovn-nbctl --wait=sb sync], [0])
> +
> +AT_CHECK([test 0 = $(ovn-sbctl list Ha_Chassis_Group | wc -l)])
> +
> +AT_CLEANUP
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
diff mbox series

Patch

diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index de59452..e37b346 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -2029,7 +2029,7 @@  join_logical_ports(struct northd_context *ctx,
                 const struct nbrec_logical_switch_port *nbsp
                     = od->nbs->ports[i];
                 struct ovn_port *op = ovn_port_find(ports, nbsp->name);
-                if (op) {
+                if (op && op->sb->datapath == od->sb) {
                     if (op->nbsp || op->nbrp) {
                         static struct vlog_rate_limit rl
                             = VLOG_RATE_LIMIT_INIT(5, 1);
@@ -2123,7 +2123,7 @@  join_logical_ports(struct northd_context *ctx,
                 }
 
                 struct ovn_port *op = ovn_port_find(ports, nbrp->name);
-                if (op) {
+                if (op && op->sb->datapath == od->sb) {
                     if (op->nbsp || op->nbrp) {
                         static struct vlog_rate_limit rl
                             = VLOG_RATE_LIMIT_INIT(5, 1);
@@ -2175,7 +2175,7 @@  join_logical_ports(struct northd_context *ctx,
                     char *redirect_name =
                         ovn_chassis_redirect_name(nbrp->name);
                     struct ovn_port *crp = ovn_port_find(ports, redirect_name);
-                    if (crp) {
+                    if (crp && crp->sb->datapath == od->sb) {
                         crp->derived = true;
                         ovn_port_set_nb(crp, NULL, nbrp);
                         ovs_list_remove(&crp->list);
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index a4469c7..a0109ae 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -1414,3 +1414,59 @@  lr_uuid=$(ovn-nbctl --columns _uuid list Logical_Router .)
 AT_CHECK[test ${nb_uuid} = ${lr_uuid}]
 
 AT_CLEANUP
+
+AT_SETUP([ovn -- check reconcile stale tunnel keys])
+ovn_start
+
+ovn-nbctl ls-add ls1
+ovn-nbctl ls-add ls2
+ovn-nbctl lsp-add ls1 lsp1
+ovn-nbctl lsp-add ls2 lsp2
+AT_CHECK([ovn-nbctl --wait=sb sync], [0])
+
+# Ports are bound on different datapaths so it's expected that they both
+# get tunnel_key == 1.
+AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
+port_binding logical_port=lsp1)])
+AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
+port_binding logical_port=lsp2)])
+
+ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
+AT_CHECK([ovn-nbctl --wait=sb sync], [0])
+
+AT_CHECK([test 1 = $(ovn-sbctl --bare --columns tunnel_key find \
+port_binding logical_port=lsp1)])
+AT_CHECK([test 2 = $(ovn-sbctl --bare --columns tunnel_key find \
+port_binding logical_port=lsp2)])
+
+# ovn-northd should allocate a new tunnel_key for lsp1 or lsp2 to maintain
+# unique DB indices.
+AT_CHECK([test ${pb1_key} != ${pb2_key}])
+
+AT_CLEANUP
+
+AT_SETUP([ovn -- check reconcile stale Ha_Chassis_Group])
+ovn_start
+
+ovn-nbctl ls-add ls1
+ovn-nbctl ls-add ls2
+ovn-nbctl lsp-add ls1 lsp1
+ovn-nbctl lsp-add ls2 lsp2
+
+ovn-nbctl lsp-set-type lsp2 external
+
+ovn-nbctl ha-chassis-group-add chg1
+ovn-nbctl ha-chassis-group-add-chassis chg1 chassis-1 30
+
+chg1_uuid=$(ovn-nbctl --bare --columns _uuid list Ha_Chassis_Group .)
+ovn-nbctl set logical_switch_port lsp2 ha_chassis_group=${chg1_uuid}
+AT_CHECK([ovn-nbctl --wait=sb sync], [0])
+
+# Move lsp2 from ls2 to ls1. This should also remove the SB HA_Chassis_Group
+# record.
+ovn-nbctl lsp-del lsp2 -- lsp-add ls1 lsp2
+AT_CHECK([ovn-nbctl --wait=sb sync], [0])
+
+AT_CHECK([test 0 = $(ovn-sbctl list Ha_Chassis_Group | wc -l)])
+
+AT_CLEANUP