diff mbox series

[ovs-dev] northd: Add lflows to send all pkts to conntrack if LB is configured on a lswitch.

Message ID 20200907124320.830247-1-numans@ovn.org
State Superseded
Headers show
Series [ovs-dev] northd: Add lflows to send all pkts to conntrack if LB is configured on a lswitch. | expand

Commit Message

Numan Siddique Sept. 7, 2020, 12:43 p.m. UTC
From: Numan Siddique <numans@ovn.org>

Prior to this patch, if a load balancer is configured on a logical switch but with no
ACLs with allow-related configured, then in the ingress pipeline only the packets
with ip.dst = VIP will be sent to conntrack using the zone id of the source logical port.

If the backend of the load balancer, sends an invalid packet (for example invalid tcp
sequence number), then such packets will be delivered to the source logical port VIF
without unDNATting. This causes the source to reset the connection.

This patch fixes this issue by sending all the packets to conntrack if a load balancer
is configured on the logical switch. Because of this, any invalid (ct.inv) packets will
be dropped in the ingress pipeline itself.

Unfortunately this impacts the performance as now there will be extra recirculations
because of ct and ct_commit (for new connections) actions.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1870359
Reported-by: Tim Rozet (trozet@redhat.com)
Signed-off-by: Numan Siddique <numans@ovn.org>
---
 northd/ovn-northd.8.xml | 14 +++---
 northd/ovn-northd.c     | 99 +++++++++++++++++++++++++++--------------
 2 files changed, 72 insertions(+), 41 deletions(-)

Comments

Dumitru Ceara Sept. 10, 2020, 3:54 p.m. UTC | #1
On 9/7/20 2:43 PM, numans@ovn.org wrote:
> From: Numan Siddique <numans@ovn.org>
> 
> Prior to this patch, if a load balancer is configured on a logical switch but with no
> ACLs with allow-related configured, then in the ingress pipeline only the packets
> with ip.dst = VIP will be sent to conntrack using the zone id of the source logical port.
> 
> If the backend of the load balancer, sends an invalid packet (for example invalid tcp
> sequence number), then such packets will be delivered to the source logical port VIF
> without unDNATting. This causes the source to reset the connection.
> 
> This patch fixes this issue by sending all the packets to conntrack if a load balancer
> is configured on the logical switch. Because of this, any invalid (ct.inv) packets will
> be dropped in the ingress pipeline itself.
> 
> Unfortunately this impacts the performance as now there will be extra recirculations
> because of ct and ct_commit (for new connections) actions.
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1870359
> Reported-by: Tim Rozet (trozet@redhat.com)
> Signed-off-by: Numan Siddique <numans@ovn.org>

Hi Numan,

Overall the patch looks ok to me and sadly I think this is the only way
(send everything to conntrack) to fix the issue reported in the bugzilla.

I do have some minor comments below.

> ---
>  northd/ovn-northd.8.xml | 14 +++---
>  northd/ovn-northd.c     | 99 +++++++++++++++++++++++++++--------------
>  2 files changed, 72 insertions(+), 41 deletions(-)
> 
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 989e3643b8..89711c5a6c 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -340,15 +340,12 @@
>        it contains a priority-110 flow to move IPv6 Neighbor Discovery traffic
>        to the next table. If load balancing rules with virtual IP addresses
>        (and ports) are configured in <code>OVN_Northbound</code> database for a
> -      logical switch datapath, a priority-100 flow is added for each configured
> -      virtual IP address <var>VIP</var>. For IPv4 <var>VIPs</var>, the match is
> -      <code>ip &amp;&amp; ip4.dst == <var>VIP</var></code>. For IPv6
> -      <var>VIPs</var>, the match is <code>ip &amp;&amp;
> -      ip6.dst == <var>VIP</var></code>. The flow sets an action
> +      logical switch datapath, a priority-100 flow is added with the match
> +      <code>ip</code> to match on IP packets and sets the action
>        <code>reg0[0] = 1; next;</code> to act as a hint for table
>        <code>Pre-stateful</code> to send IP packets to the connection tracker
> -      for packet de-fragmentation before eventually advancing to ingress table
> -      <code>LB</code>.
> +      for packet de-fragmentation before eventually advancing to ingress
> +      table <code>LB</code>.
>        If controller_event has been enabled and load balancing rules with
>        empty backends have been added in <code>OVN_Northbound</code>, a 130 flow
>        is added to trigger ovn-controller events whenever the chassis receives a
> @@ -430,7 +427,8 @@
>      <p>
>        This table also contains a priority 0 flow with action
>        <code>next;</code>, so that ACLs allow packets by default.  If the
> -      logical datapath has a statetful ACL, the following flows will
> +      logical datapath has a statetful ACL or a load balancer with VIP
> +      configured, the following flows will
>        also be added:

Nit: this can go on the line above.

>      </p>
>  
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index 3de71612b8..c322558051 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -5031,6 +5031,23 @@ build_empty_lb_event_flow(struct ovn_datapath *od, struct hmap *lflows,
>      free(action);
>  }
>  
> +static bool
> +has_lb_vip(struct ovn_datapath *od, struct hmap *lbs)
> +{
> +    for (int i = 0; i < od->nbs->n_load_balancer; i++) {
> +        struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
> +        struct ovn_lb *lb =
> +            ovn_lb_find(lbs, &nb_lb->header_.uuid);
> +        ovs_assert(lb);
> +
> +        if (lb->n_vips) {
> +            return true;
> +        }
> +    }
> +
> +    return false;
> +}
> +
>  static void
>  build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>               struct shash *meter_groups, struct hmap *lbs)
> @@ -5069,8 +5086,6 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>                                   110, lflows);
>      }
>  
> -    struct sset all_ips_v4 = SSET_INITIALIZER(&all_ips_v4);
> -    struct sset all_ips_v6 = SSET_INITIALIZER(&all_ips_v6);
>      bool vip_configured = false;
>      for (int i = 0; i < od->nbs->n_load_balancer; i++) {
>          struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
> @@ -5080,12 +5095,6 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>  
>          for (size_t j = 0; j < lb->n_vips; j++) {
>              struct lb_vip *lb_vip = &lb->vips[j];
> -            if (lb_vip->addr_family == AF_INET) {
> -                sset_add(&all_ips_v4, lb_vip->vip);
> -            } else {
> -                sset_add(&all_ips_v6, lb_vip->vip);
> -            }
> -
>              build_empty_lb_event_flow(od, lflows, lb_vip, nb_lb,
>                                        S_SWITCH_IN_PRE_LB, meter_groups);
>  
> @@ -5099,26 +5108,37 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>      }
>  
>      /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
> -     * packet to conntrack for defragmentation. */
> -    const char *ip_address;
> -    SSET_FOR_EACH (ip_address, &all_ips_v4) {
> -        char *match = xasprintf("ip && ip4.dst == %s", ip_address);
> -        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> -                      100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> -        free(match);
> -    }
> -
> -    SSET_FOR_EACH (ip_address, &all_ips_v6) {
> -        char *match = xasprintf("ip && ip6.dst == %s", ip_address);
> -        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> -                      100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> -        free(match);
> -    }
> -
> -    sset_destroy(&all_ips_v4);
> -    sset_destroy(&all_ips_v6);
> -
> +     * packet to conntrack for defragmentation.
> +     *
> +     * Send all the packets to conntrack in the ingress pipeline if the
> +     * logical switch has a load balancer with VIP configured. Earlier
> +     * we used to set the REGBIT_CONNTRACK_DEFRAG flag in the ingress pipeline
> +     * if the IP destination matches the VIP. But this causes few issues when
> +     * a logical switch has no ACLs configured with allow-related.
> +     * To understand the issue, lets a take a TCP load balancer -
> +     * 10.0.0.10:80=10.0.0.3:80.
> +     * If a logical port - p1 with IP - 10.0.0.5 opens a TCP connection with
> +     * the VIP - 10.0.0.10, then the packet in the ingress pipeline of 'p1'
> +     * is sent to the p1's conntrack zone id and the packet is load balanced
> +     * to the backend - 10.0.0.3. For the reply packet from the backend lport,
> +     * it is not sent to the conntrack of backend lport's zone id. This is fine
> +     * as long as the packet is valid. Suppose the backend lport sends an
> +     *  invalid TCP packet (like incorrect sequence number), the packet gets
> +     * delivered to the lport 'p1' without unDNATing the packet to the
> +     * VIP - 10.0.0.10. And this causes the connection to be reset by the
> +     * lport p1's VIF.
> +     *

Thanks for thoroughly explaining the problem, it's really helpful!

However, I think this would be more useful somewhere else, maybe in the
man page where we describe the flow that sets REGBIT_CONNTRACK_DEFRAG?

> +     * We can't fix this issue by adding a logical flow to drop ct.inv packets
> +     * in the egress pipeline since it will drop all other connections not
> +     * destined to the load balancers.
> +     *
> +     * To fix this issue, we send all the packets to the conntrack in the
> +     * ingress pipeline if a load balancer is configured. We can now
> +     * add a lflow to drop ct.inv packets.
> +     */
>      if (vip_configured) {
> +        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> +                      100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
>          ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
>                        100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
>      }
> @@ -5477,9 +5497,9 @@ build_port_group_lswitches(struct northd_context *ctx, struct hmap *pgs,
>  
>  static void
>  build_acls(struct ovn_datapath *od, struct hmap *lflows,
> -           struct hmap *port_groups)
> +           struct hmap *port_groups, struct hmap *lbs)
>  {
> -    bool has_stateful = has_stateful_acl(od);
> +    bool has_stateful = has_stateful_acl(od) || has_lb_vip(od, lbs);

Nit: I would add parenthesis here:

bool has_stateful = (has_stateful_acl(od) || has_lb_vip(od, lbs));

>  
>      /* Ingress and Egress ACL Table (Priority 0): Packets are allowed by
>       * default.  A related rule at priority 1 is added below if there
> @@ -5736,19 +5756,32 @@ build_qos(struct ovn_datapath *od, struct hmap *lflows) {
>  }
>  
>  static void
> -build_lb(struct ovn_datapath *od, struct hmap *lflows)
> +build_lb(struct ovn_datapath *od, struct hmap *lflows, struct hmap *lbs)
>  {
>      /* Ingress and Egress LB Table (Priority 0): Packets are allowed by
>       * default.  */
>      ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;");
>      ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;");
>  
> -    if (od->nbs->load_balancer) {
> +    if (od->nbs->n_load_balancer) {
>          for (size_t i = 0; i < od->n_router_ports; i++) {
>              skip_port_from_conntrack(od, od->router_ports[i],
>                                       S_SWITCH_IN_LB, S_SWITCH_OUT_LB,
>                                       UINT16_MAX, lflows);
>          }
> +    }
> +
> +    bool vip_configured = false;
> +    for (int i = 0; i < od->nbs->n_load_balancer; i++) {
> +        struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
> +        struct ovn_lb *lb =
> +            ovn_lb_find(lbs, &nb_lb->header_.uuid);
> +        ovs_assert(lb);
> +
> +        vip_configured = (vip_configured || lb->n_vips);
> +    }
> +
> +    if (vip_configured) {

We should use the new function you added, "has_lb_vip()". And almost the
same code is duplicated in build_pre_lb() where we also call
build_empty_lb_event_flow(). It would be nice to unify this a bit.

Thanks,
Dumitru

>          /* Ingress and Egress LB Table (Priority 65534).
>           *
>           * Send established traffic through conntrack for just NAT. */
> @@ -6622,9 +6655,9 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>          build_pre_acls(od, lflows);
>          build_pre_lb(od, lflows, meter_groups, lbs);
>          build_pre_stateful(od, lflows);
> -        build_acls(od, lflows, port_groups);
> +        build_acls(od, lflows, port_groups, lbs);
>          build_qos(od, lflows);
> -        build_lb(od, lflows);
> +        build_lb(od, lflows, lbs);
>          build_stateful(od, lflows, lbs);
>      }
>  
>
Mark Michelson Sept. 10, 2020, 5:09 p.m. UTC | #2
On 9/7/20 8:43 AM, numans@ovn.org wrote:
> From: Numan Siddique <numans@ovn.org>
> 
> Prior to this patch, if a load balancer is configured on a logical switch but with no
> ACLs with allow-related configured, then in the ingress pipeline only the packets
> with ip.dst = VIP will be sent to conntrack using the zone id of the source logical port.
> 
> If the backend of the load balancer, sends an invalid packet (for example invalid tcp
> sequence number), then such packets will be delivered to the source logical port VIF
> without unDNATting. This causes the source to reset the connection.
> 
> This patch fixes this issue by sending all the packets to conntrack if a load balancer
> is configured on the logical switch. Because of this, any invalid (ct.inv) packets will
> be dropped in the ingress pipeline itself.
> 
> Unfortunately this impacts the performance as now there will be extra recirculations
> because of ct and ct_commit (for new connections) actions.
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=1870359
> Reported-by: Tim Rozet (trozet@redhat.com)
> Signed-off-by: Numan Siddique <numans@ovn.org>
> ---
>   northd/ovn-northd.8.xml | 14 +++---
>   northd/ovn-northd.c     | 99 +++++++++++++++++++++++++++--------------
>   2 files changed, 72 insertions(+), 41 deletions(-)
> 
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 989e3643b8..89711c5a6c 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -340,15 +340,12 @@
>         it contains a priority-110 flow to move IPv6 Neighbor Discovery traffic
>         to the next table. If load balancing rules with virtual IP addresses
>         (and ports) are configured in <code>OVN_Northbound</code> database for a
> -      logical switch datapath, a priority-100 flow is added for each configured
> -      virtual IP address <var>VIP</var>. For IPv4 <var>VIPs</var>, the match is
> -      <code>ip &amp;&amp; ip4.dst == <var>VIP</var></code>. For IPv6
> -      <var>VIPs</var>, the match is <code>ip &amp;&amp;
> -      ip6.dst == <var>VIP</var></code>. The flow sets an action
> +      logical switch datapath, a priority-100 flow is added with the match
> +      <code>ip</code> to match on IP packets and sets the action
>         <code>reg0[0] = 1; next;</code> to act as a hint for table
>         <code>Pre-stateful</code> to send IP packets to the connection tracker
> -      for packet de-fragmentation before eventually advancing to ingress table
> -      <code>LB</code>.
> +      for packet de-fragmentation before eventually advancing to ingress
> +      table <code>LB</code>.
>         If controller_event has been enabled and load balancing rules with
>         empty backends have been added in <code>OVN_Northbound</code>, a 130 flow
>         is added to trigger ovn-controller events whenever the chassis receives a
> @@ -430,7 +427,8 @@
>       <p>
>         This table also contains a priority 0 flow with action
>         <code>next;</code>, so that ACLs allow packets by default.  If the
> -      logical datapath has a statetful ACL, the following flows will
> +      logical datapath has a statetful ACL or a load balancer with VIP
> +      configured, the following flows will
>         also be added:
>       </p>
>   
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index 3de71612b8..c322558051 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -5031,6 +5031,23 @@ build_empty_lb_event_flow(struct ovn_datapath *od, struct hmap *lflows,
>       free(action);
>   }
>   
> +static bool
> +has_lb_vip(struct ovn_datapath *od, struct hmap *lbs)
> +{
> +    for (int i = 0; i < od->nbs->n_load_balancer; i++) {
> +        struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
> +        struct ovn_lb *lb =
> +            ovn_lb_find(lbs, &nb_lb->header_.uuid);
> +        ovs_assert(lb);
> +
> +        if (lb->n_vips) {
> +            return true;
> +        }

I'm not sure why you need to look up the ovn_lb here. Why not just 
return true if smap_count(&nb_lb->vips) > 0 ?

lb is guaranteed to be found in the lbs hmap, and lb->n_vips is always 
set to smap_count(&nb_lb->vips) (see ovn_lb_create).

> +    }
> +
> +    return false;
> +}
> +
>   static void
>   build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>                struct shash *meter_groups, struct hmap *lbs)
> @@ -5069,8 +5086,6 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>                                    110, lflows);
>       }
>   
> -    struct sset all_ips_v4 = SSET_INITIALIZER(&all_ips_v4);
> -    struct sset all_ips_v6 = SSET_INITIALIZER(&all_ips_v6);
>       bool vip_configured = false;
>       for (int i = 0; i < od->nbs->n_load_balancer; i++) {
>           struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
> @@ -5080,12 +5095,6 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>   
>           for (size_t j = 0; j < lb->n_vips; j++) {
>               struct lb_vip *lb_vip = &lb->vips[j];
> -            if (lb_vip->addr_family == AF_INET) {
> -                sset_add(&all_ips_v4, lb_vip->vip);
> -            } else {
> -                sset_add(&all_ips_v6, lb_vip->vip);
> -            }
> -
>               build_empty_lb_event_flow(od, lflows, lb_vip, nb_lb,
>                                         S_SWITCH_IN_PRE_LB, meter_groups);
>   
> @@ -5099,26 +5108,37 @@ build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
>       }
>   
>       /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
> -     * packet to conntrack for defragmentation. */
> -    const char *ip_address;
> -    SSET_FOR_EACH (ip_address, &all_ips_v4) {
> -        char *match = xasprintf("ip && ip4.dst == %s", ip_address);
> -        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> -                      100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> -        free(match);
> -    }
> -
> -    SSET_FOR_EACH (ip_address, &all_ips_v6) {
> -        char *match = xasprintf("ip && ip6.dst == %s", ip_address);
> -        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> -                      100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> -        free(match);
> -    }
> -
> -    sset_destroy(&all_ips_v4);
> -    sset_destroy(&all_ips_v6);
> -
> +     * packet to conntrack for defragmentation.
> +     *
> +     * Send all the packets to conntrack in the ingress pipeline if the
> +     * logical switch has a load balancer with VIP configured. Earlier
> +     * we used to set the REGBIT_CONNTRACK_DEFRAG flag in the ingress pipeline
> +     * if the IP destination matches the VIP. But this causes few issues when
> +     * a logical switch has no ACLs configured with allow-related.
> +     * To understand the issue, lets a take a TCP load balancer -
> +     * 10.0.0.10:80=10.0.0.3:80.
> +     * If a logical port - p1 with IP - 10.0.0.5 opens a TCP connection with
> +     * the VIP - 10.0.0.10, then the packet in the ingress pipeline of 'p1'
> +     * is sent to the p1's conntrack zone id and the packet is load balanced
> +     * to the backend - 10.0.0.3. For the reply packet from the backend lport,
> +     * it is not sent to the conntrack of backend lport's zone id. This is fine
> +     * as long as the packet is valid. Suppose the backend lport sends an
> +     *  invalid TCP packet (like incorrect sequence number), the packet gets
> +     * delivered to the lport 'p1' without unDNATing the packet to the
> +     * VIP - 10.0.0.10. And this causes the connection to be reset by the
> +     * lport p1's VIF.
> +     *
> +     * We can't fix this issue by adding a logical flow to drop ct.inv packets
> +     * in the egress pipeline since it will drop all other connections not
> +     * destined to the load balancers.
> +     *
> +     * To fix this issue, we send all the packets to the conntrack in the
> +     * ingress pipeline if a load balancer is configured. We can now
> +     * add a lflow to drop ct.inv packets.
> +     */
>       if (vip_configured) {
> +        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
> +                      100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
>           ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
>                         100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
>       }
> @@ -5477,9 +5497,9 @@ build_port_group_lswitches(struct northd_context *ctx, struct hmap *pgs,
>   
>   static void
>   build_acls(struct ovn_datapath *od, struct hmap *lflows,
> -           struct hmap *port_groups)
> +           struct hmap *port_groups, struct hmap *lbs)
>   {
> -    bool has_stateful = has_stateful_acl(od);
> +    bool has_stateful = has_stateful_acl(od) || has_lb_vip(od, lbs);
>   
>       /* Ingress and Egress ACL Table (Priority 0): Packets are allowed by
>        * default.  A related rule at priority 1 is added below if there
> @@ -5736,19 +5756,32 @@ build_qos(struct ovn_datapath *od, struct hmap *lflows) {
>   }
>   
>   static void
> -build_lb(struct ovn_datapath *od, struct hmap *lflows)
> +build_lb(struct ovn_datapath *od, struct hmap *lflows, struct hmap *lbs)
>   {
>       /* Ingress and Egress LB Table (Priority 0): Packets are allowed by
>        * default.  */
>       ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;");
>       ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;");
>   
> -    if (od->nbs->load_balancer) {
> +    if (od->nbs->n_load_balancer) {
>           for (size_t i = 0; i < od->n_router_ports; i++) {
>               skip_port_from_conntrack(od, od->router_ports[i],
>                                        S_SWITCH_IN_LB, S_SWITCH_OUT_LB,
>                                        UINT16_MAX, lflows);
>           }
> +    }
> +
> +    bool vip_configured = false;
> +    for (int i = 0; i < od->nbs->n_load_balancer; i++) {
> +        struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
> +        struct ovn_lb *lb =
> +            ovn_lb_find(lbs, &nb_lb->header_.uuid);
> +        ovs_assert(lb);
> +
> +        vip_configured = (vip_configured || lb->n_vips);

As Dumitru stated, you can use the new has_lb_vip() function here, but 
it's another instance where ovn_lb_find() seems unnecessary.

> +    }
> +
> +    if (vip_configured) {
>           /* Ingress and Egress LB Table (Priority 65534).
>            *
>            * Send established traffic through conntrack for just NAT. */
> @@ -6622,9 +6655,9 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>           build_pre_acls(od, lflows);
>           build_pre_lb(od, lflows, meter_groups, lbs);
>           build_pre_stateful(od, lflows);
> -        build_acls(od, lflows, port_groups);
> +        build_acls(od, lflows, port_groups, lbs);
>           build_qos(od, lflows);
> -        build_lb(od, lflows);
> +        build_lb(od, lflows, lbs);
>           build_stateful(od, lflows, lbs);
>       }
>   
>
diff mbox series

Patch

diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 989e3643b8..89711c5a6c 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -340,15 +340,12 @@ 
       it contains a priority-110 flow to move IPv6 Neighbor Discovery traffic
       to the next table. If load balancing rules with virtual IP addresses
       (and ports) are configured in <code>OVN_Northbound</code> database for a
-      logical switch datapath, a priority-100 flow is added for each configured
-      virtual IP address <var>VIP</var>. For IPv4 <var>VIPs</var>, the match is
-      <code>ip &amp;&amp; ip4.dst == <var>VIP</var></code>. For IPv6
-      <var>VIPs</var>, the match is <code>ip &amp;&amp;
-      ip6.dst == <var>VIP</var></code>. The flow sets an action
+      logical switch datapath, a priority-100 flow is added with the match
+      <code>ip</code> to match on IP packets and sets the action
       <code>reg0[0] = 1; next;</code> to act as a hint for table
       <code>Pre-stateful</code> to send IP packets to the connection tracker
-      for packet de-fragmentation before eventually advancing to ingress table
-      <code>LB</code>.
+      for packet de-fragmentation before eventually advancing to ingress
+      table <code>LB</code>.
       If controller_event has been enabled and load balancing rules with
       empty backends have been added in <code>OVN_Northbound</code>, a 130 flow
       is added to trigger ovn-controller events whenever the chassis receives a
@@ -430,7 +427,8 @@ 
     <p>
       This table also contains a priority 0 flow with action
       <code>next;</code>, so that ACLs allow packets by default.  If the
-      logical datapath has a statetful ACL, the following flows will
+      logical datapath has a statetful ACL or a load balancer with VIP
+      configured, the following flows will
       also be added:
     </p>
 
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index 3de71612b8..c322558051 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -5031,6 +5031,23 @@  build_empty_lb_event_flow(struct ovn_datapath *od, struct hmap *lflows,
     free(action);
 }
 
+static bool
+has_lb_vip(struct ovn_datapath *od, struct hmap *lbs)
+{
+    for (int i = 0; i < od->nbs->n_load_balancer; i++) {
+        struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
+        struct ovn_lb *lb =
+            ovn_lb_find(lbs, &nb_lb->header_.uuid);
+        ovs_assert(lb);
+
+        if (lb->n_vips) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 static void
 build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
              struct shash *meter_groups, struct hmap *lbs)
@@ -5069,8 +5086,6 @@  build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
                                  110, lflows);
     }
 
-    struct sset all_ips_v4 = SSET_INITIALIZER(&all_ips_v4);
-    struct sset all_ips_v6 = SSET_INITIALIZER(&all_ips_v6);
     bool vip_configured = false;
     for (int i = 0; i < od->nbs->n_load_balancer; i++) {
         struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
@@ -5080,12 +5095,6 @@  build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
 
         for (size_t j = 0; j < lb->n_vips; j++) {
             struct lb_vip *lb_vip = &lb->vips[j];
-            if (lb_vip->addr_family == AF_INET) {
-                sset_add(&all_ips_v4, lb_vip->vip);
-            } else {
-                sset_add(&all_ips_v6, lb_vip->vip);
-            }
-
             build_empty_lb_event_flow(od, lflows, lb_vip, nb_lb,
                                       S_SWITCH_IN_PRE_LB, meter_groups);
 
@@ -5099,26 +5108,37 @@  build_pre_lb(struct ovn_datapath *od, struct hmap *lflows,
     }
 
     /* 'REGBIT_CONNTRACK_DEFRAG' is set to let the pre-stateful table send
-     * packet to conntrack for defragmentation. */
-    const char *ip_address;
-    SSET_FOR_EACH (ip_address, &all_ips_v4) {
-        char *match = xasprintf("ip && ip4.dst == %s", ip_address);
-        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
-                      100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
-        free(match);
-    }
-
-    SSET_FOR_EACH (ip_address, &all_ips_v6) {
-        char *match = xasprintf("ip && ip6.dst == %s", ip_address);
-        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
-                      100, match, REGBIT_CONNTRACK_DEFRAG" = 1; next;");
-        free(match);
-    }
-
-    sset_destroy(&all_ips_v4);
-    sset_destroy(&all_ips_v6);
-
+     * packet to conntrack for defragmentation.
+     *
+     * Send all the packets to conntrack in the ingress pipeline if the
+     * logical switch has a load balancer with VIP configured. Earlier
+     * we used to set the REGBIT_CONNTRACK_DEFRAG flag in the ingress pipeline
+     * if the IP destination matches the VIP. But this causes few issues when
+     * a logical switch has no ACLs configured with allow-related.
+     * To understand the issue, lets a take a TCP load balancer -
+     * 10.0.0.10:80=10.0.0.3:80.
+     * If a logical port - p1 with IP - 10.0.0.5 opens a TCP connection with
+     * the VIP - 10.0.0.10, then the packet in the ingress pipeline of 'p1'
+     * is sent to the p1's conntrack zone id and the packet is load balanced
+     * to the backend - 10.0.0.3. For the reply packet from the backend lport,
+     * it is not sent to the conntrack of backend lport's zone id. This is fine
+     * as long as the packet is valid. Suppose the backend lport sends an
+     *  invalid TCP packet (like incorrect sequence number), the packet gets
+     * delivered to the lport 'p1' without unDNATing the packet to the
+     * VIP - 10.0.0.10. And this causes the connection to be reset by the
+     * lport p1's VIF.
+     *
+     * We can't fix this issue by adding a logical flow to drop ct.inv packets
+     * in the egress pipeline since it will drop all other connections not
+     * destined to the load balancers.
+     *
+     * To fix this issue, we send all the packets to the conntrack in the
+     * ingress pipeline if a load balancer is configured. We can now
+     * add a lflow to drop ct.inv packets.
+     */
     if (vip_configured) {
+        ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB,
+                      100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
         ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB,
                       100, "ip", REGBIT_CONNTRACK_DEFRAG" = 1; next;");
     }
@@ -5477,9 +5497,9 @@  build_port_group_lswitches(struct northd_context *ctx, struct hmap *pgs,
 
 static void
 build_acls(struct ovn_datapath *od, struct hmap *lflows,
-           struct hmap *port_groups)
+           struct hmap *port_groups, struct hmap *lbs)
 {
-    bool has_stateful = has_stateful_acl(od);
+    bool has_stateful = has_stateful_acl(od) || has_lb_vip(od, lbs);
 
     /* Ingress and Egress ACL Table (Priority 0): Packets are allowed by
      * default.  A related rule at priority 1 is added below if there
@@ -5736,19 +5756,32 @@  build_qos(struct ovn_datapath *od, struct hmap *lflows) {
 }
 
 static void
-build_lb(struct ovn_datapath *od, struct hmap *lflows)
+build_lb(struct ovn_datapath *od, struct hmap *lflows, struct hmap *lbs)
 {
     /* Ingress and Egress LB Table (Priority 0): Packets are allowed by
      * default.  */
     ovn_lflow_add(lflows, od, S_SWITCH_IN_LB, 0, "1", "next;");
     ovn_lflow_add(lflows, od, S_SWITCH_OUT_LB, 0, "1", "next;");
 
-    if (od->nbs->load_balancer) {
+    if (od->nbs->n_load_balancer) {
         for (size_t i = 0; i < od->n_router_ports; i++) {
             skip_port_from_conntrack(od, od->router_ports[i],
                                      S_SWITCH_IN_LB, S_SWITCH_OUT_LB,
                                      UINT16_MAX, lflows);
         }
+    }
+
+    bool vip_configured = false;
+    for (int i = 0; i < od->nbs->n_load_balancer; i++) {
+        struct nbrec_load_balancer *nb_lb = od->nbs->load_balancer[i];
+        struct ovn_lb *lb =
+            ovn_lb_find(lbs, &nb_lb->header_.uuid);
+        ovs_assert(lb);
+
+        vip_configured = (vip_configured || lb->n_vips);
+    }
+
+    if (vip_configured) {
         /* Ingress and Egress LB Table (Priority 65534).
          *
          * Send established traffic through conntrack for just NAT. */
@@ -6622,9 +6655,9 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
         build_pre_acls(od, lflows);
         build_pre_lb(od, lflows, meter_groups, lbs);
         build_pre_stateful(od, lflows);
-        build_acls(od, lflows, port_groups);
+        build_acls(od, lflows, port_groups, lbs);
         build_qos(od, lflows);
-        build_lb(od, lflows);
+        build_lb(od, lflows, lbs);
         build_stateful(od, lflows, lbs);
     }