diff mbox series

[ovs-dev] northd: bypass connection tracking for stateless flows when there are LB flows present

Message ID 20221108155044.23112-1-venugopali@nvidia.com
State Changes Requested
Delegated to: Numan Siddique
Headers show
Series [ovs-dev] northd: bypass connection tracking for stateless flows when there are LB flows present | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_ovn-kubernetes success github build: passed
ovsrobot/github-robot-_Build_and_Test fail github build: failed

Commit Message

Venugopal Iyer Nov. 8, 2022, 3:50 p.m. UTC
Currently, even stateless flows are subject to connection tracking when there are
LB rules (for DNAT). However, if a flow needs to be subjected to LB, then it shouldn't
be configured as stateless.

A stateless flow means we should not track it, and this change exempts stateless
flows from being tracked regardless of whether LB rules are present or not.

Signed-off-by: venu.iyer <venugopali@nvidia.com>
Acked-by: Han Zhou <hzhou@ovn.org>
---
 northd/northd.c         | 24 +++++++++++++-----
 northd/ovn-northd.8.xml | 56 ++++++++++++++++++++++-------------------
 ovn-nb.xml              |  3 +++
 tests/ovn-northd.at     | 48 ++++++++++++-----------------------
 tests/ovn.at            |  4 +--
 5 files changed, 69 insertions(+), 66 deletions(-)

Comments

Han Zhou Nov. 9, 2022, 9:11 p.m. UTC | #1
On Tue, Nov 8, 2022 at 7:51 AM venu.iyer <venugopali@nvidia.com> wrote:
>
> Currently, even stateless flows are subject to connection tracking when
there are
> LB rules (for DNAT). However, if a flow needs to be subjected to LB, then
it shouldn't
> be configured as stateless.
>
> A stateless flow means we should not track it, and this change exempts
stateless
> flows from being tracked regardless of whether LB rules are present or
not.
>
> Signed-off-by: venu.iyer <venugopali@nvidia.com>
> Acked-by: Han Zhou <hzhou@ovn.org>
> ---
>  northd/northd.c         | 24 +++++++++++++-----
>  northd/ovn-northd.8.xml | 56 ++++++++++++++++++++++-------------------
>  ovn-nb.xml              |  3 +++
>  tests/ovn-northd.at     | 48 ++++++++++++-----------------------
>  tests/ovn.at            |  4 +--
>  5 files changed, 69 insertions(+), 66 deletions(-)
>
> diff --git a/northd/northd.c b/northd/northd.c
> index b7388afc5..da4beede6 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -137,8 +137,8 @@ enum ovn_stage {
>      PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    24, "ls_in_l2_unknown")
 \
>
 \
>      /* Logical switch egress stages. */
  \
> -    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")
  \
> -    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")
 \
> +    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")
 \
> +    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       1, "ls_out_pre_lb")
  \
>      PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")
  \
>      PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     3, "ls_out_acl_hint")
  \
>      PIPELINE_STAGE(SWITCH, OUT, ACL,          4, "ls_out_acl")
 \
> @@ -210,6 +210,7 @@ enum ovn_stage {
>  #define REGBIT_ACL_LABEL          "reg0[13]"
>  #define REGBIT_FROM_RAMP          "reg0[14]"
>  #define REGBIT_PORT_SEC_DROP      "reg0[15]"
> +#define REGBIT_ACL_STATELESS      "reg0[16]"
>
>  #define REG_ORIG_DIP_IPV4         "reg1"
>  #define REG_ORIG_DIP_IPV6         "xxreg1"
> @@ -271,7 +272,7 @@ enum ovn_stage {
>   * | R0 |     REGBIT_{CONNTRACK/DHCP/DNS}              |   |
     |
>   * |    |     REGBIT_{HAIRPIN/HAIRPIN_REPLY}           |   |
     |
>   * |    | REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |
     |
> - * |    |     REGBIT_ACL_LABEL                         | X |
     |
> + * |    |     REGBIT_ACL_{LABEL/STATELESS}             | X |
     |
>   * +----+----------------------------------------------+ X |
     |
>   * | R1 |         ORIG_DIP_IPV4 (>= IN_PRE_STATEFUL)   | R |
     |
>   * +----+----------------------------------------------+ E |
     |
> @@ -5677,17 +5678,18 @@ build_stateless_filter(struct ovn_datapath *od,
>                         const struct nbrec_acl *acl,
>                         struct hmap *lflows)
>  {
> +    const char *action = REGBIT_ACL_STATELESS" = 1; next;";
>      if (!strcmp(acl->direction, "from-lport")) {
>          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_ACL,
>                                  acl->priority + OVN_ACL_PRI_OFFSET,
>                                  acl->match,
> -                                "next;",
> +                                action,
>                                  &acl->header_);
>      } else {
>          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_OUT_PRE_ACL,
>                                  acl->priority + OVN_ACL_PRI_OFFSET,
>                                  acl->match,
> -                                "next;",
> +                                action,
>                                  &acl->header_);
>      }
>  }
> @@ -5779,6 +5781,10 @@ build_pre_acls(struct ovn_datapath *od, const
struct hmap *port_groups,
>                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
>          ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
>                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> +    } else if (od->has_lb_vip) {
> +        /* We'll build stateless filters if there are LB rules so that
> +         * the stateless flows are not tracked in pre-lb. */
> +        build_stateless_filters(od, port_groups, lflows);
>      }
>  }
>
> @@ -5913,6 +5919,11 @@ build_pre_lb(struct ovn_datapath *od, const struct
shash *meter_groups,
>                                   S_SWITCH_IN_PRE_LB, S_SWITCH_OUT_PRE_LB,
>                                   110, lflows);
>      }
> +    /* Do not sent statless flows via conntrack */
> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
> +                  REGBIT_ACL_STATELESS" == 1", "next;");
> +    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
> +                  REGBIT_ACL_STATELESS" == 1", "next;");
>
>      /* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
>       * packet to conntrack for defragmentation and possibly for
unNATting.
> @@ -6918,7 +6929,8 @@ build_lb_rules_pre_stateful(struct hmap *lflows,
struct ovn_northd_lb *lb,
>          }
>          ds_put_format(action, "%s;", ct_lb_mark ? "ct_lb_mark" :
"ct_lb");
>
> -        ds_put_format(match, "%s.dst == %s", ip_match, lb_vip->vip_str);
> +        ds_put_format(match, REGBIT_CONNTRACK_NAT" == 1 && %s.dst == %s",
> +                      ip_match, lb_vip->vip_str);
>          if (lb_vip->vip_port) {
>              ds_put_format(match, " && %s.dst == %d", proto,
lb_vip->vip_port);
>          }
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index a70f2e678..162ec2b3b 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -440,7 +440,9 @@
>        priority-110 flow is added to skip over stateful ACLs. Multicast,
IPv6
>        Neighbor Discovery and MLD traffic also skips stateful ACLs. For
>        "allow-stateless" ACLs, a flow is added to bypass setting the hint
for
> -      connection tracker processing.
> +      connection tracker processing when there are statelful ACLs or LB
rules;
> +      <code>REGBIT_ACL_STATELESS</code> is set for traffic matching such
> +      flows for this purpose.
>      </p>
>
>      <p>
> @@ -460,8 +462,10 @@
>        in ingress table <code>LB</code> and <code>Stateful</code>.  It
contains
>        a priority-0 flow that simply moves traffic to the next table.
Moreover
>        it contains two priority-110 flows to move multicast, IPv6 Neighbor
> -      Discovery and MLD traffic to the next table. If load balancing
rules with
> -      virtual IP addresses (and ports) are configured in
> +      Discovery and MLD traffic to the next table. It also contains two
> +      priority-110 flows to move stateless traffic, i.e traffic for which
> +      <code>REGBIT_ACL_STATELESS</code> is set, to the next table. If
load
> +      balancing rules with virtual IP addresses (and ports) are
configured in
>        <code>OVN_Northbound</code> database for a logical switch
datapath, a
>        priority-100 flow is added with the match <code>ip</code> to match
on IP
>        packets and sets the action <code>reg0[2] = 1; next;</code> to act
as a
> @@ -1859,19 +1863,11 @@ output;
>        </li>
>      </ul>
>
> -    <h3>Egress Table 0: Pre-LB</h3>
> +    <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
>
>      <p>
> -      This table is similar to ingress table <code>Pre-LB</code>.  It
> -      contains a priority-0 flow that simply moves traffic to the next
table.
> -      Moreover it contains two priority-110 flows to move multicast, IPv6
> -      Neighbor Discovery and MLD traffic to the next table. If any load
> -      balancing rules exist for the datapath, a priority-100 flow is
added with
> -      a match of <code>ip</code> and action of <code>reg0[2] = 1;
next;</code>
> -      to act as a hint for table <code>Pre-stateful</code> to send IP
packets
> -      to the connection tracker for packet de-fragmentation and possibly
DNAT
> -      the destination VIP to one of the selected backend for already
committed
> -      load balanced traffic.
> +      This is similar to ingress table <code>Pre-ACLs</code> except for
> +     <code>to-lport</code> traffic.
>      </p>
>
>      <p>
> @@ -1884,11 +1880,28 @@ output;
>        db="OVN_Northbound"/> table.
>      </p>
>
> -    <h3>Egress Table 1: <code>to-lport</code> Pre-ACLs</h3>
> +    <p>
> +      This table also has a priority-110 flow with the match
> +      <code>outport == <var>I</var></code> for all logical switch
> +      datapaths to move traffic to the next table. Where <var>I</var>
> +      is the peer of a logical router port. This flow is added to
> +      skip the connection tracking of packets which will be entering
> +      logical router datapath from logical switch datapath for routing.
> +    </p>
> +
> +    <h3>Egress Table 1: Pre-LB</h3>
>
>      <p>
> -      This is similar to ingress table <code>Pre-ACLs</code> except for
> -     <code>to-lport</code> traffic.
> +      This table is similar to ingress table <code>Pre-LB</code>.  It
> +      contains a priority-0 flow that simply moves traffic to the next
table.
> +      Moreover it contains two priority-110 flows to move multicast, IPv6
> +      Neighbor Discovery and MLD traffic to the next table. If any load
> +      balancing rules exist for the datapath, a priority-100 flow is
added with
> +      a match of <code>ip</code> and action of <code>reg0[2] = 1;
next;</code>
> +      to act as a hint for table <code>Pre-stateful</code> to send IP
packets
> +      to the connection tracker for packet de-fragmentation and possibly
DNAT
> +      the destination VIP to one of the selected backend for already
committed
> +      load balanced traffic.
>      </p>
>
>      <p>
> @@ -1901,15 +1914,6 @@ output;
>        db="OVN_Northbound"/> table.
>      </p>
>
> -    <p>
> -      This table also has a priority-110 flow with the match
> -      <code>outport == <var>I</var></code> for all logical switch
> -      datapaths to move traffic to the next table. Where <var>I</var>
> -      is the peer of a logical router port. This flow is added to
> -      skip the connection tracking of packets which will be entering
> -      logical router datapath from logical switch datapath for routing.
> -    </p>
> -
>      <h3>Egress Table 2: Pre-stateful</h3>
>
>      <p>
> diff --git a/ovn-nb.xml b/ovn-nb.xml
> index f41e9d7c0..140dd9a4f 100644
> --- a/ovn-nb.xml
> +++ b/ovn-nb.xml
> @@ -2063,6 +2063,9 @@
>            outgoing TCP traffic directed to an IP address, then you
probably
>            also want to define another rule to allow incoming TCP traffic
coming
>            from this same IP address.
> +          In addition, traffic that matches stateless ACLs will bypass
> +          load-balancer DNAT/un-DNAT processing. Stateful ACLs should be
> +          used instead if the traffic is supposed to be load-balanced.
>          </li>
>
>          <li>
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 4f399eccb..85d5acfed 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -2056,27 +2056,27 @@ check ovn-nbctl ls-lb-add sw0 lb1
>  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb3
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  check ovn-nbctl ls-lb-add sw0 lb2
>  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb4
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  check ovn-nbctl clear load_balancer $lb1 vips
>  check ovn-nbctl clear load_balancer $lb3 vips
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  check ovn-nbctl clear load_balancer $lb2 vips
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  check ovn-nbctl clear load_balancer $lb4 vips
> @@ -2091,7 +2091,7 @@ check ovn-nbctl set load_balancer $lb4
vips:"10.0.0.13"="10.0.0.6"
>
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  # Now reverse the order of clearing the vip.
> @@ -2099,13 +2099,13 @@ check ovn-nbctl clear load_balancer $lb2 vips
>  check ovn-nbctl clear load_balancer $lb4 vips
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  check ovn-nbctl clear load_balancer $lb1 vips
>  check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
grep reg0 | sort], [0], [dnl
> -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
> +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
action=(reg0[[2]] = 1; next;)
>  ])
>
>  check ovn-nbctl clear load_balancer $lb3 vips
> @@ -3044,18 +3044,10 @@ for direction in from to; do
>  done
>  ovn-nbctl --wait=sb sync
>
> -# TCP packets should go to conntrack for load balancing.
> +# TCP packets should not go to conntrack for load balancing.
>  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
>  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
[0], [dnl
> -ct_lb_mark {
> -    ct_lb_mark {
> -        reg0[[6]] = 0;
> -        reg0[[12]] = 0;
> -        ct_lb_mark /* default (use --ct to customize) */ {
> -            output("lsp2");
> -        };
> -    };
> -};
> +output("lsp2");
>  ])
>
>  # UDP packets still go to conntrack.
> @@ -3188,18 +3180,10 @@ for direction in from to; do
>  done
>  ovn-nbctl --wait=sb sync
>
> -# TCP packets should go to conntrack for load balancing.
> +# TCP packets should not go to conntrack for load balancing.
>  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
>  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
[0], [dnl
> -ct_lb_mark {
> -    ct_lb_mark {
> -        reg0[[6]] = 0;
> -        reg0[[12]] = 0;
> -        ct_lb_mark /* default (use --ct to customize) */ {
> -            output("lsp2");
> -        };
> -    };
> -};
> +output("lsp2");
>  ])
>
>  # UDP packets still go to conntrack.
> @@ -4015,8 +3999,8 @@ check_stateful_flows() {
>    table=? (ls_in_pre_stateful ), priority=0    , match=(1),
action=(next;)
>    table=? (ls_in_pre_stateful ), priority=100  , match=(reg0[[0]] == 1),
action=(ct_next;)
>    table=? (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb_mark;)
> -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
ct_lb_mark;)
> -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20; reg2[[0..15]] = 80;
ct_lb_mark;)
> +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
&& ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
reg2[[0..15]] = 80; ct_lb_mark;)
> +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
&& ip4.dst == 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20;
reg2[[0..15]] = 80; ct_lb_mark;)
>  ])
>
>      AT_CHECK([grep "ls_in_lb" sw0flows | sort | sed
's/table=../table=??/'], [0], [dnl
> @@ -7650,7 +7634,7 @@ check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
>    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
>    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
&& ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
>    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb_mark;)
>    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
ct_lb_mark(backends=42.42.42.2);)
>    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb_mark;)
> @@ -7662,7 +7646,7 @@ check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
>    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
reg0 == 66.66.66.66 && ct_label.natted == 1), action=(next;)
>    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
reg0 == 66.66.66.66), action=(ct_lb(backends=42.42.42.2);)
> -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
&& ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
>    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb;)
>    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb(backends=42.42.42.2);)
>    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb;)
> @@ -7674,7 +7658,7 @@ check ovn-nbctl --wait=sb sync
>  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
>    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
>    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
&& ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
>    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb_mark;)
>    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
ct_lb_mark(backends=42.42.42.2);)
>    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
action=(ct_lb_mark;)
> diff --git a/tests/ovn.at b/tests/ovn.at
> index f8b8db4df..f43455f60 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -23656,7 +23656,7 @@ OVS_WAIT_FOR_OUTPUT(
>    [ovn-sbctl dump-flows > sbflows
>     ovn-sbctl dump-flows sw0 | grep ct_lb_mark | grep priority=120 | sed
's/table=..//'], 0,
>    [dnl
> -  (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 &&
tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
> +  (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 &&
ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
reg2[[0..15]] = 80; ct_lb_mark;)
>    (ls_in_lb           ), priority=120  , match=(ct.new && ip4.dst ==
10.0.0.10 && tcp.dst == 80), action=(reg0[[1]] = 0; ct_lb_mark(backends=
10.0.0.3:80,20.0.0.3:80; hash_fields="ip_dst,ip_src,tcp_dst,tcp_src");)
>  ])
>
> @@ -23699,7 +23699,7 @@ ovn-sbctl dump-flows sw0 > sbflows3
>  AT_CHECK(
>    [grep "ip4.dst == 10.0.0.10 && tcp.dst == 80" sbflows3 | grep
priority=120 |\
>     sed 's/table=../table=??/'], [0], [dnl
> -  table=??(ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
ct_lb_mark;)
> +  table=??(ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
&& ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
reg2[[0..15]] = 80; ct_lb_mark;)
>    table=??(ls_in_lb           ), priority=120  , match=(ct.new &&
ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(drop;)
>  ])
>
> --
> 2.17.1
>

Thanks Venu. I have reviewed this with Venu internally and gave my Ack, but
it would be good if folks can take another look before merging.
Numan, since you were involved in an earlier discussion, probably you want
to take a look?
I think there are two things to pay attention to:
1. Switching the order of PRE_ACL and PRE_LB in the LS egress pipeline. I
don't see any problem here but not sure if there was any specific reason
behind the original order.
2. If there are VIP traffic matching stateless ACL, they won't work any
more. We think that is ok, because we can't think of a use case that would
add stateless ACLs for VIP traffic because this combination doesn't make
any sense. If such traffic was allowed with CT, it could actually hide a
mistake made by the user - e.g. the intention was to apply stateless ACLs
to bypass CT for certain traffic that was not related to VIP, but by
mistake the match condition added to the ACL touched some VIP traffic. So,
we think the right thing to do is to let it fail when such
*misconfiguration* happens.

Let me know if you have any concerns about the above points.

Thanks,
Han
Numan Siddique Nov. 9, 2022, 9:15 p.m. UTC | #2
On Wed, Nov 9, 2022 at 4:11 PM Han Zhou <hzhou@ovn.org> wrote:
>
> On Tue, Nov 8, 2022 at 7:51 AM venu.iyer <venugopali@nvidia.com> wrote:
> >
> > Currently, even stateless flows are subject to connection tracking when
> there are
> > LB rules (for DNAT). However, if a flow needs to be subjected to LB, then
> it shouldn't
> > be configured as stateless.
> >
> > A stateless flow means we should not track it, and this change exempts
> stateless
> > flows from being tracked regardless of whether LB rules are present or
> not.
> >
> > Signed-off-by: venu.iyer <venugopali@nvidia.com>
> > Acked-by: Han Zhou <hzhou@ovn.org>
> > ---
> >  northd/northd.c         | 24 +++++++++++++-----
> >  northd/ovn-northd.8.xml | 56 ++++++++++++++++++++++-------------------
> >  ovn-nb.xml              |  3 +++
> >  tests/ovn-northd.at     | 48 ++++++++++++-----------------------
> >  tests/ovn.at            |  4 +--
> >  5 files changed, 69 insertions(+), 66 deletions(-)
> >
> > diff --git a/northd/northd.c b/northd/northd.c
> > index b7388afc5..da4beede6 100644
> > --- a/northd/northd.c
> > +++ b/northd/northd.c
> > @@ -137,8 +137,8 @@ enum ovn_stage {
> >      PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    24, "ls_in_l2_unknown")
>  \
> >
>  \
> >      /* Logical switch egress stages. */
>   \
> > -    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")
>   \
> > -    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")
>  \
> > +    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")
>  \
> > +    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       1, "ls_out_pre_lb")
>   \
> >      PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")
>   \
> >      PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     3, "ls_out_acl_hint")
>   \
> >      PIPELINE_STAGE(SWITCH, OUT, ACL,          4, "ls_out_acl")
>  \
> > @@ -210,6 +210,7 @@ enum ovn_stage {
> >  #define REGBIT_ACL_LABEL          "reg0[13]"
> >  #define REGBIT_FROM_RAMP          "reg0[14]"
> >  #define REGBIT_PORT_SEC_DROP      "reg0[15]"
> > +#define REGBIT_ACL_STATELESS      "reg0[16]"
> >
> >  #define REG_ORIG_DIP_IPV4         "reg1"
> >  #define REG_ORIG_DIP_IPV6         "xxreg1"
> > @@ -271,7 +272,7 @@ enum ovn_stage {
> >   * | R0 |     REGBIT_{CONNTRACK/DHCP/DNS}              |   |
>      |
> >   * |    |     REGBIT_{HAIRPIN/HAIRPIN_REPLY}           |   |
>      |
> >   * |    | REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |
>      |
> > - * |    |     REGBIT_ACL_LABEL                         | X |
>      |
> > + * |    |     REGBIT_ACL_{LABEL/STATELESS}             | X |
>      |
> >   * +----+----------------------------------------------+ X |
>      |
> >   * | R1 |         ORIG_DIP_IPV4 (>= IN_PRE_STATEFUL)   | R |
>      |
> >   * +----+----------------------------------------------+ E |
>      |
> > @@ -5677,17 +5678,18 @@ build_stateless_filter(struct ovn_datapath *od,
> >                         const struct nbrec_acl *acl,
> >                         struct hmap *lflows)
> >  {
> > +    const char *action = REGBIT_ACL_STATELESS" = 1; next;";
> >      if (!strcmp(acl->direction, "from-lport")) {
> >          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_ACL,
> >                                  acl->priority + OVN_ACL_PRI_OFFSET,
> >                                  acl->match,
> > -                                "next;",
> > +                                action,
> >                                  &acl->header_);
> >      } else {
> >          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_OUT_PRE_ACL,
> >                                  acl->priority + OVN_ACL_PRI_OFFSET,
> >                                  acl->match,
> > -                                "next;",
> > +                                action,
> >                                  &acl->header_);
> >      }
> >  }
> > @@ -5779,6 +5781,10 @@ build_pre_acls(struct ovn_datapath *od, const
> struct hmap *port_groups,
> >                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> >          ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
> >                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> > +    } else if (od->has_lb_vip) {
> > +        /* We'll build stateless filters if there are LB rules so that
> > +         * the stateless flows are not tracked in pre-lb. */
> > +        build_stateless_filters(od, port_groups, lflows);
> >      }
> >  }
> >
> > @@ -5913,6 +5919,11 @@ build_pre_lb(struct ovn_datapath *od, const struct
> shash *meter_groups,
> >                                   S_SWITCH_IN_PRE_LB, S_SWITCH_OUT_PRE_LB,
> >                                   110, lflows);
> >      }
> > +    /* Do not sent statless flows via conntrack */
> > +    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
> > +                  REGBIT_ACL_STATELESS" == 1", "next;");
> > +    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
> > +                  REGBIT_ACL_STATELESS" == 1", "next;");
> >
> >      /* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
> >       * packet to conntrack for defragmentation and possibly for
> unNATting.
> > @@ -6918,7 +6929,8 @@ build_lb_rules_pre_stateful(struct hmap *lflows,
> struct ovn_northd_lb *lb,
> >          }
> >          ds_put_format(action, "%s;", ct_lb_mark ? "ct_lb_mark" :
> "ct_lb");
> >
> > -        ds_put_format(match, "%s.dst == %s", ip_match, lb_vip->vip_str);
> > +        ds_put_format(match, REGBIT_CONNTRACK_NAT" == 1 && %s.dst == %s",
> > +                      ip_match, lb_vip->vip_str);
> >          if (lb_vip->vip_port) {
> >              ds_put_format(match, " && %s.dst == %d", proto,
> lb_vip->vip_port);
> >          }
> > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> > index a70f2e678..162ec2b3b 100644
> > --- a/northd/ovn-northd.8.xml
> > +++ b/northd/ovn-northd.8.xml
> > @@ -440,7 +440,9 @@
> >        priority-110 flow is added to skip over stateful ACLs. Multicast,
> IPv6
> >        Neighbor Discovery and MLD traffic also skips stateful ACLs. For
> >        "allow-stateless" ACLs, a flow is added to bypass setting the hint
> for
> > -      connection tracker processing.
> > +      connection tracker processing when there are statelful ACLs or LB
> rules;
> > +      <code>REGBIT_ACL_STATELESS</code> is set for traffic matching such
> > +      flows for this purpose.
> >      </p>
> >
> >      <p>
> > @@ -460,8 +462,10 @@
> >        in ingress table <code>LB</code> and <code>Stateful</code>.  It
> contains
> >        a priority-0 flow that simply moves traffic to the next table.
> Moreover
> >        it contains two priority-110 flows to move multicast, IPv6 Neighbor
> > -      Discovery and MLD traffic to the next table. If load balancing
> rules with
> > -      virtual IP addresses (and ports) are configured in
> > +      Discovery and MLD traffic to the next table. It also contains two
> > +      priority-110 flows to move stateless traffic, i.e traffic for which
> > +      <code>REGBIT_ACL_STATELESS</code> is set, to the next table. If
> load
> > +      balancing rules with virtual IP addresses (and ports) are
> configured in
> >        <code>OVN_Northbound</code> database for a logical switch
> datapath, a
> >        priority-100 flow is added with the match <code>ip</code> to match
> on IP
> >        packets and sets the action <code>reg0[2] = 1; next;</code> to act
> as a
> > @@ -1859,19 +1863,11 @@ output;
> >        </li>
> >      </ul>
> >
> > -    <h3>Egress Table 0: Pre-LB</h3>
> > +    <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
> >
> >      <p>
> > -      This table is similar to ingress table <code>Pre-LB</code>.  It
> > -      contains a priority-0 flow that simply moves traffic to the next
> table.
> > -      Moreover it contains two priority-110 flows to move multicast, IPv6
> > -      Neighbor Discovery and MLD traffic to the next table. If any load
> > -      balancing rules exist for the datapath, a priority-100 flow is
> added with
> > -      a match of <code>ip</code> and action of <code>reg0[2] = 1;
> next;</code>
> > -      to act as a hint for table <code>Pre-stateful</code> to send IP
> packets
> > -      to the connection tracker for packet de-fragmentation and possibly
> DNAT
> > -      the destination VIP to one of the selected backend for already
> committed
> > -      load balanced traffic.
> > +      This is similar to ingress table <code>Pre-ACLs</code> except for
> > +     <code>to-lport</code> traffic.
> >      </p>
> >
> >      <p>
> > @@ -1884,11 +1880,28 @@ output;
> >        db="OVN_Northbound"/> table.
> >      </p>
> >
> > -    <h3>Egress Table 1: <code>to-lport</code> Pre-ACLs</h3>
> > +    <p>
> > +      This table also has a priority-110 flow with the match
> > +      <code>outport == <var>I</var></code> for all logical switch
> > +      datapaths to move traffic to the next table. Where <var>I</var>
> > +      is the peer of a logical router port. This flow is added to
> > +      skip the connection tracking of packets which will be entering
> > +      logical router datapath from logical switch datapath for routing.
> > +    </p>
> > +
> > +    <h3>Egress Table 1: Pre-LB</h3>
> >
> >      <p>
> > -      This is similar to ingress table <code>Pre-ACLs</code> except for
> > -     <code>to-lport</code> traffic.
> > +      This table is similar to ingress table <code>Pre-LB</code>.  It
> > +      contains a priority-0 flow that simply moves traffic to the next
> table.
> > +      Moreover it contains two priority-110 flows to move multicast, IPv6
> > +      Neighbor Discovery and MLD traffic to the next table. If any load
> > +      balancing rules exist for the datapath, a priority-100 flow is
> added with
> > +      a match of <code>ip</code> and action of <code>reg0[2] = 1;
> next;</code>
> > +      to act as a hint for table <code>Pre-stateful</code> to send IP
> packets
> > +      to the connection tracker for packet de-fragmentation and possibly
> DNAT
> > +      the destination VIP to one of the selected backend for already
> committed
> > +      load balanced traffic.
> >      </p>
> >
> >      <p>
> > @@ -1901,15 +1914,6 @@ output;
> >        db="OVN_Northbound"/> table.
> >      </p>
> >
> > -    <p>
> > -      This table also has a priority-110 flow with the match
> > -      <code>outport == <var>I</var></code> for all logical switch
> > -      datapaths to move traffic to the next table. Where <var>I</var>
> > -      is the peer of a logical router port. This flow is added to
> > -      skip the connection tracking of packets which will be entering
> > -      logical router datapath from logical switch datapath for routing.
> > -    </p>
> > -
> >      <h3>Egress Table 2: Pre-stateful</h3>
> >
> >      <p>
> > diff --git a/ovn-nb.xml b/ovn-nb.xml
> > index f41e9d7c0..140dd9a4f 100644
> > --- a/ovn-nb.xml
> > +++ b/ovn-nb.xml
> > @@ -2063,6 +2063,9 @@
> >            outgoing TCP traffic directed to an IP address, then you
> probably
> >            also want to define another rule to allow incoming TCP traffic
> coming
> >            from this same IP address.
> > +          In addition, traffic that matches stateless ACLs will bypass
> > +          load-balancer DNAT/un-DNAT processing. Stateful ACLs should be
> > +          used instead if the traffic is supposed to be load-balanced.
> >          </li>
> >
> >          <li>
> > diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> > index 4f399eccb..85d5acfed 100644
> > --- a/tests/ovn-northd.at
> > +++ b/tests/ovn-northd.at
> > @@ -2056,27 +2056,27 @@ check ovn-nbctl ls-lb-add sw0 lb1
> >  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb3
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  check ovn-nbctl ls-lb-add sw0 lb2
> >  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb4
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  check ovn-nbctl clear load_balancer $lb1 vips
> >  check ovn-nbctl clear load_balancer $lb3 vips
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  check ovn-nbctl clear load_balancer $lb2 vips
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  check ovn-nbctl clear load_balancer $lb4 vips
> > @@ -2091,7 +2091,7 @@ check ovn-nbctl set load_balancer $lb4
> vips:"10.0.0.13"="10.0.0.6"
> >
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  # Now reverse the order of clearing the vip.
> > @@ -2099,13 +2099,13 @@ check ovn-nbctl clear load_balancer $lb2 vips
> >  check ovn-nbctl clear load_balancer $lb4 vips
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  check ovn-nbctl clear load_balancer $lb1 vips
> >  check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> grep reg0 | sort], [0], [dnl
> > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> action=(reg0[[2]] = 1; next;)
> >  ])
> >
> >  check ovn-nbctl clear load_balancer $lb3 vips
> > @@ -3044,18 +3044,10 @@ for direction in from to; do
> >  done
> >  ovn-nbctl --wait=sb sync
> >
> > -# TCP packets should go to conntrack for load balancing.
> > +# TCP packets should not go to conntrack for load balancing.
> >  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
> >  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
> [0], [dnl
> > -ct_lb_mark {
> > -    ct_lb_mark {
> > -        reg0[[6]] = 0;
> > -        reg0[[12]] = 0;
> > -        ct_lb_mark /* default (use --ct to customize) */ {
> > -            output("lsp2");
> > -        };
> > -    };
> > -};
> > +output("lsp2");
> >  ])
> >
> >  # UDP packets still go to conntrack.
> > @@ -3188,18 +3180,10 @@ for direction in from to; do
> >  done
> >  ovn-nbctl --wait=sb sync
> >
> > -# TCP packets should go to conntrack for load balancing.
> > +# TCP packets should not go to conntrack for load balancing.
> >  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
> >  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
> [0], [dnl
> > -ct_lb_mark {
> > -    ct_lb_mark {
> > -        reg0[[6]] = 0;
> > -        reg0[[12]] = 0;
> > -        ct_lb_mark /* default (use --ct to customize) */ {
> > -            output("lsp2");
> > -        };
> > -    };
> > -};
> > +output("lsp2");
> >  ])
> >
> >  # UDP packets still go to conntrack.
> > @@ -4015,8 +3999,8 @@ check_stateful_flows() {
> >    table=? (ls_in_pre_stateful ), priority=0    , match=(1),
> action=(next;)
> >    table=? (ls_in_pre_stateful ), priority=100  , match=(reg0[[0]] == 1),
> action=(ct_next;)
> >    table=? (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb_mark;)
> > -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
> ct_lb_mark;)
> > -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20; reg2[[0..15]] = 80;
> ct_lb_mark;)
> > +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> reg2[[0..15]] = 80; ct_lb_mark;)
> > +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> && ip4.dst == 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20;
> reg2[[0..15]] = 80; ct_lb_mark;)
> >  ])
> >
> >      AT_CHECK([grep "ls_in_lb" sw0flows | sort | sed
> 's/table=../table=??/'], [0], [dnl
> > @@ -7650,7 +7634,7 @@ check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
> >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb_mark;)
> >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
> ct_lb_mark(backends=42.42.42.2);)
> >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb_mark;)
> > @@ -7662,7 +7646,7 @@ check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> reg0 == 66.66.66.66 && ct_label.natted == 1), action=(next;)
> >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> reg0 == 66.66.66.66), action=(ct_lb(backends=42.42.42.2);)
> > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb;)
> >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb(backends=42.42.42.2);)
> >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb;)
> > @@ -7674,7 +7658,7 @@ check ovn-nbctl --wait=sb sync
> >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
> >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb_mark;)
> >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
> ct_lb_mark(backends=42.42.42.2);)
> >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> action=(ct_lb_mark;)
> > diff --git a/tests/ovn.at b/tests/ovn.at
> > index f8b8db4df..f43455f60 100644
> > --- a/tests/ovn.at
> > +++ b/tests/ovn.at
> > @@ -23656,7 +23656,7 @@ OVS_WAIT_FOR_OUTPUT(
> >    [ovn-sbctl dump-flows > sbflows
> >     ovn-sbctl dump-flows sw0 | grep ct_lb_mark | grep priority=120 | sed
> 's/table=..//'], 0,
> >    [dnl
> > -  (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 &&
> tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
> > +  (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 &&
> ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> reg2[[0..15]] = 80; ct_lb_mark;)
> >    (ls_in_lb           ), priority=120  , match=(ct.new && ip4.dst ==
> 10.0.0.10 && tcp.dst == 80), action=(reg0[[1]] = 0; ct_lb_mark(backends=
> 10.0.0.3:80,20.0.0.3:80; hash_fields="ip_dst,ip_src,tcp_dst,tcp_src");)
> >  ])
> >
> > @@ -23699,7 +23699,7 @@ ovn-sbctl dump-flows sw0 > sbflows3
> >  AT_CHECK(
> >    [grep "ip4.dst == 10.0.0.10 && tcp.dst == 80" sbflows3 | grep
> priority=120 |\
> >     sed 's/table=../table=??/'], [0], [dnl
> > -  table=??(ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
> ct_lb_mark;)
> > +  table=??(ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> reg2[[0..15]] = 80; ct_lb_mark;)
> >    table=??(ls_in_lb           ), priority=120  , match=(ct.new &&
> ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(drop;)
> >  ])
> >
> > --
> > 2.17.1
> >
>
> Thanks Venu. I have reviewed this with Venu internally and gave my Ack, but
> it would be good if folks can take another look before merging.
> Numan, since you were involved in an earlier discussion, probably you want
> to take a look?

Sure.  I'll take a look and thanks for the comments below.

Numan

> I think there are two things to pay attention to:
> 1. Switching the order of PRE_ACL and PRE_LB in the LS egress pipeline. I
> don't see any problem here but not sure if there was any specific reason
> behind the original order.
> 2. If there are VIP traffic matching stateless ACL, they won't work any
> more. We think that is ok, because we can't think of a use case that would
> add stateless ACLs for VIP traffic because this combination doesn't make
> any sense. If such traffic was allowed with CT, it could actually hide a
> mistake made by the user - e.g. the intention was to apply stateless ACLs
> to bypass CT for certain traffic that was not related to VIP, but by
> mistake the match condition added to the ACL touched some VIP traffic. So,
> we think the right thing to do is to let it fail when such
> *misconfiguration* happens.
>
> Let me know if you have any concerns about the above points.
>
> Thanks,
> Han
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Numan Siddique Nov. 14, 2022, 5:17 p.m. UTC | #3
On Wed, Nov 9, 2022 at 4:15 PM Numan Siddique <numans@ovn.org> wrote:
>
> On Wed, Nov 9, 2022 at 4:11 PM Han Zhou <hzhou@ovn.org> wrote:
> >
> > On Tue, Nov 8, 2022 at 7:51 AM venu.iyer <venugopali@nvidia.com> wrote:
> > >
> > > Currently, even stateless flows are subject to connection tracking when
> > there are
> > > LB rules (for DNAT). However, if a flow needs to be subjected to LB, then
> > it shouldn't
> > > be configured as stateless.
> > >
> > > A stateless flow means we should not track it, and this change exempts
> > stateless
> > > flows from being tracked regardless of whether LB rules are present or
> > not.
> > >
> > > Signed-off-by: venu.iyer <venugopali@nvidia.com>
> > > Acked-by: Han Zhou <hzhou@ovn.org>

I had a quick look and it looks ok to me.
Can you please add system tests to cover some scenarios for this use
case to avoid future regressions ?

Thanks
Numan




> > > ---
> > >  northd/northd.c         | 24 +++++++++++++-----
> > >  northd/ovn-northd.8.xml | 56 ++++++++++++++++++++++-------------------
> > >  ovn-nb.xml              |  3 +++
> > >  tests/ovn-northd.at     | 48 ++++++++++++-----------------------
> > >  tests/ovn.at            |  4 +--
> > >  5 files changed, 69 insertions(+), 66 deletions(-)
> > >
> > > diff --git a/northd/northd.c b/northd/northd.c
> > > index b7388afc5..da4beede6 100644
> > > --- a/northd/northd.c
> > > +++ b/northd/northd.c
> > > @@ -137,8 +137,8 @@ enum ovn_stage {
> > >      PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    24, "ls_in_l2_unknown")
> >  \
> > >
> >  \
> > >      /* Logical switch egress stages. */
> >   \
> > > -    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")
> >   \
> > > -    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")
> >  \
> > > +    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")
> >  \
> > > +    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       1, "ls_out_pre_lb")
> >   \
> > >      PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")
> >   \
> > >      PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     3, "ls_out_acl_hint")
> >   \
> > >      PIPELINE_STAGE(SWITCH, OUT, ACL,          4, "ls_out_acl")
> >  \
> > > @@ -210,6 +210,7 @@ enum ovn_stage {
> > >  #define REGBIT_ACL_LABEL          "reg0[13]"
> > >  #define REGBIT_FROM_RAMP          "reg0[14]"
> > >  #define REGBIT_PORT_SEC_DROP      "reg0[15]"
> > > +#define REGBIT_ACL_STATELESS      "reg0[16]"
> > >
> > >  #define REG_ORIG_DIP_IPV4         "reg1"
> > >  #define REG_ORIG_DIP_IPV6         "xxreg1"
> > > @@ -271,7 +272,7 @@ enum ovn_stage {
> > >   * | R0 |     REGBIT_{CONNTRACK/DHCP/DNS}              |   |
> >      |
> > >   * |    |     REGBIT_{HAIRPIN/HAIRPIN_REPLY}           |   |
> >      |
> > >   * |    | REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |
> >      |
> > > - * |    |     REGBIT_ACL_LABEL                         | X |
> >      |
> > > + * |    |     REGBIT_ACL_{LABEL/STATELESS}             | X |
> >      |
> > >   * +----+----------------------------------------------+ X |
> >      |
> > >   * | R1 |         ORIG_DIP_IPV4 (>= IN_PRE_STATEFUL)   | R |
> >      |
> > >   * +----+----------------------------------------------+ E |
> >      |
> > > @@ -5677,17 +5678,18 @@ build_stateless_filter(struct ovn_datapath *od,
> > >                         const struct nbrec_acl *acl,
> > >                         struct hmap *lflows)
> > >  {
> > > +    const char *action = REGBIT_ACL_STATELESS" = 1; next;";
> > >      if (!strcmp(acl->direction, "from-lport")) {
> > >          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_ACL,
> > >                                  acl->priority + OVN_ACL_PRI_OFFSET,
> > >                                  acl->match,
> > > -                                "next;",
> > > +                                action,
> > >                                  &acl->header_);
> > >      } else {
> > >          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_OUT_PRE_ACL,
> > >                                  acl->priority + OVN_ACL_PRI_OFFSET,
> > >                                  acl->match,
> > > -                                "next;",
> > > +                                action,
> > >                                  &acl->header_);
> > >      }
> > >  }
> > > @@ -5779,6 +5781,10 @@ build_pre_acls(struct ovn_datapath *od, const
> > struct hmap *port_groups,
> > >                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> > >          ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
> > >                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> > > +    } else if (od->has_lb_vip) {
> > > +        /* We'll build stateless filters if there are LB rules so that
> > > +         * the stateless flows are not tracked in pre-lb. */
> > > +        build_stateless_filters(od, port_groups, lflows);
> > >      }
> > >  }
> > >
> > > @@ -5913,6 +5919,11 @@ build_pre_lb(struct ovn_datapath *od, const struct
> > shash *meter_groups,
> > >                                   S_SWITCH_IN_PRE_LB, S_SWITCH_OUT_PRE_LB,
> > >                                   110, lflows);
> > >      }
> > > +    /* Do not sent statless flows via conntrack */
> > > +    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
> > > +                  REGBIT_ACL_STATELESS" == 1", "next;");
> > > +    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
> > > +                  REGBIT_ACL_STATELESS" == 1", "next;");
> > >
> > >      /* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
> > >       * packet to conntrack for defragmentation and possibly for
> > unNATting.
> > > @@ -6918,7 +6929,8 @@ build_lb_rules_pre_stateful(struct hmap *lflows,
> > struct ovn_northd_lb *lb,
> > >          }
> > >          ds_put_format(action, "%s;", ct_lb_mark ? "ct_lb_mark" :
> > "ct_lb");
> > >
> > > -        ds_put_format(match, "%s.dst == %s", ip_match, lb_vip->vip_str);
> > > +        ds_put_format(match, REGBIT_CONNTRACK_NAT" == 1 && %s.dst == %s",
> > > +                      ip_match, lb_vip->vip_str);
> > >          if (lb_vip->vip_port) {
> > >              ds_put_format(match, " && %s.dst == %d", proto,
> > lb_vip->vip_port);
> > >          }
> > > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> > > index a70f2e678..162ec2b3b 100644
> > > --- a/northd/ovn-northd.8.xml
> > > +++ b/northd/ovn-northd.8.xml
> > > @@ -440,7 +440,9 @@
> > >        priority-110 flow is added to skip over stateful ACLs. Multicast,
> > IPv6
> > >        Neighbor Discovery and MLD traffic also skips stateful ACLs. For
> > >        "allow-stateless" ACLs, a flow is added to bypass setting the hint
> > for
> > > -      connection tracker processing.
> > > +      connection tracker processing when there are statelful ACLs or LB
> > rules;
> > > +      <code>REGBIT_ACL_STATELESS</code> is set for traffic matching such
> > > +      flows for this purpose.
> > >      </p>
> > >
> > >      <p>
> > > @@ -460,8 +462,10 @@
> > >        in ingress table <code>LB</code> and <code>Stateful</code>.  It
> > contains
> > >        a priority-0 flow that simply moves traffic to the next table.
> > Moreover
> > >        it contains two priority-110 flows to move multicast, IPv6 Neighbor
> > > -      Discovery and MLD traffic to the next table. If load balancing
> > rules with
> > > -      virtual IP addresses (and ports) are configured in
> > > +      Discovery and MLD traffic to the next table. It also contains two
> > > +      priority-110 flows to move stateless traffic, i.e traffic for which
> > > +      <code>REGBIT_ACL_STATELESS</code> is set, to the next table. If
> > load
> > > +      balancing rules with virtual IP addresses (and ports) are
> > configured in
> > >        <code>OVN_Northbound</code> database for a logical switch
> > datapath, a
> > >        priority-100 flow is added with the match <code>ip</code> to match
> > on IP
> > >        packets and sets the action <code>reg0[2] = 1; next;</code> to act
> > as a
> > > @@ -1859,19 +1863,11 @@ output;
> > >        </li>
> > >      </ul>
> > >
> > > -    <h3>Egress Table 0: Pre-LB</h3>
> > > +    <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
> > >
> > >      <p>
> > > -      This table is similar to ingress table <code>Pre-LB</code>.  It
> > > -      contains a priority-0 flow that simply moves traffic to the next
> > table.
> > > -      Moreover it contains two priority-110 flows to move multicast, IPv6
> > > -      Neighbor Discovery and MLD traffic to the next table. If any load
> > > -      balancing rules exist for the datapath, a priority-100 flow is
> > added with
> > > -      a match of <code>ip</code> and action of <code>reg0[2] = 1;
> > next;</code>
> > > -      to act as a hint for table <code>Pre-stateful</code> to send IP
> > packets
> > > -      to the connection tracker for packet de-fragmentation and possibly
> > DNAT
> > > -      the destination VIP to one of the selected backend for already
> > committed
> > > -      load balanced traffic.
> > > +      This is similar to ingress table <code>Pre-ACLs</code> except for
> > > +     <code>to-lport</code> traffic.
> > >      </p>
> > >
> > >      <p>
> > > @@ -1884,11 +1880,28 @@ output;
> > >        db="OVN_Northbound"/> table.
> > >      </p>
> > >
> > > -    <h3>Egress Table 1: <code>to-lport</code> Pre-ACLs</h3>
> > > +    <p>
> > > +      This table also has a priority-110 flow with the match
> > > +      <code>outport == <var>I</var></code> for all logical switch
> > > +      datapaths to move traffic to the next table. Where <var>I</var>
> > > +      is the peer of a logical router port. This flow is added to
> > > +      skip the connection tracking of packets which will be entering
> > > +      logical router datapath from logical switch datapath for routing.
> > > +    </p>
> > > +
> > > +    <h3>Egress Table 1: Pre-LB</h3>
> > >
> > >      <p>
> > > -      This is similar to ingress table <code>Pre-ACLs</code> except for
> > > -     <code>to-lport</code> traffic.
> > > +      This table is similar to ingress table <code>Pre-LB</code>.  It
> > > +      contains a priority-0 flow that simply moves traffic to the next
> > table.
> > > +      Moreover it contains two priority-110 flows to move multicast, IPv6
> > > +      Neighbor Discovery and MLD traffic to the next table. If any load
> > > +      balancing rules exist for the datapath, a priority-100 flow is
> > added with
> > > +      a match of <code>ip</code> and action of <code>reg0[2] = 1;
> > next;</code>
> > > +      to act as a hint for table <code>Pre-stateful</code> to send IP
> > packets
> > > +      to the connection tracker for packet de-fragmentation and possibly
> > DNAT
> > > +      the destination VIP to one of the selected backend for already
> > committed
> > > +      load balanced traffic.
> > >      </p>
> > >
> > >      <p>
> > > @@ -1901,15 +1914,6 @@ output;
> > >        db="OVN_Northbound"/> table.
> > >      </p>
> > >
> > > -    <p>
> > > -      This table also has a priority-110 flow with the match
> > > -      <code>outport == <var>I</var></code> for all logical switch
> > > -      datapaths to move traffic to the next table. Where <var>I</var>
> > > -      is the peer of a logical router port. This flow is added to
> > > -      skip the connection tracking of packets which will be entering
> > > -      logical router datapath from logical switch datapath for routing.
> > > -    </p>
> > > -
> > >      <h3>Egress Table 2: Pre-stateful</h3>
> > >
> > >      <p>
> > > diff --git a/ovn-nb.xml b/ovn-nb.xml
> > > index f41e9d7c0..140dd9a4f 100644
> > > --- a/ovn-nb.xml
> > > +++ b/ovn-nb.xml
> > > @@ -2063,6 +2063,9 @@
> > >            outgoing TCP traffic directed to an IP address, then you
> > probably
> > >            also want to define another rule to allow incoming TCP traffic
> > coming
> > >            from this same IP address.
> > > +          In addition, traffic that matches stateless ACLs will bypass
> > > +          load-balancer DNAT/un-DNAT processing. Stateful ACLs should be
> > > +          used instead if the traffic is supposed to be load-balanced.
> > >          </li>
> > >
> > >          <li>
> > > diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> > > index 4f399eccb..85d5acfed 100644
> > > --- a/tests/ovn-northd.at
> > > +++ b/tests/ovn-northd.at
> > > @@ -2056,27 +2056,27 @@ check ovn-nbctl ls-lb-add sw0 lb1
> > >  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb3
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  check ovn-nbctl ls-lb-add sw0 lb2
> > >  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb4
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  check ovn-nbctl clear load_balancer $lb1 vips
> > >  check ovn-nbctl clear load_balancer $lb3 vips
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  check ovn-nbctl clear load_balancer $lb2 vips
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  check ovn-nbctl clear load_balancer $lb4 vips
> > > @@ -2091,7 +2091,7 @@ check ovn-nbctl set load_balancer $lb4
> > vips:"10.0.0.13"="10.0.0.6"
> > >
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  # Now reverse the order of clearing the vip.
> > > @@ -2099,13 +2099,13 @@ check ovn-nbctl clear load_balancer $lb2 vips
> > >  check ovn-nbctl clear load_balancer $lb4 vips
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  check ovn-nbctl clear load_balancer $lb1 vips
> > >  check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > grep reg0 | sort], [0], [dnl
> > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > action=(reg0[[2]] = 1; next;)
> > >  ])
> > >
> > >  check ovn-nbctl clear load_balancer $lb3 vips
> > > @@ -3044,18 +3044,10 @@ for direction in from to; do
> > >  done
> > >  ovn-nbctl --wait=sb sync
> > >
> > > -# TCP packets should go to conntrack for load balancing.
> > > +# TCP packets should not go to conntrack for load balancing.
> > >  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
> > >  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
> > [0], [dnl
> > > -ct_lb_mark {
> > > -    ct_lb_mark {
> > > -        reg0[[6]] = 0;
> > > -        reg0[[12]] = 0;
> > > -        ct_lb_mark /* default (use --ct to customize) */ {
> > > -            output("lsp2");
> > > -        };
> > > -    };
> > > -};
> > > +output("lsp2");
> > >  ])
> > >
> > >  # UDP packets still go to conntrack.
> > > @@ -3188,18 +3180,10 @@ for direction in from to; do
> > >  done
> > >  ovn-nbctl --wait=sb sync
> > >
> > > -# TCP packets should go to conntrack for load balancing.
> > > +# TCP packets should not go to conntrack for load balancing.
> > >  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
> > >  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
> > [0], [dnl
> > > -ct_lb_mark {
> > > -    ct_lb_mark {
> > > -        reg0[[6]] = 0;
> > > -        reg0[[12]] = 0;
> > > -        ct_lb_mark /* default (use --ct to customize) */ {
> > > -            output("lsp2");
> > > -        };
> > > -    };
> > > -};
> > > +output("lsp2");
> > >  ])
> > >
> > >  # UDP packets still go to conntrack.
> > > @@ -4015,8 +3999,8 @@ check_stateful_flows() {
> > >    table=? (ls_in_pre_stateful ), priority=0    , match=(1),
> > action=(next;)
> > >    table=? (ls_in_pre_stateful ), priority=100  , match=(reg0[[0]] == 1),
> > action=(ct_next;)
> > >    table=? (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb_mark;)
> > > -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
> > ct_lb_mark;)
> > > -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20; reg2[[0..15]] = 80;
> > ct_lb_mark;)
> > > +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> > reg2[[0..15]] = 80; ct_lb_mark;)
> > > +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > && ip4.dst == 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20;
> > reg2[[0..15]] = 80; ct_lb_mark;)
> > >  ])
> > >
> > >      AT_CHECK([grep "ls_in_lb" sw0flows | sort | sed
> > 's/table=../table=??/'], [0], [dnl
> > > @@ -7650,7 +7634,7 @@ check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> > reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
> > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> > reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> > > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb_mark;)
> > >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> > ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
> > ct_lb_mark(backends=42.42.42.2);)
> > >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb_mark;)
> > > @@ -7662,7 +7646,7 @@ check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> > reg0 == 66.66.66.66 && ct_label.natted == 1), action=(next;)
> > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> > reg0 == 66.66.66.66), action=(ct_lb(backends=42.42.42.2);)
> > > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> > > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> > >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb;)
> > >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> > ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb(backends=42.42.42.2);)
> > >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb;)
> > > @@ -7674,7 +7658,7 @@ check ovn-nbctl --wait=sb sync
> > >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> > reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
> > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> > reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> > > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb_mark;)
> > >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> > ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
> > ct_lb_mark(backends=42.42.42.2);)
> > >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> > action=(ct_lb_mark;)
> > > diff --git a/tests/ovn.at b/tests/ovn.at
> > > index f8b8db4df..f43455f60 100644
> > > --- a/tests/ovn.at
> > > +++ b/tests/ovn.at
> > > @@ -23656,7 +23656,7 @@ OVS_WAIT_FOR_OUTPUT(
> > >    [ovn-sbctl dump-flows > sbflows
> > >     ovn-sbctl dump-flows sw0 | grep ct_lb_mark | grep priority=120 | sed
> > 's/table=..//'], 0,
> > >    [dnl
> > > -  (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 &&
> > tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
> > > +  (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 &&
> > ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> > reg2[[0..15]] = 80; ct_lb_mark;)
> > >    (ls_in_lb           ), priority=120  , match=(ct.new && ip4.dst ==
> > 10.0.0.10 && tcp.dst == 80), action=(reg0[[1]] = 0; ct_lb_mark(backends=
> > 10.0.0.3:80,20.0.0.3:80; hash_fields="ip_dst,ip_src,tcp_dst,tcp_src");)
> > >  ])
> > >
> > > @@ -23699,7 +23699,7 @@ ovn-sbctl dump-flows sw0 > sbflows3
> > >  AT_CHECK(
> > >    [grep "ip4.dst == 10.0.0.10 && tcp.dst == 80" sbflows3 | grep
> > priority=120 |\
> > >     sed 's/table=../table=??/'], [0], [dnl
> > > -  table=??(ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
> > ct_lb_mark;)
> > > +  table=??(ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> > reg2[[0..15]] = 80; ct_lb_mark;)
> > >    table=??(ls_in_lb           ), priority=120  , match=(ct.new &&
> > ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(drop;)
> > >  ])
> > >
> > > --
> > > 2.17.1
> > >
> >
> > Thanks Venu. I have reviewed this with Venu internally and gave my Ack, but
> > it would be good if folks can take another look before merging.
> > Numan, since you were involved in an earlier discussion, probably you want
> > to take a look?
>
> Sure.  I'll take a look and thanks for the comments below.
>
> Numan
>
> > I think there are two things to pay attention to:
> > 1. Switching the order of PRE_ACL and PRE_LB in the LS egress pipeline. I
> > don't see any problem here but not sure if there was any specific reason
> > behind the original order.
> > 2. If there are VIP traffic matching stateless ACL, they won't work any
> > more. We think that is ok, because we can't think of a use case that would
> > add stateless ACLs for VIP traffic because this combination doesn't make
> > any sense. If such traffic was allowed with CT, it could actually hide a
> > mistake made by the user - e.g. the intention was to apply stateless ACLs
> > to bypass CT for certain traffic that was not related to VIP, but by
> > mistake the match condition added to the ACL touched some VIP traffic. So,
> > we think the right thing to do is to let it fail when such
> > *misconfiguration* happens.
> >
> > Let me know if you have any concerns about the above points.
> >
> > Thanks,
> > Han
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
Venugopal Iyer Nov. 16, 2022, 8:05 p.m. UTC | #4
Thanks for looking at this, Numan. I haven't worked with the system tests before, and am having some
issues getting it to run. So, let me get back to you on this.

regards,

-venu
Numan Siddique Nov. 16, 2022, 9:42 p.m. UTC | #5
On Wed, Nov 16, 2022 at 3:05 PM Venugopal Iyer via dev
<ovs-dev@openvswitch.org> wrote:
>
> Thanks for looking at this, Numan. I haven't worked with the system tests before, and am having some
> issues getting it to run. So, let me get back to you on this.
>

Sure.  Please make sure that the test case checks that there is no
conntrack entry for the stateless traffic.
Existing system tests should give you a good reference.  Let me know
if you have any questions.

Numan

> regards,
>
> -venu
>
> ________________________________________
> From: Numan Siddique <numans@ovn.org>
> Sent: Monday, November 14, 2022 9:17 AM
> To: Han Zhou
> Cc: Venugopal Iyer; dev@openvswitch.org
> Subject: Re: [ovs-dev] [PATCH ovn] northd: bypass connection tracking for stateless flows when there are LB flows present
>
> External email: Use caution opening links or attachments
>
>
> On Wed, Nov 9, 2022 at 4:15 PM Numan Siddique <numans@ovn.org> wrote:
> >
> > On Wed, Nov 9, 2022 at 4:11 PM Han Zhou <hzhou@ovn.org> wrote:
> > >
> > > On Tue, Nov 8, 2022 at 7:51 AM venu.iyer <venugopali@nvidia.com> wrote:
> > > >
> > > > Currently, even stateless flows are subject to connection tracking when
> > > there are
> > > > LB rules (for DNAT). However, if a flow needs to be subjected to LB, then
> > > it shouldn't
> > > > be configured as stateless.
> > > >
> > > > A stateless flow means we should not track it, and this change exempts
> > > stateless
> > > > flows from being tracked regardless of whether LB rules are present or
> > > not.
> > > >
> > > > Signed-off-by: venu.iyer <venugopali@nvidia.com>
> > > > Acked-by: Han Zhou <hzhou@ovn.org>
>
> I had a quick look and it looks ok to me.
> Can you please add system tests to cover some scenarios for this use
> case to avoid future regressions ?
>
> Thanks
> Numan
>
>
>
>
> > > > ---
> > > >  northd/northd.c         | 24 +++++++++++++-----
> > > >  northd/ovn-northd.8.xml | 56 ++++++++++++++++++++++-------------------
> > > >  ovn-nb.xml              |  3 +++
> > > >  tests/ovn-northd.at     | 48 ++++++++++++-----------------------
> > > >  tests/ovn.at            |  4 +--
> > > >  5 files changed, 69 insertions(+), 66 deletions(-)
> > > >
> > > > diff --git a/northd/northd.c b/northd/northd.c
> > > > index b7388afc5..da4beede6 100644
> > > > --- a/northd/northd.c
> > > > +++ b/northd/northd.c
> > > > @@ -137,8 +137,8 @@ enum ovn_stage {
> > > >      PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    24, "ls_in_l2_unknown")
> > >  \
> > > >
> > >  \
> > > >      /* Logical switch egress stages. */
> > >   \
> > > > -    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")
> > >   \
> > > > -    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")
> > >  \
> > > > +    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")
> > >  \
> > > > +    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       1, "ls_out_pre_lb")
> > >   \
> > > >      PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")
> > >   \
> > > >      PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     3, "ls_out_acl_hint")
> > >   \
> > > >      PIPELINE_STAGE(SWITCH, OUT, ACL,          4, "ls_out_acl")
> > >  \
> > > > @@ -210,6 +210,7 @@ enum ovn_stage {
> > > >  #define REGBIT_ACL_LABEL          "reg0[13]"
> > > >  #define REGBIT_FROM_RAMP          "reg0[14]"
> > > >  #define REGBIT_PORT_SEC_DROP      "reg0[15]"
> > > > +#define REGBIT_ACL_STATELESS      "reg0[16]"
> > > >
> > > >  #define REG_ORIG_DIP_IPV4         "reg1"
> > > >  #define REG_ORIG_DIP_IPV6         "xxreg1"
> > > > @@ -271,7 +272,7 @@ enum ovn_stage {
> > > >   * | R0 |     REGBIT_{CONNTRACK/DHCP/DNS}              |   |
> > >      |
> > > >   * |    |     REGBIT_{HAIRPIN/HAIRPIN_REPLY}           |   |
> > >      |
> > > >   * |    | REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |
> > >      |
> > > > - * |    |     REGBIT_ACL_LABEL                         | X |
> > >      |
> > > > + * |    |     REGBIT_ACL_{LABEL/STATELESS}             | X |
> > >      |
> > > >   * +----+----------------------------------------------+ X |
> > >      |
> > > >   * | R1 |         ORIG_DIP_IPV4 (>= IN_PRE_STATEFUL)   | R |
> > >      |
> > > >   * +----+----------------------------------------------+ E |
> > >      |
> > > > @@ -5677,17 +5678,18 @@ build_stateless_filter(struct ovn_datapath *od,
> > > >                         const struct nbrec_acl *acl,
> > > >                         struct hmap *lflows)
> > > >  {
> > > > +    const char *action = REGBIT_ACL_STATELESS" = 1; next;";
> > > >      if (!strcmp(acl->direction, "from-lport")) {
> > > >          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_ACL,
> > > >                                  acl->priority + OVN_ACL_PRI_OFFSET,
> > > >                                  acl->match,
> > > > -                                "next;",
> > > > +                                action,
> > > >                                  &acl->header_);
> > > >      } else {
> > > >          ovn_lflow_add_with_hint(lflows, od, S_SWITCH_OUT_PRE_ACL,
> > > >                                  acl->priority + OVN_ACL_PRI_OFFSET,
> > > >                                  acl->match,
> > > > -                                "next;",
> > > > +                                action,
> > > >                                  &acl->header_);
> > > >      }
> > > >  }
> > > > @@ -5779,6 +5781,10 @@ build_pre_acls(struct ovn_datapath *od, const
> > > struct hmap *port_groups,
> > > >                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> > > >          ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
> > > >                        REGBIT_CONNTRACK_DEFRAG" = 1; next;");
> > > > +    } else if (od->has_lb_vip) {
> > > > +        /* We'll build stateless filters if there are LB rules so that
> > > > +         * the stateless flows are not tracked in pre-lb. */
> > > > +        build_stateless_filters(od, port_groups, lflows);
> > > >      }
> > > >  }
> > > >
> > > > @@ -5913,6 +5919,11 @@ build_pre_lb(struct ovn_datapath *od, const struct
> > > shash *meter_groups,
> > > >                                   S_SWITCH_IN_PRE_LB, S_SWITCH_OUT_PRE_LB,
> > > >                                   110, lflows);
> > > >      }
> > > > +    /* Do not sent statless flows via conntrack */
> > > > +    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
> > > > +                  REGBIT_ACL_STATELESS" == 1", "next;");
> > > > +    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
> > > > +                  REGBIT_ACL_STATELESS" == 1", "next;");
> > > >
> > > >      /* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
> > > >       * packet to conntrack for defragmentation and possibly for
> > > unNATting.
> > > > @@ -6918,7 +6929,8 @@ build_lb_rules_pre_stateful(struct hmap *lflows,
> > > struct ovn_northd_lb *lb,
> > > >          }
> > > >          ds_put_format(action, "%s;", ct_lb_mark ? "ct_lb_mark" :
> > > "ct_lb");
> > > >
> > > > -        ds_put_format(match, "%s.dst == %s", ip_match, lb_vip->vip_str);
> > > > +        ds_put_format(match, REGBIT_CONNTRACK_NAT" == 1 && %s.dst == %s",
> > > > +                      ip_match, lb_vip->vip_str);
> > > >          if (lb_vip->vip_port) {
> > > >              ds_put_format(match, " && %s.dst == %d", proto,
> > > lb_vip->vip_port);
> > > >          }
> > > > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> > > > index a70f2e678..162ec2b3b 100644
> > > > --- a/northd/ovn-northd.8.xml
> > > > +++ b/northd/ovn-northd.8.xml
> > > > @@ -440,7 +440,9 @@
> > > >        priority-110 flow is added to skip over stateful ACLs. Multicast,
> > > IPv6
> > > >        Neighbor Discovery and MLD traffic also skips stateful ACLs. For
> > > >        "allow-stateless" ACLs, a flow is added to bypass setting the hint
> > > for
> > > > -      connection tracker processing.
> > > > +      connection tracker processing when there are statelful ACLs or LB
> > > rules;
> > > > +      <code>REGBIT_ACL_STATELESS</code> is set for traffic matching such
> > > > +      flows for this purpose.
> > > >      </p>
> > > >
> > > >      <p>
> > > > @@ -460,8 +462,10 @@
> > > >        in ingress table <code>LB</code> and <code>Stateful</code>.  It
> > > contains
> > > >        a priority-0 flow that simply moves traffic to the next table.
> > > Moreover
> > > >        it contains two priority-110 flows to move multicast, IPv6 Neighbor
> > > > -      Discovery and MLD traffic to the next table. If load balancing
> > > rules with
> > > > -      virtual IP addresses (and ports) are configured in
> > > > +      Discovery and MLD traffic to the next table. It also contains two
> > > > +      priority-110 flows to move stateless traffic, i.e traffic for which
> > > > +      <code>REGBIT_ACL_STATELESS</code> is set, to the next table. If
> > > load
> > > > +      balancing rules with virtual IP addresses (and ports) are
> > > configured in
> > > >        <code>OVN_Northbound</code> database for a logical switch
> > > datapath, a
> > > >        priority-100 flow is added with the match <code>ip</code> to match
> > > on IP
> > > >        packets and sets the action <code>reg0[2] = 1; next;</code> to act
> > > as a
> > > > @@ -1859,19 +1863,11 @@ output;
> > > >        </li>
> > > >      </ul>
> > > >
> > > > -    <h3>Egress Table 0: Pre-LB</h3>
> > > > +    <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
> > > >
> > > >      <p>
> > > > -      This table is similar to ingress table <code>Pre-LB</code>.  It
> > > > -      contains a priority-0 flow that simply moves traffic to the next
> > > table.
> > > > -      Moreover it contains two priority-110 flows to move multicast, IPv6
> > > > -      Neighbor Discovery and MLD traffic to the next table. If any load
> > > > -      balancing rules exist for the datapath, a priority-100 flow is
> > > added with
> > > > -      a match of <code>ip</code> and action of <code>reg0[2] = 1;
> > > next;</code>
> > > > -      to act as a hint for table <code>Pre-stateful</code> to send IP
> > > packets
> > > > -      to the connection tracker for packet de-fragmentation and possibly
> > > DNAT
> > > > -      the destination VIP to one of the selected backend for already
> > > committed
> > > > -      load balanced traffic.
> > > > +      This is similar to ingress table <code>Pre-ACLs</code> except for
> > > > +     <code>to-lport</code> traffic.
> > > >      </p>
> > > >
> > > >      <p>
> > > > @@ -1884,11 +1880,28 @@ output;
> > > >        db="OVN_Northbound"/> table.
> > > >      </p>
> > > >
> > > > -    <h3>Egress Table 1: <code>to-lport</code> Pre-ACLs</h3>
> > > > +    <p>
> > > > +      This table also has a priority-110 flow with the match
> > > > +      <code>outport == <var>I</var></code> for all logical switch
> > > > +      datapaths to move traffic to the next table. Where <var>I</var>
> > > > +      is the peer of a logical router port. This flow is added to
> > > > +      skip the connection tracking of packets which will be entering
> > > > +      logical router datapath from logical switch datapath for routing.
> > > > +    </p>
> > > > +
> > > > +    <h3>Egress Table 1: Pre-LB</h3>
> > > >
> > > >      <p>
> > > > -      This is similar to ingress table <code>Pre-ACLs</code> except for
> > > > -     <code>to-lport</code> traffic.
> > > > +      This table is similar to ingress table <code>Pre-LB</code>.  It
> > > > +      contains a priority-0 flow that simply moves traffic to the next
> > > table.
> > > > +      Moreover it contains two priority-110 flows to move multicast, IPv6
> > > > +      Neighbor Discovery and MLD traffic to the next table. If any load
> > > > +      balancing rules exist for the datapath, a priority-100 flow is
> > > added with
> > > > +      a match of <code>ip</code> and action of <code>reg0[2] = 1;
> > > next;</code>
> > > > +      to act as a hint for table <code>Pre-stateful</code> to send IP
> > > packets
> > > > +      to the connection tracker for packet de-fragmentation and possibly
> > > DNAT
> > > > +      the destination VIP to one of the selected backend for already
> > > committed
> > > > +      load balanced traffic.
> > > >      </p>
> > > >
> > > >      <p>
> > > > @@ -1901,15 +1914,6 @@ output;
> > > >        db="OVN_Northbound"/> table.
> > > >      </p>
> > > >
> > > > -    <p>
> > > > -      This table also has a priority-110 flow with the match
> > > > -      <code>outport == <var>I</var></code> for all logical switch
> > > > -      datapaths to move traffic to the next table. Where <var>I</var>
> > > > -      is the peer of a logical router port. This flow is added to
> > > > -      skip the connection tracking of packets which will be entering
> > > > -      logical router datapath from logical switch datapath for routing.
> > > > -    </p>
> > > > -
> > > >      <h3>Egress Table 2: Pre-stateful</h3>
> > > >
> > > >      <p>
> > > > diff --git a/ovn-nb.xml b/ovn-nb.xml
> > > > index f41e9d7c0..140dd9a4f 100644
> > > > --- a/ovn-nb.xml
> > > > +++ b/ovn-nb.xml
> > > > @@ -2063,6 +2063,9 @@
> > > >            outgoing TCP traffic directed to an IP address, then you
> > > probably
> > > >            also want to define another rule to allow incoming TCP traffic
> > > coming
> > > >            from this same IP address.
> > > > +          In addition, traffic that matches stateless ACLs will bypass
> > > > +          load-balancer DNAT/un-DNAT processing. Stateful ACLs should be
> > > > +          used instead if the traffic is supposed to be load-balanced.
> > > >          </li>
> > > >
> > > >          <li>
> > > > diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> > > > index 4f399eccb..85d5acfed 100644
> > > > --- a/tests/ovn-northd.at
> > > > +++ b/tests/ovn-northd.at
> > > > @@ -2056,27 +2056,27 @@ check ovn-nbctl ls-lb-add sw0 lb1
> > > >  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb3
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  check ovn-nbctl ls-lb-add sw0 lb2
> > > >  check ovn-nbctl add load_balancer_group $lbg load_balancer $lb4
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  check ovn-nbctl clear load_balancer $lb1 vips
> > > >  check ovn-nbctl clear load_balancer $lb3 vips
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  check ovn-nbctl clear load_balancer $lb2 vips
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  check ovn-nbctl clear load_balancer $lb4 vips
> > > > @@ -2091,7 +2091,7 @@ check ovn-nbctl set load_balancer $lb4
> > > vips:"10.0.0.13"="10.0.0.6"
> > > >
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  # Now reverse the order of clearing the vip.
> > > > @@ -2099,13 +2099,13 @@ check ovn-nbctl clear load_balancer $lb2 vips
> > > >  check ovn-nbctl clear load_balancer $lb4 vips
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  check ovn-nbctl clear load_balancer $lb1 vips
> > > >  check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" |
> > > grep reg0 | sort], [0], [dnl
> > > > -  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > > +  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip),
> > > action=(reg0[[2]] = 1; next;)
> > > >  ])
> > > >
> > > >  check ovn-nbctl clear load_balancer $lb3 vips
> > > > @@ -3044,18 +3044,10 @@ for direction in from to; do
> > > >  done
> > > >  ovn-nbctl --wait=sb sync
> > > >
> > > > -# TCP packets should go to conntrack for load balancing.
> > > > +# TCP packets should not go to conntrack for load balancing.
> > > >  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
> > > >  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
> > > [0], [dnl
> > > > -ct_lb_mark {
> > > > -    ct_lb_mark {
> > > > -        reg0[[6]] = 0;
> > > > -        reg0[[12]] = 0;
> > > > -        ct_lb_mark /* default (use --ct to customize) */ {
> > > > -            output("lsp2");
> > > > -        };
> > > > -    };
> > > > -};
> > > > +output("lsp2");
> > > >  ])
> > > >
> > > >  # UDP packets still go to conntrack.
> > > > @@ -3188,18 +3180,10 @@ for direction in from to; do
> > > >  done
> > > >  ovn-nbctl --wait=sb sync
> > > >
> > > > -# TCP packets should go to conntrack for load balancing.
> > > > +# TCP packets should not go to conntrack for load balancing.
> > > >  flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
> > > >  AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"],
> > > [0], [dnl
> > > > -ct_lb_mark {
> > > > -    ct_lb_mark {
> > > > -        reg0[[6]] = 0;
> > > > -        reg0[[12]] = 0;
> > > > -        ct_lb_mark /* default (use --ct to customize) */ {
> > > > -            output("lsp2");
> > > > -        };
> > > > -    };
> > > > -};
> > > > +output("lsp2");
> > > >  ])
> > > >
> > > >  # UDP packets still go to conntrack.
> > > > @@ -4015,8 +3999,8 @@ check_stateful_flows() {
> > > >    table=? (ls_in_pre_stateful ), priority=0    , match=(1),
> > > action=(next;)
> > > >    table=? (ls_in_pre_stateful ), priority=100  , match=(reg0[[0]] == 1),
> > > action=(ct_next;)
> > > >    table=? (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb_mark;)
> > > > -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > > 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
> > > ct_lb_mark;)
> > > > -  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > > 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20; reg2[[0..15]] = 80;
> > > ct_lb_mark;)
> > > > +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > > && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> > > reg2[[0..15]] = 80; ct_lb_mark;)
> > > > +  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > > && ip4.dst == 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20;
> > > reg2[[0..15]] = 80; ct_lb_mark;)
> > > >  ])
> > > >
> > > >      AT_CHECK([grep "ls_in_lb" sw0flows | sort | sed
> > > 's/table=../table=??/'], [0], [dnl
> > > > @@ -7650,7 +7634,7 @@ check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> > > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> > > reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
> > > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> > > reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> > > > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > > 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > > > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > > && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > > >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb_mark;)
> > > >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> > > ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
> > > ct_lb_mark(backends=42.42.42.2);)
> > > >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb_mark;)
> > > > @@ -7662,7 +7646,7 @@ check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> > > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> > > reg0 == 66.66.66.66 && ct_label.natted == 1), action=(next;)
> > > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> > > reg0 == 66.66.66.66), action=(ct_lb(backends=42.42.42.2);)
> > > > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > > 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> > > > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > > && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
> > > >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb;)
> > > >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> > > ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb(backends=42.42.42.2);)
> > > >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb;)
> > > > @@ -7674,7 +7658,7 @@ check ovn-nbctl --wait=sb sync
> > > >  AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
> > > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 &&
> > > reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
> > > >    table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 &&
> > > reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
> > > > -  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > > 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > > > +  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > > && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
> > > >    table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb_mark;)
> > > >    table=11(ls_in_lb           ), priority=110  , match=(ct.new &&
> > > ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0;
> > > ct_lb_mark(backends=42.42.42.2);)
> > > >    table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1),
> > > action=(ct_lb_mark;)
> > > > diff --git a/tests/ovn.at b/tests/ovn.at
> > > > index f8b8db4df..f43455f60 100644
> > > > --- a/tests/ovn.at
> > > > +++ b/tests/ovn.at
> > > > @@ -23656,7 +23656,7 @@ OVS_WAIT_FOR_OUTPUT(
> > > >    [ovn-sbctl dump-flows > sbflows
> > > >     ovn-sbctl dump-flows sw0 | grep ct_lb_mark | grep priority=120 | sed
> > > 's/table=..//'], 0,
> > > >    [dnl
> > > > -  (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 &&
> > > tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
> > > > +  (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 &&
> > > ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> > > reg2[[0..15]] = 80; ct_lb_mark;)
> > > >    (ls_in_lb           ), priority=120  , match=(ct.new && ip4.dst ==
> > > 10.0.0.10 && tcp.dst == 80), action=(reg0[[1]] = 0; ct_lb_mark(backends=
> > > 10.0.0.3:80,20.0.0.3:80; hash_fields="ip_dst,ip_src,tcp_dst,tcp_src");)
> > > >  ])
> > > >
> > > > @@ -23699,7 +23699,7 @@ ovn-sbctl dump-flows sw0 > sbflows3
> > > >  AT_CHECK(
> > > >    [grep "ip4.dst == 10.0.0.10 && tcp.dst == 80" sbflows3 | grep
> > > priority=120 |\
> > > >     sed 's/table=../table=??/'], [0], [dnl
> > > > -  table=??(ls_in_pre_stateful ), priority=120  , match=(ip4.dst ==
> > > 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80;
> > > ct_lb_mark;)
> > > > +  table=??(ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1
> > > && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10;
> > > reg2[[0..15]] = 80; ct_lb_mark;)
> > > >    table=??(ls_in_lb           ), priority=120  , match=(ct.new &&
> > > ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(drop;)
> > > >  ])
> > > >
> > > > --
> > > > 2.17.1
> > > >
> > >
> > > Thanks Venu. I have reviewed this with Venu internally and gave my Ack, but
> > > it would be good if folks can take another look before merging.
> > > Numan, since you were involved in an earlier discussion, probably you want
> > > to take a look?
> >
> > Sure.  I'll take a look and thanks for the comments below.
> >
> > Numan
> >
> > > I think there are two things to pay attention to:
> > > 1. Switching the order of PRE_ACL and PRE_LB in the LS egress pipeline. I
> > > don't see any problem here but not sure if there was any specific reason
> > > behind the original order.
> > > 2. If there are VIP traffic matching stateless ACL, they won't work any
> > > more. We think that is ok, because we can't think of a use case that would
> > > add stateless ACLs for VIP traffic because this combination doesn't make
> > > any sense. If such traffic was allowed with CT, it could actually hide a
> > > mistake made by the user - e.g. the intention was to apply stateless ACLs
> > > to bypass CT for certain traffic that was not related to VIP, but by
> > > mistake the match condition added to the ACL touched some VIP traffic. So,
> > > we think the right thing to do is to let it fail when such
> > > *misconfiguration* happens.
> > >
> > > Let me know if you have any concerns about the above points.
> > >
> > > Thanks,
> > > Han
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmail.openvswitch.org%2Fmailman%2Flistinfo%2Fovs-dev&amp;data=05%7C01%7Cvenugopali%40nvidia.com%7Cb334b9e522434495b9bb08dac6642a5c%7C43083d15727340c1b7db39efd9ccc17a%7C0%7C0%7C638040430761722189%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C3000%7C%7C%7C&amp;sdata=IqTdVyCGJV5deuorgE0VwTpJGtjqbfuwV5TuRXOIAQI%3D&amp;reserved=0
> > >
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
diff mbox series

Patch

diff --git a/northd/northd.c b/northd/northd.c
index b7388afc5..da4beede6 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -137,8 +137,8 @@  enum ovn_stage {
     PIPELINE_STAGE(SWITCH, IN,  L2_UNKNOWN,    24, "ls_in_l2_unknown")    \
                                                                           \
     /* Logical switch egress stages. */                                   \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       0, "ls_out_pre_lb")         \
-    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      1, "ls_out_pre_acl")        \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_ACL,      0, "ls_out_pre_acl")        \
+    PIPELINE_STAGE(SWITCH, OUT, PRE_LB,       1, "ls_out_pre_lb")         \
     PIPELINE_STAGE(SWITCH, OUT, PRE_STATEFUL, 2, "ls_out_pre_stateful")   \
     PIPELINE_STAGE(SWITCH, OUT, ACL_HINT,     3, "ls_out_acl_hint")       \
     PIPELINE_STAGE(SWITCH, OUT, ACL,          4, "ls_out_acl")            \
@@ -210,6 +210,7 @@  enum ovn_stage {
 #define REGBIT_ACL_LABEL          "reg0[13]"
 #define REGBIT_FROM_RAMP          "reg0[14]"
 #define REGBIT_PORT_SEC_DROP      "reg0[15]"
+#define REGBIT_ACL_STATELESS      "reg0[16]"
 
 #define REG_ORIG_DIP_IPV4         "reg1"
 #define REG_ORIG_DIP_IPV6         "xxreg1"
@@ -271,7 +272,7 @@  enum ovn_stage {
  * | R0 |     REGBIT_{CONNTRACK/DHCP/DNS}              |   |                  |
  * |    |     REGBIT_{HAIRPIN/HAIRPIN_REPLY}           |   |                  |
  * |    | REGBIT_ACL_HINT_{ALLOW_NEW/ALLOW/DROP/BLOCK} |   |                  |
- * |    |     REGBIT_ACL_LABEL                         | X |                  |
+ * |    |     REGBIT_ACL_{LABEL/STATELESS}             | X |                  |
  * +----+----------------------------------------------+ X |                  |
  * | R1 |         ORIG_DIP_IPV4 (>= IN_PRE_STATEFUL)   | R |                  |
  * +----+----------------------------------------------+ E |                  |
@@ -5677,17 +5678,18 @@  build_stateless_filter(struct ovn_datapath *od,
                        const struct nbrec_acl *acl,
                        struct hmap *lflows)
 {
+    const char *action = REGBIT_ACL_STATELESS" = 1; next;";
     if (!strcmp(acl->direction, "from-lport")) {
         ovn_lflow_add_with_hint(lflows, od, S_SWITCH_IN_PRE_ACL,
                                 acl->priority + OVN_ACL_PRI_OFFSET,
                                 acl->match,
-                                "next;",
+                                action,
                                 &acl->header_);
     } else {
         ovn_lflow_add_with_hint(lflows, od, S_SWITCH_OUT_PRE_ACL,
                                 acl->priority + OVN_ACL_PRI_OFFSET,
                                 acl->match,
-                                "next;",
+                                action,
                                 &acl->header_);
     }
 }
@@ -5779,6 +5781,10 @@  build_pre_acls(struct ovn_datapath *od, const struct hmap *port_groups,
                       REGBIT_CONNTRACK_DEFRAG" = 1; next;");
         ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_ACL, 100, "ip",
                       REGBIT_CONNTRACK_DEFRAG" = 1; next;");
+    } else if (od->has_lb_vip) {
+        /* We'll build stateless filters if there are LB rules so that
+         * the stateless flows are not tracked in pre-lb. */
+        build_stateless_filters(od, port_groups, lflows);
     }
 }
 
@@ -5913,6 +5919,11 @@  build_pre_lb(struct ovn_datapath *od, const struct shash *meter_groups,
                                  S_SWITCH_IN_PRE_LB, S_SWITCH_OUT_PRE_LB,
                                  110, lflows);
     }
+    /* Do not sent statless flows via conntrack */
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110,
+                  REGBIT_ACL_STATELESS" == 1", "next;");
+    ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110,
+                  REGBIT_ACL_STATELESS" == 1", "next;");
 
     /* 'REGBIT_CONNTRACK_NAT' is set to let the pre-stateful table send
      * packet to conntrack for defragmentation and possibly for unNATting.
@@ -6918,7 +6929,8 @@  build_lb_rules_pre_stateful(struct hmap *lflows, struct ovn_northd_lb *lb,
         }
         ds_put_format(action, "%s;", ct_lb_mark ? "ct_lb_mark" : "ct_lb");
 
-        ds_put_format(match, "%s.dst == %s", ip_match, lb_vip->vip_str);
+        ds_put_format(match, REGBIT_CONNTRACK_NAT" == 1 && %s.dst == %s",
+                      ip_match, lb_vip->vip_str);
         if (lb_vip->vip_port) {
             ds_put_format(match, " && %s.dst == %d", proto, lb_vip->vip_port);
         }
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index a70f2e678..162ec2b3b 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -440,7 +440,9 @@ 
       priority-110 flow is added to skip over stateful ACLs. Multicast, IPv6
       Neighbor Discovery and MLD traffic also skips stateful ACLs. For
       "allow-stateless" ACLs, a flow is added to bypass setting the hint for
-      connection tracker processing.
+      connection tracker processing when there are statelful ACLs or LB rules;
+      <code>REGBIT_ACL_STATELESS</code> is set for traffic matching such
+      flows for this purpose.
     </p>
 
     <p>
@@ -460,8 +462,10 @@ 
       in ingress table <code>LB</code> and <code>Stateful</code>.  It contains
       a priority-0 flow that simply moves traffic to the next table. Moreover
       it contains two priority-110 flows to move multicast, IPv6 Neighbor
-      Discovery and MLD traffic to the next table. If load balancing rules with
-      virtual IP addresses (and ports) are configured in
+      Discovery and MLD traffic to the next table. It also contains two
+      priority-110 flows to move stateless traffic, i.e traffic for which
+      <code>REGBIT_ACL_STATELESS</code> is set, to the next table. If load
+      balancing rules with virtual IP addresses (and ports) are configured in
       <code>OVN_Northbound</code> database for a logical switch datapath, a
       priority-100 flow is added with the match <code>ip</code> to match on IP
       packets and sets the action <code>reg0[2] = 1; next;</code> to act as a
@@ -1859,19 +1863,11 @@  output;
       </li>
     </ul>
 
-    <h3>Egress Table 0: Pre-LB</h3>
+    <h3>Egress Table 0: <code>to-lport</code> Pre-ACLs</h3>
 
     <p>
-      This table is similar to ingress table <code>Pre-LB</code>.  It
-      contains a priority-0 flow that simply moves traffic to the next table.
-      Moreover it contains two priority-110 flows to move multicast, IPv6
-      Neighbor Discovery and MLD traffic to the next table. If any load
-      balancing rules exist for the datapath, a priority-100 flow is added with
-      a match of <code>ip</code> and action of <code>reg0[2] = 1; next;</code>
-      to act as a hint for table <code>Pre-stateful</code> to send IP packets
-      to the connection tracker for packet de-fragmentation and possibly DNAT
-      the destination VIP to one of the selected backend for already committed
-      load balanced traffic.
+      This is similar to ingress table <code>Pre-ACLs</code> except for
+     <code>to-lport</code> traffic.
     </p>
 
     <p>
@@ -1884,11 +1880,28 @@  output;
       db="OVN_Northbound"/> table.
     </p>
 
-    <h3>Egress Table 1: <code>to-lport</code> Pre-ACLs</h3>
+    <p>
+      This table also has a priority-110 flow with the match
+      <code>outport == <var>I</var></code> for all logical switch
+      datapaths to move traffic to the next table. Where <var>I</var>
+      is the peer of a logical router port. This flow is added to
+      skip the connection tracking of packets which will be entering
+      logical router datapath from logical switch datapath for routing.
+    </p>
+
+    <h3>Egress Table 1: Pre-LB</h3>
 
     <p>
-      This is similar to ingress table <code>Pre-ACLs</code> except for
-     <code>to-lport</code> traffic.
+      This table is similar to ingress table <code>Pre-LB</code>.  It
+      contains a priority-0 flow that simply moves traffic to the next table.
+      Moreover it contains two priority-110 flows to move multicast, IPv6
+      Neighbor Discovery and MLD traffic to the next table. If any load
+      balancing rules exist for the datapath, a priority-100 flow is added with
+      a match of <code>ip</code> and action of <code>reg0[2] = 1; next;</code>
+      to act as a hint for table <code>Pre-stateful</code> to send IP packets
+      to the connection tracker for packet de-fragmentation and possibly DNAT
+      the destination VIP to one of the selected backend for already committed
+      load balanced traffic.
     </p>
 
     <p>
@@ -1901,15 +1914,6 @@  output;
       db="OVN_Northbound"/> table.
     </p>
 
-    <p>
-      This table also has a priority-110 flow with the match
-      <code>outport == <var>I</var></code> for all logical switch
-      datapaths to move traffic to the next table. Where <var>I</var>
-      is the peer of a logical router port. This flow is added to
-      skip the connection tracking of packets which will be entering
-      logical router datapath from logical switch datapath for routing.
-    </p>
-
     <h3>Egress Table 2: Pre-stateful</h3>
 
     <p>
diff --git a/ovn-nb.xml b/ovn-nb.xml
index f41e9d7c0..140dd9a4f 100644
--- a/ovn-nb.xml
+++ b/ovn-nb.xml
@@ -2063,6 +2063,9 @@ 
           outgoing TCP traffic directed to an IP address, then you probably
           also want to define another rule to allow incoming TCP traffic coming
           from this same IP address.
+          In addition, traffic that matches stateless ACLs will bypass
+          load-balancer DNAT/un-DNAT processing. Stateful ACLs should be
+          used instead if the traffic is supposed to be load-balanced.
         </li>
 
         <li>
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 4f399eccb..85d5acfed 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -2056,27 +2056,27 @@  check ovn-nbctl ls-lb-add sw0 lb1
 check ovn-nbctl add load_balancer_group $lbg load_balancer $lb3
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 check ovn-nbctl ls-lb-add sw0 lb2
 check ovn-nbctl add load_balancer_group $lbg load_balancer $lb4
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 check ovn-nbctl clear load_balancer $lb1 vips
 check ovn-nbctl clear load_balancer $lb3 vips
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 check ovn-nbctl clear load_balancer $lb2 vips
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 check ovn-nbctl clear load_balancer $lb4 vips
@@ -2091,7 +2091,7 @@  check ovn-nbctl set load_balancer $lb4 vips:"10.0.0.13"="10.0.0.6"
 
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 # Now reverse the order of clearing the vip.
@@ -2099,13 +2099,13 @@  check ovn-nbctl clear load_balancer $lb2 vips
 check ovn-nbctl clear load_balancer $lb4 vips
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 check ovn-nbctl clear load_balancer $lb1 vips
 check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep "ls_out_pre_lb.*priority=100" | grep reg0 | sort], [0], [dnl
-  table=0 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
+  table=1 (ls_out_pre_lb      ), priority=100  , match=(ip), action=(reg0[[2]] = 1; next;)
 ])
 
 check ovn-nbctl clear load_balancer $lb3 vips
@@ -3044,18 +3044,10 @@  for direction in from to; do
 done
 ovn-nbctl --wait=sb sync
 
-# TCP packets should go to conntrack for load balancing.
+# TCP packets should not go to conntrack for load balancing.
 flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
 AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"], [0], [dnl
-ct_lb_mark {
-    ct_lb_mark {
-        reg0[[6]] = 0;
-        reg0[[12]] = 0;
-        ct_lb_mark /* default (use --ct to customize) */ {
-            output("lsp2");
-        };
-    };
-};
+output("lsp2");
 ])
 
 # UDP packets still go to conntrack.
@@ -3188,18 +3180,10 @@  for direction in from to; do
 done
 ovn-nbctl --wait=sb sync
 
-# TCP packets should go to conntrack for load balancing.
+# TCP packets should not go to conntrack for load balancing.
 flow="inport == \"lsp1\" && ${flow_eth} && ${flow_ip} && ${flow_tcp}"
 AT_CHECK_UNQUOTED([ovn_trace --ct new --ct new --minimal ls "${flow}"], [0], [dnl
-ct_lb_mark {
-    ct_lb_mark {
-        reg0[[6]] = 0;
-        reg0[[12]] = 0;
-        ct_lb_mark /* default (use --ct to customize) */ {
-            output("lsp2");
-        };
-    };
-};
+output("lsp2");
 ])
 
 # UDP packets still go to conntrack.
@@ -4015,8 +3999,8 @@  check_stateful_flows() {
   table=? (ls_in_pre_stateful ), priority=0    , match=(1), action=(next;)
   table=? (ls_in_pre_stateful ), priority=100  , match=(reg0[[0]] == 1), action=(ct_next;)
   table=? (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb_mark;)
-  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
-  table=? (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20; reg2[[0..15]] = 80; ct_lb_mark;)
+  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
+  table=? (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 10.0.0.20 && tcp.dst == 80), action=(reg1 = 10.0.0.20; reg2[[0..15]] = 80; ct_lb_mark;)
 ])
 
     AT_CHECK([grep "ls_in_lb" sw0flows | sort | sed 's/table=../table=??/'], [0], [dnl
@@ -7650,7 +7634,7 @@  check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
   table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 && reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
   table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 && reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
-  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
+  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
   table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb_mark;)
   table=11(ls_in_lb           ), priority=110  , match=(ct.new && ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb_mark(backends=42.42.42.2);)
   table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb_mark;)
@@ -7662,7 +7646,7 @@  check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
   table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 && reg0 == 66.66.66.66 && ct_label.natted == 1), action=(next;)
   table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 && reg0 == 66.66.66.66), action=(ct_lb(backends=42.42.42.2);)
-  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
+  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb;)
   table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb;)
   table=11(ls_in_lb           ), priority=110  , match=(ct.new && ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb(backends=42.42.42.2);)
   table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb;)
@@ -7674,7 +7658,7 @@  check ovn-nbctl --wait=sb sync
 AT_CHECK([ovn-sbctl lflow-list | grep -e natted -e ct_lb], [0], [dnl
   table=6 (lr_in_dnat         ), priority=110  , match=(ct.est && ip4 && reg0 == 66.66.66.66 && ct_mark.natted == 1), action=(next;)
   table=6 (lr_in_dnat         ), priority=110  , match=(ct.new && ip4 && reg0 == 66.66.66.66), action=(ct_lb_mark(backends=42.42.42.2);)
-  table=6 (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
+  table=6 (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 66.66.66.66), action=(reg1 = 66.66.66.66; ct_lb_mark;)
   table=6 (ls_in_pre_stateful ), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb_mark;)
   table=11(ls_in_lb           ), priority=110  , match=(ct.new && ip4.dst == 66.66.66.66), action=(reg0[[1]] = 0; ct_lb_mark(backends=42.42.42.2);)
   table=2 (ls_out_pre_stateful), priority=110  , match=(reg0[[2]] == 1), action=(ct_lb_mark;)
diff --git a/tests/ovn.at b/tests/ovn.at
index f8b8db4df..f43455f60 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -23656,7 +23656,7 @@  OVS_WAIT_FOR_OUTPUT(
   [ovn-sbctl dump-flows > sbflows
    ovn-sbctl dump-flows sw0 | grep ct_lb_mark | grep priority=120 | sed 's/table=..//'], 0,
   [dnl
-  (ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
+  (ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
   (ls_in_lb           ), priority=120  , match=(ct.new && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg0[[1]] = 0; ct_lb_mark(backends=10.0.0.3:80,20.0.0.3:80; hash_fields="ip_dst,ip_src,tcp_dst,tcp_src");)
 ])
 
@@ -23699,7 +23699,7 @@  ovn-sbctl dump-flows sw0 > sbflows3
 AT_CHECK(
   [grep "ip4.dst == 10.0.0.10 && tcp.dst == 80" sbflows3 | grep priority=120 |\
    sed 's/table=../table=??/'], [0], [dnl
-  table=??(ls_in_pre_stateful ), priority=120  , match=(ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
+  table=??(ls_in_pre_stateful ), priority=120  , match=(reg0[[2]] == 1 && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(reg1 = 10.0.0.10; reg2[[0..15]] = 80; ct_lb_mark;)
   table=??(ls_in_lb           ), priority=120  , match=(ct.new && ip4.dst == 10.0.0.10 && tcp.dst == 80), action=(drop;)
 ])