diff mbox series

[ovs-dev,v2,3/3] northd: rely on new actions for ecmp-symmetric routing

Message ID cb0aa7142c8948c60bd42203b8c7375613170f01.1657884720.git.lorenzo.bianconi@redhat.com
State Accepted
Headers show
Series Improve ECMP symmetric routing reliability | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test fail github build: failed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Lorenzo Bianconi July 15, 2022, 11:35 a.m. UTC
Rely on the following new actions for ecmp-symmetric routing:
- chk_ecmp_nh_mac
- chk_ecmp_nh
- commit_ecmp_nh

In this way OVN is able to keep up if for any reason the ECMP traffic
source changes L2 address and keeps old L3 addresses.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2096233
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 northd/northd.c         | 102 +++++++++++++++++++++++++++++++++++-----
 northd/ovn-northd.8.xml |  25 ++++++----
 tests/ovn-northd.at     |  18 ++++++-
 tests/ovn.at            |   4 +-
 tests/system-ovn.at     |  79 ++++++++++++++++++++++++++-----
 5 files changed, 191 insertions(+), 37 deletions(-)

Comments

Han Zhou Aug. 19, 2022, 3:09 p.m. UTC | #1
On Fri, Jul 15, 2022 at 4:35 AM Lorenzo Bianconi <
lorenzo.bianconi@redhat.com> wrote:
>
> Rely on the following new actions for ecmp-symmetric routing:
> - chk_ecmp_nh_mac
> - chk_ecmp_nh
> - commit_ecmp_nh
>
> In this way OVN is able to keep up if for any reason the ECMP traffic
> source changes L2 address and keeps old L3 addresses.

Hi Lorenzo, do you have more details of the problem scenario? It seems the
bugzilla access is limited.

>
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2096233
> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> ---
>  northd/northd.c         | 102 +++++++++++++++++++++++++++++++++++-----
>  northd/ovn-northd.8.xml |  25 ++++++----
>  tests/ovn-northd.at     |  18 ++++++-
>  tests/ovn.at            |   4 +-
>  tests/system-ovn.at     |  79 ++++++++++++++++++++++++++-----
>  5 files changed, 191 insertions(+), 37 deletions(-)
>
> diff --git a/northd/northd.c b/northd/northd.c
> index d31cb1688..57ec3675b 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -227,6 +227,7 @@ enum ovn_stage {
>  #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
>  #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
>  #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
> +#define REGBIT_KNOWN_ECMP_NH    "reg9[5]"
>
>  /* Register to store the eth address associated to a router port for
packets
>   * received in S_ROUTER_IN_ADMISSION.
> @@ -327,7 +328,8 @@ enum ovn_stage {
>   * |     |   EGRESS_LOOPBACK/       | G |     UNUSED      |
>   * | R9  |   PKT_LARGER/            | 4 |                 |
>   * |     |   LOOKUP_NEIGHBOR_RESULT/|   |                 |
> - * |     |   SKIP_LOOKUP_NEIGHBOR}  |   |                 |
> + * |     |   SKIP_LOOKUP_NEIGHBOR/  |   |                 |
> + * |     |   KNOWN_ECMP_NH}         |   |                 |
>   * |     |                          |   |                 |
>   * |     | REG_ORIG_TP_DPORT_ROUTER |   |                 |
>   * |     |                          |   |                 |
> @@ -9437,6 +9439,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
>                                 struct ds *route_match)
>  {
>      const struct nbrec_logical_router_static_route *st_route =
route->route;
> +    struct ds base_match = DS_EMPTY_INITIALIZER;
>      struct ds match = DS_EMPTY_INITIALIZER;
>      struct ds actions = DS_EMPTY_INITIALIZER;
>      struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
> @@ -9448,20 +9451,22 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
>      /* If symmetric ECMP replies are enabled, then packets that arrive
over
>       * an ECMP route need to go through conntrack.
>       */
> -    ds_put_format(&match, "inport == %s && ip%s.%s == %s",
> +    ds_put_format(&base_match, "inport == %s && ip%s.%s == %s",
>                    out_port->json_key,
>                    IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "4" : "6",
>                    route->is_src_route ? "dst" : "src",
>                    cidr);
>      free(cidr);
>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> -                            ds_cstr(&match), "ct_next;",
> -                            &st_route->header_);
> +            ds_cstr(&base_match),
> +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
> +            &st_route->header_);
>
>      /* And packets that go out over an ECMP route need conntrack */
>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> -                            ds_cstr(route_match), "ct_next;",
> -                            &st_route->header_);
> +            ds_cstr(route_match),
> +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
> +            &st_route->header_);
>
>      /* Save src eth and inport in ct_label for packets that arrive over
>       * an ECMP route.
> @@ -9469,11 +9474,84 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
>       * NOTE: we purposely are not clearing match before this
>       * ds_put_cstr() call. The previous contents are needed.
>       */
> -    ds_put_cstr(&match, " && (ct.new && !ct.est)");
> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && tcp",
> +                  ds_cstr(&base_match));
> +    ds_put_format(&actions,
> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> +            " %s = %" PRId64 ";}; "
> +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> +                            ds_cstr(&match), ds_cstr(&actions),
> +                            &st_route->header_);
> +    ds_clear(&match);
> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && udp",
> +                  ds_cstr(&base_match));
> +    ds_clear(&actions);
> +    ds_put_format(&actions,
> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> +            " %s = %" PRId64 ";}; "
> +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> +                            ds_cstr(&match), ds_cstr(&actions),
> +                            &st_route->header_);
> +    ds_clear(&match);
> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && sctp",
> +                  ds_cstr(&base_match));
> +    ds_clear(&actions);
> +    ds_put_format(&actions,
> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> +            " %s = %" PRId64 ";}; "
> +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> +                            ds_cstr(&match), ds_cstr(&actions),
> +                            &st_route->header_);
>
> -    ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth =
eth.src;"
> -                  " %s = %" PRId64 ";}; next;",
> -                  ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> +    ds_clear(&match);
> +    ds_put_format(&match,
> +            "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH"
== 0",
> +            ds_cstr(&base_match));
> +    ds_clear(&actions);
> +    ds_put_format(&actions,
> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> +            " %s = %" PRId64 ";}; "
> +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> +                            ds_cstr(&match), ds_cstr(&actions),
> +                            &st_route->header_);
> +
> +    ds_clear(&match);
> +    ds_put_format(&match,
> +            "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH"
== 0",
> +            ds_cstr(&base_match));
> +    ds_clear(&actions);
> +    ds_put_format(&actions,
> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> +            " %s = %" PRId64 ";}; "

Sorry that I haven't reviewed the details yet, but this flow would break HW
offload again, which was fixed in 22.03 and backported to 21.12. cc @Mark
Michelson <mmichels@redhat.com>

Thanks,
Han

> +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> +                            ds_cstr(&match), ds_cstr(&actions),
> +                            &st_route->header_);
> +    ds_clear(&match);
> +    ds_put_format(&match,
> +            "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH"
== 0",
> +            ds_cstr(&base_match));
> +    ds_clear(&actions);
> +    ds_put_format(&actions,
> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> +            " %s = %" PRId64 ";}; "
> +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>                              ds_cstr(&match), ds_cstr(&actions),
>                              &st_route->header_);
> @@ -9481,7 +9559,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
>      /* Bypass ECMP selection if we already have ct_label information
>       * for where to route the packet.
>       */
> -    ds_put_format(&ecmp_reply, "ct.rpl && %s == %"PRId64,
> +    ds_put_format(&ecmp_reply,
> +                  "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s ==
%"PRId64,
>                    ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
>      ds_clear(&match);
>      ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
> @@ -9517,6 +9596,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
>                              200, ds_cstr(&ecmp_reply),
>                              action, &st_route->header_);
>
> +    ds_destroy(&base_match);
>      ds_destroy(&match);
>      ds_destroy(&actions);
>      ds_destroy(&ecmp_reply);
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index a87df24bd..aa7196a92 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -3166,8 +3166,12 @@ icmp6 {
>        configured. The matching logic for these ports essentially
reverses the
>        configured logic of the ECMP route. So for instance, a route with a
>        destination routing policy will instead match if the source IP
address
> -      matches the static route's prefix. The flow uses the action
> -      <code>ct_next</code> to send IP packets to the connection tracker
for
> +      matches the static route's prefix. The flow uses the actions
> +      <code>chk_ecmp_nh_mac(); ct_next</code> or
> +      <code>chk_ecmp_nh(); ct_next</code> to send IP packets to table
> +      <code>OFTABLE_ECMP_NH_MAC</code> or to table
> +      <code>OFTABLE_ECMP_NH</code> in order to check if source info are
already
> +      stored by OVN and then to the connection tracker for
>        packet de-fragmentation and tracking before sending it to the next
table.
>      </p>
>
> @@ -3426,10 +3430,11 @@ icmp6 {
>          route with a destination routing policy will instead match if the
>          source IP address matches the static route's prefix. The flow
uses
>          the action <code>ct_commit { ct_label.ecmp_reply_eth = eth.src;"
> -        " ct_mark.ecmp_reply_port = <var>K</var>;}; next; </code> to
commit
> -        the connection and storing <code>eth.src</code> and the ECMP
> -        reply port binding tunnel key <var>K</var> in the
> -        <code>ct_label</code>.
> +        " ct_mark.ecmp_reply_port = <var>K</var>;}; commit_ecmp_nh();
next;
> +        </code> to commit the connection and storing
<code>eth.src</code> and
> +        the ECMP reply port binding tunnel key <var>K</var> in the
> +        <code>ct_label</code> and the traffic pattern to table
> +        <code>OFTABLE_ECMP_NH_MAC</code> or <code>OFTABLE_ECMP_NH</code>.
>        </li>
>      </ul>
>
> @@ -3568,10 +3573,10 @@ output;
>        which the packet should be sent. The
<code>ct_mark.ecmp_reply_port</code>
>        tells the logical router port on which the packet should be sent.
These
>        values saved to the conntrack fields when the initial ingress
traffic is
> -      received over the ECMP route and committed to conntrack. The
> -      priority-10300 flows in this stage set the <code>outport</code>,
> -      while the <code>eth.dst</code> is set by flows at the ARP/ND
Resolution
> -      stage.
> +      received over the ECMP route and committed to conntrack.
> +      If <code>REGBIT_KNOWN_ECMP_NH</code> is set, the priority-10300
> +      flows in this stage set the <code>outport</code>, while the
> +      <code>eth.dst</code> is set by flows at the ARP/ND Resolution
stage.
>      </p>
>
>      <p>
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 610a5ce12..0c7b24816 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -5644,10 +5644,24 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp"
lr0flows | sed 's/192\.168\.0\..0/192.
>    table=??(lr_in_ip_routing_ecmp), priority=150  , match=(reg8[[0..15]]
== 0), action=(next;)
>  ])
>
> +AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed
's/table=../table=??/' | sort], [0], [dnl
> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp &&
reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
 ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
sctp); next;)
> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp &&
reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
 ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
next;)
> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp &&
reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
 ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
next;)
> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp),
action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
 ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
sctp); next;)
> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp),
action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
 ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
next;)
> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp),
action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
 ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
next;)
> +])
> +
> +AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
's/table=../table=??/' | sort], [0], [dnl
> +  table=??(lr_in_defrag       ), priority=100  , match=(inport ==
"lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac();
ct_next;)
> +  table=??(lr_in_defrag       ), priority=100  , match=(reg7 == 0 &&
ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
> +])
> +
>  dnl The chassis was created with other_config:ct-no-masked-label=false,
the flows
>  dnl should be using ct_label.ecmp_reply_port.
>  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
's/table=../table=??/'], [0], [dnl
> -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1);
xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
>  ])
>
>  dnl Simulate an ovn-controller upgrade to a version that supports
> @@ -5657,7 +5671,7 @@ check ovn-sbctl set chassis ch1
other_config:ct-no-masked-label=true
>  check ovn-nbctl --wait=sb sync
>  ovn-sbctl dump-flows lr0 > lr0flows
>  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
's/table=../table=??/'], [0], [dnl
> -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1);
xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
>  ])
>
>  # add ecmp route with wrong nexthop
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 91dc3b9d6..12ba3f6d0 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -27173,7 +27173,7 @@ AT_CHECK([
>          grep "priority=200" | \
>          grep -c
"move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
>      done; :], [0], [dnl
> -1
> +6
>  1
>  0
>  0
> @@ -27298,7 +27298,7 @@ AT_CHECK([
>          grep "priority=200" | \
>          grep -c
"move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
>      done; :], [0], [dnl
> -1
> +6
>  1
>  0
>  0
> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> index 4a8fdede8..b3096deb3 100644
> --- a/tests/system-ovn.at
> +++ b/tests/system-ovn.at
> @@ -5956,11 +5956,8 @@ ovn-nbctl --wait=hv sync
>
>  on_exit 'ovs-ofctl dump-flows br-int'
>
> -# 'bob1' should be able to ping 'alice1' directly.
> -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 |
FORMAT_PING], \
> -[0], [dnl
> -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> -])
> +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
>
>  # Ensure conntrack entry is present. We should not try to predict
>  # the tunnel key for the output port, so we strip it from the labels
> @@ -5968,7 +5965,7 @@ NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15
10.0.0.2 | FORMAT_PING], \
>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
>  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
>  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>
-icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
>
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
>  ])
>
>  # Ensure datapaths show conntrack states as expected
> @@ -5981,6 +5978,36 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(-new+est+rpl+trk).*ct_lab
>  1
>  ])
>
> +# Flush conntrack entries for easier output parsing of next test.
> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> +
> +# Change bob1 L2 address anche check the reply is properly updated.
> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> +ovn-nbctl set Logical_Switch_Port r2-ext \
> +     type=router options:router-port=R2_ext
addresses='"00:00:10:01:02:04"'
> +
> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0],
[dnl
> +1
> +])
> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
> +1
> +])
> +
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
FORMAT_CT(172.16.0.1) | \
> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> +])
> +
> +# Check entries in table 76 and 77 expires w/o traffic
> +OVS_WAIT_UNTIL([
> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
> +])
> +OVS_WAIT_UNTIL([
> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
> +])
> +
>  ovs-ofctl dump-flows br-int
>
>  OVS_APP_EXIT_AND_WAIT([ovn-controller])
> @@ -6122,11 +6149,8 @@ ovn-nbctl --wait=hv sync
>
>  on_exit 'ovs-ofctl dump-flows br-int'
>
> -# 'bob1' should be able to ping 'alice1' directly.
> -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 fd01::2 |
FORMAT_PING], \
> -[0], [dnl
> -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> -])
> +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 80], [alice1.pid])
> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
>
>  # Ensure datapaths show conntrack states as expected
>  # Like with conntrack entries, we shouldn't try to predict
> @@ -6145,7 +6169,38 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(-new+est+rpl+trk).*ct_lab
>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
>  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
>  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>
-icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
>
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> +])
> +
> +# Flush conntrack entries for easier output parsing of next test.
> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> +
> +# Change bob1 L2 address anche check the reply is properly updated.
> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> +ovn-nbctl set Logical_Switch_Port r2-ext \
> +     type=router options:router-port=R2_ext
addresses='"00:00:10:01:02:04"'
> +
> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> +
> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0],
[dnl
> +1
> +])
> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
> +1
> +])
> +
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
FORMAT_CT(fd01::2) | \
> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> +])
> +
> +# Check entries in table 76 and 77 expires w/o traffic
> +OVS_WAIT_UNTIL([
> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
> +])
> +OVS_WAIT_UNTIL([
> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
>  ])
>
>  ovs-ofctl dump-flows br-int
> --
> 2.36.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Lorenzo Bianconi Aug. 19, 2022, 3:37 p.m. UTC | #2
> On Fri, Jul 15, 2022 at 4:35 AM Lorenzo Bianconi <
> lorenzo.bianconi@redhat.com> wrote:
> >
> > Rely on the following new actions for ecmp-symmetric routing:
> > - chk_ecmp_nh_mac
> > - chk_ecmp_nh
> > - commit_ecmp_nh
> >
> > In this way OVN is able to keep up if for any reason the ECMP traffic
> > source changes L2 address and keeps old L3 addresses.
> 
> Hi Lorenzo, do you have more details of the problem scenario? It seems the
> bugzilla access is limited.

Hi Han,

in the current codebase, to implement ecmp symmetric-reply, ovn stores
the next-hop mac address (let's say mac0) and in-port in ct_label and
ct_mark respectively.
An issue occurs whenever the traffic next-hop changes l2 address from
mac0 to mac1 w/o changing l3/l4 info (e.g. src ip address).
In this scenario ovn is not able to keep up and recover from this change.
Agree?

> 
> >
> > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2096233
> > Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> > ---
> >  northd/northd.c         | 102 +++++++++++++++++++++++++++++++++++-----
> >  northd/ovn-northd.8.xml |  25 ++++++----
> >  tests/ovn-northd.at     |  18 ++++++-
> >  tests/ovn.at            |   4 +-
> >  tests/system-ovn.at     |  79 ++++++++++++++++++++++++++-----
> >  5 files changed, 191 insertions(+), 37 deletions(-)
> >
> > diff --git a/northd/northd.c b/northd/northd.c
> > index d31cb1688..57ec3675b 100644
> > --- a/northd/northd.c
> > +++ b/northd/northd.c
> > @@ -227,6 +227,7 @@ enum ovn_stage {
> >  #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
> >  #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
> >  #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
> > +#define REGBIT_KNOWN_ECMP_NH    "reg9[5]"
> >
> >  /* Register to store the eth address associated to a router port for
> packets
> >   * received in S_ROUTER_IN_ADMISSION.
> > @@ -327,7 +328,8 @@ enum ovn_stage {
> >   * |     |   EGRESS_LOOPBACK/       | G |     UNUSED      |
> >   * | R9  |   PKT_LARGER/            | 4 |                 |
> >   * |     |   LOOKUP_NEIGHBOR_RESULT/|   |                 |
> > - * |     |   SKIP_LOOKUP_NEIGHBOR}  |   |                 |
> > + * |     |   SKIP_LOOKUP_NEIGHBOR/  |   |                 |
> > + * |     |   KNOWN_ECMP_NH}         |   |                 |
> >   * |     |                          |   |                 |
> >   * |     | REG_ORIG_TP_DPORT_ROUTER |   |                 |
> >   * |     |                          |   |                 |
> > @@ -9437,6 +9439,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
> >                                 struct ds *route_match)
> >  {
> >      const struct nbrec_logical_router_static_route *st_route =
> route->route;
> > +    struct ds base_match = DS_EMPTY_INITIALIZER;
> >      struct ds match = DS_EMPTY_INITIALIZER;
> >      struct ds actions = DS_EMPTY_INITIALIZER;
> >      struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
> > @@ -9448,20 +9451,22 @@ add_ecmp_symmetric_reply_flows(struct hmap
> *lflows,
> >      /* If symmetric ECMP replies are enabled, then packets that arrive
> over
> >       * an ECMP route need to go through conntrack.
> >       */
> > -    ds_put_format(&match, "inport == %s && ip%s.%s == %s",
> > +    ds_put_format(&base_match, "inport == %s && ip%s.%s == %s",
> >                    out_port->json_key,
> >                    IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "4" : "6",
> >                    route->is_src_route ? "dst" : "src",
> >                    cidr);
> >      free(cidr);
> >      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> > -                            ds_cstr(&match), "ct_next;",
> > -                            &st_route->header_);
> > +            ds_cstr(&base_match),
> > +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
> > +            &st_route->header_);
> >
> >      /* And packets that go out over an ECMP route need conntrack */
> >      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> > -                            ds_cstr(route_match), "ct_next;",
> > -                            &st_route->header_);
> > +            ds_cstr(route_match),
> > +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
> > +            &st_route->header_);
> >
> >      /* Save src eth and inport in ct_label for packets that arrive over
> >       * an ECMP route.
> > @@ -9469,11 +9474,84 @@ add_ecmp_symmetric_reply_flows(struct hmap
> *lflows,
> >       * NOTE: we purposely are not clearing match before this
> >       * ds_put_cstr() call. The previous contents are needed.
> >       */
> > -    ds_put_cstr(&match, " && (ct.new && !ct.est)");
> > +    ds_put_format(&match, "%s && (ct.new && !ct.est) && tcp",
> > +                  ds_cstr(&base_match));
> > +    ds_put_format(&actions,
> > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > +            " %s = %" PRId64 ";}; "
> > +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> > +                            ds_cstr(&match), ds_cstr(&actions),
> > +                            &st_route->header_);
> > +    ds_clear(&match);
> > +    ds_put_format(&match, "%s && (ct.new && !ct.est) && udp",
> > +                  ds_cstr(&base_match));
> > +    ds_clear(&actions);
> > +    ds_put_format(&actions,
> > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > +            " %s = %" PRId64 ";}; "
> > +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> > +                            ds_cstr(&match), ds_cstr(&actions),
> > +                            &st_route->header_);
> > +    ds_clear(&match);
> > +    ds_put_format(&match, "%s && (ct.new && !ct.est) && sctp",
> > +                  ds_cstr(&base_match));
> > +    ds_clear(&actions);
> > +    ds_put_format(&actions,
> > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > +            " %s = %" PRId64 ";}; "
> > +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> > +                            ds_cstr(&match), ds_cstr(&actions),
> > +                            &st_route->header_);
> >
> > -    ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth =
> eth.src;"
> > -                  " %s = %" PRId64 ";}; next;",
> > -                  ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> > +    ds_clear(&match);
> > +    ds_put_format(&match,
> > +            "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH"
> == 0",
> > +            ds_cstr(&base_match));
> > +    ds_clear(&actions);
> > +    ds_put_format(&actions,
> > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > +            " %s = %" PRId64 ";}; "
> > +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> > +                            ds_cstr(&match), ds_cstr(&actions),
> > +                            &st_route->header_);
> > +
> > +    ds_clear(&match);
> > +    ds_put_format(&match,
> > +            "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH"
> == 0",
> > +            ds_cstr(&base_match));
> > +    ds_clear(&actions);
> > +    ds_put_format(&actions,
> > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > +            " %s = %" PRId64 ";}; "
> 
> Sorry that I haven't reviewed the details yet, but this flow would break HW
> offload again, which was fixed in 22.03 and backported to 21.12. cc @Mark
> Michelson <mmichels@redhat.com>

can you please provide more info?

Regards,
Lorenzo

> 
> Thanks,
> Han
> 
> > +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> > +                            ds_cstr(&match), ds_cstr(&actions),
> > +                            &st_route->header_);
> > +    ds_clear(&match);
> > +    ds_put_format(&match,
> > +            "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH"
> == 0",
> > +            ds_cstr(&base_match));
> > +    ds_clear(&actions);
> > +    ds_put_format(&actions,
> > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > +            " %s = %" PRId64 ";}; "
> > +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
> >                              ds_cstr(&match), ds_cstr(&actions),
> >                              &st_route->header_);
> > @@ -9481,7 +9559,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
> >      /* Bypass ECMP selection if we already have ct_label information
> >       * for where to route the packet.
> >       */
> > -    ds_put_format(&ecmp_reply, "ct.rpl && %s == %"PRId64,
> > +    ds_put_format(&ecmp_reply,
> > +                  "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s ==
> %"PRId64,
> >                    ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
> >      ds_clear(&match);
> >      ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
> > @@ -9517,6 +9596,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
> >                              200, ds_cstr(&ecmp_reply),
> >                              action, &st_route->header_);
> >
> > +    ds_destroy(&base_match);
> >      ds_destroy(&match);
> >      ds_destroy(&actions);
> >      ds_destroy(&ecmp_reply);
> > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> > index a87df24bd..aa7196a92 100644
> > --- a/northd/ovn-northd.8.xml
> > +++ b/northd/ovn-northd.8.xml
> > @@ -3166,8 +3166,12 @@ icmp6 {
> >        configured. The matching logic for these ports essentially
> reverses the
> >        configured logic of the ECMP route. So for instance, a route with a
> >        destination routing policy will instead match if the source IP
> address
> > -      matches the static route's prefix. The flow uses the action
> > -      <code>ct_next</code> to send IP packets to the connection tracker
> for
> > +      matches the static route's prefix. The flow uses the actions
> > +      <code>chk_ecmp_nh_mac(); ct_next</code> or
> > +      <code>chk_ecmp_nh(); ct_next</code> to send IP packets to table
> > +      <code>OFTABLE_ECMP_NH_MAC</code> or to table
> > +      <code>OFTABLE_ECMP_NH</code> in order to check if source info are
> already
> > +      stored by OVN and then to the connection tracker for
> >        packet de-fragmentation and tracking before sending it to the next
> table.
> >      </p>
> >
> > @@ -3426,10 +3430,11 @@ icmp6 {
> >          route with a destination routing policy will instead match if the
> >          source IP address matches the static route's prefix. The flow
> uses
> >          the action <code>ct_commit { ct_label.ecmp_reply_eth = eth.src;"
> > -        " ct_mark.ecmp_reply_port = <var>K</var>;}; next; </code> to
> commit
> > -        the connection and storing <code>eth.src</code> and the ECMP
> > -        reply port binding tunnel key <var>K</var> in the
> > -        <code>ct_label</code>.
> > +        " ct_mark.ecmp_reply_port = <var>K</var>;}; commit_ecmp_nh();
> next;
> > +        </code> to commit the connection and storing
> <code>eth.src</code> and
> > +        the ECMP reply port binding tunnel key <var>K</var> in the
> > +        <code>ct_label</code> and the traffic pattern to table
> > +        <code>OFTABLE_ECMP_NH_MAC</code> or <code>OFTABLE_ECMP_NH</code>.
> >        </li>
> >      </ul>
> >
> > @@ -3568,10 +3573,10 @@ output;
> >        which the packet should be sent. The
> <code>ct_mark.ecmp_reply_port</code>
> >        tells the logical router port on which the packet should be sent.
> These
> >        values saved to the conntrack fields when the initial ingress
> traffic is
> > -      received over the ECMP route and committed to conntrack. The
> > -      priority-10300 flows in this stage set the <code>outport</code>,
> > -      while the <code>eth.dst</code> is set by flows at the ARP/ND
> Resolution
> > -      stage.
> > +      received over the ECMP route and committed to conntrack.
> > +      If <code>REGBIT_KNOWN_ECMP_NH</code> is set, the priority-10300
> > +      flows in this stage set the <code>outport</code>, while the
> > +      <code>eth.dst</code> is set by flows at the ARP/ND Resolution
> stage.
> >      </p>
> >
> >      <p>
> > diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> > index 610a5ce12..0c7b24816 100644
> > --- a/tests/ovn-northd.at
> > +++ b/tests/ovn-northd.at
> > @@ -5644,10 +5644,24 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp"
> lr0flows | sed 's/192\.168\.0\..0/192.
> >    table=??(lr_in_ip_routing_ecmp), priority=150  , match=(reg8[[0..15]]
> == 0), action=(next;)
> >  ])
> >
> > +AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed
> 's/table=../table=??/' | sort], [0], [dnl
> > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp &&
> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
> sctp); next;)
> > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp &&
> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
> next;)
> > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp &&
> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
> next;)
> > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp),
> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
> sctp); next;)
> > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp),
> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
> next;)
> > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp),
> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
> next;)
> > +])
> > +
> > +AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
> 's/table=../table=??/' | sort], [0], [dnl
> > +  table=??(lr_in_defrag       ), priority=100  , match=(inport ==
> "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac();
> ct_next;)
> > +  table=??(lr_in_defrag       ), priority=100  , match=(reg7 == 0 &&
> ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
> > +])
> > +
> >  dnl The chassis was created with other_config:ct-no-masked-label=false,
> the flows
> >  dnl should be using ct_label.ecmp_reply_port.
> >  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> 's/table=../table=??/'], [0], [dnl
> > -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> > +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1);
> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> >  ])
> >
> >  dnl Simulate an ovn-controller upgrade to a version that supports
> > @@ -5657,7 +5671,7 @@ check ovn-sbctl set chassis ch1
> other_config:ct-no-masked-label=true
> >  check ovn-nbctl --wait=sb sync
> >  ovn-sbctl dump-flows lr0 > lr0flows
> >  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> 's/table=../table=??/'], [0], [dnl
> > -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> > +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1);
> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> >  ])
> >
> >  # add ecmp route with wrong nexthop
> > diff --git a/tests/ovn.at b/tests/ovn.at
> > index 91dc3b9d6..12ba3f6d0 100644
> > --- a/tests/ovn.at
> > +++ b/tests/ovn.at
> > @@ -27173,7 +27173,7 @@ AT_CHECK([
> >          grep "priority=200" | \
> >          grep -c
> "move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
> >      done; :], [0], [dnl
> > -1
> > +6
> >  1
> >  0
> >  0
> > @@ -27298,7 +27298,7 @@ AT_CHECK([
> >          grep "priority=200" | \
> >          grep -c
> "move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
> >      done; :], [0], [dnl
> > -1
> > +6
> >  1
> >  0
> >  0
> > diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> > index 4a8fdede8..b3096deb3 100644
> > --- a/tests/system-ovn.at
> > +++ b/tests/system-ovn.at
> > @@ -5956,11 +5956,8 @@ ovn-nbctl --wait=hv sync
> >
> >  on_exit 'ovs-ofctl dump-flows br-int'
> >
> > -# 'bob1' should be able to ping 'alice1' directly.
> > -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 |
> FORMAT_PING], \
> > -[0], [dnl
> > -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> > -])
> > +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
> > +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> >
> >  # Ensure conntrack entry is present. We should not try to predict
> >  # the tunnel key for the output port, so we strip it from the labels
> > @@ -5968,7 +5965,7 @@ NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15
> 10.0.0.2 | FORMAT_PING], \
> >  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
> >  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> >  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >
> -icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
> >
> +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> >  ])
> >
> >  # Ensure datapaths show conntrack states as expected
> > @@ -5981,6 +5978,36 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> 'ct_state(-new+est+rpl+trk).*ct_lab
> >  1
> >  ])
> >
> > +# Flush conntrack entries for easier output parsing of next test.
> > +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> > +
> > +# Change bob1 L2 address anche check the reply is properly updated.
> > +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> > +ovn-nbctl set Logical_Switch_Port r2-ext \
> > +     type=router options:router-port=R2_ext
> addresses='"00:00:10:01:02:04"'
> > +
> > +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0],
> [dnl
> > +1
> > +])
> > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
> > +1
> > +])
> > +
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
> FORMAT_CT(172.16.0.1) | \
> > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >
> +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> > +])
> > +
> > +# Check entries in table 76 and 77 expires w/o traffic
> > +OVS_WAIT_UNTIL([
> > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
> > +])
> > +OVS_WAIT_UNTIL([
> > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
> > +])
> > +
> >  ovs-ofctl dump-flows br-int
> >
> >  OVS_APP_EXIT_AND_WAIT([ovn-controller])
> > @@ -6122,11 +6149,8 @@ ovn-nbctl --wait=hv sync
> >
> >  on_exit 'ovs-ofctl dump-flows br-int'
> >
> > -# 'bob1' should be able to ping 'alice1' directly.
> > -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 fd01::2 |
> FORMAT_PING], \
> > -[0], [dnl
> > -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> > -])
> > +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 80], [alice1.pid])
> > +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> >
> >  # Ensure datapaths show conntrack states as expected
> >  # Like with conntrack entries, we shouldn't try to predict
> > @@ -6145,7 +6169,38 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> 'ct_state(-new+est+rpl+trk).*ct_lab
> >  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
> >  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> >  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >
> -icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
> >
> +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> > +])
> > +
> > +# Flush conntrack entries for easier output parsing of next test.
> > +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> > +
> > +# Change bob1 L2 address anche check the reply is properly updated.
> > +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> > +ovn-nbctl set Logical_Switch_Port r2-ext \
> > +     type=router options:router-port=R2_ext
> addresses='"00:00:10:01:02:04"'
> > +
> > +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> > +
> > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0],
> [dnl
> > +1
> > +])
> > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
> > +1
> > +])
> > +
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
> FORMAT_CT(fd01::2) | \
> > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >
> +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> > +])
> > +
> > +# Check entries in table 76 and 77 expires w/o traffic
> > +OVS_WAIT_UNTIL([
> > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
> > +])
> > +OVS_WAIT_UNTIL([
> > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
> >  ])
> >
> >  ovs-ofctl dump-flows br-int
> > --
> > 2.36.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Han Zhou Aug. 19, 2022, 7:23 p.m. UTC | #3
On Fri, Aug 19, 2022 at 8:37 AM Lorenzo Bianconi <
lorenzo.bianconi@redhat.com> wrote:
>
> > On Fri, Jul 15, 2022 at 4:35 AM Lorenzo Bianconi <
> > lorenzo.bianconi@redhat.com> wrote:
> > >
> > > Rely on the following new actions for ecmp-symmetric routing:
> > > - chk_ecmp_nh_mac
> > > - chk_ecmp_nh
> > > - commit_ecmp_nh
> > >
> > > In this way OVN is able to keep up if for any reason the ECMP traffic
> > > source changes L2 address and keeps old L3 addresses.
> >
> > Hi Lorenzo, do you have more details of the problem scenario? It seems
the
> > bugzilla access is limited.
>
> Hi Han,
>
> in the current codebase, to implement ecmp symmetric-reply, ovn stores
> the next-hop mac address (let's say mac0) and in-port in ct_label and
> ct_mark respectively.
> An issue occurs whenever the traffic next-hop changes l2 address from
> mac0 to mac1 w/o changing l3/l4 info (e.g. src ip address).
> In this scenario ovn is not able to keep up and recover from this change.
> Agree?
>
Yes, that's clear. Thanks for explaining.

> >
> > >
> > > Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2096233
> > > Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> > > ---
> > >  northd/northd.c         | 102
+++++++++++++++++++++++++++++++++++-----
> > >  northd/ovn-northd.8.xml |  25 ++++++----
> > >  tests/ovn-northd.at     |  18 ++++++-
> > >  tests/ovn.at            |   4 +-
> > >  tests/system-ovn.at     |  79 ++++++++++++++++++++++++++-----
> > >  5 files changed, 191 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/northd/northd.c b/northd/northd.c
> > > index d31cb1688..57ec3675b 100644
> > > --- a/northd/northd.c
> > > +++ b/northd/northd.c
> > > @@ -227,6 +227,7 @@ enum ovn_stage {
> > >  #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
> > >  #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
> > >  #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
> > > +#define REGBIT_KNOWN_ECMP_NH    "reg9[5]"
> > >
> > >  /* Register to store the eth address associated to a router port for
> > packets
> > >   * received in S_ROUTER_IN_ADMISSION.
> > > @@ -327,7 +328,8 @@ enum ovn_stage {
> > >   * |     |   EGRESS_LOOPBACK/       | G |     UNUSED      |
> > >   * | R9  |   PKT_LARGER/            | 4 |                 |
> > >   * |     |   LOOKUP_NEIGHBOR_RESULT/|   |                 |
> > > - * |     |   SKIP_LOOKUP_NEIGHBOR}  |   |                 |
> > > + * |     |   SKIP_LOOKUP_NEIGHBOR/  |   |                 |
> > > + * |     |   KNOWN_ECMP_NH}         |   |                 |
> > >   * |     |                          |   |                 |
> > >   * |     | REG_ORIG_TP_DPORT_ROUTER |   |                 |
> > >   * |     |                          |   |                 |
> > > @@ -9437,6 +9439,7 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
> > >                                 struct ds *route_match)
> > >  {
> > >      const struct nbrec_logical_router_static_route *st_route =
> > route->route;
> > > +    struct ds base_match = DS_EMPTY_INITIALIZER;
> > >      struct ds match = DS_EMPTY_INITIALIZER;
> > >      struct ds actions = DS_EMPTY_INITIALIZER;
> > >      struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
> > > @@ -9448,20 +9451,22 @@ add_ecmp_symmetric_reply_flows(struct hmap
> > *lflows,
> > >      /* If symmetric ECMP replies are enabled, then packets that
arrive
> > over
> > >       * an ECMP route need to go through conntrack.
> > >       */
> > > -    ds_put_format(&match, "inport == %s && ip%s.%s == %s",
> > > +    ds_put_format(&base_match, "inport == %s && ip%s.%s == %s",
> > >                    out_port->json_key,
> > >                    IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "4" : "6",
> > >                    route->is_src_route ? "dst" : "src",
> > >                    cidr);
> > >      free(cidr);
> > >      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> > > -                            ds_cstr(&match), "ct_next;",
> > > -                            &st_route->header_);
> > > +            ds_cstr(&base_match),
> > > +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
> > > +            &st_route->header_);
> > >
> > >      /* And packets that go out over an ECMP route need conntrack */
> > >      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> > > -                            ds_cstr(route_match), "ct_next;",
> > > -                            &st_route->header_);
> > > +            ds_cstr(route_match),
> > > +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
> > > +            &st_route->header_);
> > >
> > >      /* Save src eth and inport in ct_label for packets that arrive
over
> > >       * an ECMP route.
> > > @@ -9469,11 +9474,84 @@ add_ecmp_symmetric_reply_flows(struct hmap
> > *lflows,
> > >       * NOTE: we purposely are not clearing match before this
> > >       * ds_put_cstr() call. The previous contents are needed.
> > >       */
> > > -    ds_put_cstr(&match, " && (ct.new && !ct.est)");
> > > +    ds_put_format(&match, "%s && (ct.new && !ct.est) && tcp",
> > > +                  ds_cstr(&base_match));
> > > +    ds_put_format(&actions,
> > > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > > +            " %s = %" PRId64 ";}; "
> > > +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> > > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> > > +                            ds_cstr(&match), ds_cstr(&actions),
> > > +                            &st_route->header_);
> > > +    ds_clear(&match);
> > > +    ds_put_format(&match, "%s && (ct.new && !ct.est) && udp",
> > > +                  ds_cstr(&base_match));
> > > +    ds_clear(&actions);
> > > +    ds_put_format(&actions,
> > > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > > +            " %s = %" PRId64 ";}; "
> > > +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> > > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> > > +                            ds_cstr(&match), ds_cstr(&actions),
> > > +                            &st_route->header_);
> > > +    ds_clear(&match);
> > > +    ds_put_format(&match, "%s && (ct.new && !ct.est) && sctp",
> > > +                  ds_cstr(&base_match));
> > > +    ds_clear(&actions);
> > > +    ds_put_format(&actions,
> > > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > > +            " %s = %" PRId64 ";}; "
> > > +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> > > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> > > +                            ds_cstr(&match), ds_cstr(&actions),
> > > +                            &st_route->header_);
> > >
> > > -    ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth =
> > eth.src;"
> > > -                  " %s = %" PRId64 ";}; next;",
> > > -                  ct_ecmp_reply_port_match,
out_port->sb->tunnel_key);
> > > +    ds_clear(&match);
> > > +    ds_put_format(&match,
> > > +            "%s && (!ct.rpl && ct.est) && tcp &&
"REGBIT_KNOWN_ECMP_NH"
> > == 0",
> > > +            ds_cstr(&base_match));
> > > +    ds_clear(&actions);
> > > +    ds_put_format(&actions,
> > > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > > +            " %s = %" PRId64 ";}; "
> > > +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> > > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> > > +                            ds_cstr(&match), ds_cstr(&actions),
> > > +                            &st_route->header_);
> > > +
> > > +    ds_clear(&match);
> > > +    ds_put_format(&match,
> > > +            "%s && (!ct.rpl && ct.est) && udp &&
"REGBIT_KNOWN_ECMP_NH"
> > == 0",
> > > +            ds_cstr(&base_match));
> > > +    ds_clear(&actions);
> > > +    ds_put_format(&actions,
> > > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > > +            " %s = %" PRId64 ";}; "
> >
> > Sorry that I haven't reviewed the details yet, but this flow would
break HW
> > offload again, which was fixed in 22.03 and backported to 21.12. cc
@Mark
> > Michelson <mmichels@redhat.com>
>
> can you please provide more info?

For the HW offload related fix for ct_label I was talking about
https://github.com/ovn-org/ovn/commit/a075230e4a0fcc166251271db1c8ae01b993c9cf
My concern here turned out to be a false alarm. Initially I was caught by
the matching of ct.est and the ct_label.ecmp_reply_eth access in the
actions, thinking this would break HW offloading for packets of an
established connection. After checking more details of the change here, I
realized that the flow would match only for the first packet when a MAC
change happens because it matches "REGBIT_KNOWN_ECMP_NH == 0". It is ok if
it's not offloaded. Once the entry is updated (by the commit action) the
flow won't be matched and following packets should stay in HW.
In addition, commit actions in HW is always ignored and handled as a
regular CT action which would not make any update to the CT entry, because
the assumption is that at least one first packets of the HW flow should
have been handled by the SW and updated the CT entry properly, so updating
the entry from HW is never needed.
In short, this shouldn't impact HW offload. Sorry for the false alarm.

Thanks,
Han

>
> Regards,
> Lorenzo
>
> >
> > Thanks,
> > Han
> >
> > > +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> > > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > > +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> > > +                            ds_cstr(&match), ds_cstr(&actions),
> > > +                            &st_route->header_);
> > > +    ds_clear(&match);
> > > +    ds_put_format(&match,
> > > +            "%s && (!ct.rpl && ct.est) && sctp &&
"REGBIT_KNOWN_ECMP_NH"
> > == 0",
> > > +            ds_cstr(&base_match));
> > > +    ds_clear(&actions);
> > > +    ds_put_format(&actions,
> > > +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> > > +            " %s = %" PRId64 ";}; "
> > > +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> > > +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> > > +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> > >      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> > >                              ds_cstr(&match), ds_cstr(&actions),
> > >                              &st_route->header_);
> > > @@ -9481,7 +9559,8 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
> > >      /* Bypass ECMP selection if we already have ct_label information
> > >       * for where to route the packet.
> > >       */
> > > -    ds_put_format(&ecmp_reply, "ct.rpl && %s == %"PRId64,
> > > +    ds_put_format(&ecmp_reply,
> > > +                  "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s ==
> > %"PRId64,
> > >                    ct_ecmp_reply_port_match,
out_port->sb->tunnel_key);
> > >      ds_clear(&match);
> > >      ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
> > > @@ -9517,6 +9596,7 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
> > >                              200, ds_cstr(&ecmp_reply),
> > >                              action, &st_route->header_);
> > >
> > > +    ds_destroy(&base_match);
> > >      ds_destroy(&match);
> > >      ds_destroy(&actions);
> > >      ds_destroy(&ecmp_reply);
> > > diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> > > index a87df24bd..aa7196a92 100644
> > > --- a/northd/ovn-northd.8.xml
> > > +++ b/northd/ovn-northd.8.xml
> > > @@ -3166,8 +3166,12 @@ icmp6 {
> > >        configured. The matching logic for these ports essentially
> > reverses the
> > >        configured logic of the ECMP route. So for instance, a route
with a
> > >        destination routing policy will instead match if the source IP
> > address
> > > -      matches the static route's prefix. The flow uses the action
> > > -      <code>ct_next</code> to send IP packets to the connection
tracker
> > for
> > > +      matches the static route's prefix. The flow uses the actions
> > > +      <code>chk_ecmp_nh_mac(); ct_next</code> or
> > > +      <code>chk_ecmp_nh(); ct_next</code> to send IP packets to table
> > > +      <code>OFTABLE_ECMP_NH_MAC</code> or to table
> > > +      <code>OFTABLE_ECMP_NH</code> in order to check if source info
are
> > already
> > > +      stored by OVN and then to the connection tracker for
> > >        packet de-fragmentation and tracking before sending it to the
next
> > table.
> > >      </p>
> > >
> > > @@ -3426,10 +3430,11 @@ icmp6 {
> > >          route with a destination routing policy will instead match
if the
> > >          source IP address matches the static route's prefix. The flow
> > uses
> > >          the action <code>ct_commit { ct_label.ecmp_reply_eth =
eth.src;"
> > > -        " ct_mark.ecmp_reply_port = <var>K</var>;}; next; </code> to
> > commit
> > > -        the connection and storing <code>eth.src</code> and the ECMP
> > > -        reply port binding tunnel key <var>K</var> in the
> > > -        <code>ct_label</code>.
> > > +        " ct_mark.ecmp_reply_port = <var>K</var>;}; commit_ecmp_nh();
> > next;
> > > +        </code> to commit the connection and storing
> > <code>eth.src</code> and
> > > +        the ECMP reply port binding tunnel key <var>K</var> in the
> > > +        <code>ct_label</code> and the traffic pattern to table
> > > +        <code>OFTABLE_ECMP_NH_MAC</code> or
<code>OFTABLE_ECMP_NH</code>.
> > >        </li>
> > >      </ul>
> > >
> > > @@ -3568,10 +3573,10 @@ output;
> > >        which the packet should be sent. The
> > <code>ct_mark.ecmp_reply_port</code>
> > >        tells the logical router port on which the packet should be
sent.
> > These
> > >        values saved to the conntrack fields when the initial ingress
> > traffic is
> > > -      received over the ECMP route and committed to conntrack. The
> > > -      priority-10300 flows in this stage set the
<code>outport</code>,
> > > -      while the <code>eth.dst</code> is set by flows at the ARP/ND
> > Resolution
> > > -      stage.
> > > +      received over the ECMP route and committed to conntrack.
> > > +      If <code>REGBIT_KNOWN_ECMP_NH</code> is set, the priority-10300
> > > +      flows in this stage set the <code>outport</code>, while the
> > > +      <code>eth.dst</code> is set by flows at the ARP/ND Resolution
> > stage.
> > >      </p>
> > >
> > >      <p>
> > > diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> > > index 610a5ce12..0c7b24816 100644
> > > --- a/tests/ovn-northd.at
> > > +++ b/tests/ovn-northd.at
> > > @@ -5644,10 +5644,24 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp"
> > lr0flows | sed 's/192\.168\.0\..0/192.
> > >    table=??(lr_in_ip_routing_ecmp), priority=150  ,
match=(reg8[[0..15]]
> > == 0), action=(next;)
> > >  ])
> > >
> > > +AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows |
sed
> > 's/table=../table=??/' | sort], [0], [dnl
> > > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp &&
> > reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
> > sctp); next;)
> > > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp &&
> > reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
tcp);
> > next;)
> > > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp &&
> > reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
udp);
> > next;)
> > > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp),
> > action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
> > sctp); next;)
> > > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp),
> > action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
tcp);
> > next;)
> > > +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp),
> > action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
udp);
> > next;)
> > > +])
> > > +
> > > +AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
> > 's/table=../table=??/' | sort], [0], [dnl
> > > +  table=??(lr_in_defrag       ), priority=100  , match=(inport ==
> > "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] =
chk_ecmp_nh_mac();
> > ct_next;)
> > > +  table=??(lr_in_defrag       ), priority=100  , match=(reg7 == 0 &&
> > ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
> > > +])
> > > +
> > >  dnl The chassis was created with
other_config:ct-no-masked-label=false,
> > the flows
> > >  dnl should be using ct_label.ecmp_reply_port.
> > >  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> > 's/table=../table=??/'], [0], [dnl
> > > -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> > ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> > eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> > > +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> > reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1);
> > xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> > >  ])
> > >
> > >  dnl Simulate an ovn-controller upgrade to a version that supports
> > > @@ -5657,7 +5671,7 @@ check ovn-sbctl set chassis ch1
> > other_config:ct-no-masked-label=true
> > >  check ovn-nbctl --wait=sb sync
> > >  ovn-sbctl dump-flows lr0 > lr0flows
> > >  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> > 's/table=../table=??/'], [0], [dnl
> > > -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> > ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> > eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> > > +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> > reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1);
> > xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> > >  ])
> > >
> > >  # add ecmp route with wrong nexthop
> > > diff --git a/tests/ovn.at b/tests/ovn.at
> > > index 91dc3b9d6..12ba3f6d0 100644
> > > --- a/tests/ovn.at
> > > +++ b/tests/ovn.at
> > > @@ -27173,7 +27173,7 @@ AT_CHECK([
> > >          grep "priority=200" | \
> > >          grep -c
> >
"move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
> > >      done; :], [0], [dnl
> > > -1
> > > +6
> > >  1
> > >  0
> > >  0
> > > @@ -27298,7 +27298,7 @@ AT_CHECK([
> > >          grep "priority=200" | \
> > >          grep -c
> >
"move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
> > >      done; :], [0], [dnl
> > > -1
> > > +6
> > >  1
> > >  0
> > >  0
> > > diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> > > index 4a8fdede8..b3096deb3 100644
> > > --- a/tests/system-ovn.at
> > > +++ b/tests/system-ovn.at
> > > @@ -5956,11 +5956,8 @@ ovn-nbctl --wait=hv sync
> > >
> > >  on_exit 'ovs-ofctl dump-flows br-int'
> > >
> > > -# 'bob1' should be able to ping 'alice1' directly.
> > > -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 |
> > FORMAT_PING], \
> > > -[0], [dnl
> > > -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> > > -])
> > > +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
> > > +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> > >
> > >  # Ensure conntrack entry is present. We should not try to predict
> > >  # the tunnel key for the output port, so we strip it from the labels
> > > @@ -5968,7 +5965,7 @@ NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w
15
> > 10.0.0.2 | FORMAT_PING], \
> > >  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
> > >  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > >  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> > >
> >
-icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
> > >
> >
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> > >  ])
> > >
> > >  # Ensure datapaths show conntrack states as expected
> > > @@ -5981,6 +5978,36 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> > 'ct_state(-new+est+rpl+trk).*ct_lab
> > >  1
> > >  ])
> > >
> > > +# Flush conntrack entries for easier output parsing of next test.
> > > +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> > > +
> > > +# Change bob1 L2 address anche check the reply is properly updated.
> > > +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> > > +ovn-nbctl set Logical_Switch_Port r2-ext \
> > > +     type=router options:router-port=R2_ext
> > addresses='"00:00:10:01:02:04"'
> > > +
> > > +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> > > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> > 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c],
[0],
> > [dnl
> > > +1
> > > +])
> > > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> > 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0],
[dnl
> > > +1
> > > +])
> > > +
> > > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
> > FORMAT_CT(172.16.0.1) | \
> > > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> > >
> >
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> > > +])
> > > +
> > > +# Check entries in table 76 and 77 expires w/o traffic
> > > +OVS_WAIT_UNTIL([
> > > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets')
-eq 0
> > > +])
> > > +OVS_WAIT_UNTIL([
> > > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets')
-eq 0
> > > +])
> > > +
> > >  ovs-ofctl dump-flows br-int
> > >
> > >  OVS_APP_EXIT_AND_WAIT([ovn-controller])
> > > @@ -6122,11 +6149,8 @@ ovn-nbctl --wait=hv sync
> > >
> > >  on_exit 'ovs-ofctl dump-flows br-int'
> > >
> > > -# 'bob1' should be able to ping 'alice1' directly.
> > > -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 fd01::2 |
> > FORMAT_PING], \
> > > -[0], [dnl
> > > -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> > > -])
> > > +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 80], [alice1.pid])
> > > +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> > >
> > >  # Ensure datapaths show conntrack states as expected
> > >  # Like with conntrack entries, we shouldn't try to predict
> > > @@ -6145,7 +6169,38 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> > 'ct_state(-new+est+rpl+trk).*ct_lab
> > >  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
> > >  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > >  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> > >
> >
-icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
> > >
> >
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> > > +])
> > > +
> > > +# Flush conntrack entries for easier output parsing of next test.
> > > +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> > > +
> > > +# Change bob1 L2 address anche check the reply is properly updated.
> > > +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> > > +ovn-nbctl set Logical_Switch_Port r2-ext \
> > > +     type=router options:router-port=R2_ext
> > addresses='"00:00:10:01:02:04"'
> > > +
> > > +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> > > +
> > > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> > 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c],
[0],
> > [dnl
> > > +1
> > > +])
> > > +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> > 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0],
[dnl
> > > +1
> > > +])
> > > +
> > > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
> > FORMAT_CT(fd01::2) | \
> > > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> > >
> >
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> > > +])
> > > +
> > > +# Check entries in table 76 and 77 expires w/o traffic
> > > +OVS_WAIT_UNTIL([
> > > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets')
-eq 0
> > > +])
> > > +OVS_WAIT_UNTIL([
> > > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets')
-eq 0
> > >  ])
> > >
> > >  ovs-ofctl dump-flows br-int
> > > --
> > > 2.36.1
> > >
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Dumitru Ceara Aug. 19, 2022, 7:48 p.m. UTC | #4
On 8/19/22 17:37, Lorenzo Bianconi wrote:
>> On Fri, Jul 15, 2022 at 4:35 AM Lorenzo Bianconi <
>> lorenzo.bianconi@redhat.com> wrote:
>>>
>>> Rely on the following new actions for ecmp-symmetric routing:
>>> - chk_ecmp_nh_mac
>>> - chk_ecmp_nh
>>> - commit_ecmp_nh
>>>
>>> In this way OVN is able to keep up if for any reason the ECMP traffic
>>> source changes L2 address and keeps old L3 addresses.
>>
>> Hi Lorenzo, do you have more details of the problem scenario? It seems the
>> bugzilla access is limited.
> 
> Hi Han,
> 
> in the current codebase, to implement ecmp symmetric-reply, ovn stores
> the next-hop mac address (let's say mac0) and in-port in ct_label and
> ct_mark respectively.
> An issue occurs whenever the traffic next-hop changes l2 address from
> mac0 to mac1 w/o changing l3/l4 info (e.g. src ip address).
> In this scenario ovn is not able to keep up and recover from this change.
> Agree?
> 
>>
>>>
>>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2096233
>>> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
>>> ---
>>>  northd/northd.c         | 102 +++++++++++++++++++++++++++++++++++-----
>>>  northd/ovn-northd.8.xml |  25 ++++++----
>>>  tests/ovn-northd.at     |  18 ++++++-
>>>  tests/ovn.at            |   4 +-
>>>  tests/system-ovn.at     |  79 ++++++++++++++++++++++++++-----
>>>  5 files changed, 191 insertions(+), 37 deletions(-)
>>>
>>> diff --git a/northd/northd.c b/northd/northd.c
>>> index d31cb1688..57ec3675b 100644
>>> --- a/northd/northd.c
>>> +++ b/northd/northd.c
>>> @@ -227,6 +227,7 @@ enum ovn_stage {
>>>  #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
>>>  #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
>>>  #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
>>> +#define REGBIT_KNOWN_ECMP_NH    "reg9[5]"
>>>
>>>  /* Register to store the eth address associated to a router port for
>> packets
>>>   * received in S_ROUTER_IN_ADMISSION.
>>> @@ -327,7 +328,8 @@ enum ovn_stage {
>>>   * |     |   EGRESS_LOOPBACK/       | G |     UNUSED      |
>>>   * | R9  |   PKT_LARGER/            | 4 |                 |
>>>   * |     |   LOOKUP_NEIGHBOR_RESULT/|   |                 |
>>> - * |     |   SKIP_LOOKUP_NEIGHBOR}  |   |                 |
>>> + * |     |   SKIP_LOOKUP_NEIGHBOR/  |   |                 |
>>> + * |     |   KNOWN_ECMP_NH}         |   |                 |
>>>   * |     |                          |   |                 |
>>>   * |     | REG_ORIG_TP_DPORT_ROUTER |   |                 |
>>>   * |     |                          |   |                 |
>>> @@ -9437,6 +9439,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
>>>                                 struct ds *route_match)
>>>  {
>>>      const struct nbrec_logical_router_static_route *st_route =
>> route->route;
>>> +    struct ds base_match = DS_EMPTY_INITIALIZER;
>>>      struct ds match = DS_EMPTY_INITIALIZER;
>>>      struct ds actions = DS_EMPTY_INITIALIZER;
>>>      struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
>>> @@ -9448,20 +9451,22 @@ add_ecmp_symmetric_reply_flows(struct hmap
>> *lflows,
>>>      /* If symmetric ECMP replies are enabled, then packets that arrive
>> over
>>>       * an ECMP route need to go through conntrack.
>>>       */
>>> -    ds_put_format(&match, "inport == %s && ip%s.%s == %s",
>>> +    ds_put_format(&base_match, "inport == %s && ip%s.%s == %s",
>>>                    out_port->json_key,
>>>                    IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "4" : "6",
>>>                    route->is_src_route ? "dst" : "src",
>>>                    cidr);
>>>      free(cidr);
>>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
>>> -                            ds_cstr(&match), "ct_next;",
>>> -                            &st_route->header_);
>>> +            ds_cstr(&base_match),
>>> +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
>>> +            &st_route->header_);
>>>
>>>      /* And packets that go out over an ECMP route need conntrack */
>>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
>>> -                            ds_cstr(route_match), "ct_next;",
>>> -                            &st_route->header_);
>>> +            ds_cstr(route_match),
>>> +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
>>> +            &st_route->header_);
>>>
>>>      /* Save src eth and inport in ct_label for packets that arrive over
>>>       * an ECMP route.
>>> @@ -9469,11 +9474,84 @@ add_ecmp_symmetric_reply_flows(struct hmap
>> *lflows,
>>>       * NOTE: we purposely are not clearing match before this
>>>       * ds_put_cstr() call. The previous contents are needed.
>>>       */
>>> -    ds_put_cstr(&match, " && (ct.new && !ct.est)");
>>> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && tcp",
>>> +                  ds_cstr(&base_match));
>>> +    ds_put_format(&actions,
>>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
>>> +            " %s = %" PRId64 ";}; "
>>> +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
>>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
>>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>>> +                            ds_cstr(&match), ds_cstr(&actions),
>>> +                            &st_route->header_);
>>> +    ds_clear(&match);
>>> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && udp",
>>> +                  ds_cstr(&base_match));
>>> +    ds_clear(&actions);
>>> +    ds_put_format(&actions,
>>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
>>> +            " %s = %" PRId64 ";}; "
>>> +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
>>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
>>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>>> +                            ds_cstr(&match), ds_cstr(&actions),
>>> +                            &st_route->header_);
>>> +    ds_clear(&match);
>>> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && sctp",
>>> +                  ds_cstr(&base_match));
>>> +    ds_clear(&actions);
>>> +    ds_put_format(&actions,
>>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
>>> +            " %s = %" PRId64 ";}; "
>>> +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
>>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
>>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>>> +                            ds_cstr(&match), ds_cstr(&actions),
>>> +                            &st_route->header_);
>>>
>>> -    ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth =
>> eth.src;"
>>> -                  " %s = %" PRId64 ";}; next;",
>>> -                  ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
>>> +    ds_clear(&match);
>>> +    ds_put_format(&match,
>>> +            "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH"
>> == 0",
>>> +            ds_cstr(&base_match));
>>> +    ds_clear(&actions);
>>> +    ds_put_format(&actions,
>>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
>>> +            " %s = %" PRId64 ";}; "
>>> +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
>>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
>>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>>> +                            ds_cstr(&match), ds_cstr(&actions),
>>> +                            &st_route->header_);
>>> +
>>> +    ds_clear(&match);
>>> +    ds_put_format(&match,
>>> +            "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH"
>> == 0",
>>> +            ds_cstr(&base_match));
>>> +    ds_clear(&actions);
>>> +    ds_put_format(&actions,
>>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
>>> +            " %s = %" PRId64 ";}; "
>>
>> Sorry that I haven't reviewed the details yet, but this flow would break HW
>> offload again, which was fixed in 22.03 and backported to 21.12. cc @Mark
>> Michelson <mmichels@redhat.com>
> 
> can you please provide more info?
> 

I'm curious too how this would break HWOL.  We used to have exactly the
same type of action before this patch too:

ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth = eth.src;"
              " %s = %" PRId64 ";}; next;",

And there was no complaint about it AFAICT.

On a different note, I think it would be extremely beneficial if HW
vendors would plug into the CI automation (ovsrobot + patchwork) and run
a set of tests with OVN patches to ensure that we don't add flows that
are hard to offload.

I think it's very hard (impossible?) to catch all cases during code review.

Regards,
Dumitru

> Regards,
> Lorenzo
> 
>>
>> Thanks,
>> Han
>>
>>> +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
>>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
>>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>>> +                            ds_cstr(&match), ds_cstr(&actions),
>>> +                            &st_route->header_);
>>> +    ds_clear(&match);
>>> +    ds_put_format(&match,
>>> +            "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH"
>> == 0",
>>> +            ds_cstr(&base_match));
>>> +    ds_clear(&actions);
>>> +    ds_put_format(&actions,
>>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
>>> +            " %s = %" PRId64 ";}; "
>>> +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
>>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
>>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
>>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
>>>                              ds_cstr(&match), ds_cstr(&actions),
>>>                              &st_route->header_);
>>> @@ -9481,7 +9559,8 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
>>>      /* Bypass ECMP selection if we already have ct_label information
>>>       * for where to route the packet.
>>>       */
>>> -    ds_put_format(&ecmp_reply, "ct.rpl && %s == %"PRId64,
>>> +    ds_put_format(&ecmp_reply,
>>> +                  "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s ==
>> %"PRId64,
>>>                    ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
>>>      ds_clear(&match);
>>>      ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
>>> @@ -9517,6 +9596,7 @@ add_ecmp_symmetric_reply_flows(struct hmap *lflows,
>>>                              200, ds_cstr(&ecmp_reply),
>>>                              action, &st_route->header_);
>>>
>>> +    ds_destroy(&base_match);
>>>      ds_destroy(&match);
>>>      ds_destroy(&actions);
>>>      ds_destroy(&ecmp_reply);
>>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>>> index a87df24bd..aa7196a92 100644
>>> --- a/northd/ovn-northd.8.xml
>>> +++ b/northd/ovn-northd.8.xml
>>> @@ -3166,8 +3166,12 @@ icmp6 {
>>>        configured. The matching logic for these ports essentially
>> reverses the
>>>        configured logic of the ECMP route. So for instance, a route with a
>>>        destination routing policy will instead match if the source IP
>> address
>>> -      matches the static route's prefix. The flow uses the action
>>> -      <code>ct_next</code> to send IP packets to the connection tracker
>> for
>>> +      matches the static route's prefix. The flow uses the actions
>>> +      <code>chk_ecmp_nh_mac(); ct_next</code> or
>>> +      <code>chk_ecmp_nh(); ct_next</code> to send IP packets to table
>>> +      <code>OFTABLE_ECMP_NH_MAC</code> or to table
>>> +      <code>OFTABLE_ECMP_NH</code> in order to check if source info are
>> already
>>> +      stored by OVN and then to the connection tracker for
>>>        packet de-fragmentation and tracking before sending it to the next
>> table.
>>>      </p>
>>>
>>> @@ -3426,10 +3430,11 @@ icmp6 {
>>>          route with a destination routing policy will instead match if the
>>>          source IP address matches the static route's prefix. The flow
>> uses
>>>          the action <code>ct_commit { ct_label.ecmp_reply_eth = eth.src;"
>>> -        " ct_mark.ecmp_reply_port = <var>K</var>;}; next; </code> to
>> commit
>>> -        the connection and storing <code>eth.src</code> and the ECMP
>>> -        reply port binding tunnel key <var>K</var> in the
>>> -        <code>ct_label</code>.
>>> +        " ct_mark.ecmp_reply_port = <var>K</var>;}; commit_ecmp_nh();
>> next;
>>> +        </code> to commit the connection and storing
>> <code>eth.src</code> and
>>> +        the ECMP reply port binding tunnel key <var>K</var> in the
>>> +        <code>ct_label</code> and the traffic pattern to table
>>> +        <code>OFTABLE_ECMP_NH_MAC</code> or <code>OFTABLE_ECMP_NH</code>.
>>>        </li>
>>>      </ul>
>>>
>>> @@ -3568,10 +3573,10 @@ output;
>>>        which the packet should be sent. The
>> <code>ct_mark.ecmp_reply_port</code>
>>>        tells the logical router port on which the packet should be sent.
>> These
>>>        values saved to the conntrack fields when the initial ingress
>> traffic is
>>> -      received over the ECMP route and committed to conntrack. The
>>> -      priority-10300 flows in this stage set the <code>outport</code>,
>>> -      while the <code>eth.dst</code> is set by flows at the ARP/ND
>> Resolution
>>> -      stage.
>>> +      received over the ECMP route and committed to conntrack.
>>> +      If <code>REGBIT_KNOWN_ECMP_NH</code> is set, the priority-10300
>>> +      flows in this stage set the <code>outport</code>, while the
>>> +      <code>eth.dst</code> is set by flows at the ARP/ND Resolution
>> stage.
>>>      </p>
>>>
>>>      <p>
>>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>>> index 610a5ce12..0c7b24816 100644
>>> --- a/tests/ovn-northd.at
>>> +++ b/tests/ovn-northd.at
>>> @@ -5644,10 +5644,24 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp"
>> lr0flows | sed 's/192\.168\.0\..0/192.
>>>    table=??(lr_in_ip_routing_ecmp), priority=150  , match=(reg8[[0..15]]
>> == 0), action=(next;)
>>>  ])
>>>
>>> +AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed
>> 's/table=../table=??/' | sort], [0], [dnl
>>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp &&
>> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
>> sctp); next;)
>>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp &&
>> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
>> next;)
>>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp &&
>> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
>> next;)
>>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp),
>> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
>> sctp); next;)
>>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp),
>> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp);
>> next;)
>>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp),
>> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
>>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp);
>> next;)
>>> +])
>>> +
>>> +AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
>> 's/table=../table=??/' | sort], [0], [dnl
>>> +  table=??(lr_in_defrag       ), priority=100  , match=(inport ==
>> "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac();
>> ct_next;)
>>> +  table=??(lr_in_defrag       ), priority=100  , match=(reg7 == 0 &&
>> ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
>>> +])
>>> +
>>>  dnl The chassis was created with other_config:ct-no-masked-label=false,
>> the flows
>>>  dnl should be using ct_label.ecmp_reply_port.
>>>  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
>> 's/table=../table=??/'], [0], [dnl
>>> -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
>> ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
>> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
>>> +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
>> reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1);
>> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
>>>  ])
>>>
>>>  dnl Simulate an ovn-controller upgrade to a version that supports
>>> @@ -5657,7 +5671,7 @@ check ovn-sbctl set chassis ch1
>> other_config:ct-no-masked-label=true
>>>  check ovn-nbctl --wait=sb sync
>>>  ovn-sbctl dump-flows lr0 > lr0flows
>>>  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
>> 's/table=../table=??/'], [0], [dnl
>>> -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
>> ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
>> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
>>> +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
>> reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1);
>> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
>>>  ])
>>>
>>>  # add ecmp route with wrong nexthop
>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>> index 91dc3b9d6..12ba3f6d0 100644
>>> --- a/tests/ovn.at
>>> +++ b/tests/ovn.at
>>> @@ -27173,7 +27173,7 @@ AT_CHECK([
>>>          grep "priority=200" | \
>>>          grep -c
>> "move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
>>>      done; :], [0], [dnl
>>> -1
>>> +6
>>>  1
>>>  0
>>>  0
>>> @@ -27298,7 +27298,7 @@ AT_CHECK([
>>>          grep "priority=200" | \
>>>          grep -c
>> "move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
>>>      done; :], [0], [dnl
>>> -1
>>> +6
>>>  1
>>>  0
>>>  0
>>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
>>> index 4a8fdede8..b3096deb3 100644
>>> --- a/tests/system-ovn.at
>>> +++ b/tests/system-ovn.at
>>> @@ -5956,11 +5956,8 @@ ovn-nbctl --wait=hv sync
>>>
>>>  on_exit 'ovs-ofctl dump-flows br-int'
>>>
>>> -# 'bob1' should be able to ping 'alice1' directly.
>>> -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 |
>> FORMAT_PING], \
>>> -[0], [dnl
>>> -20 packets transmitted, 20 received, 0% packet loss, time 0ms
>>> -])
>>> +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
>>> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
>>>
>>>  # Ensure conntrack entry is present. We should not try to predict
>>>  # the tunnel key for the output port, so we strip it from the labels
>>> @@ -5968,7 +5965,7 @@ NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15
>> 10.0.0.2 | FORMAT_PING], \
>>>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
>>>  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
>>>  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>>>
>> -icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
>>>
>> +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
>>>  ])
>>>
>>>  # Ensure datapaths show conntrack states as expected
>>> @@ -5981,6 +5978,36 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
>> 'ct_state(-new+est+rpl+trk).*ct_lab
>>>  1
>>>  ])
>>>
>>> +# Flush conntrack entries for easier output parsing of next test.
>>> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
>>> +
>>> +# Change bob1 L2 address anche check the reply is properly updated.
>>> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
>>> +ovn-nbctl set Logical_Switch_Port r2-ext \
>>> +     type=router options:router-port=R2_ext
>> addresses='"00:00:10:01:02:04"'
>>> +
>>> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
>>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
>> 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0],
>> [dnl
>>> +1
>>> +])
>>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
>> 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
>>> +1
>>> +])
>>> +
>>> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
>> FORMAT_CT(172.16.0.1) | \
>>> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
>>> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>>>
>> +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
>>> +])
>>> +
>>> +# Check entries in table 76 and 77 expires w/o traffic
>>> +OVS_WAIT_UNTIL([
>>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
>>> +])
>>> +OVS_WAIT_UNTIL([
>>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
>>> +])
>>> +
>>>  ovs-ofctl dump-flows br-int
>>>
>>>  OVS_APP_EXIT_AND_WAIT([ovn-controller])
>>> @@ -6122,11 +6149,8 @@ ovn-nbctl --wait=hv sync
>>>
>>>  on_exit 'ovs-ofctl dump-flows br-int'
>>>
>>> -# 'bob1' should be able to ping 'alice1' directly.
>>> -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 fd01::2 |
>> FORMAT_PING], \
>>> -[0], [dnl
>>> -20 packets transmitted, 20 received, 0% packet loss, time 0ms
>>> -])
>>> +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 80], [alice1.pid])
>>> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
>>>
>>>  # Ensure datapaths show conntrack states as expected
>>>  # Like with conntrack entries, we shouldn't try to predict
>>> @@ -6145,7 +6169,38 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
>> 'ct_state(-new+est+rpl+trk).*ct_lab
>>>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
>>>  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
>>>  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>>>
>> -icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
>>>
>> +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
>>> +])
>>> +
>>> +# Flush conntrack entries for easier output parsing of next test.
>>> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
>>> +
>>> +# Change bob1 L2 address anche check the reply is properly updated.
>>> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
>>> +ovn-nbctl set Logical_Switch_Port r2-ext \
>>> +     type=router options:router-port=R2_ext
>> addresses='"00:00:10:01:02:04"'
>>> +
>>> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
>>> +
>>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
>> 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0],
>> [dnl
>>> +1
>>> +])
>>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
>> 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
>>> +1
>>> +])
>>> +
>>> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
>> FORMAT_CT(fd01::2) | \
>>> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
>>> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
>>>
>> +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
>>> +])
>>> +
>>> +# Check entries in table 76 and 77 expires w/o traffic
>>> +OVS_WAIT_UNTIL([
>>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
>>> +])
>>> +OVS_WAIT_UNTIL([
>>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
>>>  ])
>>>
>>>  ovs-ofctl dump-flows br-int
>>> --
>>> 2.36.1
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Han Zhou Aug. 19, 2022, 9:38 p.m. UTC | #5
On Fri, Aug 19, 2022 at 12:48 PM Dumitru Ceara <dceara@redhat.com> wrote:
>
> On 8/19/22 17:37, Lorenzo Bianconi wrote:
> >> On Fri, Jul 15, 2022 at 4:35 AM Lorenzo Bianconi <
> >> lorenzo.bianconi@redhat.com> wrote:
> >>>
> >>> Rely on the following new actions for ecmp-symmetric routing:
> >>> - chk_ecmp_nh_mac
> >>> - chk_ecmp_nh
> >>> - commit_ecmp_nh
> >>>
> >>> In this way OVN is able to keep up if for any reason the ECMP traffic
> >>> source changes L2 address and keeps old L3 addresses.
> >>
> >> Hi Lorenzo, do you have more details of the problem scenario? It seems
the
> >> bugzilla access is limited.
> >
> > Hi Han,
> >
> > in the current codebase, to implement ecmp symmetric-reply, ovn stores
> > the next-hop mac address (let's say mac0) and in-port in ct_label and
> > ct_mark respectively.
> > An issue occurs whenever the traffic next-hop changes l2 address from
> > mac0 to mac1 w/o changing l3/l4 info (e.g. src ip address).
> > In this scenario ovn is not able to keep up and recover from this
change.
> > Agree?
> >
> >>
> >>>
> >>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2096233
> >>> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> >>> ---
> >>>  northd/northd.c         | 102
+++++++++++++++++++++++++++++++++++-----
> >>>  northd/ovn-northd.8.xml |  25 ++++++----
> >>>  tests/ovn-northd.at     |  18 ++++++-
> >>>  tests/ovn.at            |   4 +-
> >>>  tests/system-ovn.at     |  79 ++++++++++++++++++++++++++-----
> >>>  5 files changed, 191 insertions(+), 37 deletions(-)
> >>>
> >>> diff --git a/northd/northd.c b/northd/northd.c
> >>> index d31cb1688..57ec3675b 100644
> >>> --- a/northd/northd.c
> >>> +++ b/northd/northd.c
> >>> @@ -227,6 +227,7 @@ enum ovn_stage {
> >>>  #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
> >>>  #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
> >>>  #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
> >>> +#define REGBIT_KNOWN_ECMP_NH    "reg9[5]"
> >>>
> >>>  /* Register to store the eth address associated to a router port for
> >> packets
> >>>   * received in S_ROUTER_IN_ADMISSION.
> >>> @@ -327,7 +328,8 @@ enum ovn_stage {
> >>>   * |     |   EGRESS_LOOPBACK/       | G |     UNUSED      |
> >>>   * | R9  |   PKT_LARGER/            | 4 |                 |
> >>>   * |     |   LOOKUP_NEIGHBOR_RESULT/|   |                 |
> >>> - * |     |   SKIP_LOOKUP_NEIGHBOR}  |   |                 |
> >>> + * |     |   SKIP_LOOKUP_NEIGHBOR/  |   |                 |
> >>> + * |     |   KNOWN_ECMP_NH}         |   |                 |
> >>>   * |     |                          |   |                 |
> >>>   * |     | REG_ORIG_TP_DPORT_ROUTER |   |                 |
> >>>   * |     |                          |   |                 |
> >>> @@ -9437,6 +9439,7 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
> >>>                                 struct ds *route_match)
> >>>  {
> >>>      const struct nbrec_logical_router_static_route *st_route =
> >> route->route;
> >>> +    struct ds base_match = DS_EMPTY_INITIALIZER;
> >>>      struct ds match = DS_EMPTY_INITIALIZER;
> >>>      struct ds actions = DS_EMPTY_INITIALIZER;
> >>>      struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
> >>> @@ -9448,20 +9451,22 @@ add_ecmp_symmetric_reply_flows(struct hmap
> >> *lflows,
> >>>      /* If symmetric ECMP replies are enabled, then packets that
arrive
> >> over
> >>>       * an ECMP route need to go through conntrack.
> >>>       */
> >>> -    ds_put_format(&match, "inport == %s && ip%s.%s == %s",
> >>> +    ds_put_format(&base_match, "inport == %s && ip%s.%s == %s",
> >>>                    out_port->json_key,
> >>>                    IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "4" : "6",
> >>>                    route->is_src_route ? "dst" : "src",
> >>>                    cidr);
> >>>      free(cidr);
> >>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> >>> -                            ds_cstr(&match), "ct_next;",
> >>> -                            &st_route->header_);
> >>> +            ds_cstr(&base_match),
> >>> +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
> >>> +            &st_route->header_);
> >>>
> >>>      /* And packets that go out over an ECMP route need conntrack */
> >>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
> >>> -                            ds_cstr(route_match), "ct_next;",
> >>> -                            &st_route->header_);
> >>> +            ds_cstr(route_match),
> >>> +            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
> >>> +            &st_route->header_);
> >>>
> >>>      /* Save src eth and inport in ct_label for packets that arrive
over
> >>>       * an ECMP route.
> >>> @@ -9469,11 +9474,84 @@ add_ecmp_symmetric_reply_flows(struct hmap
> >> *lflows,
> >>>       * NOTE: we purposely are not clearing match before this
> >>>       * ds_put_cstr() call. The previous contents are needed.
> >>>       */
> >>> -    ds_put_cstr(&match, " && (ct.new && !ct.est)");
> >>> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && tcp",
> >>> +                  ds_cstr(&base_match));
> >>> +    ds_put_format(&actions,
> >>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> >>> +            " %s = %" PRId64 ";}; "
> >>> +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> >>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> >>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> >>> +                            ds_cstr(&match), ds_cstr(&actions),
> >>> +                            &st_route->header_);
> >>> +    ds_clear(&match);
> >>> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && udp",
> >>> +                  ds_cstr(&base_match));
> >>> +    ds_clear(&actions);
> >>> +    ds_put_format(&actions,
> >>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> >>> +            " %s = %" PRId64 ";}; "
> >>> +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> >>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> >>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> >>> +                            ds_cstr(&match), ds_cstr(&actions),
> >>> +                            &st_route->header_);
> >>> +    ds_clear(&match);
> >>> +    ds_put_format(&match, "%s && (ct.new && !ct.est) && sctp",
> >>> +                  ds_cstr(&base_match));
> >>> +    ds_clear(&actions);
> >>> +    ds_put_format(&actions,
> >>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> >>> +            " %s = %" PRId64 ";}; "
> >>> +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> >>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> >>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> >>> +                            ds_cstr(&match), ds_cstr(&actions),
> >>> +                            &st_route->header_);
> >>>
> >>> -    ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth =
> >> eth.src;"
> >>> -                  " %s = %" PRId64 ";}; next;",
> >>> -                  ct_ecmp_reply_port_match,
out_port->sb->tunnel_key);
> >>> +    ds_clear(&match);
> >>> +    ds_put_format(&match,
> >>> +            "%s && (!ct.rpl && ct.est) && tcp &&
"REGBIT_KNOWN_ECMP_NH"
> >> == 0",
> >>> +            ds_cstr(&base_match));
> >>> +    ds_clear(&actions);
> >>> +    ds_put_format(&actions,
> >>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> >>> +            " %s = %" PRId64 ";}; "
> >>> +            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
> >>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> >>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> >>> +                            ds_cstr(&match), ds_cstr(&actions),
> >>> +                            &st_route->header_);
> >>> +
> >>> +    ds_clear(&match);
> >>> +    ds_put_format(&match,
> >>> +            "%s && (!ct.rpl && ct.est) && udp &&
"REGBIT_KNOWN_ECMP_NH"
> >> == 0",
> >>> +            ds_cstr(&base_match));
> >>> +    ds_clear(&actions);
> >>> +    ds_put_format(&actions,
> >>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> >>> +            " %s = %" PRId64 ";}; "
> >>
> >> Sorry that I haven't reviewed the details yet, but this flow would
break HW
> >> offload again, which was fixed in 22.03 and backported to 21.12. cc
@Mark
> >> Michelson <mmichels@redhat.com>
> >
> > can you please provide more info?
> >
>
> I'm curious too how this would break HWOL.  We used to have exactly the
> same type of action before this patch too:
>
> ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth = eth.src;"
>               " %s = %" PRId64 ";}; next;",
>
> And there was no complaint about it AFAICT.

This one is more obvious, as mentioned in the commit message of
https://github.com/ovn-org/ovn/commit/a075230e4a0fcc166251271db1c8ae01b993c9cf
"For ct_label.ecmp_reply_eth, the flow matching it still uses masked
access, but it doesn't matter because *the flow is for new connections*
and requires ct_commit in its actions, so it wouldn't be offloaded
anyway for those NICs."

I was worried because this patch added a flow with "ct.est" that accesses
masked ct_label. But anyway, false alarm as explained in my other email.

>
> On a different note, I think it would be extremely beneficial if HW
> vendors would plug into the CI automation (ovsrobot + patchwork) and run
> a set of tests with OVN patches to ensure that we don't add flows that
> are hard to offload.
>
> I think it's very hard (impossible?) to catch all cases during code
review.
>

It's a good idea, but in practice how can the "set of tests" be ensured to
cover all possible flows generated by OVN pipelines? Even if flows are
generated with good coverage, it would still be hard to check, because some
flows that can't be offloaded would impact only a very small amount of
packets, such as the one mentioned in this patch, which is considered no
impact. Need more thoughts on this.

> Regards,
> Dumitru
>
> > Regards,
> > Lorenzo
> >
> >>
> >> Thanks,
> >> Han
> >>
> >>> +            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
> >>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> >>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >>> +    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> >>> +                            ds_cstr(&match), ds_cstr(&actions),
> >>> +                            &st_route->header_);
> >>> +    ds_clear(&match);
> >>> +    ds_put_format(&match,
> >>> +            "%s && (!ct.rpl && ct.est) && sctp &&
"REGBIT_KNOWN_ECMP_NH"
> >> == 0",
> >>> +            ds_cstr(&base_match));
> >>> +    ds_clear(&actions);
> >>> +    ds_put_format(&actions,
> >>> +            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
> >>> +            " %s = %" PRId64 ";}; "
> >>> +            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
> >>> +            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
> >>> +            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
> >>>      ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL,
100,
> >>>                              ds_cstr(&match), ds_cstr(&actions),
> >>>                              &st_route->header_);
> >>> @@ -9481,7 +9559,8 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
> >>>      /* Bypass ECMP selection if we already have ct_label information
> >>>       * for where to route the packet.
> >>>       */
> >>> -    ds_put_format(&ecmp_reply, "ct.rpl && %s == %"PRId64,
> >>> +    ds_put_format(&ecmp_reply,
> >>> +                  "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s ==
> >> %"PRId64,
> >>>                    ct_ecmp_reply_port_match,
out_port->sb->tunnel_key);
> >>>      ds_clear(&match);
> >>>      ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
> >>> @@ -9517,6 +9596,7 @@ add_ecmp_symmetric_reply_flows(struct hmap
*lflows,
> >>>                              200, ds_cstr(&ecmp_reply),
> >>>                              action, &st_route->header_);
> >>>
> >>> +    ds_destroy(&base_match);
> >>>      ds_destroy(&match);
> >>>      ds_destroy(&actions);
> >>>      ds_destroy(&ecmp_reply);
> >>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> >>> index a87df24bd..aa7196a92 100644
> >>> --- a/northd/ovn-northd.8.xml
> >>> +++ b/northd/ovn-northd.8.xml
> >>> @@ -3166,8 +3166,12 @@ icmp6 {
> >>>        configured. The matching logic for these ports essentially
> >> reverses the
> >>>        configured logic of the ECMP route. So for instance, a route
with a
> >>>        destination routing policy will instead match if the source IP
> >> address
> >>> -      matches the static route's prefix. The flow uses the action
> >>> -      <code>ct_next</code> to send IP packets to the connection
tracker
> >> for
> >>> +      matches the static route's prefix. The flow uses the actions
> >>> +      <code>chk_ecmp_nh_mac(); ct_next</code> or
> >>> +      <code>chk_ecmp_nh(); ct_next</code> to send IP packets to table
> >>> +      <code>OFTABLE_ECMP_NH_MAC</code> or to table
> >>> +      <code>OFTABLE_ECMP_NH</code> in order to check if source info
are
> >> already
> >>> +      stored by OVN and then to the connection tracker for
> >>>        packet de-fragmentation and tracking before sending it to the
next
> >> table.
> >>>      </p>
> >>>
> >>> @@ -3426,10 +3430,11 @@ icmp6 {
> >>>          route with a destination routing policy will instead match
if the
> >>>          source IP address matches the static route's prefix. The flow
> >> uses
> >>>          the action <code>ct_commit { ct_label.ecmp_reply_eth =
eth.src;"
> >>> -        " ct_mark.ecmp_reply_port = <var>K</var>;}; next; </code> to
> >> commit
> >>> -        the connection and storing <code>eth.src</code> and the ECMP
> >>> -        reply port binding tunnel key <var>K</var> in the
> >>> -        <code>ct_label</code>.
> >>> +        " ct_mark.ecmp_reply_port = <var>K</var>;}; commit_ecmp_nh();
> >> next;
> >>> +        </code> to commit the connection and storing
> >> <code>eth.src</code> and
> >>> +        the ECMP reply port binding tunnel key <var>K</var> in the
> >>> +        <code>ct_label</code> and the traffic pattern to table
> >>> +        <code>OFTABLE_ECMP_NH_MAC</code> or
<code>OFTABLE_ECMP_NH</code>.
> >>>        </li>
> >>>      </ul>
> >>>
> >>> @@ -3568,10 +3573,10 @@ output;
> >>>        which the packet should be sent. The
> >> <code>ct_mark.ecmp_reply_port</code>
> >>>        tells the logical router port on which the packet should be
sent.
> >> These
> >>>        values saved to the conntrack fields when the initial ingress
> >> traffic is
> >>> -      received over the ECMP route and committed to conntrack. The
> >>> -      priority-10300 flows in this stage set the
<code>outport</code>,
> >>> -      while the <code>eth.dst</code> is set by flows at the ARP/ND
> >> Resolution
> >>> -      stage.
> >>> +      received over the ECMP route and committed to conntrack.
> >>> +      If <code>REGBIT_KNOWN_ECMP_NH</code> is set, the priority-10300
> >>> +      flows in this stage set the <code>outport</code>, while the
> >>> +      <code>eth.dst</code> is set by flows at the ARP/ND Resolution
> >> stage.
> >>>      </p>
> >>>
> >>>      <p>
> >>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> >>> index 610a5ce12..0c7b24816 100644
> >>> --- a/tests/ovn-northd.at
> >>> +++ b/tests/ovn-northd.at
> >>> @@ -5644,10 +5644,24 @@ AT_CHECK([grep -e "lr_in_ip_routing_ecmp"
> >> lr0flows | sed 's/192\.168\.0\..0/192.
> >>>    table=??(lr_in_ip_routing_ecmp), priority=150  ,
match=(reg8[[0..15]]
> >> == 0), action=(next;)
> >>>  ])
> >>>
> >>> +AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows |
sed
> >> 's/table=../table=??/' | sort], [0], [dnl
> >>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp &&
> >> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
> >> sctp); next;)
> >>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp &&
> >> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
tcp);
> >> next;)
> >>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp &&
> >> reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
udp);
> >> next;)
> >>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp),
> >> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
> >> sctp); next;)
> >>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp),
> >> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
tcp);
> >> next;)
> >>> +  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp),
> >> action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;
> >>  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto =
udp);
> >> next;)
> >>> +])
> >>> +
> >>> +AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed
> >> 's/table=../table=??/' | sort], [0], [dnl
> >>> +  table=??(lr_in_defrag       ), priority=100  , match=(inport ==
> >> "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] =
chk_ecmp_nh_mac();
> >> ct_next;)
> >>> +  table=??(lr_in_defrag       ), priority=100  , match=(reg7 == 0 &&
> >> ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
> >>> +])
> >>> +
> >>>  dnl The chassis was created with
other_config:ct-no-masked-label=false,
> >> the flows
> >>>  dnl should be using ct_label.ecmp_reply_port.
> >>>  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> >> 's/table=../table=??/'], [0], [dnl
> >>> -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> >> ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 =
ct_label;
> >> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> >>> +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> >> reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1);
> >> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> >>>  ])
> >>>
> >>>  dnl Simulate an ovn-controller upgrade to a version that supports
> >>> @@ -5657,7 +5671,7 @@ check ovn-sbctl set chassis ch1
> >> other_config:ct-no-masked-label=true
> >>>  check ovn-nbctl --wait=sb sync
> >>>  ovn-sbctl dump-flows lr0 > lr0flows
> >>>  AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed
> >> 's/table=../table=??/'], [0], [dnl
> >>> -  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> >> ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label;
> >> eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> >>> +  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl &&
> >> reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1);
> >> xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
> >>>  ])
> >>>
> >>>  # add ecmp route with wrong nexthop
> >>> diff --git a/tests/ovn.at b/tests/ovn.at
> >>> index 91dc3b9d6..12ba3f6d0 100644
> >>> --- a/tests/ovn.at
> >>> +++ b/tests/ovn.at
> >>> @@ -27173,7 +27173,7 @@ AT_CHECK([
> >>>          grep "priority=200" | \
> >>>          grep -c
> >>
"move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
> >>>      done; :], [0], [dnl
> >>> -1
> >>> +6
> >>>  1
> >>>  0
> >>>  0
> >>> @@ -27298,7 +27298,7 @@ AT_CHECK([
> >>>          grep "priority=200" | \
> >>>          grep -c
> >>
"move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
> >>>      done; :], [0], [dnl
> >>> -1
> >>> +6
> >>>  1
> >>>  0
> >>>  0
> >>> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> >>> index 4a8fdede8..b3096deb3 100644
> >>> --- a/tests/system-ovn.at
> >>> +++ b/tests/system-ovn.at
> >>> @@ -5956,11 +5956,8 @@ ovn-nbctl --wait=hv sync
> >>>
> >>>  on_exit 'ovs-ofctl dump-flows br-int'
> >>>
> >>> -# 'bob1' should be able to ping 'alice1' directly.
> >>> -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 |
> >> FORMAT_PING], \
> >>> -[0], [dnl
> >>> -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> >>> -])
> >>> +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
> >>> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> >>>
> >>>  # Ensure conntrack entry is present. We should not try to predict
> >>>  # the tunnel key for the output port, so we strip it from the labels
> >>> @@ -5968,7 +5965,7 @@ NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w
15
> >> 10.0.0.2 | FORMAT_PING], \
> >>>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
> >>>  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> >>>  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >>>
> >>
-icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
> >>>
> >>
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> >>>  ])
> >>>
> >>>  # Ensure datapaths show conntrack states as expected
> >>> @@ -5981,6 +5978,36 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> >> 'ct_state(-new+est+rpl+trk).*ct_lab
> >>>  1
> >>>  ])
> >>>
> >>> +# Flush conntrack entries for easier output parsing of next test.
> >>> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> >>> +
> >>> +# Change bob1 L2 address anche check the reply is properly updated.
> >>> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> >>> +ovn-nbctl set Logical_Switch_Port r2-ext \
> >>> +     type=router options:router-port=R2_ext
> >> addresses='"00:00:10:01:02:04"'
> >>> +
> >>> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> >>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> >> 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c],
[0],
> >> [dnl
> >>> +1
> >>> +])
> >>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> >> 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0],
[dnl
> >>> +1
> >>> +])
> >>> +
> >>> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
> >> FORMAT_CT(172.16.0.1) | \
> >>> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> >>> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >>>
> >>
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> >>> +])
> >>> +
> >>> +# Check entries in table 76 and 77 expires w/o traffic
> >>> +OVS_WAIT_UNTIL([
> >>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets')
-eq 0
> >>> +])
> >>> +OVS_WAIT_UNTIL([
> >>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets')
-eq 0
> >>> +])
> >>> +
> >>>  ovs-ofctl dump-flows br-int
> >>>
> >>>  OVS_APP_EXIT_AND_WAIT([ovn-controller])
> >>> @@ -6122,11 +6149,8 @@ ovn-nbctl --wait=hv sync
> >>>
> >>>  on_exit 'ovs-ofctl dump-flows br-int'
> >>>
> >>> -# 'bob1' should be able to ping 'alice1' directly.
> >>> -NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 fd01::2 |
> >> FORMAT_PING], \
> >>> -[0], [dnl
> >>> -20 packets transmitted, 20 received, 0% packet loss, time 0ms
> >>> -])
> >>> +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 80], [alice1.pid])
> >>> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> >>>
> >>>  # Ensure datapaths show conntrack states as expected
> >>>  # Like with conntrack entries, we shouldn't try to predict
> >>> @@ -6145,7 +6169,38 @@ AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> >> 'ct_state(-new+est+rpl+trk).*ct_lab
> >>>  AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
> >>>  sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> >>>  sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >>>
> >>
-icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
> >>>
> >>
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
> >>> +])
> >>> +
> >>> +# Flush conntrack entries for easier output parsing of next test.
> >>> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> >>> +
> >>> +# Change bob1 L2 address anche check the reply is properly updated.
> >>> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> >>> +ovn-nbctl set Logical_Switch_Port r2-ext \
> >>> +     type=router options:router-port=R2_ext
> >> addresses='"00:00:10:01:02:04"'
> >>> +
> >>> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
> >>> +
> >>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> >> 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c],
[0],
> >> [dnl
> >>> +1
> >>> +])
> >>> +AT_CHECK([ovs-appctl dpctl/dump-flows | grep
> >> 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0],
[dnl
> >>> +1
> >>> +])
> >>> +
> >>> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 |
> >> FORMAT_CT(fd01::2) | \
> >>> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> >>> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
> >>>
> >>
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
> >>> +])
> >>> +
> >>> +# Check entries in table 76 and 77 expires w/o traffic
> >>> +OVS_WAIT_UNTIL([
> >>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets')
-eq 0
> >>> +])
> >>> +OVS_WAIT_UNTIL([
> >>> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets')
-eq 0
> >>>  ])
> >>>
> >>>  ovs-ofctl dump-flows br-int
> >>> --
> >>> 2.36.1
> >>>
> >>> _______________________________________________
> >>> dev mailing list
> >>> dev@openvswitch.org
> >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
diff mbox series

Patch

diff --git a/northd/northd.c b/northd/northd.c
index d31cb1688..57ec3675b 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -227,6 +227,7 @@  enum ovn_stage {
 #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
 #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
 #define REGBIT_DST_NAT_IP_LOCAL "reg9[4]"
+#define REGBIT_KNOWN_ECMP_NH    "reg9[5]"
 
 /* Register to store the eth address associated to a router port for packets
  * received in S_ROUTER_IN_ADMISSION.
@@ -327,7 +328,8 @@  enum ovn_stage {
  * |     |   EGRESS_LOOPBACK/       | G |     UNUSED      |
  * | R9  |   PKT_LARGER/            | 4 |                 |
  * |     |   LOOKUP_NEIGHBOR_RESULT/|   |                 |
- * |     |   SKIP_LOOKUP_NEIGHBOR}  |   |                 |
+ * |     |   SKIP_LOOKUP_NEIGHBOR/  |   |                 |
+ * |     |   KNOWN_ECMP_NH}         |   |                 |
  * |     |                          |   |                 |
  * |     | REG_ORIG_TP_DPORT_ROUTER |   |                 |
  * |     |                          |   |                 |
@@ -9437,6 +9439,7 @@  add_ecmp_symmetric_reply_flows(struct hmap *lflows,
                                struct ds *route_match)
 {
     const struct nbrec_logical_router_static_route *st_route = route->route;
+    struct ds base_match = DS_EMPTY_INITIALIZER;
     struct ds match = DS_EMPTY_INITIALIZER;
     struct ds actions = DS_EMPTY_INITIALIZER;
     struct ds ecmp_reply = DS_EMPTY_INITIALIZER;
@@ -9448,20 +9451,22 @@  add_ecmp_symmetric_reply_flows(struct hmap *lflows,
     /* If symmetric ECMP replies are enabled, then packets that arrive over
      * an ECMP route need to go through conntrack.
      */
-    ds_put_format(&match, "inport == %s && ip%s.%s == %s",
+    ds_put_format(&base_match, "inport == %s && ip%s.%s == %s",
                   out_port->json_key,
                   IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "4" : "6",
                   route->is_src_route ? "dst" : "src",
                   cidr);
     free(cidr);
     ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
-                            ds_cstr(&match), "ct_next;",
-                            &st_route->header_);
+            ds_cstr(&base_match),
+            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh_mac(); ct_next;",
+            &st_route->header_);
 
     /* And packets that go out over an ECMP route need conntrack */
     ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_DEFRAG, 100,
-                            ds_cstr(route_match), "ct_next;",
-                            &st_route->header_);
+            ds_cstr(route_match),
+            REGBIT_KNOWN_ECMP_NH" = chk_ecmp_nh(); ct_next;",
+            &st_route->header_);
 
     /* Save src eth and inport in ct_label for packets that arrive over
      * an ECMP route.
@@ -9469,11 +9474,84 @@  add_ecmp_symmetric_reply_flows(struct hmap *lflows,
      * NOTE: we purposely are not clearing match before this
      * ds_put_cstr() call. The previous contents are needed.
      */
-    ds_put_cstr(&match, " && (ct.new && !ct.est)");
+    ds_put_format(&match, "%s && (ct.new && !ct.est) && tcp",
+                  ds_cstr(&base_match));
+    ds_put_format(&actions,
+            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
+            " %s = %" PRId64 ";}; "
+            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
+            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
+            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
+                            ds_cstr(&match), ds_cstr(&actions),
+                            &st_route->header_);
+    ds_clear(&match);
+    ds_put_format(&match, "%s && (ct.new && !ct.est) && udp",
+                  ds_cstr(&base_match));
+    ds_clear(&actions);
+    ds_put_format(&actions,
+            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
+            " %s = %" PRId64 ";}; "
+            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
+            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
+            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
+                            ds_cstr(&match), ds_cstr(&actions),
+                            &st_route->header_);
+    ds_clear(&match);
+    ds_put_format(&match, "%s && (ct.new && !ct.est) && sctp",
+                  ds_cstr(&base_match));
+    ds_clear(&actions);
+    ds_put_format(&actions,
+            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
+            " %s = %" PRId64 ";}; "
+            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
+            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
+            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
+                            ds_cstr(&match), ds_cstr(&actions),
+                            &st_route->header_);
 
-    ds_put_format(&actions, "ct_commit { ct_label.ecmp_reply_eth = eth.src;"
-                  " %s = %" PRId64 ";}; next;",
-                  ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
+    ds_clear(&match);
+    ds_put_format(&match,
+            "%s && (!ct.rpl && ct.est) && tcp && "REGBIT_KNOWN_ECMP_NH" == 0",
+            ds_cstr(&base_match));
+    ds_clear(&actions);
+    ds_put_format(&actions,
+            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
+            " %s = %" PRId64 ";}; "
+            "commit_ecmp_nh(ipv6 = %s, proto = tcp); next;",
+            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
+            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
+                            ds_cstr(&match), ds_cstr(&actions),
+                            &st_route->header_);
+
+    ds_clear(&match);
+    ds_put_format(&match,
+            "%s && (!ct.rpl && ct.est) && udp && "REGBIT_KNOWN_ECMP_NH" == 0",
+            ds_cstr(&base_match));
+    ds_clear(&actions);
+    ds_put_format(&actions,
+            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
+            " %s = %" PRId64 ";}; "
+            "commit_ecmp_nh(ipv6 = %s, proto = udp); next;",
+            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
+            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
+    ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
+                            ds_cstr(&match), ds_cstr(&actions),
+                            &st_route->header_);
+    ds_clear(&match);
+    ds_put_format(&match,
+            "%s && (!ct.rpl && ct.est) && sctp && "REGBIT_KNOWN_ECMP_NH" == 0",
+            ds_cstr(&base_match));
+    ds_clear(&actions);
+    ds_put_format(&actions,
+            "ct_commit { ct_label.ecmp_reply_eth = eth.src; "
+            " %s = %" PRId64 ";}; "
+            "commit_ecmp_nh(ipv6 = %s, proto = sctp); next;",
+            ct_ecmp_reply_port_match, out_port->sb->tunnel_key,
+            IN6_IS_ADDR_V4MAPPED(&route->prefix) ? "false" : "true");
     ovn_lflow_add_with_hint(lflows, od, S_ROUTER_IN_ECMP_STATEFUL, 100,
                             ds_cstr(&match), ds_cstr(&actions),
                             &st_route->header_);
@@ -9481,7 +9559,8 @@  add_ecmp_symmetric_reply_flows(struct hmap *lflows,
     /* Bypass ECMP selection if we already have ct_label information
      * for where to route the packet.
      */
-    ds_put_format(&ecmp_reply, "ct.rpl && %s == %"PRId64,
+    ds_put_format(&ecmp_reply,
+                  "ct.rpl && "REGBIT_KNOWN_ECMP_NH" == 1 && %s == %"PRId64,
                   ct_ecmp_reply_port_match, out_port->sb->tunnel_key);
     ds_clear(&match);
     ds_put_format(&match, "%s && %s", ds_cstr(&ecmp_reply),
@@ -9517,6 +9596,7 @@  add_ecmp_symmetric_reply_flows(struct hmap *lflows,
                             200, ds_cstr(&ecmp_reply),
                             action, &st_route->header_);
 
+    ds_destroy(&base_match);
     ds_destroy(&match);
     ds_destroy(&actions);
     ds_destroy(&ecmp_reply);
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index a87df24bd..aa7196a92 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -3166,8 +3166,12 @@  icmp6 {
       configured. The matching logic for these ports essentially reverses the
       configured logic of the ECMP route. So for instance, a route with a
       destination routing policy will instead match if the source IP address
-      matches the static route's prefix. The flow uses the action
-      <code>ct_next</code> to send IP packets to the connection tracker for
+      matches the static route's prefix. The flow uses the actions
+      <code>chk_ecmp_nh_mac(); ct_next</code> or
+      <code>chk_ecmp_nh(); ct_next</code> to send IP packets to table
+      <code>OFTABLE_ECMP_NH_MAC</code> or to table
+      <code>OFTABLE_ECMP_NH</code> in order to check if source info are already
+      stored by OVN and then to the connection tracker for
       packet de-fragmentation and tracking before sending it to the next table.
     </p>
 
@@ -3426,10 +3430,11 @@  icmp6 {
         route with a destination routing policy will instead match if the
         source IP address matches the static route's prefix. The flow uses
         the action <code>ct_commit { ct_label.ecmp_reply_eth = eth.src;"
-        " ct_mark.ecmp_reply_port = <var>K</var>;}; next; </code> to commit
-        the connection and storing <code>eth.src</code> and the ECMP
-        reply port binding tunnel key <var>K</var> in the
-        <code>ct_label</code>.
+        " ct_mark.ecmp_reply_port = <var>K</var>;}; commit_ecmp_nh(); next;
+        </code> to commit the connection and storing <code>eth.src</code> and
+        the ECMP reply port binding tunnel key <var>K</var> in the
+        <code>ct_label</code> and the traffic pattern to table
+        <code>OFTABLE_ECMP_NH_MAC</code> or <code>OFTABLE_ECMP_NH</code>.
       </li>
     </ul>
 
@@ -3568,10 +3573,10 @@  output;
       which the packet should be sent. The <code>ct_mark.ecmp_reply_port</code>
       tells the logical router port on which the packet should be sent. These
       values saved to the conntrack fields when the initial ingress traffic is
-      received over the ECMP route and committed to conntrack. The
-      priority-10300 flows in this stage set the <code>outport</code>,
-      while the <code>eth.dst</code> is set by flows at the ARP/ND Resolution
-      stage.
+      received over the ECMP route and committed to conntrack.
+      If <code>REGBIT_KNOWN_ECMP_NH</code> is set, the priority-10300
+      flows in this stage set the <code>outport</code>, while the
+      <code>eth.dst</code> is set by flows at the ARP/ND Resolution stage.
     </p>
 
     <p>
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 610a5ce12..0c7b24816 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -5644,10 +5644,24 @@  AT_CHECK([grep -e "lr_in_ip_routing_ecmp" lr0flows | sed 's/192\.168\.0\..0/192.
   table=??(lr_in_ip_routing_ecmp), priority=150  , match=(reg8[[0..15]] == 0), action=(next;)
 ])
 
+AT_CHECK([grep -e "lr_in_ecmp_stateful".*commit_ecmp_nh lr0flows | sed 's/table=../table=??/' | sort], [0], [dnl
+  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && sctp && reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp); next;)
+  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && tcp && reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp); next;)
+  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (!ct.rpl && ct.est) && udp && reg9[[5]] == 0), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp); next;)
+  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && sctp), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = sctp); next;)
+  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && tcp), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = tcp); next;)
+  table=??(lr_in_ecmp_stateful), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1 && (ct.new && !ct.est) && udp), action=(ct_commit { ct_label.ecmp_reply_eth = eth.src;  ct_label.ecmp_reply_port = 1;}; commit_ecmp_nh(ipv6 = false, proto = udp); next;)
+])
+
+AT_CHECK([grep -e "lr_in_defrag".*chk_ecmp_nh* lr0flows | sed 's/table=../table=??/' | sort], [0], [dnl
+  table=??(lr_in_defrag       ), priority=100  , match=(inport == "lr0-public" && ip4.src == 1.0.0.1), action=(reg9[[5]] = chk_ecmp_nh_mac(); ct_next;)
+  table=??(lr_in_defrag       ), priority=100  , match=(reg7 == 0 && ip4.dst == 1.0.0.1/32), action=(reg9[[5]] = chk_ecmp_nh(); ct_next;)
+])
+
 dnl The chassis was created with other_config:ct-no-masked-label=false, the flows
 dnl should be using ct_label.ecmp_reply_port.
 AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed 's/table=../table=??/'], [0], [dnl
-  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl && ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
+  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl && reg9[[5]] == 1 && ct_label.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
 ])
 
 dnl Simulate an ovn-controller upgrade to a version that supports
@@ -5657,7 +5671,7 @@  check ovn-sbctl set chassis ch1 other_config:ct-no-masked-label=true
 check ovn-nbctl --wait=sb sync
 ovn-sbctl dump-flows lr0 > lr0flows
 AT_CHECK([grep -e "lr_in_arp_resolve.*ecmp" lr0flows | sed 's/table=../table=??/'], [0], [dnl
-  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
+  table=??(lr_in_arp_resolve  ), priority=200  , match=(ct.rpl && reg9[[5]] == 1 && ct_mark.ecmp_reply_port == 1), action=(push(xxreg1); xxreg1 = ct_label; eth.dst = xxreg1[[32..79]]; pop(xxreg1); next;)
 ])
 
 # add ecmp route with wrong nexthop
diff --git a/tests/ovn.at b/tests/ovn.at
index 91dc3b9d6..12ba3f6d0 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -27173,7 +27173,7 @@  AT_CHECK([
         grep "priority=200" | \
         grep -c "move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
     done; :], [0], [dnl
-1
+6
 1
 0
 0
@@ -27298,7 +27298,7 @@  AT_CHECK([
         grep "priority=200" | \
         grep -c "move:NXM_NX_CT_LABEL\\[[\\]]->NXM_NX_XXREG1\\[[\\]],move:NXM_NX_XXREG1\\[[32..79\\]]->NXM_OF_ETH_DST"
     done; :], [0], [dnl
-1
+6
 1
 0
 0
diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index 4a8fdede8..b3096deb3 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -5956,11 +5956,8 @@  ovn-nbctl --wait=hv sync
 
 on_exit 'ovs-ofctl dump-flows br-int'
 
-# 'bob1' should be able to ping 'alice1' directly.
-NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 | FORMAT_PING], \
-[0], [dnl
-20 packets transmitted, 20 received, 0% packet loss, time 0ms
-])
+NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
+NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
 
 # Ensure conntrack entry is present. We should not try to predict
 # the tunnel key for the output port, so we strip it from the labels
@@ -5968,7 +5965,7 @@  NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 10.0.0.2 | FORMAT_PING], \
 AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
 sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
 sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
-icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
 ])
 
 # Ensure datapaths show conntrack states as expected
@@ -5981,6 +5978,36 @@  AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(-new+est+rpl+trk).*ct_lab
 1
 ])
 
+# Flush conntrack entries for easier output parsing of next test.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Change bob1 L2 address anche check the reply is properly updated.
+ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
+ovn-nbctl set Logical_Switch_Port r2-ext \
+     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
+
+NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0], [dnl
+1
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
+1
+])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 | FORMAT_CT(172.16.0.1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Check entries in table 76 and 77 expires w/o traffic
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
+])
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
+])
+
 ovs-ofctl dump-flows br-int
 
 OVS_APP_EXIT_AND_WAIT([ovn-controller])
@@ -6122,11 +6149,8 @@  ovn-nbctl --wait=hv sync
 
 on_exit 'ovs-ofctl dump-flows br-int'
 
-# 'bob1' should be able to ping 'alice1' directly.
-NS_CHECK_EXEC([bob1], [ping -q -c 20 -i 0.3 -w 15 fd01::2 | FORMAT_PING], \
-[0], [dnl
-20 packets transmitted, 20 received, 0% packet loss, time 0ms
-])
+NETNS_DAEMONIZE([alice1], [nc -6 -l -k 80], [alice1.pid])
+NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
 
 # Ensure datapaths show conntrack states as expected
 # Like with conntrack entries, we shouldn't try to predict
@@ -6145,7 +6169,38 @@  AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(-new+est+rpl+trk).*ct_lab
 AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
 sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
 sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
-icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x401020400000000
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x401020400000000,protoinfo=(state=<cleared>)
+])
+
+# Flush conntrack entries for easier output parsing of next test.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Change bob1 L2 address anche check the reply is properly updated.
+ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
+ovn-nbctl set Logical_Switch_Port r2-ext \
+     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
+
+NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 80], [0])
+
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x1001020400000000/.*)' -c], [0], [dnl
+1
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | grep 'ct_state(-new+est+rpl+trk).*ct_label(0x1001020400000000)' -c], [0], [dnl
+1
+])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 0x1001020400000000 | FORMAT_CT(fd01::2) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/'], [0], [dnl
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x1001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Check entries in table 76 and 77 expires w/o traffic
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=76, n_packets') -eq 0
+])
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=77, n_packets') -eq 0
 ])
 
 ovs-ofctl dump-flows br-int