diff mbox series

[ovs-dev,v2,3/3] ofctrl: Introduce ecmp_nexthop_monitor.

Message ID f116acec25ae92e7c280dddd3b6bd4ebf900291d.1710257650.git.lorenzo.bianconi@redhat.com
State Changes Requested
Headers show
Series Introduce ECMP_nexthop monitor in ovn-controller | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Lorenzo Bianconi March 12, 2024, 3:59 p.m. UTC
Introduce ecmp_nexthop_monitor in ovn-controller in order to track and
flush ecmp-symmetric reply ct entires when requested by the CMS (e.g
removing the related static routes).

Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 controller/ofctrl.c         | 101 ++++++++++++++
 controller/ofctrl.h         |   2 +
 controller/ovn-controller.c |   2 +
 tests/system-ovn-kmod.at    | 266 ++++++++++++++++++++++++++++++++++++
 tests/system-ovn.at         |   4 +
 5 files changed, 375 insertions(+)

Comments

Mark Michelson April 4, 2024, 6:46 p.m. UTC | #1
Hi Lorenzo,

The code looks fine to me, but I'm a bit confused by the new test.

My understanding of the new feature is that each ECMP nexthop has an ID 
associated with it. This ID gets placed in the ct.label. If the ECMP 
route is removed, then we can find the associated ID, and remove the 
conntrack entry that has this ID in its label.

In the test, the setup initially seems good. Alice is set up as a 
server, and Bob is set up as a client behind two routers. Bob sends 
traffic via R2 and we can see the conntrack entry that ensures traffic 
from Alice will get routed to R2 instead of R3. So far so good.

After this point, I'm a bit confused. I expected something like:
* Remove the ECMP route from R1 to R2 (or alter its nexthop address)
* Check conntrack to ensure the existing entry is removed.
* If possible, send new traffic from Bob to Alice and ensure that a new 
conntrack entry is created to show the updated ECMP symmetric reply.

Instead, the ECMP routes are never changed. Other configuration changes 
are made, and conntrack is flushed between each stage of the test. Am I 
missing something here?

On 3/12/24 11:59, Lorenzo Bianconi wrote:
> Introduce ecmp_nexthop_monitor in ovn-controller in order to track and
> flush ecmp-symmetric reply ct entires when requested by the CMS (e.g
> removing the related static routes).
> 
> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> ---
>   controller/ofctrl.c         | 101 ++++++++++++++
>   controller/ofctrl.h         |   2 +
>   controller/ovn-controller.c |   2 +
>   tests/system-ovn-kmod.at    | 266 ++++++++++++++++++++++++++++++++++++
>   tests/system-ovn.at         |   4 +
>   5 files changed, 375 insertions(+)
> 
> diff --git a/controller/ofctrl.c b/controller/ofctrl.c
> index f14cd79a8..a10b0f567 100644
> --- a/controller/ofctrl.c
> +++ b/controller/ofctrl.c
> @@ -388,9 +388,24 @@ struct meter_band_entry {
>   
>   static struct shash meter_bands;
>   
> +static struct hmap ecmp_nexthop_map;
> +struct ecmp_nexthop_entry {
> +    struct hmap_node node;
> +    bool erase;
> +
> +    char *nexthop;
> +    int id;
> +};
> +
>   static void ofctrl_meter_bands_destroy(void);
>   static void ofctrl_meter_bands_clear(void);
>   
> +static void ecmp_nexthop_monitor_destroy(void);
> +static void ecmp_nexthop_monitor_run(
> +        const struct sbrec_ecmp_nexthop_table *enh_table,
> +        struct ovs_list *msgs);
> +
> +
>   /* MFF_* field ID for our Geneve option.  In S_TLV_TABLE_MOD_SENT, this is
>    * the option we requested (we don't know whether we obtained it yet).  In
>    * S_CLEAR_FLOWS or S_UPDATE_FLOWS, this is really the option we have. */
> @@ -429,6 +444,7 @@ ofctrl_init(struct ovn_extend_table *group_table,
>       groups = group_table;
>       meters = meter_table;
>       shash_init(&meter_bands);
> +    hmap_init(&ecmp_nexthop_map);
>   }
>   
>   /* S_NEW, for a new connection.
> @@ -883,6 +899,7 @@ ofctrl_destroy(void)
>       expr_symtab_destroy(&symtab);
>       shash_destroy(&symtab);
>       ofctrl_meter_bands_destroy();
> +    ecmp_nexthop_monitor_destroy();
>   }
>   
>   uint64_t
> @@ -2306,6 +2323,87 @@ add_meter(struct ovn_extend_table_info *m_desired,
>       ofctrl_meter_bands_alloc(sb_meter, m_desired, msgs);
>   }
>   
> +static void
> +ecmp_nexthop_monitor_free_entry(struct ecmp_nexthop_entry *e,
> +                                struct ovs_list *msgs)
> +{
> +    if (msgs) {
> +        ovs_u128 mask = {
> +            /* ct_labels.label BITS[96-127] */
> +            .u64.hi = 0xffffffff00000000,
> +        };
> +        uint64_t id = e->id;
> +        ovs_u128 nexthop = {
> +            .u64.hi = id << 32,
> +        };
> +        struct ofp_ct_match match = {
> +            .labels = nexthop,
> +            .labels_mask = mask,
> +        };
> +        struct ofpbuf *msg = ofp_ct_match_encode(&match, NULL,
> +                                                 rconn_get_version(swconn));
> +        ovs_list_push_back(msgs, &msg->list_node);
> +    }
> +    free(e->nexthop);
> +    free(e);
> +}
> +
> +static void
> +ecmp_nexthop_monitor_destroy(void)
> +{
> +    struct ecmp_nexthop_entry *e;
> +    HMAP_FOR_EACH_POP (e, node, &ecmp_nexthop_map) {
> +        ecmp_nexthop_monitor_free_entry(e, NULL);
> +    }
> +    hmap_destroy(&ecmp_nexthop_map);
> +}
> +
> +static struct ecmp_nexthop_entry *
> +ecmp_nexthop_monitor_lookup(char *nexthop)
> +{
> +    uint32_t hash = hash_string(nexthop, 0);
> +    struct ecmp_nexthop_entry *e;
> +
> +    HMAP_FOR_EACH_WITH_HASH (e, node, hash, &ecmp_nexthop_map) {
> +        if (!strcmp(e->nexthop, nexthop)) {
> +            return e;
> +        }
> +    }
> +    return NULL;
> +}
> +
> +static void
> +ecmp_nexthop_monitor_run(const struct sbrec_ecmp_nexthop_table *enh_table,
> +                         struct ovs_list *msgs)
> +{
> +    struct ecmp_nexthop_entry *e;
> +    HMAP_FOR_EACH (e, node, &ecmp_nexthop_map) {
> +        e->erase = true;
> +    }
> +
> +    const struct sbrec_ecmp_nexthop *sbrec_ecmp_nexthop;
> +    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sbrec_ecmp_nexthop, enh_table) {
> +        e = ecmp_nexthop_monitor_lookup(sbrec_ecmp_nexthop->nexthop);
> +        if (!e) {
> +            e = xzalloc(sizeof *e);
> +            e->nexthop = xstrdup(sbrec_ecmp_nexthop->nexthop);
> +            e->id = sbrec_ecmp_nexthop->id;
> +            uint32_t hash = hash_string(e->nexthop, 0);
> +            hmap_insert(&ecmp_nexthop_map, &e->node, hash);
> +        } else {
> +            e->erase = false;
> +        }
> +    }
> +
> +    HMAP_FOR_EACH_SAFE (e, node, &ecmp_nexthop_map) {
> +        if (e->erase) {
> +            hmap_remove(&ecmp_nexthop_map, &e->node);
> +            ecmp_nexthop_monitor_free_entry(e, msgs);
> +        }
> +    }
> +
> +}
> +
>   static void
>   installed_flow_add(struct ovn_flow *d,
>                      struct ofputil_bundle_ctrl_msg *bc,
> @@ -2664,6 +2762,7 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
>              struct shash *pending_ct_zones,
>              struct hmap *pending_lb_tuples,
>              struct ovsdb_idl_index *sbrec_meter_by_name,
> +           const struct sbrec_ecmp_nexthop_table *enh_table,
>              uint64_t req_cfg,
>              bool lflows_changed,
>              bool pflows_changed)
> @@ -2704,6 +2803,8 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
>       /* OpenFlow messages to send to the switch to bring it up-to-date. */
>       struct ovs_list msgs = OVS_LIST_INITIALIZER(&msgs);
>   
> +    ecmp_nexthop_monitor_run(enh_table, &msgs);
> +
>       /* Iterate through ct zones that need to be flushed. */
>       struct shash_node *iter;
>       SHASH_FOR_EACH(iter, pending_ct_zones) {
> diff --git a/controller/ofctrl.h b/controller/ofctrl.h
> index 502c73da6..e08b354f4 100644
> --- a/controller/ofctrl.h
> +++ b/controller/ofctrl.h
> @@ -31,6 +31,7 @@ struct ofpbuf;
>   struct ovsrec_bridge;
>   struct ovsrec_open_vswitch_table;
>   struct sbrec_meter_table;
> +struct sbrec_ecmp_nexthop_table;
>   struct shash;
>   
>   struct ovn_desired_flow_table {
> @@ -59,6 +60,7 @@ void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
>                   struct shash *pending_ct_zones,
>                   struct hmap *pending_lb_tuples,
>                   struct ovsdb_idl_index *sbrec_meter_by_name,
> +                const struct sbrec_ecmp_nexthop_table *enh_table,
>                   uint64_t nb_cfg,
>                   bool lflow_changed,
>                   bool pflow_changed);
> diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> index 1c9960c70..28cac2683 100644
> --- a/controller/ovn-controller.c
> +++ b/controller/ovn-controller.c
> @@ -5945,6 +5945,8 @@ main(int argc, char *argv[])
>                                      &ct_zones_data->pending,
>                                      &lb_data->removed_tuples,
>                                      sbrec_meter_by_name,
> +                                   sbrec_ecmp_nexthop_table_get(
> +                                        ovnsb_idl_loop.idl),
>                                      ofctrl_seqno_get_req_cfg(),
>                                      engine_node_changed(&en_lflow_output),
>                                      engine_node_changed(&en_pflow_output));
> diff --git a/tests/system-ovn-kmod.at b/tests/system-ovn-kmod.at
> index 14fe4ecec..88c196874 100644
> --- a/tests/system-ovn-kmod.at
> +++ b/tests/system-ovn-kmod.at
> @@ -1054,3 +1054,269 @@ OVS_TRAFFIC_VSWITCHD_STOP(["
>   "])
>   AT_CLEANUP
>   ])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([ECMP symmetric reply - kmod])
> +AT_KEYWORDS([ecmp])
> +
> +CHECK_CONNTRACK()
> +ovn_start
> +
> +OVS_TRAFFIC_VSWITCHD_START()
> +ADD_BR([br-int])
> +
> +# Set external-ids in br-int needed for ovn-controller
> +ovs-vsctl \
> +        -- set Open_vSwitch . external-ids:system-id=hv1 \
> +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
> +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
> +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
> +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
> +
> +# Start ovn-controller
> +start_daemon ovn-controller
> +
> +# Logical network:
> +# Alice is connected to gateway router R1. R1 is connected to two "external"
> +# routers, R2 and R3 via an "ext" switch.
> +# Bob is connected to both R2 and R3. R1 contains two ECMP routes, one through R2
> +# and one through R3, to Bob.
> +#
> +#     alice -- R1 -- ext ---- R2
> +#                     |         \
> +#                     |           bob
> +#                     |         /
> +#                     + ----- R3
> +#
> +# For this test, Bob sends request traffic through R2 to Alice. We want to ensure that
> +# all response traffic from Alice is routed through R2 as well.
> +
> +ovn-nbctl create Logical_Router name=R1 options:chassis=hv1
> +ovn-nbctl create Logical_Router name=R2
> +ovn-nbctl create Logical_Router name=R3
> +
> +ovn-nbctl ls-add alice
> +ovn-nbctl ls-add bob
> +ovn-nbctl ls-add ext
> +
> +# connect alice to R1
> +ovn-nbctl lrp-add R1 alice 00:00:01:01:02:03 10.0.0.1/24 fd01::1/64
> +ovn-nbctl lsp-add alice rp-alice -- set Logical_Switch_Port rp-alice \
> +    type=router options:router-port=alice addresses='"00:00:01:01:02:03"'
> +
> +# connect bob to R2
> +ovn-nbctl lrp-add R2 R2_bob 00:00:02:01:02:03 172.16.0.2/16 fd07::2/64
> +ovn-nbctl lsp-add bob rp2-bob -- set Logical_Switch_Port rp2-bob \
> +    type=router options:router-port=R2_bob addresses='"00:00:02:01:02:03"'
> +
> +# connect bob to R3
> +ovn-nbctl lrp-add R3 R3_bob 00:00:02:01:02:04 172.16.0.3/16 fd07::3/64
> +ovn-nbctl lsp-add bob rp3-bob -- set Logical_Switch_Port rp3-bob \
> +    type=router options:router-port=R3_bob addresses='"00:00:02:01:02:04"'
> +
> +# Connect R1 to ext
> +ovn-nbctl lrp-add R1 R1_ext 00:00:04:01:02:03 20.0.0.1/24 fd02::1/64
> +ovn-nbctl lsp-add ext r1-ext -- set Logical_Switch_Port r1-ext \
> +    type=router options:router-port=R1_ext addresses='"00:00:04:01:02:03"'
> +
> +# Connect R2 to ext
> +ovn-nbctl lrp-add R2 R2_ext 00:00:04:01:02:04 20.0.0.2/24 fd02::2/64
> +ovn-nbctl lsp-add ext r2-ext -- set Logical_Switch_Port r2-ext \
> +    type=router options:router-port=R2_ext addresses='"00:00:04:01:02:04"'
> +
> +# Connect R3 to ext
> +ovn-nbctl lrp-add R3 R3_ext 00:00:04:01:02:05 20.0.0.3/24 fd02::3/64
> +ovn-nbctl lsp-add ext r3-ext -- set Logical_Switch_Port r3-ext \
> +    type=router options:router-port=R3_ext addresses='"00:00:04:01:02:05"'
> +
> +# Install ECMP routes for alice.
> +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.2
> +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.3
> +
> +# Static Routes
> +ovn-nbctl lr-route-add R2 10.0.0.0/24 20.0.0.1
> +ovn-nbctl lr-route-add R3 10.0.0.0/24 20.0.0.1
> +
> +# Logical port 'alice1' in switch 'alice'.
> +ADD_NAMESPACES(alice1)
> +# Only send 1 router solicitation as any additional ones can cause datapath
> +# flows to get evicted, causing unexpected failures below.
> +NS_CHECK_EXEC([alice1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
> +net.ipv6.conf.default.router_solicitations = 1
> +])
> +ADD_VETH(alice1, alice1, br-int, "10.0.0.2/24", "f0:00:00:01:02:04", \
> +         "10.0.0.1")
> +NS_CHECK_EXEC([alice1], [ip -6 addr add fd01::2/64 dev alice1 nodad])
> +NS_CHECK_EXEC([alice1], [ip -6 route add default via fd01::1])
> +NS_CHECK_EXEC([alice1], [ip -6 neigh add fd01::1 lladdr 00:00:01:01:02:03 dev alice1], [0])
> +ovn-nbctl lsp-add alice alice1 \
> +-- lsp-set-addresses alice1 "f0:00:00:01:02:04 10.0.0.2 fd01::2"
> +
> +# Logical port 'bob1' in switch 'bob'.
> +ADD_NAMESPACES(bob1)
> +# Only send 1 router solicitation as any additional ones can cause datapath
> +# flows to get evicted, causing unexpected failures below.
> +NS_CHECK_EXEC([bob1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
> +net.ipv6.conf.default.router_solicitations = 1
> +])
> +ADD_VETH(bob1, bob1, br-int, "172.16.0.1/16", "f0:00:00:01:02:06", \
> +         "172.16.0.2")
> +NS_CHECK_EXEC([bob1], [ip -6 addr add fd07::1/64 dev bob1 nodad])
> +NS_CHECK_EXEC([bob1], [ip -6 route add default via fd07::2])
> +NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::2 lladdr 00:00:02:01:02:03 dev bob1])
> +NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::3 lladdr 00:00:01:01:02:04 dev bob1])
> +
> +# Add neighbour MAC addresses to avoid sending IPv6 NS messages which could
> +# cause datapath flows to be evicted
> +ovn-nbctl lsp-add bob bob1 \
> +-- lsp-set-addresses bob1 "f0:00:00:01:02:06 172.16.0.1 fd07::1"
> +
> +# Ensure ovn-controller is caught up
> +ovn-nbctl --wait=hv sync
> +
> +on_exit 'ovs-ofctl dump-flows br-int'
> +
> +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
> +[0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +# Ensure conntrack entry is present. We should not try to predict
> +# the tunnel key for the output port, so we strip it from the labels
> +# and just ensure that the known ethernet address is present.
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> +sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
> +icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000
> +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000,protoinfo=(state=<cleared>)
> +])
> +
> +# Ensure datapaths show conntrack states as expected
> +# Like with conntrack entries, we shouldn't try to predict
> +# port binding tunnel keys. So omit them from expected labels.
> +AT_CHECK([ovs-appctl dpctl/dump-flows | sed -e 's/label=0x[[0-9]]/label=0x?/' | \
> +grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x?000000000401020400000000/.*)' -c], [0], [dnl
> +2
> +])
> +AT_CHECK([[ovs-appctl dpctl/dump-flows | sed -e 's/ct_label(0x[0-9]/ct_label(0x?/' | \
> +grep 'ct_state(-new+est+rpl+trk).*ct_label(0x?000000000401020400000000)' -c]], [0], [dnl
> +2
> +])
> +
> +# Flush conntrack entries for easier output parsing of next test.
> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> +# Change bob1 L2 address anche check the reply is properly updated.
> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> +ovn-nbctl set Logical_Switch_Port r2-ext \
> +     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
> +
> +# Wait for ovn-controller before sending traffic
> +ovn-nbctl --wait=hv sync
> +
> +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
> +[0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +AT_CHECK([ovs-appctl dpctl/dump-flows | sed -e 's/label=0x[[0-9]]/label=0x?/' | \
> +grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x?000000001001020400000000/.*)' -c], [0], [dnl
> +2
> +])
> +AT_CHECK([[ovs-appctl dpctl/dump-flows | sed -e 's/ct_label(0x[0-9]/ct_label(0x?/' | \
> +grep 'ct_state(-new+est+rpl+trk).*ct_label(0x?000000001001020400000000)' -c]], [0], [dnl
> +2
> +])
> +
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(172.16.0.1) | \
> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> +sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
> +icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
> +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
> +])
> +# Check entries in table 76 and 77 expires w/o traffic
> +OVS_WAIT_UNTIL([
> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH_MAC, n_packets') -eq 0
> +])
> +OVS_WAIT_UNTIL([
> +test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
> +])
> +
> +# Flush connection tracking entries
> +ovn-nbctl --wait=hv lr-route-del R1
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
> +
> +# Install ECMP routes for alice.
> +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::2
> +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::3
> +
> +# Static Routes
> +ovn-nbctl lr-route-add R2 fd01::/64 fd02::1
> +ovn-nbctl lr-route-add R3 fd01::/64 fd02::1
> +
> +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 8080], [alice2.pid])
> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
> +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
> +[0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +# Ensure conntrack entry is present. We should not try to predict
> +# the tunnel key for the output port, so we strip it from the labels
> +# and just ensure that the known ethernet address is present.
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> +sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
> +icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
> +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
> +])
> +
> +# Flush conntrack entries for easier output parsing of next test.
> +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> +
> +# Change bob1 L2 address anche check the reply is properly updated.
> +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> +ovn-nbctl --wait=hv set Logical_Switch_Port r2-ext \
> +     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
> +
> +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
> +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
> +[0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(fd01::2) | \
> +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> +sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
> +icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
> +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
> +])
> +
> +# Flush connection tracking entries
> +ovn-nbctl --wait=hv lr-route-del R1
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2)])
> +
> +ovs-ofctl dump-flows br-int
> +
> +OVS_APP_EXIT_AND_WAIT([ovn-controller])
> +
> +as ovn-sb
> +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
> +
> +as ovn-nb
> +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
> +
> +as northd
> +OVS_APP_EXIT_AND_WAIT([ovn-northd])
> +
> +as
> +OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
> +/connection dropped.*/d"])
> +
> +AT_CLEANUP
> +])
> diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> index 7ae54113a..a0f375141 100644
> --- a/tests/system-ovn.at
> +++ b/tests/system-ovn.at
> @@ -6178,6 +6178,10 @@ OVS_WAIT_UNTIL([
>   test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
>   ])
>   
> +# Flush connection tracking entries
> +ovn-nbctl --wait=hv lr-route-del R1
> +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
> +
>   ovs-ofctl dump-flows br-int
>   
>   OVS_APP_EXIT_AND_WAIT([ovn-controller])
Lorenzo Bianconi April 15, 2024, 10:55 a.m. UTC | #2
> Hi Lorenzo,

Hi Mark,

thx for the review.

> 
> The code looks fine to me, but I'm a bit confused by the new test.
> 
> My understanding of the new feature is that each ECMP nexthop has an ID
> associated with it. This ID gets placed in the ct.label. If the ECMP route
> is removed, then we can find the associated ID, and remove the conntrack
> entry that has this ID in its label.

correct.

> 
> In the test, the setup initially seems good. Alice is set up as a server,
> and Bob is set up as a client behind two routers. Bob sends traffic via R2
> and we can see the conntrack entry that ensures traffic from Alice will get
> routed to R2 instead of R3. So far so good.
> 
> After this point, I'm a bit confused. I expected something like:
> * Remove the ECMP route from R1 to R2 (or alter its nexthop address)

I remove all the static routes in R1 at the end of the IPv{4,6} test and then
I check we do not have any related entry in the ct table.

$ovn-nbctl --wait=hv lr-route-del R1
AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])

Am I missing something?

I change the R2_ext mac address during the test, maybe we can get rid of the ct
flushing there, but I guess it is just a leftover of the previous test.

> * Check conntrack to ensure the existing entry is removed.
> * If possible, send new traffic from Bob to Alice and ensure that a new
> conntrack entry is created to show the updated ECMP symmetric reply.

We do it or am I missing something?

Regards,
Lorenzo

> 
> Instead, the ECMP routes are never changed. Other configuration changes are
> made, and conntrack is flushed between each stage of the test. Am I missing
> something here?
> 
> On 3/12/24 11:59, Lorenzo Bianconi wrote:
> > Introduce ecmp_nexthop_monitor in ovn-controller in order to track and
> > flush ecmp-symmetric reply ct entires when requested by the CMS (e.g
> > removing the related static routes).
> > 
> > Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> > ---
> >   controller/ofctrl.c         | 101 ++++++++++++++
> >   controller/ofctrl.h         |   2 +
> >   controller/ovn-controller.c |   2 +
> >   tests/system-ovn-kmod.at    | 266 ++++++++++++++++++++++++++++++++++++
> >   tests/system-ovn.at         |   4 +
> >   5 files changed, 375 insertions(+)
> > 
> > diff --git a/controller/ofctrl.c b/controller/ofctrl.c
> > index f14cd79a8..a10b0f567 100644
> > --- a/controller/ofctrl.c
> > +++ b/controller/ofctrl.c
> > @@ -388,9 +388,24 @@ struct meter_band_entry {
> >   static struct shash meter_bands;
> > +static struct hmap ecmp_nexthop_map;
> > +struct ecmp_nexthop_entry {
> > +    struct hmap_node node;
> > +    bool erase;
> > +
> > +    char *nexthop;
> > +    int id;
> > +};
> > +
> >   static void ofctrl_meter_bands_destroy(void);
> >   static void ofctrl_meter_bands_clear(void);
> > +static void ecmp_nexthop_monitor_destroy(void);
> > +static void ecmp_nexthop_monitor_run(
> > +        const struct sbrec_ecmp_nexthop_table *enh_table,
> > +        struct ovs_list *msgs);
> > +
> > +
> >   /* MFF_* field ID for our Geneve option.  In S_TLV_TABLE_MOD_SENT, this is
> >    * the option we requested (we don't know whether we obtained it yet).  In
> >    * S_CLEAR_FLOWS or S_UPDATE_FLOWS, this is really the option we have. */
> > @@ -429,6 +444,7 @@ ofctrl_init(struct ovn_extend_table *group_table,
> >       groups = group_table;
> >       meters = meter_table;
> >       shash_init(&meter_bands);
> > +    hmap_init(&ecmp_nexthop_map);
> >   }
> >   
> >   /* S_NEW, for a new connection.
> > @@ -883,6 +899,7 @@ ofctrl_destroy(void)
> >       expr_symtab_destroy(&symtab);
> >       shash_destroy(&symtab);
> >       ofctrl_meter_bands_destroy();
> > +    ecmp_nexthop_monitor_destroy();
> >   }
> >   uint64_t
> > @@ -2306,6 +2323,87 @@ add_meter(struct ovn_extend_table_info *m_desired,
> >       ofctrl_meter_bands_alloc(sb_meter, m_desired, msgs);
> >   }
> > +static void
> > +ecmp_nexthop_monitor_free_entry(struct ecmp_nexthop_entry *e,
> > +                                struct ovs_list *msgs)
> > +{
> > +    if (msgs) {
> > +        ovs_u128 mask = {
> > +            /* ct_labels.label BITS[96-127] */
> > +            .u64.hi = 0xffffffff00000000,
> > +        };
> > +        uint64_t id = e->id;
> > +        ovs_u128 nexthop = {
> > +            .u64.hi = id << 32,
> > +        };
> > +        struct ofp_ct_match match = {
> > +            .labels = nexthop,
> > +            .labels_mask = mask,
> > +        };
> > +        struct ofpbuf *msg = ofp_ct_match_encode(&match, NULL,
> > +                                                 rconn_get_version(swconn));
> > +        ovs_list_push_back(msgs, &msg->list_node);
> > +    }
> > +    free(e->nexthop);
> > +    free(e);
> > +}
> > +
> > +static void
> > +ecmp_nexthop_monitor_destroy(void)
> > +{
> > +    struct ecmp_nexthop_entry *e;
> > +    HMAP_FOR_EACH_POP (e, node, &ecmp_nexthop_map) {
> > +        ecmp_nexthop_monitor_free_entry(e, NULL);
> > +    }
> > +    hmap_destroy(&ecmp_nexthop_map);
> > +}
> > +
> > +static struct ecmp_nexthop_entry *
> > +ecmp_nexthop_monitor_lookup(char *nexthop)
> > +{
> > +    uint32_t hash = hash_string(nexthop, 0);
> > +    struct ecmp_nexthop_entry *e;
> > +
> > +    HMAP_FOR_EACH_WITH_HASH (e, node, hash, &ecmp_nexthop_map) {
> > +        if (!strcmp(e->nexthop, nexthop)) {
> > +            return e;
> > +        }
> > +    }
> > +    return NULL;
> > +}
> > +
> > +static void
> > +ecmp_nexthop_monitor_run(const struct sbrec_ecmp_nexthop_table *enh_table,
> > +                         struct ovs_list *msgs)
> > +{
> > +    struct ecmp_nexthop_entry *e;
> > +    HMAP_FOR_EACH (e, node, &ecmp_nexthop_map) {
> > +        e->erase = true;
> > +    }
> > +
> > +    const struct sbrec_ecmp_nexthop *sbrec_ecmp_nexthop;
> > +    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sbrec_ecmp_nexthop, enh_table) {
> > +        e = ecmp_nexthop_monitor_lookup(sbrec_ecmp_nexthop->nexthop);
> > +        if (!e) {
> > +            e = xzalloc(sizeof *e);
> > +            e->nexthop = xstrdup(sbrec_ecmp_nexthop->nexthop);
> > +            e->id = sbrec_ecmp_nexthop->id;
> > +            uint32_t hash = hash_string(e->nexthop, 0);
> > +            hmap_insert(&ecmp_nexthop_map, &e->node, hash);
> > +        } else {
> > +            e->erase = false;
> > +        }
> > +    }
> > +
> > +    HMAP_FOR_EACH_SAFE (e, node, &ecmp_nexthop_map) {
> > +        if (e->erase) {
> > +            hmap_remove(&ecmp_nexthop_map, &e->node);
> > +            ecmp_nexthop_monitor_free_entry(e, msgs);
> > +        }
> > +    }
> > +
> > +}
> > +
> >   static void
> >   installed_flow_add(struct ovn_flow *d,
> >                      struct ofputil_bundle_ctrl_msg *bc,
> > @@ -2664,6 +2762,7 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
> >              struct shash *pending_ct_zones,
> >              struct hmap *pending_lb_tuples,
> >              struct ovsdb_idl_index *sbrec_meter_by_name,
> > +           const struct sbrec_ecmp_nexthop_table *enh_table,
> >              uint64_t req_cfg,
> >              bool lflows_changed,
> >              bool pflows_changed)
> > @@ -2704,6 +2803,8 @@ ofctrl_put(struct ovn_desired_flow_table *lflow_table,
> >       /* OpenFlow messages to send to the switch to bring it up-to-date. */
> >       struct ovs_list msgs = OVS_LIST_INITIALIZER(&msgs);
> > +    ecmp_nexthop_monitor_run(enh_table, &msgs);
> > +
> >       /* Iterate through ct zones that need to be flushed. */
> >       struct shash_node *iter;
> >       SHASH_FOR_EACH(iter, pending_ct_zones) {
> > diff --git a/controller/ofctrl.h b/controller/ofctrl.h
> > index 502c73da6..e08b354f4 100644
> > --- a/controller/ofctrl.h
> > +++ b/controller/ofctrl.h
> > @@ -31,6 +31,7 @@ struct ofpbuf;
> >   struct ovsrec_bridge;
> >   struct ovsrec_open_vswitch_table;
> >   struct sbrec_meter_table;
> > +struct sbrec_ecmp_nexthop_table;
> >   struct shash;
> >   struct ovn_desired_flow_table {
> > @@ -59,6 +60,7 @@ void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
> >                   struct shash *pending_ct_zones,
> >                   struct hmap *pending_lb_tuples,
> >                   struct ovsdb_idl_index *sbrec_meter_by_name,
> > +                const struct sbrec_ecmp_nexthop_table *enh_table,
> >                   uint64_t nb_cfg,
> >                   bool lflow_changed,
> >                   bool pflow_changed);
> > diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
> > index 1c9960c70..28cac2683 100644
> > --- a/controller/ovn-controller.c
> > +++ b/controller/ovn-controller.c
> > @@ -5945,6 +5945,8 @@ main(int argc, char *argv[])
> >                                      &ct_zones_data->pending,
> >                                      &lb_data->removed_tuples,
> >                                      sbrec_meter_by_name,
> > +                                   sbrec_ecmp_nexthop_table_get(
> > +                                        ovnsb_idl_loop.idl),
> >                                      ofctrl_seqno_get_req_cfg(),
> >                                      engine_node_changed(&en_lflow_output),
> >                                      engine_node_changed(&en_pflow_output));
> > diff --git a/tests/system-ovn-kmod.at b/tests/system-ovn-kmod.at
> > index 14fe4ecec..88c196874 100644
> > --- a/tests/system-ovn-kmod.at
> > +++ b/tests/system-ovn-kmod.at
> > @@ -1054,3 +1054,269 @@ OVS_TRAFFIC_VSWITCHD_STOP(["
> >   "])
> >   AT_CLEANUP
> >   ])
> > +
> > +OVN_FOR_EACH_NORTHD([
> > +AT_SETUP([ECMP symmetric reply - kmod])
> > +AT_KEYWORDS([ecmp])
> > +
> > +CHECK_CONNTRACK()
> > +ovn_start
> > +
> > +OVS_TRAFFIC_VSWITCHD_START()
> > +ADD_BR([br-int])
> > +
> > +# Set external-ids in br-int needed for ovn-controller
> > +ovs-vsctl \
> > +        -- set Open_vSwitch . external-ids:system-id=hv1 \
> > +        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
> > +        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
> > +        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
> > +        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
> > +
> > +# Start ovn-controller
> > +start_daemon ovn-controller
> > +
> > +# Logical network:
> > +# Alice is connected to gateway router R1. R1 is connected to two "external"
> > +# routers, R2 and R3 via an "ext" switch.
> > +# Bob is connected to both R2 and R3. R1 contains two ECMP routes, one through R2
> > +# and one through R3, to Bob.
> > +#
> > +#     alice -- R1 -- ext ---- R2
> > +#                     |         \
> > +#                     |           bob
> > +#                     |         /
> > +#                     + ----- R3
> > +#
> > +# For this test, Bob sends request traffic through R2 to Alice. We want to ensure that
> > +# all response traffic from Alice is routed through R2 as well.
> > +
> > +ovn-nbctl create Logical_Router name=R1 options:chassis=hv1
> > +ovn-nbctl create Logical_Router name=R2
> > +ovn-nbctl create Logical_Router name=R3
> > +
> > +ovn-nbctl ls-add alice
> > +ovn-nbctl ls-add bob
> > +ovn-nbctl ls-add ext
> > +
> > +# connect alice to R1
> > +ovn-nbctl lrp-add R1 alice 00:00:01:01:02:03 10.0.0.1/24 fd01::1/64
> > +ovn-nbctl lsp-add alice rp-alice -- set Logical_Switch_Port rp-alice \
> > +    type=router options:router-port=alice addresses='"00:00:01:01:02:03"'
> > +
> > +# connect bob to R2
> > +ovn-nbctl lrp-add R2 R2_bob 00:00:02:01:02:03 172.16.0.2/16 fd07::2/64
> > +ovn-nbctl lsp-add bob rp2-bob -- set Logical_Switch_Port rp2-bob \
> > +    type=router options:router-port=R2_bob addresses='"00:00:02:01:02:03"'
> > +
> > +# connect bob to R3
> > +ovn-nbctl lrp-add R3 R3_bob 00:00:02:01:02:04 172.16.0.3/16 fd07::3/64
> > +ovn-nbctl lsp-add bob rp3-bob -- set Logical_Switch_Port rp3-bob \
> > +    type=router options:router-port=R3_bob addresses='"00:00:02:01:02:04"'
> > +
> > +# Connect R1 to ext
> > +ovn-nbctl lrp-add R1 R1_ext 00:00:04:01:02:03 20.0.0.1/24 fd02::1/64
> > +ovn-nbctl lsp-add ext r1-ext -- set Logical_Switch_Port r1-ext \
> > +    type=router options:router-port=R1_ext addresses='"00:00:04:01:02:03"'
> > +
> > +# Connect R2 to ext
> > +ovn-nbctl lrp-add R2 R2_ext 00:00:04:01:02:04 20.0.0.2/24 fd02::2/64
> > +ovn-nbctl lsp-add ext r2-ext -- set Logical_Switch_Port r2-ext \
> > +    type=router options:router-port=R2_ext addresses='"00:00:04:01:02:04"'
> > +
> > +# Connect R3 to ext
> > +ovn-nbctl lrp-add R3 R3_ext 00:00:04:01:02:05 20.0.0.3/24 fd02::3/64
> > +ovn-nbctl lsp-add ext r3-ext -- set Logical_Switch_Port r3-ext \
> > +    type=router options:router-port=R3_ext addresses='"00:00:04:01:02:05"'
> > +
> > +# Install ECMP routes for alice.
> > +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.2
> > +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.3
> > +
> > +# Static Routes
> > +ovn-nbctl lr-route-add R2 10.0.0.0/24 20.0.0.1
> > +ovn-nbctl lr-route-add R3 10.0.0.0/24 20.0.0.1
> > +
> > +# Logical port 'alice1' in switch 'alice'.
> > +ADD_NAMESPACES(alice1)
> > +# Only send 1 router solicitation as any additional ones can cause datapath
> > +# flows to get evicted, causing unexpected failures below.
> > +NS_CHECK_EXEC([alice1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
> > +net.ipv6.conf.default.router_solicitations = 1
> > +])
> > +ADD_VETH(alice1, alice1, br-int, "10.0.0.2/24", "f0:00:00:01:02:04", \
> > +         "10.0.0.1")
> > +NS_CHECK_EXEC([alice1], [ip -6 addr add fd01::2/64 dev alice1 nodad])
> > +NS_CHECK_EXEC([alice1], [ip -6 route add default via fd01::1])
> > +NS_CHECK_EXEC([alice1], [ip -6 neigh add fd01::1 lladdr 00:00:01:01:02:03 dev alice1], [0])
> > +ovn-nbctl lsp-add alice alice1 \
> > +-- lsp-set-addresses alice1 "f0:00:00:01:02:04 10.0.0.2 fd01::2"
> > +
> > +# Logical port 'bob1' in switch 'bob'.
> > +ADD_NAMESPACES(bob1)
> > +# Only send 1 router solicitation as any additional ones can cause datapath
> > +# flows to get evicted, causing unexpected failures below.
> > +NS_CHECK_EXEC([bob1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
> > +net.ipv6.conf.default.router_solicitations = 1
> > +])
> > +ADD_VETH(bob1, bob1, br-int, "172.16.0.1/16", "f0:00:00:01:02:06", \
> > +         "172.16.0.2")
> > +NS_CHECK_EXEC([bob1], [ip -6 addr add fd07::1/64 dev bob1 nodad])
> > +NS_CHECK_EXEC([bob1], [ip -6 route add default via fd07::2])
> > +NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::2 lladdr 00:00:02:01:02:03 dev bob1])
> > +NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::3 lladdr 00:00:01:01:02:04 dev bob1])
> > +
> > +# Add neighbour MAC addresses to avoid sending IPv6 NS messages which could
> > +# cause datapath flows to be evicted
> > +ovn-nbctl lsp-add bob bob1 \
> > +-- lsp-set-addresses bob1 "f0:00:00:01:02:06 172.16.0.1 fd07::1"
> > +
> > +# Ensure ovn-controller is caught up
> > +ovn-nbctl --wait=hv sync
> > +
> > +on_exit 'ovs-ofctl dump-flows br-int'
> > +
> > +NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
> > +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> > +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
> > +[0], [dnl
> > +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> > +])
> > +
> > +# Ensure conntrack entry is present. We should not try to predict
> > +# the tunnel key for the output port, so we strip it from the labels
> > +# and just ensure that the known ethernet address is present.
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
> > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> > +sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
> > +icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000
> > +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000,protoinfo=(state=<cleared>)
> > +])
> > +
> > +# Ensure datapaths show conntrack states as expected
> > +# Like with conntrack entries, we shouldn't try to predict
> > +# port binding tunnel keys. So omit them from expected labels.
> > +AT_CHECK([ovs-appctl dpctl/dump-flows | sed -e 's/label=0x[[0-9]]/label=0x?/' | \
> > +grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x?000000000401020400000000/.*)' -c], [0], [dnl
> > +2
> > +])
> > +AT_CHECK([[ovs-appctl dpctl/dump-flows | sed -e 's/ct_label(0x[0-9]/ct_label(0x?/' | \
> > +grep 'ct_state(-new+est+rpl+trk).*ct_label(0x?000000000401020400000000)' -c]], [0], [dnl
> > +2
> > +])
> > +
> > +# Flush conntrack entries for easier output parsing of next test.
> > +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> > +# Change bob1 L2 address anche check the reply is properly updated.
> > +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> > +ovn-nbctl set Logical_Switch_Port r2-ext \
> > +     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
> > +
> > +# Wait for ovn-controller before sending traffic
> > +ovn-nbctl --wait=hv sync
> > +
> > +NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
> > +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
> > +[0], [dnl
> > +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> > +])
> > +AT_CHECK([ovs-appctl dpctl/dump-flows | sed -e 's/label=0x[[0-9]]/label=0x?/' | \
> > +grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x?000000001001020400000000/.*)' -c], [0], [dnl
> > +2
> > +])
> > +AT_CHECK([[ovs-appctl dpctl/dump-flows | sed -e 's/ct_label(0x[0-9]/ct_label(0x?/' | \
> > +grep 'ct_state(-new+est+rpl+trk).*ct_label(0x?000000001001020400000000)' -c]], [0], [dnl
> > +2
> > +])
> > +
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(172.16.0.1) | \
> > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> > +sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
> > +icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
> > +tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
> > +])
> > +# Check entries in table 76 and 77 expires w/o traffic
> > +OVS_WAIT_UNTIL([
> > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH_MAC, n_packets') -eq 0
> > +])
> > +OVS_WAIT_UNTIL([
> > +test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
> > +])
> > +
> > +# Flush connection tracking entries
> > +ovn-nbctl --wait=hv lr-route-del R1
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
> > +
> > +# Install ECMP routes for alice.
> > +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::2
> > +ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::3
> > +
> > +# Static Routes
> > +ovn-nbctl lr-route-add R2 fd01::/64 fd02::1
> > +ovn-nbctl lr-route-add R3 fd01::/64 fd02::1
> > +
> > +NETNS_DAEMONIZE([alice1], [nc -6 -l -k 8080], [alice2.pid])
> > +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
> > +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
> > +[0], [dnl
> > +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> > +])
> > +
> > +# Ensure conntrack entry is present. We should not try to predict
> > +# the tunnel key for the output port, so we strip it from the labels
> > +# and just ensure that the known ethernet address is present.
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
> > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> > +sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
> > +icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
> > +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
> > +])
> > +
> > +# Flush conntrack entries for easier output parsing of next test.
> > +AT_CHECK([ovs-appctl dpctl/flush-conntrack])
> > +
> > +# Change bob1 L2 address anche check the reply is properly updated.
> > +ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
> > +ovn-nbctl --wait=hv set Logical_Switch_Port r2-ext \
> > +     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
> > +
> > +NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
> > +NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
> > +[0], [dnl
> > +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> > +])
> > +
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(fd01::2) | \
> > +sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
> > +sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
> > +sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
> > +icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
> > +tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
> > +])
> > +
> > +# Flush connection tracking entries
> > +ovn-nbctl --wait=hv lr-route-del R1
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2)])
> > +
> > +ovs-ofctl dump-flows br-int
> > +
> > +OVS_APP_EXIT_AND_WAIT([ovn-controller])
> > +
> > +as ovn-sb
> > +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
> > +
> > +as ovn-nb
> > +OVS_APP_EXIT_AND_WAIT([ovsdb-server])
> > +
> > +as northd
> > +OVS_APP_EXIT_AND_WAIT([ovn-northd])
> > +
> > +as
> > +OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
> > +/connection dropped.*/d"])
> > +
> > +AT_CLEANUP
> > +])
> > diff --git a/tests/system-ovn.at b/tests/system-ovn.at
> > index 7ae54113a..a0f375141 100644
> > --- a/tests/system-ovn.at
> > +++ b/tests/system-ovn.at
> > @@ -6178,6 +6178,10 @@ OVS_WAIT_UNTIL([
> >   test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
> >   ])
> > +# Flush connection tracking entries
> > +ovn-nbctl --wait=hv lr-route-del R1
> > +AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
> > +
> >   ovs-ofctl dump-flows br-int
> >   OVS_APP_EXIT_AND_WAIT([ovn-controller])
>
diff mbox series

Patch

diff --git a/controller/ofctrl.c b/controller/ofctrl.c
index f14cd79a8..a10b0f567 100644
--- a/controller/ofctrl.c
+++ b/controller/ofctrl.c
@@ -388,9 +388,24 @@  struct meter_band_entry {
 
 static struct shash meter_bands;
 
+static struct hmap ecmp_nexthop_map;
+struct ecmp_nexthop_entry {
+    struct hmap_node node;
+    bool erase;
+
+    char *nexthop;
+    int id;
+};
+
 static void ofctrl_meter_bands_destroy(void);
 static void ofctrl_meter_bands_clear(void);
 
+static void ecmp_nexthop_monitor_destroy(void);
+static void ecmp_nexthop_monitor_run(
+        const struct sbrec_ecmp_nexthop_table *enh_table,
+        struct ovs_list *msgs);
+
+
 /* MFF_* field ID for our Geneve option.  In S_TLV_TABLE_MOD_SENT, this is
  * the option we requested (we don't know whether we obtained it yet).  In
  * S_CLEAR_FLOWS or S_UPDATE_FLOWS, this is really the option we have. */
@@ -429,6 +444,7 @@  ofctrl_init(struct ovn_extend_table *group_table,
     groups = group_table;
     meters = meter_table;
     shash_init(&meter_bands);
+    hmap_init(&ecmp_nexthop_map);
 }
 
 /* S_NEW, for a new connection.
@@ -883,6 +899,7 @@  ofctrl_destroy(void)
     expr_symtab_destroy(&symtab);
     shash_destroy(&symtab);
     ofctrl_meter_bands_destroy();
+    ecmp_nexthop_monitor_destroy();
 }
 
 uint64_t
@@ -2306,6 +2323,87 @@  add_meter(struct ovn_extend_table_info *m_desired,
     ofctrl_meter_bands_alloc(sb_meter, m_desired, msgs);
 }
 
+static void
+ecmp_nexthop_monitor_free_entry(struct ecmp_nexthop_entry *e,
+                                struct ovs_list *msgs)
+{
+    if (msgs) {
+        ovs_u128 mask = {
+            /* ct_labels.label BITS[96-127] */
+            .u64.hi = 0xffffffff00000000,
+        };
+        uint64_t id = e->id;
+        ovs_u128 nexthop = {
+            .u64.hi = id << 32,
+        };
+        struct ofp_ct_match match = {
+            .labels = nexthop,
+            .labels_mask = mask,
+        };
+        struct ofpbuf *msg = ofp_ct_match_encode(&match, NULL,
+                                                 rconn_get_version(swconn));
+        ovs_list_push_back(msgs, &msg->list_node);
+    }
+    free(e->nexthop);
+    free(e);
+}
+
+static void
+ecmp_nexthop_monitor_destroy(void)
+{
+    struct ecmp_nexthop_entry *e;
+    HMAP_FOR_EACH_POP (e, node, &ecmp_nexthop_map) {
+        ecmp_nexthop_monitor_free_entry(e, NULL);
+    }
+    hmap_destroy(&ecmp_nexthop_map);
+}
+
+static struct ecmp_nexthop_entry *
+ecmp_nexthop_monitor_lookup(char *nexthop)
+{
+    uint32_t hash = hash_string(nexthop, 0);
+    struct ecmp_nexthop_entry *e;
+
+    HMAP_FOR_EACH_WITH_HASH (e, node, hash, &ecmp_nexthop_map) {
+        if (!strcmp(e->nexthop, nexthop)) {
+            return e;
+        }
+    }
+    return NULL;
+}
+
+static void
+ecmp_nexthop_monitor_run(const struct sbrec_ecmp_nexthop_table *enh_table,
+                         struct ovs_list *msgs)
+{
+    struct ecmp_nexthop_entry *e;
+    HMAP_FOR_EACH (e, node, &ecmp_nexthop_map) {
+        e->erase = true;
+    }
+
+    const struct sbrec_ecmp_nexthop *sbrec_ecmp_nexthop;
+    SBREC_ECMP_NEXTHOP_TABLE_FOR_EACH (sbrec_ecmp_nexthop, enh_table) {
+        e = ecmp_nexthop_monitor_lookup(sbrec_ecmp_nexthop->nexthop);
+        if (!e) {
+            e = xzalloc(sizeof *e);
+            e->nexthop = xstrdup(sbrec_ecmp_nexthop->nexthop);
+            e->id = sbrec_ecmp_nexthop->id;
+            uint32_t hash = hash_string(e->nexthop, 0);
+            hmap_insert(&ecmp_nexthop_map, &e->node, hash);
+        } else {
+            e->erase = false;
+        }
+    }
+
+    HMAP_FOR_EACH_SAFE (e, node, &ecmp_nexthop_map) {
+        if (e->erase) {
+            hmap_remove(&ecmp_nexthop_map, &e->node);
+            ecmp_nexthop_monitor_free_entry(e, msgs);
+        }
+    }
+
+}
+
 static void
 installed_flow_add(struct ovn_flow *d,
                    struct ofputil_bundle_ctrl_msg *bc,
@@ -2664,6 +2762,7 @@  ofctrl_put(struct ovn_desired_flow_table *lflow_table,
            struct shash *pending_ct_zones,
            struct hmap *pending_lb_tuples,
            struct ovsdb_idl_index *sbrec_meter_by_name,
+           const struct sbrec_ecmp_nexthop_table *enh_table,
            uint64_t req_cfg,
            bool lflows_changed,
            bool pflows_changed)
@@ -2704,6 +2803,8 @@  ofctrl_put(struct ovn_desired_flow_table *lflow_table,
     /* OpenFlow messages to send to the switch to bring it up-to-date. */
     struct ovs_list msgs = OVS_LIST_INITIALIZER(&msgs);
 
+    ecmp_nexthop_monitor_run(enh_table, &msgs);
+
     /* Iterate through ct zones that need to be flushed. */
     struct shash_node *iter;
     SHASH_FOR_EACH(iter, pending_ct_zones) {
diff --git a/controller/ofctrl.h b/controller/ofctrl.h
index 502c73da6..e08b354f4 100644
--- a/controller/ofctrl.h
+++ b/controller/ofctrl.h
@@ -31,6 +31,7 @@  struct ofpbuf;
 struct ovsrec_bridge;
 struct ovsrec_open_vswitch_table;
 struct sbrec_meter_table;
+struct sbrec_ecmp_nexthop_table;
 struct shash;
 
 struct ovn_desired_flow_table {
@@ -59,6 +60,7 @@  void ofctrl_put(struct ovn_desired_flow_table *lflow_table,
                 struct shash *pending_ct_zones,
                 struct hmap *pending_lb_tuples,
                 struct ovsdb_idl_index *sbrec_meter_by_name,
+                const struct sbrec_ecmp_nexthop_table *enh_table,
                 uint64_t nb_cfg,
                 bool lflow_changed,
                 bool pflow_changed);
diff --git a/controller/ovn-controller.c b/controller/ovn-controller.c
index 1c9960c70..28cac2683 100644
--- a/controller/ovn-controller.c
+++ b/controller/ovn-controller.c
@@ -5945,6 +5945,8 @@  main(int argc, char *argv[])
                                    &ct_zones_data->pending,
                                    &lb_data->removed_tuples,
                                    sbrec_meter_by_name,
+                                   sbrec_ecmp_nexthop_table_get(
+                                        ovnsb_idl_loop.idl),
                                    ofctrl_seqno_get_req_cfg(),
                                    engine_node_changed(&en_lflow_output),
                                    engine_node_changed(&en_pflow_output));
diff --git a/tests/system-ovn-kmod.at b/tests/system-ovn-kmod.at
index 14fe4ecec..88c196874 100644
--- a/tests/system-ovn-kmod.at
+++ b/tests/system-ovn-kmod.at
@@ -1054,3 +1054,269 @@  OVS_TRAFFIC_VSWITCHD_STOP(["
 "])
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ECMP symmetric reply - kmod])
+AT_KEYWORDS([ecmp])
+
+CHECK_CONNTRACK()
+ovn_start
+
+OVS_TRAFFIC_VSWITCHD_START()
+ADD_BR([br-int])
+
+# Set external-ids in br-int needed for ovn-controller
+ovs-vsctl \
+        -- set Open_vSwitch . external-ids:system-id=hv1 \
+        -- set Open_vSwitch . external-ids:ovn-remote=unix:$ovs_base/ovn-sb/ovn-sb.sock \
+        -- set Open_vSwitch . external-ids:ovn-encap-type=geneve \
+        -- set Open_vSwitch . external-ids:ovn-encap-ip=169.0.0.1 \
+        -- set bridge br-int fail-mode=secure other-config:disable-in-band=true
+
+# Start ovn-controller
+start_daemon ovn-controller
+
+# Logical network:
+# Alice is connected to gateway router R1. R1 is connected to two "external"
+# routers, R2 and R3 via an "ext" switch.
+# Bob is connected to both R2 and R3. R1 contains two ECMP routes, one through R2
+# and one through R3, to Bob.
+#
+#     alice -- R1 -- ext ---- R2
+#                     |         \
+#                     |           bob
+#                     |         /
+#                     + ----- R3
+#
+# For this test, Bob sends request traffic through R2 to Alice. We want to ensure that
+# all response traffic from Alice is routed through R2 as well.
+
+ovn-nbctl create Logical_Router name=R1 options:chassis=hv1
+ovn-nbctl create Logical_Router name=R2
+ovn-nbctl create Logical_Router name=R3
+
+ovn-nbctl ls-add alice
+ovn-nbctl ls-add bob
+ovn-nbctl ls-add ext
+
+# connect alice to R1
+ovn-nbctl lrp-add R1 alice 00:00:01:01:02:03 10.0.0.1/24 fd01::1/64
+ovn-nbctl lsp-add alice rp-alice -- set Logical_Switch_Port rp-alice \
+    type=router options:router-port=alice addresses='"00:00:01:01:02:03"'
+
+# connect bob to R2
+ovn-nbctl lrp-add R2 R2_bob 00:00:02:01:02:03 172.16.0.2/16 fd07::2/64
+ovn-nbctl lsp-add bob rp2-bob -- set Logical_Switch_Port rp2-bob \
+    type=router options:router-port=R2_bob addresses='"00:00:02:01:02:03"'
+
+# connect bob to R3
+ovn-nbctl lrp-add R3 R3_bob 00:00:02:01:02:04 172.16.0.3/16 fd07::3/64
+ovn-nbctl lsp-add bob rp3-bob -- set Logical_Switch_Port rp3-bob \
+    type=router options:router-port=R3_bob addresses='"00:00:02:01:02:04"'
+
+# Connect R1 to ext
+ovn-nbctl lrp-add R1 R1_ext 00:00:04:01:02:03 20.0.0.1/24 fd02::1/64
+ovn-nbctl lsp-add ext r1-ext -- set Logical_Switch_Port r1-ext \
+    type=router options:router-port=R1_ext addresses='"00:00:04:01:02:03"'
+
+# Connect R2 to ext
+ovn-nbctl lrp-add R2 R2_ext 00:00:04:01:02:04 20.0.0.2/24 fd02::2/64
+ovn-nbctl lsp-add ext r2-ext -- set Logical_Switch_Port r2-ext \
+    type=router options:router-port=R2_ext addresses='"00:00:04:01:02:04"'
+
+# Connect R3 to ext
+ovn-nbctl lrp-add R3 R3_ext 00:00:04:01:02:05 20.0.0.3/24 fd02::3/64
+ovn-nbctl lsp-add ext r3-ext -- set Logical_Switch_Port r3-ext \
+    type=router options:router-port=R3_ext addresses='"00:00:04:01:02:05"'
+
+# Install ECMP routes for alice.
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.2
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 10.0.0.0/24 20.0.0.3
+
+# Static Routes
+ovn-nbctl lr-route-add R2 10.0.0.0/24 20.0.0.1
+ovn-nbctl lr-route-add R3 10.0.0.0/24 20.0.0.1
+
+# Logical port 'alice1' in switch 'alice'.
+ADD_NAMESPACES(alice1)
+# Only send 1 router solicitation as any additional ones can cause datapath
+# flows to get evicted, causing unexpected failures below.
+NS_CHECK_EXEC([alice1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
+net.ipv6.conf.default.router_solicitations = 1
+])
+ADD_VETH(alice1, alice1, br-int, "10.0.0.2/24", "f0:00:00:01:02:04", \
+         "10.0.0.1")
+NS_CHECK_EXEC([alice1], [ip -6 addr add fd01::2/64 dev alice1 nodad])
+NS_CHECK_EXEC([alice1], [ip -6 route add default via fd01::1])
+NS_CHECK_EXEC([alice1], [ip -6 neigh add fd01::1 lladdr 00:00:01:01:02:03 dev alice1], [0])
+ovn-nbctl lsp-add alice alice1 \
+-- lsp-set-addresses alice1 "f0:00:00:01:02:04 10.0.0.2 fd01::2"
+
+# Logical port 'bob1' in switch 'bob'.
+ADD_NAMESPACES(bob1)
+# Only send 1 router solicitation as any additional ones can cause datapath
+# flows to get evicted, causing unexpected failures below.
+NS_CHECK_EXEC([bob1], [sysctl -w net.ipv6.conf.default.router_solicitations=1], [0], [dnl
+net.ipv6.conf.default.router_solicitations = 1
+])
+ADD_VETH(bob1, bob1, br-int, "172.16.0.1/16", "f0:00:00:01:02:06", \
+         "172.16.0.2")
+NS_CHECK_EXEC([bob1], [ip -6 addr add fd07::1/64 dev bob1 nodad])
+NS_CHECK_EXEC([bob1], [ip -6 route add default via fd07::2])
+NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::2 lladdr 00:00:02:01:02:03 dev bob1])
+NS_CHECK_EXEC([bob1], [ip -6 neigh add fd07::3 lladdr 00:00:01:01:02:04 dev bob1])
+
+# Add neighbour MAC addresses to avoid sending IPv6 NS messages which could
+# cause datapath flows to be evicted
+ovn-nbctl lsp-add bob bob1 \
+-- lsp-set-addresses bob1 "f0:00:00:01:02:06 172.16.0.1 fd07::1"
+
+# Ensure ovn-controller is caught up
+ovn-nbctl --wait=hv sync
+
+on_exit 'ovs-ofctl dump-flows br-int'
+
+NETNS_DAEMONIZE([alice1], [nc -l -k 80], [alice1.pid])
+NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Ensure conntrack entry is present. We should not try to predict
+# the tunnel key for the output port, so we strip it from the labels
+# and just ensure that the known ethernet address is present.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
+icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000000401020400000000,protoinfo=(state=<cleared>)
+])
+
+# Ensure datapaths show conntrack states as expected
+# Like with conntrack entries, we shouldn't try to predict
+# port binding tunnel keys. So omit them from expected labels.
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed -e 's/label=0x[[0-9]]/label=0x?/' | \
+grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x?000000000401020400000000/.*)' -c], [0], [dnl
+2
+])
+AT_CHECK([[ovs-appctl dpctl/dump-flows | sed -e 's/ct_label(0x[0-9]/ct_label(0x?/' | \
+grep 'ct_state(-new+est+rpl+trk).*ct_label(0x?000000000401020400000000)' -c]], [0], [dnl
+2
+])
+
+# Flush conntrack entries for easier output parsing of next test.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+# Change bob1 L2 address anche check the reply is properly updated.
+ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
+ovn-nbctl set Logical_Switch_Port r2-ext \
+     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
+
+# Wait for ovn-controller before sending traffic
+ovn-nbctl --wait=hv sync
+
+NS_CHECK_EXEC([bob1], [nc -z 10.0.0.2 80], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+AT_CHECK([ovs-appctl dpctl/dump-flows | sed -e 's/label=0x[[0-9]]/label=0x?/' | \
+grep 'ct_state(+new-est-rpl+trk).*ct(.*label=0x?000000001001020400000000/.*)' -c], [0], [dnl
+2
+])
+AT_CHECK([[ovs-appctl dpctl/dump-flows | sed -e 's/ct_label(0x[0-9]/ct_label(0x?/' | \
+grep 'ct_state(-new+est+rpl+trk).*ct_label(0x?000000001001020400000000)' -c]], [0], [dnl
+2
+])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(172.16.0.1) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
+icmp,orig=(src=172.16.0.1,dst=10.0.0.2,id=<cleared>,type=8,code=0),reply=(src=10.0.0.2,dst=172.16.0.1,id=<cleared>,type=0,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
+tcp,orig=(src=172.16.0.1,dst=10.0.0.2,sport=<cleared>,dport=<cleared>),reply=(src=10.0.0.2,dst=172.16.0.1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
+])
+# Check entries in table 76 and 77 expires w/o traffic
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH_MAC, n_packets') -eq 0
+])
+OVS_WAIT_UNTIL([
+test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
+])
+
+# Flush connection tracking entries
+ovn-nbctl --wait=hv lr-route-del R1
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
+
+# Install ECMP routes for alice.
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::2
+ovn-nbctl --ecmp-symmetric-reply --policy="src-ip" lr-route-add R1 fd01::/126 fd02::3
+
+# Static Routes
+ovn-nbctl lr-route-add R2 fd01::/64 fd02::1
+ovn-nbctl lr-route-add R3 fd01::/64 fd02::1
+
+NETNS_DAEMONIZE([alice1], [nc -6 -l -k 8080], [alice2.pid])
+NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Ensure conntrack entry is present. We should not try to predict
+# the tunnel key for the output port, so we strip it from the labels
+# and just ensure that the known ethernet address is present.
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/' | sort], [0], [dnl
+icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Flush conntrack entries for easier output parsing of next test.
+AT_CHECK([ovs-appctl dpctl/flush-conntrack])
+
+# Change bob1 L2 address anche check the reply is properly updated.
+ovn-nbctl set Logical_Router_Port R2_ext mac='"00:00:10:01:02:04"'
+ovn-nbctl --wait=hv set Logical_Switch_Port r2-ext \
+     type=router options:router-port=R2_ext addresses='"00:00:10:01:02:04"'
+
+NS_CHECK_EXEC([bob1], [nc -6 -z fd01::2 8080], [0])
+NS_CHECK_EXEC([bob1], [ping -q -c 3 -i 0.3 -w 2 fd01::2 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | grep 1001020400000000 | FORMAT_CT(fd01::2) | \
+sed -e 's/zone=[[0-9]]*/zone=<cleared>/' |
+sed -e 's/mark=[[0-9]]*/mark=<cleared>/' |
+sed -e 's/labels=0x[[0-9]]/labels=0x?/'], [0], [dnl
+icmpv6,orig=(src=fd07::1,dst=fd01::2,id=<cleared>,type=128,code=0),reply=(src=fd01::2,dst=fd07::1,id=<cleared>,type=129,code=0),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000
+tcp,orig=(src=fd07::1,dst=fd01::2,sport=<cleared>,dport=<cleared>),reply=(src=fd01::2,dst=fd07::1,sport=<cleared>,dport=<cleared>),zone=<cleared>,mark=<cleared>,labels=0x?000000001001020400000000,protoinfo=(state=<cleared>)
+])
+
+# Flush connection tracking entries
+ovn-nbctl --wait=hv lr-route-del R1
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(fd01::2)])
+
+ovs-ofctl dump-flows br-int
+
+OVS_APP_EXIT_AND_WAIT([ovn-controller])
+
+as ovn-sb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as ovn-nb
+OVS_APP_EXIT_AND_WAIT([ovsdb-server])
+
+as northd
+OVS_APP_EXIT_AND_WAIT([ovn-northd])
+
+as
+OVS_TRAFFIC_VSWITCHD_STOP(["/failed to query port patch-.*/d
+/connection dropped.*/d"])
+
+AT_CLEANUP
+])
diff --git a/tests/system-ovn.at b/tests/system-ovn.at
index 7ae54113a..a0f375141 100644
--- a/tests/system-ovn.at
+++ b/tests/system-ovn.at
@@ -6178,6 +6178,10 @@  OVS_WAIT_UNTIL([
 test $(ovs-ofctl dump-flows br-int | grep -c 'table=OFTABLE_ECMP_NH, n_packets') -eq 0
 ])
 
+# Flush connection tracking entries
+ovn-nbctl --wait=hv lr-route-del R1
+AT_CHECK([ovs-appctl dpctl/dump-conntrack | FORMAT_CT(172.16.0.1)])
+
 ovs-ofctl dump-flows br-int
 
 OVS_APP_EXIT_AND_WAIT([ovn-controller])