diff mbox series

[ovs-dev] northd: Fix pmtud for non routed traffic.

Message ID ae8615cb43df962b949990aee981817b1fd4faf0.1707831032.git.lorenzo.bianconi@redhat.com
State Changes Requested
Headers show
Series [ovs-dev] northd: Fix pmtud for non routed traffic. | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes success github build: passed

Commit Message

Lorenzo Bianconi Feb. 13, 2024, 1:32 p.m. UTC
Similar to what is already implemented for routed e/w traffic,
introduce pmtud support for e/w traffic between two logical switch ports
connected to the same logical switch, but running on two different
hypervisors.

Reported-at: https://issues.redhat.com/browse/FDP-362
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 controller/lflow.h      |   1 +
 controller/physical.c   |  31 ++++++++-
 northd/northd.c         |  35 +++++++---
 northd/ovn-northd.8.xml |  14 +++-
 tests/multinode.at      | 151 ++++++++++++++++++++++++++++++++++++++++
 tests/ovn-northd.at     |  22 ++++--
 6 files changed, 236 insertions(+), 18 deletions(-)

Comments

Mark Michelson March 15, 2024, 7:34 p.m. UTC | #1
Hi Lorenzo,

Thanks for the fix.

Acked-by: Mark Michelson <mmichels@redhat.com>

When this is merged, the following should also be folded in:

---
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 17b414144..0cf1c2bb5 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -338,7 +338,7 @@
      </p>

      <p>
-      This table adds a priority-110 flow that matches 'recirculated' 
icmp{4,6}
+      This table adds a priority-105 flow that matches 'recirculated' 
icmp{4,6}
        error 'packet too big' to drop the packet.
      </p>

---

This accounts for the change of priority introduced in this patch.

I also noticed a small spelling mistake that should be corrected. I 
marked it below.

On 2/13/24 08:32, Lorenzo Bianconi wrote:
> Similar to what is already implemented for routed e/w traffic,
> introduce pmtud support for e/w traffic between two logical switch ports
> connected to the same logical switch, but running on two different
> hypervisors.
> 
> Reported-at: https://issues.redhat.com/browse/FDP-362
> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> ---
>   controller/lflow.h      |   1 +
>   controller/physical.c   |  31 ++++++++-
>   northd/northd.c         |  35 +++++++---
>   northd/ovn-northd.8.xml |  14 +++-
>   tests/multinode.at      | 151 ++++++++++++++++++++++++++++++++++++++++
>   tests/ovn-northd.at     |  22 ++++--
>   6 files changed, 236 insertions(+), 18 deletions(-)
> 
> diff --git a/controller/lflow.h b/controller/lflow.h
> index 9b7ffa19c..906a26280 100644
> --- a/controller/lflow.h
> +++ b/controller/lflow.h
> @@ -94,6 +94,7 @@ struct uuid;
>   #define OFTABLE_ECMP_NH                  77
>   #define OFTABLE_CHK_LB_AFFINITY          78
>   #define OFTABLE_MAC_CACHE_USE            79
> +#define OFTABLE_CT_ZONE_LOOKUP           80
>   
>   struct lflow_ctx_in {
>       struct ovsdb_idl_index *sbrec_multicast_group_by_name_datapath;
> diff --git a/controller/physical.c b/controller/physical.c
> index c32642d2c..6a9327b8d 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -2451,8 +2451,37 @@ physical_run(struct physical_ctx *p_ctx,
>                                 p_ctx->n_encap_ips,
>                                 p_ctx->encap_ips,
>                                 flow_table, &ofpacts);
> +
> +        if (!local_binding_get_primary_pb(p_ctx->local_bindings,
> +                                          binding->logical_port)) {
> +            continue;
> +        }
> +
> +        /* Table 80, priority 100.
> +         * =======================
> +         *
> +         * Process ICMP{4,6} error packets too big locally generalted from the

s/generalted/generated/

> +         * kernel in order to lookup proper ct_zone. */
> +        struct match match = MATCH_CATCHALL_INITIALIZER;
> +        match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
> +        match_set_reg(&match, MFF_LOG_INPORT - MFF_REG0, binding->tunnel_key);
> +
> +        ofpbuf_clear(&ofpacts);
> +        struct zone_ids zone_ids = get_zone_ids(binding, p_ctx->ct_zones);
> +        put_zones_ofpacts(&zone_ids, &ofpacts);
> +        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
> +        ofctrl_add_flow(flow_table, OFTABLE_CT_ZONE_LOOKUP, 100, 0,
> +                        &match, &ofpacts, hc_uuid);
>       }
>   
> +    /* Default flow for CT_ZONE_LOOKUP Table. */
> +    struct match ct_look_def_match;
> +    match_init_catchall(&ct_look_def_match);
> +    ofpbuf_clear(&ofpacts);
> +    put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
> +    ofctrl_add_flow(flow_table, OFTABLE_CT_ZONE_LOOKUP, 0, 0,
> +                    &ct_look_def_match, &ofpacts, hc_uuid);
> +
>       /* Handle output to multicast groups, in tables 40 and 41. */
>       const struct sbrec_multicast_group *mc;
>       SBREC_MULTICAST_GROUP_TABLE_FOR_EACH (mc, p_ctx->mc_group_table) {
> @@ -2511,7 +2540,7 @@ physical_run(struct physical_ctx *p_ctx,
>           /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled packets
>            * do not fit path MTU.
>            */
> -        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
> +        put_resubmit(OFTABLE_CT_ZONE_LOOKUP, &ofpacts);
>   
>           /* IPv4 */
>           match_init_catchall(&match);
> diff --git a/northd/northd.c b/northd/northd.c
> index a174a4dcd..34c56f95e 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -8634,7 +8634,7 @@ build_lswitch_lflows_admission_control(struct ovn_datapath *od,
>       ovs_assert(od->nbs);
>   
>       /* Default action for recirculated ICMP error 'packet too big'. */
> -    ovn_lflow_add(lflows, od, S_SWITCH_IN_CHECK_PORT_SEC, 110,
> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_CHECK_PORT_SEC, 105,
>                     "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>                     " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
>                     " flags.tunnel_rx == 1", debug_drop_action(), lflow_ref);
> @@ -11822,7 +11822,24 @@ build_lswitch_icmp_packet_toobig_admin_flows(
>   {
>       ovs_assert(op->nbsp);
>   
> +    ds_clear(match);
>       if (!lsp_is_router(op->nbsp)) {
> +        struct eth_addr mac;
> +        if (!op->nbsp->n_addresses ||
> +            !ovs_scan(op->nbsp->addresses[0], ETH_ADDR_SCAN_FMT,
> +                      ETH_ADDR_SCAN_ARGS(mac))) {
> +            return;
> +        }
> +
> +        ds_put_format(match,
> +                      "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
> +                      " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
> +                      " eth.src == "ETH_ADDR_FMT" && outport == %s &&"
> +                      " !is_chassis_resident(%s) && flags.tunnel_rx == 1",
> +                      ETH_ADDR_ARGS(mac), op->json_key, op->json_key);
> +        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 110,
> +                      ds_cstr(match), "outport <-> inport; next;",
> +                      op->lflow_ref);
>           return;
>       }
>   
> @@ -11831,26 +11848,28 @@ build_lswitch_icmp_packet_toobig_admin_flows(
>           return;
>       }
>   
> -    ds_clear(match);
>       if (peer->od->is_gw_router) {
>           ds_put_format(match,
>                         "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>                         " (ip6 && icmp6.type == 2 && icmp6.code == 0)) && "
>                         "eth.src == %s && outport == %s && flags.tunnel_rx == 1",
>                         peer->nbrp->mac, op->json_key);
> +        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
> +                      ds_cstr(match), "outport <-> inport; next;",
> +                      op->lflow_ref);
>       } else {
>           ds_put_format(match,
>                         "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>                         " (ip6 && icmp6.type == 2 && icmp6.code == 0)) && "
>                         "eth.dst == %s && flags.tunnel_rx == 1",
>                         peer->nbrp->mac);
> +        ds_clear(actions);
> +        ds_put_format(actions,
> +                      "outport <-> inport; next(pipeline=ingress,table=%d);",
> +                      ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
> +        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
> +                      ds_cstr(match), ds_cstr(actions), op->lflow_ref);
>       }
> -    ds_clear(actions);
> -    ds_put_format(actions,
> -                  "outport <-> inport; next(pipeline=ingress,table=%d);",
> -                  ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
> -    ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
> -                  ds_cstr(match), ds_cstr(actions), op->lflow_ref);
>   }
>   
>   static void
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 9583abeff..840a4f6a3 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -324,8 +324,7 @@
>         'packet too big' and <code>eth.src == <var>D</var> &amp;&amp;
>         outport == <var>P</var> &amp;&amp; flags.tunnel_rx == 1</code> where
>         <var>D</var> is the peer logical router port <var>RP</var> mac address,
> -      swaps inport and outport and applies the action <code>
> -      next(pipeline=S_SWITCH_IN_L2_LKUP)</code>.
> +      swaps inport and outport and applies the action <code>next</code>.
>       </p>
>   
>       <p>
> @@ -338,7 +337,16 @@
>       </p>
>   
>       <p>
> -      This table adds a priority-110 flow that matches 'recirculated' icmp{4,6}
> +      For each logical switch port <var>P</var> a priority-110 flow that
> +      matches 'recirculated' icmp{4,6} error 'packet too big' and <code>
> +      eth.src == <var>D</var> &amp;&amp; outport == <var>P</var> &amp;&amp;
> +      !is_chassis_resident("<var>P</var>") &amp;&amp; flags.tunnel_rx == 1
> +      </code> where <var>D</var> is the logical switch port mac address,
> +      swaps inport and outport and applies the action <code>next</code>.
> +    </p>
> +
> +    <p>
> +      This table adds a priority-105 flow that matches 'recirculated' icmp{4,6}
>         error 'packet too big' to drop the packet.
>       </p>
>   
> diff --git a/tests/multinode.at b/tests/multinode.at
> index 0187382be..ef40db9b7 100644
> --- a/tests/multinode.at
> +++ b/tests/multinode.at
> @@ -154,6 +154,11 @@ check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
>   check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
>   check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
>   
> +# create LB
> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 10.0.0.4:8080 udp
> +check multinode_nbctl ls-lb-add sw0 lb0
> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw0p2], [nc -u -l 8080 >/dev/null 2>&1 &])
> +
>   m_as ovn-gw-1 ip netns add ovn-ext0
>   m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0 type=internal
>   m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
> @@ -207,6 +212,14 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 172.20.1.2 |
>   3 packets transmitted, 3 received, 0% packet loss, time 0ms
>   ])
>   
> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
> +for i in $(seq 30); do
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
> +done
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
> +
> +killall nc
> +
>   AT_CLEANUP
>   
>   AT_SETUP([ovn multinode pmtu - distributed router - vxlan])
> @@ -696,6 +709,11 @@ check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
>   check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
>   check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
>   
> +# create LB
> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 20.0.0.3:8080 udp
> +check multinode_nbctl lr-lb-add lr0 lb0
> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw1p1], [nc -u -l 8080 >/dev/null 2>&1 &])
> +
>   m_as ovn-gw-1 ip netns add ovn-ext0
>   m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0 type=internal
>   m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
> @@ -751,6 +769,18 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 172.20.1.2 |
>   M_NS_CHECK_EXEC([ovn-gw-1], [ovn-ext0], [ip link set dev ext1 mtu 1100])
>   M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 20 -i 0.5 -s 1300 -M do 172.20.1.2 2>&1 |grep -q "mtu = 1100"])
>   
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
> +
> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
> +for i in $(seq 30); do
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
> +done
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
> +
> +killall nc
> +
>   AT_CLEANUP
>   
>   AT_SETUP([ovn multinode pmtu - gw router - vxlan])
> @@ -834,6 +864,11 @@ check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
>   check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
>   check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
>   
> +# create LB
> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 20.0.0.3:8080 udp
> +check multinode_nbctl lr-lb-add lr0 lb0
> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw1p1], [nc -u -l 8080 >/dev/null 2>&1 &])
> +
>   m_as ovn-gw-1 ip netns add ovn-ext0
>   m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0 type=internal
>   m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
> @@ -882,4 +917,120 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 172.20.1.2 |
>   
>   M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 20 -i 0.5 -s 1300 -M do 172.20.1.2 2>&1 |grep -q "mtu = 1150"])
>   
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
> +
> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
> +for i in $(seq 30); do
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
> +done
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 950'])
> +
> +killall nc
> +
> +AT_CLEANUP
> +
> +AT_SETUP([ovn multinode pmtu - logical switch - geneve])
> +
> +# Check that ovn-fake-multinode setup is up and running
> +check_fake_multinode_setup
> +
> +# Delete the multinode NB and OVS resources before starting the test.
> +cleanup_multinode_resources
> +
> +m_as ovn-chassis-1 ip link del sw0p1-p
> +m_as ovn-chassis-2 ip link del sw0p2-p
> +
> +# Reset geneve tunnels
> +for c in ovn-chassis-1 ovn-chassis-2 ovn-gw-1
> +do
> +    m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=geneve
> +done
> +
> +OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q genev_sys])
> +OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q genev_sys])
> +OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q genev_sys])
> +
> +# Test East-West switching
> +check multinode_nbctl ls-add sw0
> +check multinode_nbctl lsp-add sw0 sw0-port1
> +check multinode_nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:03 10.0.0.3 1000::3"
> +check multinode_nbctl lsp-add sw0 sw0-port2
> +check multinode_nbctl lsp-set-addresses sw0-port2 "50:54:00:00:00:04 10.0.0.4 1000::4"
> +
> +m_as ovn-chassis-1 /data/create_fake_vm.sh sw0-port1 sw0p1 50:54:00:00:00:03 10.0.0.3 24 10.0.0.1 1000::3/64 1000::a
> +m_as ovn-chassis-2 /data/create_fake_vm.sh sw0-port2 sw0p2 50:54:00:00:00:04 10.0.0.4 24 10.0.0.1 1000::4/64 1000::a
> +
> +# Create the second logical switch with one port
> +check multinode_nbctl ls-add sw1
> +check multinode_nbctl lsp-add sw1 sw1-port1
> +check multinode_nbctl lsp-set-addresses sw1-port1 "40:54:00:00:00:03 20.0.0.3 2000::3"
> +
> +# Create a logical router and attach both logical switches
> +check multinode_nbctl lr-add lr0
> +check multinode_nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24 1000::a/64
> +check multinode_nbctl lsp-add sw0 sw0-lr0
> +check multinode_nbctl lsp-set-type sw0-lr0 router
> +check multinode_nbctl lsp-set-addresses sw0-lr0 router
> +check multinode_nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
> +
> +check multinode_nbctl lrp-add lr0 lr0-sw1 00:00:00:00:ff:02 20.0.0.1/24 2000::a/64
> +check multinode_nbctl lsp-add sw1 sw1-lr0
> +check multinode_nbctl lsp-set-type sw1-lr0 router
> +check multinode_nbctl lsp-set-addresses sw1-lr0 router
> +check multinode_nbctl lsp-set-options sw1-lr0 router-port=lr0-sw1
> +
> +m_as ovn-chassis-2 /data/create_fake_vm.sh sw1-port1 sw1p1 40:54:00:00:00:03 20.0.0.3 24 20.0.0.1 2000::3/64 2000::a
> +
> +check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 10.0.0.0/24
> +check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
> +
> +check multinode_nbctl lrp-set-gateway-chassis lr0-sw0 ovn-chassis-1 10
> +check multinode_nbctl lrp-set-gateway-chassis lr0-sw1 ovn-chassis-2 10
> +
> +# create some ACLs
> +check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
> +check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
> +
> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 10.0.0.4:8080 udp
> +check multinode_nbctl ls-lb-add sw0 lb0
> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw0p2], [nc -u -l 8080 >/dev/null 2>&1 &])
> +
> +m_wait_for_ports_up
> +
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.4 | FORMAT_PING], \
> +[0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +# Change ptmu for the geneve tunnel
> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 10.0.0.4 2>&1 |grep -q "message too long, mtu=1142"])
> +
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
> +
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | FORMAT_PING], \
> +[0], [dnl
> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
> +])
> +
> +# Change ptmu for the geneve tunnel
> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1100 dev eth1
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 |grep -q "message too long, mtu=1042"])
> +
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
> +
> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
> +for i in $(seq 30); do
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
> +done
> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
> +
> +killall nc
> +
>   AT_CLEANUP
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index 591ad5aad..b04cc4893 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -8507,7 +8507,7 @@ ovn_strip_lflows ], [0], [dnl
>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
> @@ -8533,7 +8533,9 @@ ovn_strip_lflows ], [0], [dnl
>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
>     table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
> @@ -8560,7 +8562,9 @@ ovn_strip_lflows ], [0], [dnl
>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
>     table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
> @@ -8588,7 +8592,9 @@ ovn_strip_lflows ], [0], [dnl
>     table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(inport == "sw0p1"), action=(reg0[[15]] = 1; next;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
>     table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
> @@ -8615,7 +8621,9 @@ ovn_strip_lflows ], [0], [dnl
>     table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(inport == "sw0p1"), action=(reg0[[15]] = 1; next;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
>     table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_check_port_sec), priority=70   , match=(inport == "sw0p2"), action=(set_queue(10); reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
> @@ -8645,7 +8653,9 @@ ovn_strip_lflows ], [0], [dnl
>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
>     table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
>     table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_check_port_sec), priority=70   , match=(inport == "localnetport"), action=(set_queue(10); reg0[[15]] = check_in_port_sec(); next;)
>     table=??(ls_in_check_port_sec), priority=70   , match=(inport == "sw0p1"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)
Dumitru Ceara March 28, 2024, 11:47 a.m. UTC | #2
On 3/15/24 20:34, Mark Michelson wrote:
> Hi Lorenzo,
> 
> Thanks for the fix.
> 
> Acked-by: Mark Michelson <mmichels@redhat.com>
> 

Hi Lorenzo, Mark,

I'm afraid there's a bug in this patch, please see below.

> When this is merged, the following should also be folded in:
> 
> ---
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 17b414144..0cf1c2bb5 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -338,7 +338,7 @@
>      </p>
> 
>      <p>
> -      This table adds a priority-110 flow that matches 'recirculated'
> icmp{4,6}
> +      This table adds a priority-105 flow that matches 'recirculated'
> icmp{4,6}
>        error 'packet too big' to drop the packet.
>      </p>
> 
> ---
> 
> This accounts for the change of priority introduced in this patch.
> 
> I also noticed a small spelling mistake that should be corrected. I
> marked it below.
> 
> On 2/13/24 08:32, Lorenzo Bianconi wrote:
>> Similar to what is already implemented for routed e/w traffic,
>> introduce pmtud support for e/w traffic between two logical switch ports
>> connected to the same logical switch, but running on two different
>> hypervisors.
>>
>> Reported-at: https://issues.redhat.com/browse/FDP-362
>> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
>> ---
>>   controller/lflow.h      |   1 +
>>   controller/physical.c   |  31 ++++++++-
>>   northd/northd.c         |  35 +++++++---
>>   northd/ovn-northd.8.xml |  14 +++-
>>   tests/multinode.at      | 151 ++++++++++++++++++++++++++++++++++++++++
>>   tests/ovn-northd.at     |  22 ++++--
>>   6 files changed, 236 insertions(+), 18 deletions(-)
>>
>> diff --git a/controller/lflow.h b/controller/lflow.h
>> index 9b7ffa19c..906a26280 100644
>> --- a/controller/lflow.h
>> +++ b/controller/lflow.h
>> @@ -94,6 +94,7 @@ struct uuid;
>>   #define OFTABLE_ECMP_NH                  77
>>   #define OFTABLE_CHK_LB_AFFINITY          78
>>   #define OFTABLE_MAC_CACHE_USE            79
>> +#define OFTABLE_CT_ZONE_LOOKUP           80
>>     struct lflow_ctx_in {
>>       struct ovsdb_idl_index *sbrec_multicast_group_by_name_datapath;
>> diff --git a/controller/physical.c b/controller/physical.c
>> index c32642d2c..6a9327b8d 100644
>> --- a/controller/physical.c
>> +++ b/controller/physical.c
>> @@ -2451,8 +2451,37 @@ physical_run(struct physical_ctx *p_ctx,
>>                                 p_ctx->n_encap_ips,
>>                                 p_ctx->encap_ips,
>>                                 flow_table, &ofpacts);

This whole chunk below should be part of consider_port_binding().
Otherwise, we fail to incrementally add flows for newly bound interfaces.

physical_run() is only called on recompute while consider_port_binding()
is called in both paths (recompute and incremetally process updates).

We should add a test case for this.

One way to hit the bug is to start an OVN sandbox.  Then:
$ ./ovn-setup.sh

$ ovn-sbctl show
Chassis chassis-1
    hostname: sandbox
    Encap geneve
        ip: "127.0.0.1"
        options: {csum="true"}
    Port_Binding sw1-port1
    Port_Binding sw0-port1

# Two ports are bound locally, we expect 2 non-default flows in table 80:
$ ovs-ofctl dump-flows br-int table=80
 cookie=0x0, duration=51.758s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x1,metadata=0x1 actions=load:0x3->NXM_NX_REG13[0..15],load:0x2->NXM_NX_REG11[],load:0x7->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=1.116s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x1,metadata=0x2 actions=load:0x8->NXM_NX_REG13[0..15],load:0x6->NXM_NX_REG11[],load:0x5->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=53.640s, table=80, n_packets=0, n_bytes=0, priority=0 actions=resubmit(,8)

# Add a new locally bound port:
$ ovn-nbctl lsp-add sw0 sw0-bar
$ ovs-vsctl add-port br-int sw0-bar -- set interface sw0-bar external_ids:iface-id=sw0-bar

$ ovn-sbctl show
Chassis chassis-1
    hostname: sandbox
    Encap geneve
        ip: "127.0.0.1"
        options: {csum="true"}
    Port_Binding sw1-port1
    Port_Binding sw0-port1
    Port_Binding sw0-bar

# Three ports are bound locally, we expect 3 non-default flows in table 80:
$ ovs-ofctl dump-flows br-int table=80
 cookie=0x0, duration=123.946s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x1,metadata=0x1 actions=load:0x3->NXM_NX_REG13[0..15],load:0x2->NXM_NX_REG11[],load:0x7->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=73.304s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x1,metadata=0x2 actions=load:0x8->NXM_NX_REG13[0..15],load:0x6->NXM_NX_REG11[],load:0x5->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=125.828s, table=80, n_packets=0, n_bytes=0, priority=0 actions=resubmit(,8)

# Only 2 are there.. trigger a recompute:
$ ovn-appctl recompute
$ ovs-ofctl dump-flows br-int table=80
 cookie=0x0, duration=167.205s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x1,metadata=0x1 actions=load:0x3->NXM_NX_REG13[0..15],load:0x2->NXM_NX_REG11[],load:0x7->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=116.563s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x1,metadata=0x2 actions=load:0x8->NXM_NX_REG13[0..15],load:0x6->NXM_NX_REG11[],load:0x5->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=2.673s, table=80, n_packets=0, n_bytes=0, priority=100,reg14=0x3,metadata=0x1 actions=load:0x9->NXM_NX_REG13[0..15],load:0x2->NXM_NX_REG11[],load:0x7->NXM_NX_REG12[],resubmit(,8)
 cookie=0x0, duration=169.087s, table=80, n_packets=0, n_bytes=0, priority=0 actions=resubmit(,8)

Regards,
Dumitru

>> +
>> +        if (!local_binding_get_primary_pb(p_ctx->local_bindings,
>> +                                          binding->logical_port)) {
>> +            continue;
>> +        }
>> +
>> +        /* Table 80, priority 100.
>> +         * =======================
>> +         *
>> +         * Process ICMP{4,6} error packets too big locally generalted
>> from the
> 
> s/generalted/generated/
> 
>> +         * kernel in order to lookup proper ct_zone. */
>> +        struct match match = MATCH_CATCHALL_INITIALIZER;
>> +        match_set_metadata(&match,
>> htonll(binding->datapath->tunnel_key));
>> +        match_set_reg(&match, MFF_LOG_INPORT - MFF_REG0,
>> binding->tunnel_key);
>> +
>> +        ofpbuf_clear(&ofpacts);
>> +        struct zone_ids zone_ids = get_zone_ids(binding,
>> p_ctx->ct_zones);
>> +        put_zones_ofpacts(&zone_ids, &ofpacts);
>> +        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
>> +        ofctrl_add_flow(flow_table, OFTABLE_CT_ZONE_LOOKUP, 100, 0,
>> +                        &match, &ofpacts, hc_uuid);
>>       }
>>   +    /* Default flow for CT_ZONE_LOOKUP Table. */
>> +    struct match ct_look_def_match;
>> +    match_init_catchall(&ct_look_def_match);
>> +    ofpbuf_clear(&ofpacts);
>> +    put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
>> +    ofctrl_add_flow(flow_table, OFTABLE_CT_ZONE_LOOKUP, 0, 0,
>> +                    &ct_look_def_match, &ofpacts, hc_uuid);
>> +
>>       /* Handle output to multicast groups, in tables 40 and 41. */
>>       const struct sbrec_multicast_group *mc;
>>       SBREC_MULTICAST_GROUP_TABLE_FOR_EACH (mc, p_ctx->mc_group_table) {
>> @@ -2511,7 +2540,7 @@ physical_run(struct physical_ctx *p_ctx,
>>           /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled
>> packets
>>            * do not fit path MTU.
>>            */
>> -        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
>> +        put_resubmit(OFTABLE_CT_ZONE_LOOKUP, &ofpacts);
>>             /* IPv4 */
>>           match_init_catchall(&match);
>> diff --git a/northd/northd.c b/northd/northd.c
>> index a174a4dcd..34c56f95e 100644
>> --- a/northd/northd.c
>> +++ b/northd/northd.c
>> @@ -8634,7 +8634,7 @@ build_lswitch_lflows_admission_control(struct
>> ovn_datapath *od,
>>       ovs_assert(od->nbs);
>>         /* Default action for recirculated ICMP error 'packet too
>> big'. */
>> -    ovn_lflow_add(lflows, od, S_SWITCH_IN_CHECK_PORT_SEC, 110,
>> +    ovn_lflow_add(lflows, od, S_SWITCH_IN_CHECK_PORT_SEC, 105,
>>                     "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>>                     " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
>>                     " flags.tunnel_rx == 1", debug_drop_action(),
>> lflow_ref);
>> @@ -11822,7 +11822,24 @@ build_lswitch_icmp_packet_toobig_admin_flows(
>>   {
>>       ovs_assert(op->nbsp);
>>   +    ds_clear(match);
>>       if (!lsp_is_router(op->nbsp)) {
>> +        struct eth_addr mac;
>> +        if (!op->nbsp->n_addresses ||
>> +            !ovs_scan(op->nbsp->addresses[0], ETH_ADDR_SCAN_FMT,
>> +                      ETH_ADDR_SCAN_ARGS(mac))) {
>> +            return;
>> +        }
>> +
>> +        ds_put_format(match,
>> +                      "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>> +                      " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
>> +                      " eth.src == "ETH_ADDR_FMT" && outport == %s &&"
>> +                      " !is_chassis_resident(%s) && flags.tunnel_rx
>> == 1",
>> +                      ETH_ADDR_ARGS(mac), op->json_key, op->json_key);
>> +        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 110,
>> +                      ds_cstr(match), "outport <-> inport; next;",
>> +                      op->lflow_ref);
>>           return;
>>       }
>>   @@ -11831,26 +11848,28 @@ build_lswitch_icmp_packet_toobig_admin_flows(
>>           return;
>>       }
>>   -    ds_clear(match);
>>       if (peer->od->is_gw_router) {
>>           ds_put_format(match,
>>                         "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>>                         " (ip6 && icmp6.type == 2 && icmp6.code == 0))
>> && "
>>                         "eth.src == %s && outport == %s &&
>> flags.tunnel_rx == 1",
>>                         peer->nbrp->mac, op->json_key);
>> +        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
>> +                      ds_cstr(match), "outport <-> inport; next;",
>> +                      op->lflow_ref);
>>       } else {
>>           ds_put_format(match,
>>                         "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
>>                         " (ip6 && icmp6.type == 2 && icmp6.code == 0))
>> && "
>>                         "eth.dst == %s && flags.tunnel_rx == 1",
>>                         peer->nbrp->mac);
>> +        ds_clear(actions);
>> +        ds_put_format(actions,
>> +                      "outport <-> inport;
>> next(pipeline=ingress,table=%d);",
>> +                      ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
>> +        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
>> +                      ds_cstr(match), ds_cstr(actions), op->lflow_ref);
>>       }
>> -    ds_clear(actions);
>> -    ds_put_format(actions,
>> -                  "outport <-> inport;
>> next(pipeline=ingress,table=%d);",
>> -                  ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
>> -    ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
>> -                  ds_cstr(match), ds_cstr(actions), op->lflow_ref);
>>   }
>>     static void
>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>> index 9583abeff..840a4f6a3 100644
>> --- a/northd/ovn-northd.8.xml
>> +++ b/northd/ovn-northd.8.xml
>> @@ -324,8 +324,7 @@
>>         'packet too big' and <code>eth.src == <var>D</var> &amp;&amp;
>>         outport == <var>P</var> &amp;&amp; flags.tunnel_rx == 1</code>
>> where
>>         <var>D</var> is the peer logical router port <var>RP</var> mac
>> address,
>> -      swaps inport and outport and applies the action <code>
>> -      next(pipeline=S_SWITCH_IN_L2_LKUP)</code>.
>> +      swaps inport and outport and applies the action <code>next</code>.
>>       </p>
>>         <p>
>> @@ -338,7 +337,16 @@
>>       </p>
>>         <p>
>> -      This table adds a priority-110 flow that matches 'recirculated'
>> icmp{4,6}
>> +      For each logical switch port <var>P</var> a priority-110 flow that
>> +      matches 'recirculated' icmp{4,6} error 'packet too big' and <code>
>> +      eth.src == <var>D</var> &amp;&amp; outport == <var>P</var>
>> &amp;&amp;
>> +      !is_chassis_resident("<var>P</var>") &amp;&amp; flags.tunnel_rx
>> == 1
>> +      </code> where <var>D</var> is the logical switch port mac address,
>> +      swaps inport and outport and applies the action <code>next</code>.
>> +    </p>
>> +
>> +    <p>
>> +      This table adds a priority-105 flow that matches 'recirculated'
>> icmp{4,6}
>>         error 'packet too big' to drop the packet.
>>       </p>
>>   diff --git a/tests/multinode.at b/tests/multinode.at
>> index 0187382be..ef40db9b7 100644
>> --- a/tests/multinode.at
>> +++ b/tests/multinode.at
>> @@ -154,6 +154,11 @@ check multinode_nbctl lr-nat-add lr0 snat
>> 172.20.0.100 20.0.0.0/24
>>   check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6' 
>> allow-related
>>   check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6' 
>> allow-related
>>   +# create LB
>> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 10.0.0.4:8080 udp
>> +check multinode_nbctl ls-lb-add sw0 lb0
>> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw0p2], [nc -u -l 8080 >/dev/null
>> 2>&1 &])
>> +
>>   m_as ovn-gw-1 ip netns add ovn-ext0
>>   m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0
>> type=internal
>>   m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
>> @@ -207,6 +212,14 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping
>> -q -c 3 -i 0.3 -w 2 172.20.1.2 |
>>   3 packets transmitted, 3 received, 0% packet loss, time 0ms
>>   ])
>>   +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
>> +for i in $(seq 30); do
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2
>> if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
>> +done
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev
>> sw0p1 | grep -q 'mtu 942'])
>> +
>> +killall nc
>> +
>>   AT_CLEANUP
>>     AT_SETUP([ovn multinode pmtu - distributed router - vxlan])
>> @@ -696,6 +709,11 @@ check multinode_nbctl lr-nat-add lr0 snat
>> 172.20.0.100 20.0.0.0/24
>>   check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6' 
>> allow-related
>>   check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6' 
>> allow-related
>>   +# create LB
>> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 20.0.0.3:8080 udp
>> +check multinode_nbctl lr-lb-add lr0 lb0
>> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw1p1], [nc -u -l 8080 >/dev/null
>> 2>&1 &])
>> +
>>   m_as ovn-gw-1 ip netns add ovn-ext0
>>   m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0
>> type=internal
>>   m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
>> @@ -751,6 +769,18 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping
>> -q -c 3 -i 0.3 -w 2 172.20.1.2 |
>>   M_NS_CHECK_EXEC([ovn-gw-1], [ovn-ext0], [ip link set dev ext1 mtu
>> 1100])
>>   M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 20 -i 0.5 -s 1300
>> -M do 172.20.1.2 2>&1 |grep -q "mtu = 1100"])
>>   +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24
>> dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via
>> 10.0.0.1 dev sw0p1])
>> +
>> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
>> +for i in $(seq 30); do
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2
>> if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
>> +done
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev
>> sw0p1 | grep -q 'mtu 942'])
>> +
>> +killall nc
>> +
>>   AT_CLEANUP
>>     AT_SETUP([ovn multinode pmtu - gw router - vxlan])
>> @@ -834,6 +864,11 @@ check multinode_nbctl lr-nat-add lr0 snat
>> 172.20.0.100 20.0.0.0/24
>>   check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6' 
>> allow-related
>>   check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6' 
>> allow-related
>>   +# create LB
>> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 20.0.0.3:8080 udp
>> +check multinode_nbctl lr-lb-add lr0 lb0
>> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw1p1], [nc -u -l 8080 >/dev/null
>> 2>&1 &])
>> +
>>   m_as ovn-gw-1 ip netns add ovn-ext0
>>   m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0
>> type=internal
>>   m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
>> @@ -882,4 +917,120 @@ M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping
>> -q -c 3 -i 0.3 -w 2 172.20.1.2 |
>>     M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 20 -i 0.5 -s
>> 1300 -M do 172.20.1.2 2>&1 |grep -q "mtu = 1150"])
>>   +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24
>> dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via
>> 10.0.0.1 dev sw0p1])
>> +
>> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
>> +for i in $(seq 30); do
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2
>> if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
>> +done
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev
>> sw0p1 | grep -q 'mtu 950'])
>> +
>> +killall nc
>> +
>> +AT_CLEANUP
>> +
>> +AT_SETUP([ovn multinode pmtu - logical switch - geneve])
>> +
>> +# Check that ovn-fake-multinode setup is up and running
>> +check_fake_multinode_setup
>> +
>> +# Delete the multinode NB and OVS resources before starting the test.
>> +cleanup_multinode_resources
>> +
>> +m_as ovn-chassis-1 ip link del sw0p1-p
>> +m_as ovn-chassis-2 ip link del sw0p2-p
>> +
>> +# Reset geneve tunnels
>> +for c in ovn-chassis-1 ovn-chassis-2 ovn-gw-1
>> +do
>> +    m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=geneve
>> +done
>> +
>> +OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q genev_sys])
>> +OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q genev_sys])
>> +OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q genev_sys])
>> +
>> +# Test East-West switching
>> +check multinode_nbctl ls-add sw0
>> +check multinode_nbctl lsp-add sw0 sw0-port1
>> +check multinode_nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:03
>> 10.0.0.3 1000::3"
>> +check multinode_nbctl lsp-add sw0 sw0-port2
>> +check multinode_nbctl lsp-set-addresses sw0-port2 "50:54:00:00:00:04
>> 10.0.0.4 1000::4"
>> +
>> +m_as ovn-chassis-1 /data/create_fake_vm.sh sw0-port1 sw0p1
>> 50:54:00:00:00:03 10.0.0.3 24 10.0.0.1 1000::3/64 1000::a
>> +m_as ovn-chassis-2 /data/create_fake_vm.sh sw0-port2 sw0p2
>> 50:54:00:00:00:04 10.0.0.4 24 10.0.0.1 1000::4/64 1000::a
>> +
>> +# Create the second logical switch with one port
>> +check multinode_nbctl ls-add sw1
>> +check multinode_nbctl lsp-add sw1 sw1-port1
>> +check multinode_nbctl lsp-set-addresses sw1-port1 "40:54:00:00:00:03
>> 20.0.0.3 2000::3"
>> +
>> +# Create a logical router and attach both logical switches
>> +check multinode_nbctl lr-add lr0
>> +check multinode_nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01
>> 10.0.0.1/24 1000::a/64
>> +check multinode_nbctl lsp-add sw0 sw0-lr0
>> +check multinode_nbctl lsp-set-type sw0-lr0 router
>> +check multinode_nbctl lsp-set-addresses sw0-lr0 router
>> +check multinode_nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
>> +
>> +check multinode_nbctl lrp-add lr0 lr0-sw1 00:00:00:00:ff:02
>> 20.0.0.1/24 2000::a/64
>> +check multinode_nbctl lsp-add sw1 sw1-lr0
>> +check multinode_nbctl lsp-set-type sw1-lr0 router
>> +check multinode_nbctl lsp-set-addresses sw1-lr0 router
>> +check multinode_nbctl lsp-set-options sw1-lr0 router-port=lr0-sw1
>> +
>> +m_as ovn-chassis-2 /data/create_fake_vm.sh sw1-port1 sw1p1
>> 40:54:00:00:00:03 20.0.0.3 24 20.0.0.1 2000::3/64 2000::a
>> +
>> +check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 10.0.0.0/24
>> +check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
>> +
>> +check multinode_nbctl lrp-set-gateway-chassis lr0-sw0 ovn-chassis-1 10
>> +check multinode_nbctl lrp-set-gateway-chassis lr0-sw1 ovn-chassis-2 10
>> +
>> +# create some ACLs
>> +check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6' 
>> allow-related
>> +check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6' 
>> allow-related
>> +
>> +check multinode_nbctl lb-add lb0 10.0.0.1:8080 10.0.0.4:8080 udp
>> +check multinode_nbctl ls-lb-add sw0 lb0
>> +M_NS_CHECK_EXEC([ovn-chassis-2], [sw0p2], [nc -u -l 8080 >/dev/null
>> 2>&1 &])
>> +
>> +m_wait_for_ports_up
>> +
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2
>> 10.0.0.4 | FORMAT_PING], \
>> +[0], [dnl
>> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
>> +])
>> +
>> +# Change ptmu for the geneve tunnel
>> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do
>> 10.0.0.4 2>&1 |grep -q "message too long, mtu=1142"])
>> +
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24
>> dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via
>> 10.0.0.1 dev sw0p1])
>> +
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2
>> 20.0.0.3 | FORMAT_PING], \
>> +[0], [dnl
>> +3 packets transmitted, 3 received, 0% packet loss, time 0ms
>> +])
>> +
>> +# Change ptmu for the geneve tunnel
>> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1100 dev eth1
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do
>> 20.0.0.3 2>&1 |grep -q "message too long, mtu=1042"])
>> +
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24
>> dev sw0p1])
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via
>> 10.0.0.1 dev sw0p1])
>> +
>> +m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
>> +for i in $(seq 30); do
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2
>> if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
>> +done
>> +M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev
>> sw0p1 | grep -q 'mtu 942'])
>> +
>> +killall nc
>> +
>>   AT_CLEANUP
>> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
>> index 591ad5aad..b04cc4893 100644
>> --- a/tests/ovn-northd.at
>> +++ b/tests/ovn-northd.at
>> @@ -8507,7 +8507,7 @@ ovn_strip_lflows ], [0], [dnl
>>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]]
>> == 1), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(eth.src[[40]]), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(vlan.present), action=(drop;)
>> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=50   , match=(1),
>> action=(reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1),
>> action=(outport = get_fdb(eth.dst); next;)
>>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst ==
>> $svc_monitor_mac && (tcp || icmp || icmp6)),
>> action=(handle_svc_check(inport);)
>> @@ -8533,7 +8533,9 @@ ovn_strip_lflows ], [0], [dnl
>>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]]
>> == 1), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(eth.src[[40]]), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(vlan.present), action=(drop;)
>> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport ==
>> "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport ==
>> "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>>     table=??(ls_in_check_port_sec), priority=50   , match=(1),
>> action=(reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1),
>> action=(outport = get_fdb(eth.dst); next;)
>>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst ==
>> $svc_monitor_mac && (tcp || icmp || icmp6)),
>> action=(handle_svc_check(inport);)
>> @@ -8560,7 +8562,9 @@ ovn_strip_lflows ], [0], [dnl
>>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]]
>> == 1), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(eth.src[[40]]), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(vlan.present), action=(drop;)
>> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport ==
>> "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport ==
>> "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>>     table=??(ls_in_check_port_sec), priority=50   , match=(1),
>> action=(reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1),
>> action=(outport = get_fdb(eth.dst); next;)
>>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst ==
>> $svc_monitor_mac && (tcp || icmp || icmp6)),
>> action=(handle_svc_check(inport);)
>> @@ -8588,7 +8592,9 @@ ovn_strip_lflows ], [0], [dnl
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(eth.src[[40]]), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  , match=(inport ==
>> "sw0p1"), action=(reg0[[15]] = 1; next;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(vlan.present), action=(drop;)
>> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport ==
>> "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport ==
>> "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>>     table=??(ls_in_check_port_sec), priority=50   , match=(1),
>> action=(reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1),
>> action=(outport = get_fdb(eth.dst); next;)
>>     table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst ==
>> $svc_monitor_mac && (tcp || icmp || icmp6)),
>> action=(handle_svc_check(inport);)
>> @@ -8615,7 +8621,9 @@ ovn_strip_lflows ], [0], [dnl
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(eth.src[[40]]), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  , match=(inport ==
>> "sw0p1"), action=(reg0[[15]] = 1; next;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(vlan.present), action=(drop;)
>> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport ==
>> "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport ==
>> "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>>     table=??(ls_in_check_port_sec), priority=50   , match=(1),
>> action=(reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_check_port_sec), priority=70   , match=(inport ==
>> "sw0p2"), action=(set_queue(10); reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_l2_lkup      ), priority=0    , match=(1),
>> action=(outport = get_fdb(eth.dst); next;)
>> @@ -8645,7 +8653,9 @@ ovn_strip_lflows ], [0], [dnl
>>     table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]]
>> == 1), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(eth.src[[40]]), action=(drop;)
>>     table=??(ls_in_check_port_sec), priority=100  ,
>> match=(vlan.present), action=(drop;)
>> -  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport ==
>> "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>> +  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 &&
>> icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 &&
>> icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport ==
>> "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1),
>> action=(outport <-> inport; next;)
>>     table=??(ls_in_check_port_sec), priority=50   , match=(1),
>> action=(reg0[[15]] = check_in_port_sec(); next;)
>>     table=??(ls_in_check_port_sec), priority=70   , match=(inport ==
>> "localnetport"), action=(set_queue(10); reg0[[15]] =
>> check_in_port_sec(); next;)
>>     table=??(ls_in_check_port_sec), priority=70   , match=(inport ==
>> "sw0p1"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)
>
diff mbox series

Patch

diff --git a/controller/lflow.h b/controller/lflow.h
index 9b7ffa19c..906a26280 100644
--- a/controller/lflow.h
+++ b/controller/lflow.h
@@ -94,6 +94,7 @@  struct uuid;
 #define OFTABLE_ECMP_NH                  77
 #define OFTABLE_CHK_LB_AFFINITY          78
 #define OFTABLE_MAC_CACHE_USE            79
+#define OFTABLE_CT_ZONE_LOOKUP           80
 
 struct lflow_ctx_in {
     struct ovsdb_idl_index *sbrec_multicast_group_by_name_datapath;
diff --git a/controller/physical.c b/controller/physical.c
index c32642d2c..6a9327b8d 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -2451,8 +2451,37 @@  physical_run(struct physical_ctx *p_ctx,
                               p_ctx->n_encap_ips,
                               p_ctx->encap_ips,
                               flow_table, &ofpacts);
+
+        if (!local_binding_get_primary_pb(p_ctx->local_bindings,
+                                          binding->logical_port)) {
+            continue;
+        }
+
+        /* Table 80, priority 100.
+         * =======================
+         *
+         * Process ICMP{4,6} error packets too big locally generalted from the
+         * kernel in order to lookup proper ct_zone. */
+        struct match match = MATCH_CATCHALL_INITIALIZER;
+        match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
+        match_set_reg(&match, MFF_LOG_INPORT - MFF_REG0, binding->tunnel_key);
+
+        ofpbuf_clear(&ofpacts);
+        struct zone_ids zone_ids = get_zone_ids(binding, p_ctx->ct_zones);
+        put_zones_ofpacts(&zone_ids, &ofpacts);
+        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
+        ofctrl_add_flow(flow_table, OFTABLE_CT_ZONE_LOOKUP, 100, 0,
+                        &match, &ofpacts, hc_uuid);
     }
 
+    /* Default flow for CT_ZONE_LOOKUP Table. */
+    struct match ct_look_def_match;
+    match_init_catchall(&ct_look_def_match);
+    ofpbuf_clear(&ofpacts);
+    put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
+    ofctrl_add_flow(flow_table, OFTABLE_CT_ZONE_LOOKUP, 0, 0,
+                    &ct_look_def_match, &ofpacts, hc_uuid);
+
     /* Handle output to multicast groups, in tables 40 and 41. */
     const struct sbrec_multicast_group *mc;
     SBREC_MULTICAST_GROUP_TABLE_FOR_EACH (mc, p_ctx->mc_group_table) {
@@ -2511,7 +2540,7 @@  physical_run(struct physical_ctx *p_ctx,
         /* Add specif flows for E/W ICMPv{4,6} packets if tunnelled packets
          * do not fit path MTU.
          */
-        put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &ofpacts);
+        put_resubmit(OFTABLE_CT_ZONE_LOOKUP, &ofpacts);
 
         /* IPv4 */
         match_init_catchall(&match);
diff --git a/northd/northd.c b/northd/northd.c
index a174a4dcd..34c56f95e 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -8634,7 +8634,7 @@  build_lswitch_lflows_admission_control(struct ovn_datapath *od,
     ovs_assert(od->nbs);
 
     /* Default action for recirculated ICMP error 'packet too big'. */
-    ovn_lflow_add(lflows, od, S_SWITCH_IN_CHECK_PORT_SEC, 110,
+    ovn_lflow_add(lflows, od, S_SWITCH_IN_CHECK_PORT_SEC, 105,
                   "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
                   " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
                   " flags.tunnel_rx == 1", debug_drop_action(), lflow_ref);
@@ -11822,7 +11822,24 @@  build_lswitch_icmp_packet_toobig_admin_flows(
 {
     ovs_assert(op->nbsp);
 
+    ds_clear(match);
     if (!lsp_is_router(op->nbsp)) {
+        struct eth_addr mac;
+        if (!op->nbsp->n_addresses ||
+            !ovs_scan(op->nbsp->addresses[0], ETH_ADDR_SCAN_FMT,
+                      ETH_ADDR_SCAN_ARGS(mac))) {
+            return;
+        }
+
+        ds_put_format(match,
+                      "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
+                      " (ip6 && icmp6.type == 2 && icmp6.code == 0)) &&"
+                      " eth.src == "ETH_ADDR_FMT" && outport == %s &&"
+                      " !is_chassis_resident(%s) && flags.tunnel_rx == 1",
+                      ETH_ADDR_ARGS(mac), op->json_key, op->json_key);
+        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 110,
+                      ds_cstr(match), "outport <-> inport; next;",
+                      op->lflow_ref);
         return;
     }
 
@@ -11831,26 +11848,28 @@  build_lswitch_icmp_packet_toobig_admin_flows(
         return;
     }
 
-    ds_clear(match);
     if (peer->od->is_gw_router) {
         ds_put_format(match,
                       "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
                       " (ip6 && icmp6.type == 2 && icmp6.code == 0)) && "
                       "eth.src == %s && outport == %s && flags.tunnel_rx == 1",
                       peer->nbrp->mac, op->json_key);
+        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
+                      ds_cstr(match), "outport <-> inport; next;",
+                      op->lflow_ref);
     } else {
         ds_put_format(match,
                       "((ip4 && icmp4.type == 3 && icmp4.code == 4) ||"
                       " (ip6 && icmp6.type == 2 && icmp6.code == 0)) && "
                       "eth.dst == %s && flags.tunnel_rx == 1",
                       peer->nbrp->mac);
+        ds_clear(actions);
+        ds_put_format(actions,
+                      "outport <-> inport; next(pipeline=ingress,table=%d);",
+                      ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
+        ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
+                      ds_cstr(match), ds_cstr(actions), op->lflow_ref);
     }
-    ds_clear(actions);
-    ds_put_format(actions,
-                  "outport <-> inport; next(pipeline=ingress,table=%d);",
-                  ovn_stage_get_table(S_SWITCH_IN_L2_LKUP));
-    ovn_lflow_add(lflows, op->od, S_SWITCH_IN_CHECK_PORT_SEC, 120,
-                  ds_cstr(match), ds_cstr(actions), op->lflow_ref);
 }
 
 static void
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 9583abeff..840a4f6a3 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -324,8 +324,7 @@ 
       'packet too big' and <code>eth.src == <var>D</var> &amp;&amp;
       outport == <var>P</var> &amp;&amp; flags.tunnel_rx == 1</code> where
       <var>D</var> is the peer logical router port <var>RP</var> mac address,
-      swaps inport and outport and applies the action <code>
-      next(pipeline=S_SWITCH_IN_L2_LKUP)</code>.
+      swaps inport and outport and applies the action <code>next</code>.
     </p>
 
     <p>
@@ -338,7 +337,16 @@ 
     </p>
 
     <p>
-      This table adds a priority-110 flow that matches 'recirculated' icmp{4,6}
+      For each logical switch port <var>P</var> a priority-110 flow that
+      matches 'recirculated' icmp{4,6} error 'packet too big' and <code>
+      eth.src == <var>D</var> &amp;&amp; outport == <var>P</var> &amp;&amp;
+      !is_chassis_resident("<var>P</var>") &amp;&amp; flags.tunnel_rx == 1
+      </code> where <var>D</var> is the logical switch port mac address,
+      swaps inport and outport and applies the action <code>next</code>.
+    </p>
+
+    <p>
+      This table adds a priority-105 flow that matches 'recirculated' icmp{4,6}
       error 'packet too big' to drop the packet.
     </p>
 
diff --git a/tests/multinode.at b/tests/multinode.at
index 0187382be..ef40db9b7 100644
--- a/tests/multinode.at
+++ b/tests/multinode.at
@@ -154,6 +154,11 @@  check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
 check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
 check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
 
+# create LB
+check multinode_nbctl lb-add lb0 10.0.0.1:8080 10.0.0.4:8080 udp
+check multinode_nbctl ls-lb-add sw0 lb0
+M_NS_CHECK_EXEC([ovn-chassis-2], [sw0p2], [nc -u -l 8080 >/dev/null 2>&1 &])
+
 m_as ovn-gw-1 ip netns add ovn-ext0
 m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0 type=internal
 m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
@@ -207,6 +212,14 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 172.20.1.2 |
 3 packets transmitted, 3 received, 0% packet loss, time 0ms
 ])
 
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
+for i in $(seq 30); do
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
+done
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
+
+killall nc
+
 AT_CLEANUP
 
 AT_SETUP([ovn multinode pmtu - distributed router - vxlan])
@@ -696,6 +709,11 @@  check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
 check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
 check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
 
+# create LB
+check multinode_nbctl lb-add lb0 10.0.0.1:8080 20.0.0.3:8080 udp
+check multinode_nbctl lr-lb-add lr0 lb0
+M_NS_CHECK_EXEC([ovn-chassis-2], [sw1p1], [nc -u -l 8080 >/dev/null 2>&1 &])
+
 m_as ovn-gw-1 ip netns add ovn-ext0
 m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0 type=internal
 m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
@@ -751,6 +769,18 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 172.20.1.2 |
 M_NS_CHECK_EXEC([ovn-gw-1], [ovn-ext0], [ip link set dev ext1 mtu 1100])
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 20 -i 0.5 -s 1300 -M do 172.20.1.2 2>&1 |grep -q "mtu = 1100"])
 
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
+
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
+for i in $(seq 30); do
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
+done
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
+
+killall nc
+
 AT_CLEANUP
 
 AT_SETUP([ovn multinode pmtu - gw router - vxlan])
@@ -834,6 +864,11 @@  check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
 check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
 check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
 
+# create LB
+check multinode_nbctl lb-add lb0 10.0.0.1:8080 20.0.0.3:8080 udp
+check multinode_nbctl lr-lb-add lr0 lb0
+M_NS_CHECK_EXEC([ovn-chassis-2], [sw1p1], [nc -u -l 8080 >/dev/null 2>&1 &])
+
 m_as ovn-gw-1 ip netns add ovn-ext0
 m_as ovn-gw-1 ovs-vsctl add-port br-ex ext0 -- set interface ext0 type=internal
 m_as ovn-gw-1 ip link set ext0 netns ovn-ext0
@@ -882,4 +917,120 @@  M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 172.20.1.2 |
 
 M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 20 -i 0.5 -s 1300 -M do 172.20.1.2 2>&1 |grep -q "mtu = 1150"])
 
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
+
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
+for i in $(seq 30); do
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
+done
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 950'])
+
+killall nc
+
+AT_CLEANUP
+
+AT_SETUP([ovn multinode pmtu - logical switch - geneve])
+
+# Check that ovn-fake-multinode setup is up and running
+check_fake_multinode_setup
+
+# Delete the multinode NB and OVS resources before starting the test.
+cleanup_multinode_resources
+
+m_as ovn-chassis-1 ip link del sw0p1-p
+m_as ovn-chassis-2 ip link del sw0p2-p
+
+# Reset geneve tunnels
+for c in ovn-chassis-1 ovn-chassis-2 ovn-gw-1
+do
+    m_as $c ovs-vsctl set open . external-ids:ovn-encap-type=geneve
+done
+
+OVS_WAIT_UNTIL([m_as ovn-chassis-1 ip link show | grep -q genev_sys])
+OVS_WAIT_UNTIL([m_as ovn-chassis-2 ip link show | grep -q genev_sys])
+OVS_WAIT_UNTIL([m_as ovn-gw-1 ip link show | grep -q genev_sys])
+
+# Test East-West switching
+check multinode_nbctl ls-add sw0
+check multinode_nbctl lsp-add sw0 sw0-port1
+check multinode_nbctl lsp-set-addresses sw0-port1 "50:54:00:00:00:03 10.0.0.3 1000::3"
+check multinode_nbctl lsp-add sw0 sw0-port2
+check multinode_nbctl lsp-set-addresses sw0-port2 "50:54:00:00:00:04 10.0.0.4 1000::4"
+
+m_as ovn-chassis-1 /data/create_fake_vm.sh sw0-port1 sw0p1 50:54:00:00:00:03 10.0.0.3 24 10.0.0.1 1000::3/64 1000::a
+m_as ovn-chassis-2 /data/create_fake_vm.sh sw0-port2 sw0p2 50:54:00:00:00:04 10.0.0.4 24 10.0.0.1 1000::4/64 1000::a
+
+# Create the second logical switch with one port
+check multinode_nbctl ls-add sw1
+check multinode_nbctl lsp-add sw1 sw1-port1
+check multinode_nbctl lsp-set-addresses sw1-port1 "40:54:00:00:00:03 20.0.0.3 2000::3"
+
+# Create a logical router and attach both logical switches
+check multinode_nbctl lr-add lr0
+check multinode_nbctl lrp-add lr0 lr0-sw0 00:00:00:00:ff:01 10.0.0.1/24 1000::a/64
+check multinode_nbctl lsp-add sw0 sw0-lr0
+check multinode_nbctl lsp-set-type sw0-lr0 router
+check multinode_nbctl lsp-set-addresses sw0-lr0 router
+check multinode_nbctl lsp-set-options sw0-lr0 router-port=lr0-sw0
+
+check multinode_nbctl lrp-add lr0 lr0-sw1 00:00:00:00:ff:02 20.0.0.1/24 2000::a/64
+check multinode_nbctl lsp-add sw1 sw1-lr0
+check multinode_nbctl lsp-set-type sw1-lr0 router
+check multinode_nbctl lsp-set-addresses sw1-lr0 router
+check multinode_nbctl lsp-set-options sw1-lr0 router-port=lr0-sw1
+
+m_as ovn-chassis-2 /data/create_fake_vm.sh sw1-port1 sw1p1 40:54:00:00:00:03 20.0.0.3 24 20.0.0.1 2000::3/64 2000::a
+
+check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 10.0.0.0/24
+check multinode_nbctl lr-nat-add lr0 snat 172.20.0.100 20.0.0.0/24
+
+check multinode_nbctl lrp-set-gateway-chassis lr0-sw0 ovn-chassis-1 10
+check multinode_nbctl lrp-set-gateway-chassis lr0-sw1 ovn-chassis-2 10
+
+# create some ACLs
+check multinode_nbctl acl-add sw0 from-lport 1002 'ip4 || ip6'  allow-related
+check multinode_nbctl acl-add sw1 from-lport 1002 'ip4 || ip6'  allow-related
+
+check multinode_nbctl lb-add lb0 10.0.0.1:8080 10.0.0.4:8080 udp
+check multinode_nbctl ls-lb-add sw0 lb0
+M_NS_CHECK_EXEC([ovn-chassis-2], [sw0p2], [nc -u -l 8080 >/dev/null 2>&1 &])
+
+m_wait_for_ports_up
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 10.0.0.4 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Change ptmu for the geneve tunnel
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1200 dev eth1
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 10.0.0.4 2>&1 |grep -q "message too long, mtu=1142"])
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -q -c 3 -i 0.3 -w 2 20.0.0.3 | FORMAT_PING], \
+[0], [dnl
+3 packets transmitted, 3 received, 0% packet loss, time 0ms
+])
+
+# Change ptmu for the geneve tunnel
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1100 dev eth1
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ping -c 5 -s 1300 -M do 20.0.0.3 2>&1 |grep -q "message too long, mtu=1042"])
+
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route flush dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add 10.0.0.0/24 dev sw0p1])
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route add default via 10.0.0.1 dev sw0p1])
+
+m_as ovn-chassis-1 ip route change 170.168.0.0/16 mtu 1000 dev eth1
+for i in $(seq 30); do
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [sh -c 'dd bs=512 count=2 if=/dev/urandom |nc -u 10.0.0.1 8080'], [ignore], [ignore], [ignore])
+done
+M_NS_CHECK_EXEC([ovn-chassis-1], [sw0p1], [ip route get 10.0.0.1 dev sw0p1 | grep -q 'mtu 942'])
+
+killall nc
+
 AT_CLEANUP
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index 591ad5aad..b04cc4893 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -8507,7 +8507,7 @@  ovn_strip_lflows ], [0], [dnl
   table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
-  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
   table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
   table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
@@ -8533,7 +8533,9 @@  ovn_strip_lflows ], [0], [dnl
   table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
-  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
   table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
   table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
@@ -8560,7 +8562,9 @@  ovn_strip_lflows ], [0], [dnl
   table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
-  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
   table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
   table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
@@ -8588,7 +8592,9 @@  ovn_strip_lflows ], [0], [dnl
   table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(inport == "sw0p1"), action=(reg0[[15]] = 1; next;)
   table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
-  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
   table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
   table=??(ls_in_l2_lkup      ), priority=110  , match=(eth.dst == $svc_monitor_mac && (tcp || icmp || icmp6)), action=(handle_svc_check(inport);)
@@ -8615,7 +8621,9 @@  ovn_strip_lflows ], [0], [dnl
   table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(inport == "sw0p1"), action=(reg0[[15]] = 1; next;)
   table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
-  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
   table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_check_port_sec), priority=70   , match=(inport == "sw0p2"), action=(set_queue(10); reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_l2_lkup      ), priority=0    , match=(1), action=(outport = get_fdb(eth.dst); next;)
@@ -8645,7 +8653,9 @@  ovn_strip_lflows ], [0], [dnl
   table=??(ls_in_apply_port_sec), priority=50   , match=(reg0[[15]] == 1), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(eth.src[[40]]), action=(drop;)
   table=??(ls_in_check_port_sec), priority=100  , match=(vlan.present), action=(drop;)
-  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=105  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && flags.tunnel_rx == 1), action=(drop;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:01 && outport == "sw0p1" && !is_chassis_resident("sw0p1") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
+  table=??(ls_in_check_port_sec), priority=110  , match=(((ip4 && icmp4.type == 3 && icmp4.code == 4) || (ip6 && icmp6.type == 2 && icmp6.code == 0)) && eth.src == 00:00:00:00:00:02 && outport == "sw0p2" && !is_chassis_resident("sw0p2") && flags.tunnel_rx == 1), action=(outport <-> inport; next;)
   table=??(ls_in_check_port_sec), priority=50   , match=(1), action=(reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_check_port_sec), priority=70   , match=(inport == "localnetport"), action=(set_queue(10); reg0[[15]] = check_in_port_sec(); next;)
   table=??(ls_in_check_port_sec), priority=70   , match=(inport == "sw0p1"), action=(reg0[[14]] = 1; next(pipeline=ingress, table=??);)