diff mbox series

[ovs-dev,branch-22.09,5/5] Implement MTU Path Discovery for multichassis ports

Message ID 20230531201041.3541525-5-ihrachys@redhat.com
State Accepted
Headers show
Series [ovs-dev,branch-22.09,1/5] Track ip version of tunnel in chassis_tunnel struct | expand

Checks

Context Check Description
ovsrobot/apply-robot warning apply and check: warning
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Ihar Hrachyshka May 31, 2023, 8:10 p.m. UTC
When a multichassis port belongs to a switch with a localnet port,
packets originating or directed to the multichassis port are NOT sent
thorugh the localnet port. Instead, tunneling is enforced in-cluster to
guarantee delivery of all packets to all chassis of the port.

This behavior has an unfortunate side effect, where - because of
additional tunnel header added to each packet - the effective MTU of the
path for multichassis ports changes from what's set as mtu_request. This
effectively makes OVN to black hole all packets for the port that use
full capacity of the interface MTU. This breaks usual TCP / UDP
services, among other things (SSH, iperf sessions etc.)

This patch adds flows so that
- (in table 38) detect too-big packets (table 38), and then
- (in table 39) icmp fragmentation needed / too big errors are sent
  back to offending port.

Once the error is received, the sender is expected to adjust the route
MTU accordingly, sending the next packets with the new path MTU. After a
multichassis port is re-assigned to a single chassis, the effective path
MTU is restored to "usual". Peers will eventually see their "learned"
path MTU cache expire, which will make them switch back to the "usual"
MTU.

Among other scenarios, this patch helps to maintain existing services
working during live migration of a VM, if multichassis ports are used.
(E.g. in OpenStack Nueutron.)

Fixes: 7084cf437421 ("Always funnel multichassis port traffic through tunnels")

Conflicts:
      NEWS
      controller/physical.c
      lib/ovn-util.h

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
Acked-by: Dumitru Ceara <dceara@redhat.com>
Signed-off-by: Mark Michelson <mmichels@redhat.com>
(cherry picked from commit c519c9551b4d1f48d7ee64abfad2ac266ea83380)
(cherry picked from commit ee8992a1bb556195ec11b73fee0d08806afa48f5)
(cherry picked from commit 9c59a98dd12468cccbb14ca67539825381986949)
---
 NEWS                   |   6 +
 controller/physical.c  | 262 ++++++++++++++++++++++++++++++++-
 include/ovn/actions.h  |   3 +
 lib/actions.c          |   4 +-
 lib/ovn-util.h         |   7 +
 northd/northd.c        |   2 +
 ovn-architecture.7.xml |   9 +-
 tests/ovn.at           | 321 +++++++++++++++++++++++++++++++++++++++++
 8 files changed, 605 insertions(+), 9 deletions(-)

Comments

0-day Robot May 31, 2023, 8:25 p.m. UTC | #1
Bleep bloop.  Greetings Ihar Hrachyshka, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Unexpected sign-offs from developers who are not authors or co-authors or committers: Mark Michelson <mmichels@redhat.com>
Lines checked: 833, Warnings: 1, Errors: 0


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index 5c65735f9..d5170786b 100644
--- a/NEWS
+++ b/NEWS
@@ -2,6 +2,12 @@  OVN v22.09.2 - xx xxx xxxx
 --------------------------
   - Always allow IPv6 Router Discovery, Neighbor Discovery, and Multicast
     Listener Discovery protocols, regardless of ACLs defined.
+  - Send ICMP Fragmentation Needed packets back to offending ports when
+    communicating with multichassis ports using frames that don't fit through a
+    tunnel. This is done only for logical switches that are attached to a
+    physical network via a localnet port, in which case multichassis ports may
+    have an effective MTU different from regular ports and hence may need this
+    mechanism to maintain connectivity with other peers in the network.
 
 OVN v22.09.1 - 20 Dec 2022
 --------------------------
diff --git a/controller/physical.c b/controller/physical.c
index 4697895b3..2dce238a8 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -41,6 +41,7 @@ 
 #include "lib/ovn-sb-idl.h"
 #include "lib/ovn-util.h"
 #include "ovn/actions.h"
+#include "if-status.h"
 #include "physical.h"
 #include "pinctrl.h"
 #include "openvswitch/shash.h"
@@ -91,6 +92,7 @@  physical_register_ovs_idl(struct ovsdb_idl *ovs_idl)
 
     ovsdb_idl_add_table(ovs_idl, &ovsrec_table_interface);
     ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name);
+    ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_mtu);
     ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_ofport);
     ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_external_ids);
 }
@@ -1072,6 +1074,240 @@  setup_activation_strategy(const struct sbrec_port_binding *binding,
     }
 }
 
+/*
+ * Insert a flow to determine if an IP packet is too big for the corresponding
+ * egress interface.
+ */
+static void
+determine_if_pkt_too_big(struct ovn_desired_flow_table *flow_table,
+                         const struct sbrec_port_binding *binding,
+                         const struct sbrec_port_binding *mcp,
+                         uint16_t mtu, bool is_ipv6, int direction)
+{
+    struct ofpbuf ofpacts;
+    ofpbuf_init(&ofpacts, 0);
+
+    /* Store packet too large flag in reg9[1]. */
+    struct match match;
+    match_init_catchall(&match);
+    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+    match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
+    match_set_reg(&match, direction - MFF_REG0, mcp->tunnel_key);
+
+    /* reg9[1] is REGBIT_PKT_LARGER as defined by northd */
+    struct ofpact_check_pkt_larger *pkt_larger =
+        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
+    pkt_larger->pkt_len = mtu;
+    pkt_larger->dst.field = mf_from_id(MFF_REG9);
+    pkt_larger->dst.ofs = 1;
+
+    put_resubmit(OFTABLE_OUTPUT_LARGE_PKT_PROCESS, &ofpacts);
+    ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_DETECT, 100,
+                    binding->header_.uuid.parts[0], &match, &ofpacts,
+                    &binding->header_.uuid);
+    ofpbuf_uninit(&ofpacts);
+}
+
+/*
+ * Insert a flow to reply with ICMP error for IP packets that are too big for
+ * the corresponding egress interface.
+ */
+/*
+ * NOTE(ihrachys) This reimplements icmp_error as found in
+ * build_icmperr_pkt_big_flows. We may look into reusing the existing OVN
+ * action for this flow in the future.
+ */
+static void
+reply_imcp_error_if_pkt_too_big(struct ovn_desired_flow_table *flow_table,
+                                const struct sbrec_port_binding *binding,
+                                const struct sbrec_port_binding *mcp,
+                                uint16_t mtu, bool is_ipv6, int direction)
+{
+    struct match match;
+    match_init_catchall(&match);
+    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+    match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
+    match_set_reg(&match, direction - MFF_REG0, mcp->tunnel_key);
+    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
+
+    /* Return ICMP error with a part of the original IP packet included. */
+    struct ofpbuf ofpacts;
+    ofpbuf_init(&ofpacts, 0);
+    size_t oc_offset = encode_start_controller_op(
+        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
+
+    struct ofpbuf inner_ofpacts;
+    ofpbuf_init(&inner_ofpacts, 0);
+
+    /* The error packet is no longer too large, set REGBIT_PKT_LARGER = 0 */
+    /* reg9[1] is REGBIT_PKT_LARGER as defined by northd */
+    ovs_be32 value = htonl(0);
+    ovs_be32 mask = htonl(1 << 1);
+    ofpact_put_set_field(
+        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
+
+    /* The new error packet is delivered locally */
+    /* REGBIT_EGRESS_LOOPBACK = 1 */
+    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+    ofpact_put_set_field(
+        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
+
+    /* eth.src <-> eth.dst */
+    put_stack(MFF_ETH_DST, ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
+    put_stack(MFF_ETH_SRC, ofpact_put_STACK_POP(&inner_ofpacts));
+
+    /* ip.src <-> ip.dst */
+    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
+        ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
+        ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
+        ofpact_put_STACK_POP(&inner_ofpacts));
+    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
+        ofpact_put_STACK_POP(&inner_ofpacts));
+
+    /* ip.ttl = 255 */
+    struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
+    ip_ttl->ttl = 255;
+
+    uint16_t frag_mtu = mtu - ETHERNET_OVERHEAD;
+    size_t frag_mtu_oc_offset;
+    if (is_ipv6) {
+        /* icmp6.type = 2 (Packet Too Big) */
+        /* icmp6.code = 0 */
+        uint8_t icmp_type = 2;
+        uint8_t icmp_code = 0;
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type, NULL);
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code, NULL);
+
+        /* icmp6.frag_mtu */
+        frag_mtu_oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
+            &inner_ofpacts);
+        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
+        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+    } else {
+        /* icmp4.type = 3 (Destination Unreachable) */
+        /* icmp4.code = 4 (Fragmentation Needed) */
+        uint8_t icmp_type = 3;
+        uint8_t icmp_code = 4;
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type, NULL);
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code, NULL);
+
+        /* icmp4.frag_mtu = */
+        frag_mtu_oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
+            &inner_ofpacts);
+        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
+        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+    }
+    encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
+
+    /* Finally, submit the ICMP error back to the ingress pipeline */
+    put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &inner_ofpacts);
+
+    /* Attach nested actions to ICMP error controller handler */
+    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
+                                 &ofpacts, OFP15_VERSION);
+
+    /* Finalize the ICMP error controller handler */
+    encode_finish_controller_op(oc_offset, &ofpacts);
+
+    ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_PROCESS, 100,
+                    binding->header_.uuid.parts[0], &match, &ofpacts,
+                    &binding->header_.uuid);
+
+    ofpbuf_uninit(&inner_ofpacts);
+    ofpbuf_uninit(&ofpacts);
+}
+
+static uint16_t
+get_tunnel_overhead(struct chassis_tunnel const *tun)
+{
+    uint16_t overhead = 0;
+    enum chassis_tunnel_type type = tun->type;
+    if (type == GENEVE) {
+        overhead += GENEVE_TUNNEL_OVERHEAD;
+    } else if (type == STT) {
+        overhead += STT_TUNNEL_OVERHEAD;
+    } else if (type == VXLAN) {
+        overhead += VXLAN_TUNNEL_OVERHEAD;
+    } else {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+        VLOG_WARN_RL(&rl, "Unknown tunnel type %d, can't determine overhead "
+                          "size for Path MTU Discovery", type);
+        return 0;
+    }
+    overhead += tun->is_ipv6? IPV6_HEADER_LEN : IP_HEADER_LEN;
+    return overhead;
+}
+
+static uint16_t
+get_effective_mtu(const struct sbrec_port_binding *mcp,
+                  struct ovs_list *remote_tunnels,
+                  const struct if_status_mgr *if_mgr)
+{
+    /* Use interface MTU as a base for calculation */
+    uint16_t iface_mtu = if_status_mgr_iface_get_mtu(if_mgr,
+                                                     mcp->logical_port);
+    if (!iface_mtu) {
+        return 0;
+    }
+
+    /* Iterate over all peer tunnels and find the biggest tunnel overhead */
+    uint16_t overhead = 0;
+    struct tunnel *tun;
+    LIST_FOR_EACH (tun, list_node, remote_tunnels) {
+        overhead = MAX(overhead, get_tunnel_overhead(tun->tun));
+    }
+    if (!overhead) {
+        return 0;
+    }
+
+    return iface_mtu - overhead;
+}
+
+static void
+handle_pkt_too_big_for_ip_version(struct ovn_desired_flow_table *flow_table,
+                                  const struct sbrec_port_binding *binding,
+                                  const struct sbrec_port_binding *mcp,
+                                  uint16_t mtu, bool is_ipv6)
+{
+    /* ingress */
+    determine_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+                             MFF_LOG_INPORT);
+    reply_imcp_error_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+                                    MFF_LOG_INPORT);
+
+    /* egress */
+    determine_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+                             MFF_LOG_OUTPORT);
+    reply_imcp_error_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+                                    MFF_LOG_OUTPORT);
+}
+
+static void
+handle_pkt_too_big(struct ovn_desired_flow_table *flow_table,
+                   struct ovs_list *remote_tunnels,
+                   const struct sbrec_port_binding *binding,
+                   const struct sbrec_port_binding *mcp,
+                   const struct if_status_mgr *if_mgr)
+{
+    uint16_t mtu = get_effective_mtu(mcp, remote_tunnels, if_mgr);
+    if (!mtu) {
+        return;
+    }
+    handle_pkt_too_big_for_ip_version(flow_table, binding, mcp, mtu, false);
+    handle_pkt_too_big_for_ip_version(flow_table, binding, mcp, mtu, true);
+}
+
 static void
 enforce_tunneling_for_multichassis_ports(
     struct local_datapath *ld,
@@ -1079,7 +1315,8 @@  enforce_tunneling_for_multichassis_ports(
     const struct sbrec_chassis *chassis,
     const struct hmap *chassis_tunnels,
     enum mf_field_id mff_ovn_geneve,
-    struct ovn_desired_flow_table *flow_table)
+    struct ovn_desired_flow_table *flow_table,
+    const struct if_status_mgr *if_mgr)
 {
     if (shash_is_empty(&ld->multichassis_ports)) {
         return;
@@ -1124,6 +1361,8 @@  enforce_tunneling_for_multichassis_ports(
                         binding->header_.uuid.parts[0], &match, &ofpacts,
                         &binding->header_.uuid);
         ofpbuf_uninit(&ofpacts);
+
+        handle_pkt_too_big(flow_table, tuns, binding, mcp, if_mgr);
     }
 
     struct tunnel *tun_elem;
@@ -1144,6 +1383,7 @@  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                       const struct hmap *chassis_tunnels,
                       const struct sbrec_port_binding *binding,
                       const struct sbrec_chassis *chassis,
+                      const struct if_status_mgr *if_mgr,
                       struct ovn_desired_flow_table *flow_table,
                       struct ofpbuf *ofpacts_p)
 {
@@ -1567,8 +1807,10 @@  consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                         binding->header_.uuid.parts[0],
                         &match, ofpacts_p, &binding->header_.uuid);
 
-        enforce_tunneling_for_multichassis_ports(
-            ld, binding, chassis, chassis_tunnels, mff_ovn_geneve, flow_table);
+        enforce_tunneling_for_multichassis_ports(ld, binding, chassis,
+                                                 chassis_tunnels,
+                                                 mff_ovn_geneve, flow_table,
+                                                 if_mgr);
 
         /* No more tunneling to set up. */
         goto out;
@@ -1872,7 +2114,8 @@  physical_eval_port_binding(struct physical_ctx *p_ctx,
                           p_ctx->local_bindings,
                           p_ctx->patch_ofports,
                           p_ctx->chassis_tunnels,
-                          pb, p_ctx->chassis, flow_table, &ofpacts);
+                          pb, p_ctx->chassis,
+                          p_ctx->if_mgr, flow_table, &ofpacts);
     ofpbuf_uninit(&ofpacts);
 }
 
@@ -1995,7 +2238,8 @@  physical_run(struct physical_ctx *p_ctx,
                               p_ctx->local_bindings,
                               p_ctx->patch_ofports,
                               p_ctx->chassis_tunnels, binding,
-                              p_ctx->chassis, flow_table, &ofpacts);
+                              p_ctx->chassis,
+                              p_ctx->if_mgr, flow_table, &ofpacts);
     }
 
     /* Handle output to multicast groups, in tables 40 and 41. */
@@ -2132,6 +2376,14 @@  physical_run(struct physical_ctx *p_ctx,
     ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_DETECT, 0, 0, &match,
                     &ofpacts, hc_uuid);
 
+    match_init_catchall(&match);
+    match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0,
+                         MLF_ALLOW_LOOPBACK, MLF_ALLOW_LOOPBACK);
+    ofpbuf_clear(&ofpacts);
+    put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
+    ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_PROCESS, 10, 0,
+                    &match, &ofpacts, hc_uuid);
+
     match_init_catchall(&match);
     ofpbuf_clear(&ofpacts);
     put_resubmit(OFTABLE_REMOTE_OUTPUT, &ofpacts);
diff --git a/include/ovn/actions.h b/include/ovn/actions.h
index ccb084356..937337300 100644
--- a/include/ovn/actions.h
+++ b/include/ovn/actions.h
@@ -849,6 +849,9 @@  void ovnacts_free(struct ovnact[], size_t ovnacts_len);
 char *ovnact_op_to_string(uint32_t);
 int encode_ra_dnssl_opt(char *data, char *buf, int buf_len);
 
+size_t encode_start_controller_op(enum action_opcode opcode, bool pause,
+                                  uint32_t meter_id, struct ofpbuf *ofpacts);
+void encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts);
 void encode_controller_op(enum action_opcode opcode, uint32_t meter_id,
                           struct ofpbuf *ofpacts);
 
diff --git a/lib/actions.c b/lib/actions.c
index a04d649a7..fec8464a4 100644
--- a/lib/actions.c
+++ b/lib/actions.c
@@ -79,7 +79,7 @@  ovnact_init(struct ovnact *ovnact, enum ovnact_type type, size_t len)
     ovnact->len = len;
 }
 
-static size_t
+size_t
 encode_start_controller_op(enum action_opcode opcode, bool pause,
                            uint32_t meter_id, struct ofpbuf *ofpacts)
 {
@@ -100,7 +100,7 @@  encode_start_controller_op(enum action_opcode opcode, bool pause,
     return ofs;
 }
 
-static void
+void
 encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
 {
     struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof *oc);
diff --git a/lib/ovn-util.h b/lib/ovn-util.h
index dd73e1938..36f7aec37 100644
--- a/lib/ovn-util.h
+++ b/lib/ovn-util.h
@@ -28,6 +28,13 @@ 
 #define ROUTE_ORIGIN_CONNECTED "connected"
 #define ROUTE_ORIGIN_STATIC "static"
 
+#define ETH_CRC_LENGTH 4
+#define ETHERNET_OVERHEAD (ETH_HEADER_LEN + ETH_CRC_LENGTH)
+
+#define GENEVE_TUNNEL_OVERHEAD 38
+#define STT_TUNNEL_OVERHEAD 18
+#define VXLAN_TUNNEL_OVERHEAD 30
+
 struct nbrec_logical_router_port;
 struct sbrec_logical_flow;
 struct svec;
diff --git a/northd/northd.c b/northd/northd.c
index 6a689dbc7..e8d7c8e99 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -224,6 +224,8 @@  enum ovn_stage {
  * one of the logical router's own IP addresses. */
 #define REGBIT_EGRESS_LOOPBACK  "reg9[0]"
 /* Register to store the result of check_pkt_larger action. */
+/* This register is also used by ovn-controller in
+ * OFTABLE_OUTPUT_LARGE_PKT_DETECT table, for a similar goal. */
 #define REGBIT_PKT_LARGER        "reg9[1]"
 #define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
 #define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
diff --git a/ovn-architecture.7.xml b/ovn-architecture.7.xml
index c56547e6f..6c979891d 100644
--- a/ovn-architecture.7.xml
+++ b/ovn-architecture.7.xml
@@ -1441,8 +1441,13 @@ 
       <p>
         OpenFlow tables 37 through 42 implement the <code>output</code> action
         in the logical ingress pipeline.  Specifically, table 37 serves as an
-        entry point to egress pipeline. Tables 38 and 39 are, for now,
-        placeholders for Path MTU Discovery implementation.
+        entry point to egress pipeline. Table 38 detects IP packets that are
+        too big for a corresponding interface. Table 39 produces ICMPv4
+        Fragmentation Needed (or ICMPv6 Too Big) errors and deliver them back
+        to the offending port. table 40 handles packets to remote hypervisors,
+        table 41 handles packets to the local hypervisor, and table 42 checks
+        whether packets whose logical ingress and egress port are the same
+        should be discarded.
       </p>
 
       <p>
diff --git a/tests/ovn.at b/tests/ovn.at
index ac24904a6..74cbea5e5 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -15069,6 +15069,327 @@  OVN_CLEANUP([hv1],[hv2],[hv3])
 AT_CLEANUP
 ])
 
+m4_define([MULTICHASSIS_PATH_MTU_DISCOVERY_TEST],
+  [OVN_FOR_EACH_NORTHD([
+   AT_SETUP([localnet connectivity with multiple requested-chassis, path mtu discovery (ip=$1, tunnel=$2, mtu=$3)])
+   AT_KEYWORDS([multi-chassis])
+   AT_SKIP_IF([test $HAVE_SCAPY = no])
+
+   ovn_start
+
+   net_add n1
+   for i in 1 2; do
+       sim_add hv$i
+       as hv$i
+       check ovs-vsctl add-br br-phys
+       if test "x$1" = "xipv6"; then
+           ovn_attach n1 br-phys fd00::$i 64 $2
+       else
+           ovn_attach n1 br-phys 192.168.0.$i 24 $2
+       fi
+       check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+   done
+
+   first_mac=00:00:00:00:00:01
+   second_mac=00:00:00:00:00:02
+   multi1_mac=00:00:00:00:00:f0
+   multi2_mac=00:00:00:00:00:f1
+   first_ip=10.0.0.1
+   second_ip=10.0.0.2
+   multi1_ip=10.0.0.10
+   multi2_ip=10.0.0.20
+   first_ip6=abcd::1
+   second_ip6=abcd::2
+   multi1_ip6=abcd::f0
+   multi2_ip6=abcd::f1
+
+   check ovn-nbctl ls-add ls0
+   check ovn-nbctl lsp-add ls0 first
+   check ovn-nbctl lsp-add ls0 second
+   check ovn-nbctl lsp-add ls0 multi1
+   check ovn-nbctl lsp-add ls0 multi2
+   check ovn-nbctl lsp-set-addresses first "${first_mac} ${first_ip} ${first_ip6}"
+   check ovn-nbctl lsp-set-addresses second "${second_mac} ${second_ip} ${second_ip6}"
+   check ovn-nbctl lsp-set-addresses multi1 "${multi1_mac} ${multi1_ip} ${multi1_ip6}"
+   check ovn-nbctl lsp-set-addresses multi2 "${multi2_mac} ${multi2_ip} ${multi2_ip6}"
+
+   check ovn-nbctl lsp-add ls0 public
+   check ovn-nbctl lsp-set-type public localnet
+   check ovn-nbctl lsp-set-addresses public unknown
+   check ovn-nbctl lsp-set-options public network_name=phys
+
+   check ovn-nbctl lsp-set-options first requested-chassis=hv1
+   check ovn-nbctl lsp-set-options second requested-chassis=hv2
+   check ovn-nbctl lsp-set-options multi1 requested-chassis=hv1,hv2
+   check ovn-nbctl lsp-set-options multi2 requested-chassis=hv1,hv2
+
+   as hv1 check ovs-vsctl -- add-port br-int first -- \
+       set Interface first external-ids:iface-id=first \
+       options:tx_pcap=hv1/first-tx.pcap \
+       options:rxq_pcap=hv1/first-rx.pcap \
+       ofport-request=1
+   as hv2 check ovs-vsctl -- add-port br-int second -- \
+       set Interface second external-ids:iface-id=second \
+       options:tx_pcap=hv2/second-tx.pcap \
+       options:rxq_pcap=hv2/second-rx.pcap \
+       ofport-request=2
+
+   # Create interfaces for multichassis ports on both hv1 and hv2
+   for hv in hv1 hv2; do
+       for i in 1 2; do
+           as $hv check ovs-vsctl -- add-port br-int multi${i} -- \
+               set Interface multi${i} external-ids:iface-id=multi${i} \
+               options:tx_pcap=$hv/multi${i}-tx.pcap \
+               options:rxq_pcap=$hv/multi${i}-rx.pcap \
+               ofport-request=${i}00
+       done
+   done
+
+   send_ip_packet() {
+       local inport=${1} hv=${2} eth_src=${3} eth_dst=${4} ipv4_src=${5} ipv4_dst=${6} data=${7} fail=${8} mtu=${9:-$3}
+       packet=$(fmt_pkt "
+           Ether(dst='${eth_dst}', src='${eth_src}') /
+           IP(src='${ipv4_src}', dst='${ipv4_dst}') /
+           ICMP(type=8) / bytes.fromhex('${data}')
+       ")
+       as hv${hv} ovs-appctl netdev-dummy/receive ${inport} ${packet}
+       if [[ x"${fail}" != x0 ]]; then
+         original_ip_frame=$(fmt_pkt "
+           IP(src='${ipv4_src}', dst='${ipv4_dst}') /
+           ICMP(type=8) / bytes.fromhex('${data}')
+         ")
+         # IP(flags=2) means DF (Don't Fragment) = 1
+         # ICMP(type=3, code=4) means Destination Unreachable, Fragmentation Needed
+         packet=$(fmt_pkt "
+             Ether(dst='${eth_src}', src='${eth_dst}') /
+             IP(src='${ipv4_dst}', dst='${ipv4_src}', ttl=255, flags=2, id=0) /
+             ICMP(type=3, code=4, nexthopmtu=${mtu}) /
+             bytes.fromhex('${original_ip_frame:0:$((534 * 2))}')
+         ")
+       fi
+       echo ${packet}
+   }
+
+   send_ip6_packet() {
+       local inport=${1} hv=${2} eth_src=${3} eth_dst=${4} ipv6_src=${5} ipv6_dst=${6} data=${7} fail=${8} mtu=${9:-$3}
+       packet=$(fmt_pkt "
+           Ether(dst='${eth_dst}', src='${eth_src}') /
+           IPv6(src='${ipv6_src}', dst='${ipv6_dst}') /
+           ICMPv6EchoRequest() / bytes.fromhex('${data}')
+       ")
+       as hv${hv} ovs-appctl netdev-dummy/receive ${inport} ${packet}
+       if [[ x"${fail}" != x0 ]]; then
+         original_ip_frame=$(fmt_pkt "
+           IPv6(src='${ipv6_src}', dst='${ipv6_dst}') /
+           ICMPv6EchoRequest() / bytes.fromhex('${data}')
+         ")
+         packet=$(fmt_pkt "
+             Ether(dst='${eth_src}', src='${eth_dst}') /
+             IPv6(src='${ipv6_dst}', dst='${ipv6_src}', hlim=255) /
+             ICMPv6PacketTooBig(mtu=${mtu}) /
+             bytes.fromhex('${original_ip_frame:0:$((1218 * 2))}')
+         ")
+       fi
+       echo ${packet}
+   }
+
+   reset_env() {
+       for port in first multi1 multi2; do
+           as hv1 reset_pcap_file $port hv1/$port
+       done
+       for port in second multi1 multi2; do
+           as hv2 reset_pcap_file $port hv2/$port
+       done
+       for port in hv1/multi1 hv2/multi1 hv1/multi2 hv2/multi2 hv1/first hv2/second; do
+           : > $port.expected
+       done
+   }
+
+   check_pkts() {
+       for port in hv1/multi1 hv2/multi1 hv1/multi2 hv2/multi2 hv1/first hv2/second; do
+           OVN_CHECK_PACKETS_REMOVE_BROADCAST([${port}-tx.pcap], [${port}.expected])
+       done
+   }
+
+   payload() {
+       echo $(cat /dev/urandom | tr -cd 'a-f0-9' | head -c ${1})
+   }
+
+   wait_for_ports_up
+   OVN_POPULATE_ARP
+
+   reset_env
+
+   AS_BOX([Packets of proper size are delivered from multichassis to regular ports])
+
+   len=1000
+   packet=$(send_ip_packet multi1 1 $multi1_mac $first_mac $multi1_ip $first_ip $(payload $len) 0)
+   echo $packet >> hv1/first.expected
+
+   packet=$(send_ip_packet multi1 1 $multi1_mac $second_mac $multi1_ip $second_ip $(payload $len) 0)
+   echo $packet >> hv2/second.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $first_mac $multi1_ip6 $first_ip6 $(payload $len) 0)
+   echo $packet >> hv1/first.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $second_mac $multi1_ip6 $second_ip6 $(payload $len) 0)
+   echo $packet >> hv2/second.expected
+
+   check_pkts
+   reset_env
+
+   AS_BOX([Oversized packets are not delivered from multichassis to regular ports])
+
+   len=3000
+   packet=$(send_ip_packet multi1 1 $multi1_mac $first_mac $multi1_ip $first_ip $(payload $len) 1)
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip_packet multi1 1 $multi1_mac $second_mac $multi1_ip $second_ip $(payload $len) 1)
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $first_mac $multi1_ip6 $first_ip6 $(payload $len) 1)
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $second_mac $multi1_ip6 $second_ip6 $(payload $len) 1)
+   echo $packet >> hv1/multi1.expected
+
+   check_pkts
+   reset_env
+
+   AS_BOX([Packets of proper size are delivered from regular to multichassis ports])
+
+   len=1000
+   packet=$(send_ip_packet first 1 $first_mac $multi1_mac $first_ip $multi1_ip $(payload $len) 0)
+   echo $packet >> hv1/multi1.expected
+   echo $packet >> hv2/multi1.expected
+
+   packet=$(send_ip_packet second 2 $second_mac $multi1_mac $second_ip $multi1_ip $(payload $len) 0)
+   echo $packet >> hv1/multi1.expected
+   echo $packet >> hv2/multi1.expected
+
+   packet=$(send_ip6_packet first 1 $first_mac $multi1_mac $first_ip6 $multi1_ip6 $(payload $len) 0)
+   echo $packet >> hv1/multi1.expected
+   echo $packet >> hv2/multi1.expected
+
+   packet=$(send_ip6_packet second 2 $second_mac $multi1_mac $second_ip6 $multi1_ip6 $(payload $len) 0)
+   echo $packet >> hv1/multi1.expected
+   echo $packet >> hv2/multi1.expected
+
+   check_pkts
+   reset_env
+
+   AS_BOX([Oversized packets are not delivered from regular to multichassis ports])
+
+   len=3000
+   packet=$(send_ip_packet first 1 $first_mac $multi1_mac $first_ip $multi1_ip $(payload $len) 1)
+   echo $packet >> hv1/first.expected
+
+   packet=$(send_ip_packet second 2 $second_mac $multi1_mac $second_ip $multi1_ip $(payload $len) 1)
+   echo $packet >> hv2/second.expected
+
+   packet=$(send_ip6_packet first 1 $first_mac $multi1_mac $first_ip6 $multi1_ip6 $(payload $len) 1)
+   echo $packet >> hv1/first.expected
+
+   packet=$(send_ip6_packet second 2 $second_mac $multi1_mac $second_ip6 $multi1_ip6 $(payload $len) 1)
+   echo $packet >> hv2/second.expected
+
+   check_pkts
+   reset_env
+
+   AS_BOX([Packets of proper size are delivered from multichassis to multichassis ports])
+
+   len=1000
+   packet=$(send_ip_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip $multi2_ip $(payload $len) 0)
+   echo $packet >> hv1/multi2.expected
+   echo $packet >> hv2/multi2.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip6 $multi2_ip6 $(payload $len) 0)
+   echo $packet >> hv1/multi2.expected
+   echo $packet >> hv2/multi2.expected
+
+   check_pkts
+   reset_env
+
+   AS_BOX([Oversized packets are not delivered from multichassis to multichassis ports])
+
+   len=3000
+   packet=$(send_ip_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip $multi2_ip $(payload $len) 1)
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip6 $multi2_ip6 $(payload $len) 1)
+   echo $packet >> hv1/multi1.expected
+
+   check_pkts
+   reset_env
+
+   AS_BOX([MTU updates are honored in ICMP Path MTU calculation])
+
+   set_mtu() {
+       local hv=${1} iface=${2} new_mtu=${3}
+
+       iface_uuid=$(as ${hv} ovs-vsctl --bare --columns _uuid find Interface name=${iface})
+       check as ${hv} ovs-vsctl set interface ${iface_uuid} mtu_request=${new_mtu}
+   }
+
+   set_mtu_for_all_ports() {
+       for port in multi1 multi2 first; do
+           set_mtu hv1 ${port} ${1}
+       done
+       for port in multi1 multi2 second; do
+           set_mtu hv2 ${port} ${1}
+       done
+   }
+
+   initial_mtu=1500  # all interfaces are 1500 by default
+   new_mtu=1400
+   set_mtu_for_all_ports ${new_mtu}
+   mtu_diff=$((${initial_mtu} - ${new_mtu}))
+
+   len=3000
+   expected_ip_mtu=$(($3 - ${mtu_diff}))
+   packet=$(send_ip_packet first 1 $first_mac $multi1_mac $first_ip $multi1_ip $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/first.expected
+
+   packet=$(send_ip_packet second 2 $second_mac $multi1_mac $second_ip $multi1_ip $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv2/second.expected
+
+   packet=$(send_ip6_packet first 1 $first_mac $multi1_mac $first_ip6 $multi1_ip6 $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/first.expected
+
+   packet=$(send_ip6_packet second 2 $second_mac $multi1_mac $second_ip6 $multi1_ip6 $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv2/second.expected
+
+   packet=$(send_ip_packet multi1 1 $multi1_mac $first_mac $multi1_ip $first_ip $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip_packet multi1 1 $multi1_mac $second_mac $multi1_ip $second_ip $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $first_mac $multi1_ip6 $first_ip6 $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $second_mac $multi1_ip6 $second_ip6 $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip $multi2_ip $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/multi1.expected
+
+   packet=$(send_ip6_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip6 $multi2_ip6 $(payload $len) 1 ${expected_ip_mtu})
+   echo $packet >> hv1/multi1.expected
+
+   check_pkts
+
+   OVN_CLEANUP([hv1],[hv2])
+
+   AT_CLEANUP
+   ])])
+
+# NOTE(ihar) no STT variants because it's not supported by upstream kernels
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv4], [geneve], [1424])
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv6], [geneve], [1404])
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv4], [vxlan], [1432])
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv6], [vxlan], [1412])
+
 OVN_FOR_EACH_NORTHD([
 AT_SETUP([options:activation-strategy for logical port])
 AT_KEYWORDS([multi-chassis])