@@ -2,6 +2,12 @@ OVN v22.09.2 - xx xxx xxxx
--------------------------
- Always allow IPv6 Router Discovery, Neighbor Discovery, and Multicast
Listener Discovery protocols, regardless of ACLs defined.
+ - Send ICMP Fragmentation Needed packets back to offending ports when
+ communicating with multichassis ports using frames that don't fit through a
+ tunnel. This is done only for logical switches that are attached to a
+ physical network via a localnet port, in which case multichassis ports may
+ have an effective MTU different from regular ports and hence may need this
+ mechanism to maintain connectivity with other peers in the network.
OVN v22.09.1 - 20 Dec 2022
--------------------------
@@ -41,6 +41,7 @@
#include "lib/ovn-sb-idl.h"
#include "lib/ovn-util.h"
#include "ovn/actions.h"
+#include "if-status.h"
#include "physical.h"
#include "pinctrl.h"
#include "openvswitch/shash.h"
@@ -91,6 +92,7 @@ physical_register_ovs_idl(struct ovsdb_idl *ovs_idl)
ovsdb_idl_add_table(ovs_idl, &ovsrec_table_interface);
ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_name);
+ ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_mtu);
ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_ofport);
ovsdb_idl_track_add_column(ovs_idl, &ovsrec_interface_col_external_ids);
}
@@ -1072,6 +1074,240 @@ setup_activation_strategy(const struct sbrec_port_binding *binding,
}
}
+/*
+ * Insert a flow to determine if an IP packet is too big for the corresponding
+ * egress interface.
+ */
+static void
+determine_if_pkt_too_big(struct ovn_desired_flow_table *flow_table,
+ const struct sbrec_port_binding *binding,
+ const struct sbrec_port_binding *mcp,
+ uint16_t mtu, bool is_ipv6, int direction)
+{
+ struct ofpbuf ofpacts;
+ ofpbuf_init(&ofpacts, 0);
+
+ /* Store packet too large flag in reg9[1]. */
+ struct match match;
+ match_init_catchall(&match);
+ match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+ match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
+ match_set_reg(&match, direction - MFF_REG0, mcp->tunnel_key);
+
+ /* reg9[1] is REGBIT_PKT_LARGER as defined by northd */
+ struct ofpact_check_pkt_larger *pkt_larger =
+ ofpact_put_CHECK_PKT_LARGER(&ofpacts);
+ pkt_larger->pkt_len = mtu;
+ pkt_larger->dst.field = mf_from_id(MFF_REG9);
+ pkt_larger->dst.ofs = 1;
+
+ put_resubmit(OFTABLE_OUTPUT_LARGE_PKT_PROCESS, &ofpacts);
+ ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_DETECT, 100,
+ binding->header_.uuid.parts[0], &match, &ofpacts,
+ &binding->header_.uuid);
+ ofpbuf_uninit(&ofpacts);
+}
+
+/*
+ * Insert a flow to reply with ICMP error for IP packets that are too big for
+ * the corresponding egress interface.
+ */
+/*
+ * NOTE(ihrachys) This reimplements icmp_error as found in
+ * build_icmperr_pkt_big_flows. We may look into reusing the existing OVN
+ * action for this flow in the future.
+ */
+static void
+reply_imcp_error_if_pkt_too_big(struct ovn_desired_flow_table *flow_table,
+ const struct sbrec_port_binding *binding,
+ const struct sbrec_port_binding *mcp,
+ uint16_t mtu, bool is_ipv6, int direction)
+{
+ struct match match;
+ match_init_catchall(&match);
+ match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+ match_set_metadata(&match, htonll(binding->datapath->tunnel_key));
+ match_set_reg(&match, direction - MFF_REG0, mcp->tunnel_key);
+ match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
+
+ /* Return ICMP error with a part of the original IP packet included. */
+ struct ofpbuf ofpacts;
+ ofpbuf_init(&ofpacts, 0);
+ size_t oc_offset = encode_start_controller_op(
+ ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
+
+ struct ofpbuf inner_ofpacts;
+ ofpbuf_init(&inner_ofpacts, 0);
+
+ /* The error packet is no longer too large, set REGBIT_PKT_LARGER = 0 */
+ /* reg9[1] is REGBIT_PKT_LARGER as defined by northd */
+ ovs_be32 value = htonl(0);
+ ovs_be32 mask = htonl(1 << 1);
+ ofpact_put_set_field(
+ &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
+
+ /* The new error packet is delivered locally */
+ /* REGBIT_EGRESS_LOOPBACK = 1 */
+ value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+ mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+ ofpact_put_set_field(
+ &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
+
+ /* eth.src <-> eth.dst */
+ put_stack(MFF_ETH_DST, ofpact_put_STACK_PUSH(&inner_ofpacts));
+ put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
+ put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
+ put_stack(MFF_ETH_SRC, ofpact_put_STACK_POP(&inner_ofpacts));
+
+ /* ip.src <-> ip.dst */
+ put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
+ ofpact_put_STACK_PUSH(&inner_ofpacts));
+ put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
+ ofpact_put_STACK_PUSH(&inner_ofpacts));
+ put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
+ ofpact_put_STACK_POP(&inner_ofpacts));
+ put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
+ ofpact_put_STACK_POP(&inner_ofpacts));
+
+ /* ip.ttl = 255 */
+ struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
+ ip_ttl->ttl = 255;
+
+ uint16_t frag_mtu = mtu - ETHERNET_OVERHEAD;
+ size_t frag_mtu_oc_offset;
+ if (is_ipv6) {
+ /* icmp6.type = 2 (Packet Too Big) */
+ /* icmp6.code = 0 */
+ uint8_t icmp_type = 2;
+ uint8_t icmp_code = 0;
+ ofpact_put_set_field(
+ &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type, NULL);
+ ofpact_put_set_field(
+ &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code, NULL);
+
+ /* icmp6.frag_mtu */
+ frag_mtu_oc_offset = encode_start_controller_op(
+ ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
+ &inner_ofpacts);
+ ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
+ ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+ } else {
+ /* icmp4.type = 3 (Destination Unreachable) */
+ /* icmp4.code = 4 (Fragmentation Needed) */
+ uint8_t icmp_type = 3;
+ uint8_t icmp_code = 4;
+ ofpact_put_set_field(
+ &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type, NULL);
+ ofpact_put_set_field(
+ &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code, NULL);
+
+ /* icmp4.frag_mtu = */
+ frag_mtu_oc_offset = encode_start_controller_op(
+ ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
+ &inner_ofpacts);
+ ovs_be16 frag_mtu_ovs = htons(frag_mtu);
+ ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+ }
+ encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
+
+ /* Finally, submit the ICMP error back to the ingress pipeline */
+ put_resubmit(OFTABLE_LOG_INGRESS_PIPELINE, &inner_ofpacts);
+
+ /* Attach nested actions to ICMP error controller handler */
+ ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
+ &ofpacts, OFP15_VERSION);
+
+ /* Finalize the ICMP error controller handler */
+ encode_finish_controller_op(oc_offset, &ofpacts);
+
+ ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_PROCESS, 100,
+ binding->header_.uuid.parts[0], &match, &ofpacts,
+ &binding->header_.uuid);
+
+ ofpbuf_uninit(&inner_ofpacts);
+ ofpbuf_uninit(&ofpacts);
+}
+
+static uint16_t
+get_tunnel_overhead(struct chassis_tunnel const *tun)
+{
+ uint16_t overhead = 0;
+ enum chassis_tunnel_type type = tun->type;
+ if (type == GENEVE) {
+ overhead += GENEVE_TUNNEL_OVERHEAD;
+ } else if (type == STT) {
+ overhead += STT_TUNNEL_OVERHEAD;
+ } else if (type == VXLAN) {
+ overhead += VXLAN_TUNNEL_OVERHEAD;
+ } else {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+ VLOG_WARN_RL(&rl, "Unknown tunnel type %d, can't determine overhead "
+ "size for Path MTU Discovery", type);
+ return 0;
+ }
+ overhead += tun->is_ipv6? IPV6_HEADER_LEN : IP_HEADER_LEN;
+ return overhead;
+}
+
+static uint16_t
+get_effective_mtu(const struct sbrec_port_binding *mcp,
+ struct ovs_list *remote_tunnels,
+ const struct if_status_mgr *if_mgr)
+{
+ /* Use interface MTU as a base for calculation */
+ uint16_t iface_mtu = if_status_mgr_iface_get_mtu(if_mgr,
+ mcp->logical_port);
+ if (!iface_mtu) {
+ return 0;
+ }
+
+ /* Iterate over all peer tunnels and find the biggest tunnel overhead */
+ uint16_t overhead = 0;
+ struct tunnel *tun;
+ LIST_FOR_EACH (tun, list_node, remote_tunnels) {
+ overhead = MAX(overhead, get_tunnel_overhead(tun->tun));
+ }
+ if (!overhead) {
+ return 0;
+ }
+
+ return iface_mtu - overhead;
+}
+
+static void
+handle_pkt_too_big_for_ip_version(struct ovn_desired_flow_table *flow_table,
+ const struct sbrec_port_binding *binding,
+ const struct sbrec_port_binding *mcp,
+ uint16_t mtu, bool is_ipv6)
+{
+ /* ingress */
+ determine_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+ MFF_LOG_INPORT);
+ reply_imcp_error_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+ MFF_LOG_INPORT);
+
+ /* egress */
+ determine_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+ MFF_LOG_OUTPORT);
+ reply_imcp_error_if_pkt_too_big(flow_table, binding, mcp, mtu, is_ipv6,
+ MFF_LOG_OUTPORT);
+}
+
+static void
+handle_pkt_too_big(struct ovn_desired_flow_table *flow_table,
+ struct ovs_list *remote_tunnels,
+ const struct sbrec_port_binding *binding,
+ const struct sbrec_port_binding *mcp,
+ const struct if_status_mgr *if_mgr)
+{
+ uint16_t mtu = get_effective_mtu(mcp, remote_tunnels, if_mgr);
+ if (!mtu) {
+ return;
+ }
+ handle_pkt_too_big_for_ip_version(flow_table, binding, mcp, mtu, false);
+ handle_pkt_too_big_for_ip_version(flow_table, binding, mcp, mtu, true);
+}
+
static void
enforce_tunneling_for_multichassis_ports(
struct local_datapath *ld,
@@ -1079,7 +1315,8 @@ enforce_tunneling_for_multichassis_ports(
const struct sbrec_chassis *chassis,
const struct hmap *chassis_tunnels,
enum mf_field_id mff_ovn_geneve,
- struct ovn_desired_flow_table *flow_table)
+ struct ovn_desired_flow_table *flow_table,
+ const struct if_status_mgr *if_mgr)
{
if (shash_is_empty(&ld->multichassis_ports)) {
return;
@@ -1124,6 +1361,8 @@ enforce_tunneling_for_multichassis_ports(
binding->header_.uuid.parts[0], &match, &ofpacts,
&binding->header_.uuid);
ofpbuf_uninit(&ofpacts);
+
+ handle_pkt_too_big(flow_table, tuns, binding, mcp, if_mgr);
}
struct tunnel *tun_elem;
@@ -1144,6 +1383,7 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
const struct hmap *chassis_tunnels,
const struct sbrec_port_binding *binding,
const struct sbrec_chassis *chassis,
+ const struct if_status_mgr *if_mgr,
struct ovn_desired_flow_table *flow_table,
struct ofpbuf *ofpacts_p)
{
@@ -1567,8 +1807,10 @@ consider_port_binding(struct ovsdb_idl_index *sbrec_port_binding_by_name,
binding->header_.uuid.parts[0],
&match, ofpacts_p, &binding->header_.uuid);
- enforce_tunneling_for_multichassis_ports(
- ld, binding, chassis, chassis_tunnels, mff_ovn_geneve, flow_table);
+ enforce_tunneling_for_multichassis_ports(ld, binding, chassis,
+ chassis_tunnels,
+ mff_ovn_geneve, flow_table,
+ if_mgr);
/* No more tunneling to set up. */
goto out;
@@ -1872,7 +2114,8 @@ physical_eval_port_binding(struct physical_ctx *p_ctx,
p_ctx->local_bindings,
p_ctx->patch_ofports,
p_ctx->chassis_tunnels,
- pb, p_ctx->chassis, flow_table, &ofpacts);
+ pb, p_ctx->chassis,
+ p_ctx->if_mgr, flow_table, &ofpacts);
ofpbuf_uninit(&ofpacts);
}
@@ -1995,7 +2238,8 @@ physical_run(struct physical_ctx *p_ctx,
p_ctx->local_bindings,
p_ctx->patch_ofports,
p_ctx->chassis_tunnels, binding,
- p_ctx->chassis, flow_table, &ofpacts);
+ p_ctx->chassis,
+ p_ctx->if_mgr, flow_table, &ofpacts);
}
/* Handle output to multicast groups, in tables 40 and 41. */
@@ -2132,6 +2376,14 @@ physical_run(struct physical_ctx *p_ctx,
ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_DETECT, 0, 0, &match,
&ofpacts, hc_uuid);
+ match_init_catchall(&match);
+ match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0,
+ MLF_ALLOW_LOOPBACK, MLF_ALLOW_LOOPBACK);
+ ofpbuf_clear(&ofpacts);
+ put_resubmit(OFTABLE_LOCAL_OUTPUT, &ofpacts);
+ ofctrl_add_flow(flow_table, OFTABLE_OUTPUT_LARGE_PKT_PROCESS, 10, 0,
+ &match, &ofpacts, hc_uuid);
+
match_init_catchall(&match);
ofpbuf_clear(&ofpacts);
put_resubmit(OFTABLE_REMOTE_OUTPUT, &ofpacts);
@@ -849,6 +849,9 @@ void ovnacts_free(struct ovnact[], size_t ovnacts_len);
char *ovnact_op_to_string(uint32_t);
int encode_ra_dnssl_opt(char *data, char *buf, int buf_len);
+size_t encode_start_controller_op(enum action_opcode opcode, bool pause,
+ uint32_t meter_id, struct ofpbuf *ofpacts);
+void encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts);
void encode_controller_op(enum action_opcode opcode, uint32_t meter_id,
struct ofpbuf *ofpacts);
@@ -79,7 +79,7 @@ ovnact_init(struct ovnact *ovnact, enum ovnact_type type, size_t len)
ovnact->len = len;
}
-static size_t
+size_t
encode_start_controller_op(enum action_opcode opcode, bool pause,
uint32_t meter_id, struct ofpbuf *ofpacts)
{
@@ -100,7 +100,7 @@ encode_start_controller_op(enum action_opcode opcode, bool pause,
return ofs;
}
-static void
+void
encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
{
struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof *oc);
@@ -28,6 +28,13 @@
#define ROUTE_ORIGIN_CONNECTED "connected"
#define ROUTE_ORIGIN_STATIC "static"
+#define ETH_CRC_LENGTH 4
+#define ETHERNET_OVERHEAD (ETH_HEADER_LEN + ETH_CRC_LENGTH)
+
+#define GENEVE_TUNNEL_OVERHEAD 38
+#define STT_TUNNEL_OVERHEAD 18
+#define VXLAN_TUNNEL_OVERHEAD 30
+
struct nbrec_logical_router_port;
struct sbrec_logical_flow;
struct svec;
@@ -224,6 +224,8 @@ enum ovn_stage {
* one of the logical router's own IP addresses. */
#define REGBIT_EGRESS_LOOPBACK "reg9[0]"
/* Register to store the result of check_pkt_larger action. */
+/* This register is also used by ovn-controller in
+ * OFTABLE_OUTPUT_LARGE_PKT_DETECT table, for a similar goal. */
#define REGBIT_PKT_LARGER "reg9[1]"
#define REGBIT_LOOKUP_NEIGHBOR_RESULT "reg9[2]"
#define REGBIT_LOOKUP_NEIGHBOR_IP_RESULT "reg9[3]"
@@ -1441,8 +1441,13 @@
<p>
OpenFlow tables 37 through 42 implement the <code>output</code> action
in the logical ingress pipeline. Specifically, table 37 serves as an
- entry point to egress pipeline. Tables 38 and 39 are, for now,
- placeholders for Path MTU Discovery implementation.
+ entry point to egress pipeline. Table 38 detects IP packets that are
+ too big for a corresponding interface. Table 39 produces ICMPv4
+ Fragmentation Needed (or ICMPv6 Too Big) errors and deliver them back
+ to the offending port. table 40 handles packets to remote hypervisors,
+ table 41 handles packets to the local hypervisor, and table 42 checks
+ whether packets whose logical ingress and egress port are the same
+ should be discarded.
</p>
<p>
@@ -15069,6 +15069,327 @@ OVN_CLEANUP([hv1],[hv2],[hv3])
AT_CLEANUP
])
+m4_define([MULTICHASSIS_PATH_MTU_DISCOVERY_TEST],
+ [OVN_FOR_EACH_NORTHD([
+ AT_SETUP([localnet connectivity with multiple requested-chassis, path mtu discovery (ip=$1, tunnel=$2, mtu=$3)])
+ AT_KEYWORDS([multi-chassis])
+ AT_SKIP_IF([test $HAVE_SCAPY = no])
+
+ ovn_start
+
+ net_add n1
+ for i in 1 2; do
+ sim_add hv$i
+ as hv$i
+ check ovs-vsctl add-br br-phys
+ if test "x$1" = "xipv6"; then
+ ovn_attach n1 br-phys fd00::$i 64 $2
+ else
+ ovn_attach n1 br-phys 192.168.0.$i 24 $2
+ fi
+ check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+ done
+
+ first_mac=00:00:00:00:00:01
+ second_mac=00:00:00:00:00:02
+ multi1_mac=00:00:00:00:00:f0
+ multi2_mac=00:00:00:00:00:f1
+ first_ip=10.0.0.1
+ second_ip=10.0.0.2
+ multi1_ip=10.0.0.10
+ multi2_ip=10.0.0.20
+ first_ip6=abcd::1
+ second_ip6=abcd::2
+ multi1_ip6=abcd::f0
+ multi2_ip6=abcd::f1
+
+ check ovn-nbctl ls-add ls0
+ check ovn-nbctl lsp-add ls0 first
+ check ovn-nbctl lsp-add ls0 second
+ check ovn-nbctl lsp-add ls0 multi1
+ check ovn-nbctl lsp-add ls0 multi2
+ check ovn-nbctl lsp-set-addresses first "${first_mac} ${first_ip} ${first_ip6}"
+ check ovn-nbctl lsp-set-addresses second "${second_mac} ${second_ip} ${second_ip6}"
+ check ovn-nbctl lsp-set-addresses multi1 "${multi1_mac} ${multi1_ip} ${multi1_ip6}"
+ check ovn-nbctl lsp-set-addresses multi2 "${multi2_mac} ${multi2_ip} ${multi2_ip6}"
+
+ check ovn-nbctl lsp-add ls0 public
+ check ovn-nbctl lsp-set-type public localnet
+ check ovn-nbctl lsp-set-addresses public unknown
+ check ovn-nbctl lsp-set-options public network_name=phys
+
+ check ovn-nbctl lsp-set-options first requested-chassis=hv1
+ check ovn-nbctl lsp-set-options second requested-chassis=hv2
+ check ovn-nbctl lsp-set-options multi1 requested-chassis=hv1,hv2
+ check ovn-nbctl lsp-set-options multi2 requested-chassis=hv1,hv2
+
+ as hv1 check ovs-vsctl -- add-port br-int first -- \
+ set Interface first external-ids:iface-id=first \
+ options:tx_pcap=hv1/first-tx.pcap \
+ options:rxq_pcap=hv1/first-rx.pcap \
+ ofport-request=1
+ as hv2 check ovs-vsctl -- add-port br-int second -- \
+ set Interface second external-ids:iface-id=second \
+ options:tx_pcap=hv2/second-tx.pcap \
+ options:rxq_pcap=hv2/second-rx.pcap \
+ ofport-request=2
+
+ # Create interfaces for multichassis ports on both hv1 and hv2
+ for hv in hv1 hv2; do
+ for i in 1 2; do
+ as $hv check ovs-vsctl -- add-port br-int multi${i} -- \
+ set Interface multi${i} external-ids:iface-id=multi${i} \
+ options:tx_pcap=$hv/multi${i}-tx.pcap \
+ options:rxq_pcap=$hv/multi${i}-rx.pcap \
+ ofport-request=${i}00
+ done
+ done
+
+ send_ip_packet() {
+ local inport=${1} hv=${2} eth_src=${3} eth_dst=${4} ipv4_src=${5} ipv4_dst=${6} data=${7} fail=${8} mtu=${9:-$3}
+ packet=$(fmt_pkt "
+ Ether(dst='${eth_dst}', src='${eth_src}') /
+ IP(src='${ipv4_src}', dst='${ipv4_dst}') /
+ ICMP(type=8) / bytes.fromhex('${data}')
+ ")
+ as hv${hv} ovs-appctl netdev-dummy/receive ${inport} ${packet}
+ if [[ x"${fail}" != x0 ]]; then
+ original_ip_frame=$(fmt_pkt "
+ IP(src='${ipv4_src}', dst='${ipv4_dst}') /
+ ICMP(type=8) / bytes.fromhex('${data}')
+ ")
+ # IP(flags=2) means DF (Don't Fragment) = 1
+ # ICMP(type=3, code=4) means Destination Unreachable, Fragmentation Needed
+ packet=$(fmt_pkt "
+ Ether(dst='${eth_src}', src='${eth_dst}') /
+ IP(src='${ipv4_dst}', dst='${ipv4_src}', ttl=255, flags=2, id=0) /
+ ICMP(type=3, code=4, nexthopmtu=${mtu}) /
+ bytes.fromhex('${original_ip_frame:0:$((534 * 2))}')
+ ")
+ fi
+ echo ${packet}
+ }
+
+ send_ip6_packet() {
+ local inport=${1} hv=${2} eth_src=${3} eth_dst=${4} ipv6_src=${5} ipv6_dst=${6} data=${7} fail=${8} mtu=${9:-$3}
+ packet=$(fmt_pkt "
+ Ether(dst='${eth_dst}', src='${eth_src}') /
+ IPv6(src='${ipv6_src}', dst='${ipv6_dst}') /
+ ICMPv6EchoRequest() / bytes.fromhex('${data}')
+ ")
+ as hv${hv} ovs-appctl netdev-dummy/receive ${inport} ${packet}
+ if [[ x"${fail}" != x0 ]]; then
+ original_ip_frame=$(fmt_pkt "
+ IPv6(src='${ipv6_src}', dst='${ipv6_dst}') /
+ ICMPv6EchoRequest() / bytes.fromhex('${data}')
+ ")
+ packet=$(fmt_pkt "
+ Ether(dst='${eth_src}', src='${eth_dst}') /
+ IPv6(src='${ipv6_dst}', dst='${ipv6_src}', hlim=255) /
+ ICMPv6PacketTooBig(mtu=${mtu}) /
+ bytes.fromhex('${original_ip_frame:0:$((1218 * 2))}')
+ ")
+ fi
+ echo ${packet}
+ }
+
+ reset_env() {
+ for port in first multi1 multi2; do
+ as hv1 reset_pcap_file $port hv1/$port
+ done
+ for port in second multi1 multi2; do
+ as hv2 reset_pcap_file $port hv2/$port
+ done
+ for port in hv1/multi1 hv2/multi1 hv1/multi2 hv2/multi2 hv1/first hv2/second; do
+ : > $port.expected
+ done
+ }
+
+ check_pkts() {
+ for port in hv1/multi1 hv2/multi1 hv1/multi2 hv2/multi2 hv1/first hv2/second; do
+ OVN_CHECK_PACKETS_REMOVE_BROADCAST([${port}-tx.pcap], [${port}.expected])
+ done
+ }
+
+ payload() {
+ echo $(cat /dev/urandom | tr -cd 'a-f0-9' | head -c ${1})
+ }
+
+ wait_for_ports_up
+ OVN_POPULATE_ARP
+
+ reset_env
+
+ AS_BOX([Packets of proper size are delivered from multichassis to regular ports])
+
+ len=1000
+ packet=$(send_ip_packet multi1 1 $multi1_mac $first_mac $multi1_ip $first_ip $(payload $len) 0)
+ echo $packet >> hv1/first.expected
+
+ packet=$(send_ip_packet multi1 1 $multi1_mac $second_mac $multi1_ip $second_ip $(payload $len) 0)
+ echo $packet >> hv2/second.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $first_mac $multi1_ip6 $first_ip6 $(payload $len) 0)
+ echo $packet >> hv1/first.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $second_mac $multi1_ip6 $second_ip6 $(payload $len) 0)
+ echo $packet >> hv2/second.expected
+
+ check_pkts
+ reset_env
+
+ AS_BOX([Oversized packets are not delivered from multichassis to regular ports])
+
+ len=3000
+ packet=$(send_ip_packet multi1 1 $multi1_mac $first_mac $multi1_ip $first_ip $(payload $len) 1)
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip_packet multi1 1 $multi1_mac $second_mac $multi1_ip $second_ip $(payload $len) 1)
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $first_mac $multi1_ip6 $first_ip6 $(payload $len) 1)
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $second_mac $multi1_ip6 $second_ip6 $(payload $len) 1)
+ echo $packet >> hv1/multi1.expected
+
+ check_pkts
+ reset_env
+
+ AS_BOX([Packets of proper size are delivered from regular to multichassis ports])
+
+ len=1000
+ packet=$(send_ip_packet first 1 $first_mac $multi1_mac $first_ip $multi1_ip $(payload $len) 0)
+ echo $packet >> hv1/multi1.expected
+ echo $packet >> hv2/multi1.expected
+
+ packet=$(send_ip_packet second 2 $second_mac $multi1_mac $second_ip $multi1_ip $(payload $len) 0)
+ echo $packet >> hv1/multi1.expected
+ echo $packet >> hv2/multi1.expected
+
+ packet=$(send_ip6_packet first 1 $first_mac $multi1_mac $first_ip6 $multi1_ip6 $(payload $len) 0)
+ echo $packet >> hv1/multi1.expected
+ echo $packet >> hv2/multi1.expected
+
+ packet=$(send_ip6_packet second 2 $second_mac $multi1_mac $second_ip6 $multi1_ip6 $(payload $len) 0)
+ echo $packet >> hv1/multi1.expected
+ echo $packet >> hv2/multi1.expected
+
+ check_pkts
+ reset_env
+
+ AS_BOX([Oversized packets are not delivered from regular to multichassis ports])
+
+ len=3000
+ packet=$(send_ip_packet first 1 $first_mac $multi1_mac $first_ip $multi1_ip $(payload $len) 1)
+ echo $packet >> hv1/first.expected
+
+ packet=$(send_ip_packet second 2 $second_mac $multi1_mac $second_ip $multi1_ip $(payload $len) 1)
+ echo $packet >> hv2/second.expected
+
+ packet=$(send_ip6_packet first 1 $first_mac $multi1_mac $first_ip6 $multi1_ip6 $(payload $len) 1)
+ echo $packet >> hv1/first.expected
+
+ packet=$(send_ip6_packet second 2 $second_mac $multi1_mac $second_ip6 $multi1_ip6 $(payload $len) 1)
+ echo $packet >> hv2/second.expected
+
+ check_pkts
+ reset_env
+
+ AS_BOX([Packets of proper size are delivered from multichassis to multichassis ports])
+
+ len=1000
+ packet=$(send_ip_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip $multi2_ip $(payload $len) 0)
+ echo $packet >> hv1/multi2.expected
+ echo $packet >> hv2/multi2.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip6 $multi2_ip6 $(payload $len) 0)
+ echo $packet >> hv1/multi2.expected
+ echo $packet >> hv2/multi2.expected
+
+ check_pkts
+ reset_env
+
+ AS_BOX([Oversized packets are not delivered from multichassis to multichassis ports])
+
+ len=3000
+ packet=$(send_ip_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip $multi2_ip $(payload $len) 1)
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip6 $multi2_ip6 $(payload $len) 1)
+ echo $packet >> hv1/multi1.expected
+
+ check_pkts
+ reset_env
+
+ AS_BOX([MTU updates are honored in ICMP Path MTU calculation])
+
+ set_mtu() {
+ local hv=${1} iface=${2} new_mtu=${3}
+
+ iface_uuid=$(as ${hv} ovs-vsctl --bare --columns _uuid find Interface name=${iface})
+ check as ${hv} ovs-vsctl set interface ${iface_uuid} mtu_request=${new_mtu}
+ }
+
+ set_mtu_for_all_ports() {
+ for port in multi1 multi2 first; do
+ set_mtu hv1 ${port} ${1}
+ done
+ for port in multi1 multi2 second; do
+ set_mtu hv2 ${port} ${1}
+ done
+ }
+
+ initial_mtu=1500 # all interfaces are 1500 by default
+ new_mtu=1400
+ set_mtu_for_all_ports ${new_mtu}
+ mtu_diff=$((${initial_mtu} - ${new_mtu}))
+
+ len=3000
+ expected_ip_mtu=$(($3 - ${mtu_diff}))
+ packet=$(send_ip_packet first 1 $first_mac $multi1_mac $first_ip $multi1_ip $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/first.expected
+
+ packet=$(send_ip_packet second 2 $second_mac $multi1_mac $second_ip $multi1_ip $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv2/second.expected
+
+ packet=$(send_ip6_packet first 1 $first_mac $multi1_mac $first_ip6 $multi1_ip6 $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/first.expected
+
+ packet=$(send_ip6_packet second 2 $second_mac $multi1_mac $second_ip6 $multi1_ip6 $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv2/second.expected
+
+ packet=$(send_ip_packet multi1 1 $multi1_mac $first_mac $multi1_ip $first_ip $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip_packet multi1 1 $multi1_mac $second_mac $multi1_ip $second_ip $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $first_mac $multi1_ip6 $first_ip6 $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $second_mac $multi1_ip6 $second_ip6 $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip $multi2_ip $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/multi1.expected
+
+ packet=$(send_ip6_packet multi1 1 $multi1_mac $multi2_mac $multi1_ip6 $multi2_ip6 $(payload $len) 1 ${expected_ip_mtu})
+ echo $packet >> hv1/multi1.expected
+
+ check_pkts
+
+ OVN_CLEANUP([hv1],[hv2])
+
+ AT_CLEANUP
+ ])])
+
+# NOTE(ihar) no STT variants because it's not supported by upstream kernels
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv4], [geneve], [1424])
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv6], [geneve], [1404])
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv4], [vxlan], [1432])
+MULTICHASSIS_PATH_MTU_DISCOVERY_TEST([ipv6], [vxlan], [1412])
+
OVN_FOR_EACH_NORTHD([
AT_SETUP([options:activation-strategy for logical port])
AT_KEYWORDS([multi-chassis])