diff mbox series

[ovs-dev] Implement Path MTU Discovery for multichassis ports

Message ID 20221103034554.346944-1-ihrachys@redhat.com
State Changes Requested
Headers show
Series [ovs-dev] Implement Path MTU Discovery for multichassis ports | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test fail github build: failed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Ihar Hrachyshka Nov. 3, 2022, 3:45 a.m. UTC
When multichassis ports attached to localnet switches are forced to use
tunneling to communicate with other ports, MTU of the path between ports
may effectively change (to accommodate tunnel headers or otherwise).

This patch implements Path MTU Discovery for IPv4 and IPv6 for
multichassis ports (both egress and ingress).

This patch adds new flows as follows:

- table 30: store check_pkt_len(mtu)->reg9[1]
- table 37: if reg9[1] is set, then send the packet to controller.

TODO: document behavior.
TODO: dynamically calculate MTU of the interface.
TODO: constrain new flows to communication involving m-chassis ports only.

Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
---
 controller/physical.c | 167 ++++++++++++++++++++++++++++
 tests/ovn.at          | 252 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 419 insertions(+)

Comments

Ales Musil Nov. 16, 2022, 8:55 a.m. UTC | #1
On Thu, Nov 3, 2022 at 4:46 AM Ihar Hrachyshka <ihrachys@redhat.com> wrote:

> When multichassis ports attached to localnet switches are forced to use
> tunneling to communicate with other ports, MTU of the path between ports
> may effectively change (to accommodate tunnel headers or otherwise).
>
> This patch implements Path MTU Discovery for IPv4 and IPv6 for
> multichassis ports (both egress and ingress).
>
> This patch adds new flows as follows:
>
> - table 30: store check_pkt_len(mtu)->reg9[1]
> - table 37: if reg9[1] is set, then send the packet to controller.
>
> TODO: document behavior.
> TODO: dynamically calculate MTU of the interface.
> TODO: constrain new flows to communication involving m-chassis ports only.
>
> Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
>
---
>  controller/physical.c | 167 ++++++++++++++++++++++++++++
>  tests/ovn.at          | 252 ++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 419 insertions(+)
>
> diff --git a/controller/physical.c b/controller/physical.c
> index 705146316..96c0dc31a 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -1072,6 +1072,170 @@ setup_activation_strategy(const struct
> sbrec_port_binding *binding,
>      }
>  }
>
> +static size_t
> +encode_start_controller_op(enum action_opcode opcode, bool pause,
> +                           uint32_t meter_id, struct ofpbuf *ofpacts)
> +{
> +    size_t ofs = ofpacts->size;
> +
> +    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
> +    oc->max_len = UINT16_MAX;
> +    oc->reason = OFPR_ACTION;
> +    oc->pause = pause;
> +    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
> +        meter_id = NX_CTLR_NO_METER;
> +    }
> +    oc->meter_id = meter_id;
> +
> +    struct action_header ah = { .opcode = htonl(opcode) };
> +    ofpbuf_put(ofpacts, &ah, sizeof ah);
> +
> +    return ofs;
> +}
> +
> +static void
> +encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
> +{
> +    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof
> *oc);
> +    ofpacts->header = oc;
> +    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
> +    ofpact_finish_CONTROLLER(ofpacts, &oc);
> +}
> +
>
+static void
> +handle_oversized_ip_packets(struct ovn_desired_flow_table *flow_table,
> +                            const struct sbrec_port_binding *binding,
> +                            bool is_ipv6)
> +{
> +    uint32_t dp_key = binding->datapath->tunnel_key;
> +
> +    struct match match;
> +    match_init_catchall(&match);
> +    match_set_metadata(&match, htonll(dp_key));
> +
> +    struct ofpbuf ofpacts;
> +    ofpbuf_init(&ofpacts, 0);
> +
> +    /* Store packet too large flag in reg9[1]. */
> +    /* TODO: limit to multichassis traffic? */
> +    match_init_catchall(&match);
> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
> ETH_TYPE_IP));
> +    match_set_metadata(&match, htonll(dp_key));
> +
> +    /* TODO: get mtu from interface of the tunnel */
> +    uint16_t frag_mtu = 1500;
> +    struct ofpact_check_pkt_larger *pkt_larger =
> +        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
> +    pkt_larger->pkt_len = frag_mtu;
> +    pkt_larger->dst.field = mf_from_id(MFF_REG9);
> +    pkt_larger->dst.ofs = 1;
> +
> +    put_resubmit(31, &ofpacts);
> +    /* TODO: should we tag table=30 as consumed for this job anywhere? */
> +    ofctrl_add_flow(flow_table, 30, 110,
> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
> +                    &binding->header_.uuid);
> +    ofpbuf_uninit(&ofpacts);
>
+
> +    /* Generate ICMP Fragmentation needed for IP packets that are too
> large
> +     * (reg9[1] == 1) */
> +    /* TODO: limit to multichassis traffic? */
> +    match_init_catchall(&match);
> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
> ETH_TYPE_IP));
> +    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
> +
> +    /* Return ICMP error with a part of the original IP packet included.
> */
> +    ofpbuf_init(&ofpacts, 0);
> +    size_t oc_offset = encode_start_controller_op(
> +        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
> +
> +    /* Before sending the ICMP error packet back to the pipeline, set a
> number
> +     * of fields. */
> +    struct ofpbuf inner_ofpacts;
> +    ofpbuf_init(&inner_ofpacts, 0);
> +
> +    /* The new error packet is no longer too large */
> +    /* REGBIT_PKT_LARGER = 0 */
> +    ovs_be32 value = htonl(0);
> +    ovs_be32 mask = htonl(1 << 1);
> +    ofpact_put_set_field(
> +        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
> +
> +    /* The new error packet is delivered locally */
> +    /* REGBIT_EGRESS_LOOPBACK = 1 */
> +    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
> +    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
> +    ofpact_put_set_field(
> +        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
> +
> +    /* eth.dst */
> +    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
> +    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
> +
> +    /* ip.dst */
> +    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
> +              ofpact_put_STACK_PUSH(&inner_ofpacts));
> +    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
> +              ofpact_put_STACK_POP(&inner_ofpacts));
> +
> +    /* ip.ttl */
> +    struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
> +    ip_ttl->ttl = 255;
> +
> +    if (is_ipv6) {
> +        /* icmp6.type = 2 (Packet Too Big) */
> +        /* icmp6.code = 0 */
> +        uint8_t icmp_type = 2;
> +        uint8_t icmp_code = 0;
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type,
> NULL);
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code,
> NULL);
> +
> +        /* icmp6.frag_mtu */
> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
> +            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
> +            &inner_ofpacts);
> +        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
> +    } else {
> +        /* icmp4.type = 3 (Destination Unreachable) */
> +        /* icmp4.code = 4 (Fragmentation Needed) */
> +        uint8_t icmp_type = 3;
> +        uint8_t icmp_code = 4;
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type,
> NULL);
> +        ofpact_put_set_field(
> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code,
> NULL);
> +
> +        /* icmp4.frag_mtu = */
> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
> +            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
> +            &inner_ofpacts);
> +        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
> +    }
> +
> +    /* Finally, submit the ICMP error back to the ingress pipeline */
> +    put_resubmit(8, &inner_ofpacts);
> +
> +    /* Attach nested actions to ICMP error controller handler */
> +    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
> +                                 &ofpacts, OFP15_VERSION);
> +
> +    /* Finalize the ICMP error controller handler */
> +    encode_finish_controller_op(oc_offset, &ofpacts);
> +
> +    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 1000,
> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
> +                    &binding->header_.uuid);
> +
> +    ofpbuf_uninit(&inner_ofpacts);
> +    ofpbuf_uninit(&ofpacts);
> +}
> +
>  static void
>  enforce_tunneling_for_multichassis_ports(
>      struct local_datapath *ld,
> @@ -1124,6 +1288,9 @@ enforce_tunneling_for_multichassis_ports(
>                          binding->header_.uuid.parts[0], &match, &ofpacts,
>                          &binding->header_.uuid);
>          ofpbuf_uninit(&ofpacts);
> +
> +        handle_oversized_ip_packets(flow_table, binding, false);
> +        handle_oversized_ip_packets(flow_table, binding, true);
>      }
>
>      struct tunnel *tun_elem;
> diff --git a/tests/ovn.at b/tests/ovn.at
> index f8b8db4df..91ecd0814 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -14887,6 +14887,258 @@ OVN_CLEANUP([hv1],[hv2],[hv3])
>  AT_CLEANUP
>  ])
>
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([localnet connectivity with multiple requested-chassis, path mtu
> discovery])
> +AT_KEYWORDS([multi-chassis])
> +ovn_start
> +
> +net_add n1
> +for i in 1 2; do
> +    sim_add hv$i
> +    as hv$i
> +    check ovs-vsctl add-br br-phys
> +    ovn_attach n1 br-phys 192.168.0.$i
> +    check ovs-vsctl set open .
> external-ids:ovn-bridge-mappings=phys:br-phys
> +done
> +
> +check ovn-nbctl ls-add ls0 -- add Logical_Switch ls0 other_config
> vlan-passthru=true
> +check ovn-nbctl lsp-add ls0 first
> +check ovn-nbctl lsp-add ls0 second
> +check ovn-nbctl lsp-add ls0 migrator
> +check ovn-nbctl lsp-set-addresses first "00:00:00:00:00:01 10.0.0.1
> ff::01"
> +check ovn-nbctl lsp-set-addresses second "00:00:00:00:00:02 10.0.0.2
> ff::02"
> +check ovn-nbctl lsp-set-addresses migrator "00:00:00:00:00:ff 10.0.0.100
> ff::ff"
> +
> +check ovn-nbctl lsp-add ls0 public
> +check ovn-nbctl lsp-set-type public localnet
> +check ovn-nbctl lsp-set-addresses public unknown
> +check ovn-nbctl lsp-set-options public network_name=phys
> +
> +# TODO: do we need mtu request here?
> +check ovn-nbctl lsp-set-options first requested-chassis=hv1
> vif-plug-mtu-request=1500
> +check ovn-nbctl lsp-set-options second requested-chassis=hv2
> vif-plug-mtu-request=1500
> +check ovn-nbctl lsp-set-options migrator requested-chassis=hv1,hv2
> vif-plug-mtu-request=1500
> +
> +as hv1 check ovs-vsctl -- add-port br-int first -- \
> +    set Interface first external-ids:iface-id=first \
> +    options:tx_pcap=hv1/first-tx.pcap \
> +    options:rxq_pcap=hv1/first-rx.pcap \
> +    ofport-request=1
> +as hv2 check ovs-vsctl -- add-port br-int second -- \
> +    set Interface second external-ids:iface-id=second \
> +    options:tx_pcap=hv2/second-tx.pcap \
> +    options:rxq_pcap=hv2/second-rx.pcap \
> +    ofport-request=2
> +
> +# Create Migrator interfaces on both hv1 and hv2
> +for hv in hv1 hv2; do
> +    as $hv check ovs-vsctl -- add-port br-int migrator -- \
> +        set Interface migrator external-ids:iface-id=migrator \
> +        options:tx_pcap=$hv/migrator-tx.pcap \
> +        options:rxq_pcap=$hv/migrator-rx.pcap \
> +        ofport-request=100
> +done
> +
> +send_ip_packet() {
> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv4_src=$5 ipv4_dst=$6
> ip_chksum=$7 data=$8 ip_len=$9
> +    local ip_ttl=ff
> +    local
> packet=${eth_dst}${eth_src}08004500${ip_len}00004000${ip_ttl}fd${ip_chksum}${ipv4_src}${ipv4_dst}${data}
> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
> +    echo $packet
> +}
> +
> +send_ip6_packet() {
> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv6_src=$5 ipv6_dst=$6
> data=$7 ip_len=$8
> +    local ip_ttl=ff
> +    local
> packet=${eth_dst}${eth_src}86dd60000000${ip_len}fd${ip_ttl}${ipv6_src}${ipv6_dst}${data}
> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
> +    echo $packet
> +}
> +
> +reset_env() {
> +    as hv1 reset_pcap_file first hv1/first
> +    as hv2 reset_pcap_file second hv2/second
> +    as hv1 reset_pcap_file migrator hv1/migrator
> +    as hv2 reset_pcap_file migrator hv2/migrator
> +
> +    for port in hv1/migrator hv2/migrator hv1/first hv2/second; do
> +        : > $port.expected
> +    done
> +}
> +
> +first_mac=000000000001
> +second_mac=000000000002
> +migrator_mac=0000000000ff
> +first_ip=$(ip_to_hex 10 0 0 1)
> +second_ip=$(ip_to_hex 10 0 0 2)
> +migrator_ip=$(ip_to_hex 10 0 0 100)
> +first_ip6=ff000000000000000000000000000001
> +second_ip6=ff000000000000000000000000000002
> +migrator_ip6=ff0000000000000000000000000000ff
> +
> +OVN_POPULATE_ARP
> +
> +reset_env
> +
> +# Check that packets of proper size are delivered from multichassis to
> intended ports
> +ip_len=058c
> +len=1400
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip
> $first_ip 0000 $data $ip_len)
> +echo $packet >> hv1/first.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac $migrator_ip
> $second_ip 0000 $data $ip_len)
> +echo $packet >> hv2/second.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
> $migrator_ip6 $first_ip6 $data $ip_len)
> +echo $packet >> hv1/first.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
> $migrator_ip6 $second_ip6 $data $ip_len)
> +echo $packet >> hv2/second.expected
> +
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
> [hv1/first.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
> [hv2/second.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
> [hv1/migrator.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
> [hv2/migrator.expected])
> +
> +reset_env
> +
> +# Now check that oversized packets are not delivered to the intended
> ports, and
> +# that the originating port receives an ICMP error suggesting
> fragmentation
> +
> +ip_len=0bcc
> +len=3000
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip
> $first_ip 0000 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac $migrator_ip
> $second_ip 0000 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
> $migrator_ip6 $first_ip6 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
> $migrator_ip6 $second_ip6 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
> [hv1/first.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
> [hv2/second.expected])
> +
> +# it's harder to construct the original packet that includes geneve
> headers
> +# etc. so check manually
> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/migrator-tx.pcap >
> hv1/migrator.packets
> +sed -i '/ffffffffffff/d' hv1/migrator.packets
> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/migrator-tx.pcap >
> hv2/migrator.packets
> +sed -i '/ffffffffffff/d' hv2/migrator.packets
> +
> +# icmp errors are delivered only to the port binding that originated the
> offending packet
> +AT_CHECK([wc -l hv1/migrator.packets | cut -c 1], [0], [4
> +])
> +AT_CHECK([wc -l hv2/migrator.packets | cut -c 1], [0], [0
> +])
> +
> +for line in $(cat hv1/migrator.expected); do
> +    trimmed=$(echo $line | cut -c 125-225)
> +    echo "checking if packet $trimmed... generated an icmp error"
> +    AT_CHECK([grep ${trimmed} hv1/migrator.packets | wc -l | cut -c 1],
> [0], [1
> +])
> +done
> +
> +reset_env
> +
> +# Check that packets of proper size are delivered from regular to
> multichassis ports
> +ip_len=058c
> +len=1400
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
> $migrator_ip 0000 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +echo $packet >> hv2/migrator.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
> $migrator_ip 0000 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +echo $packet >> hv2/migrator.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
> $migrator_ip6 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +echo $packet >> hv2/migrator.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
> $migrator_ip6 $data $ip_len)
> +echo $packet >> hv1/migrator.expected
> +echo $packet >> hv2/migrator.expected
> +
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
> [hv1/first.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
> [hv2/second.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
> [hv1/migrator.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
> [hv2/migrator.expected])
> +
> +reset_env
> +
> +# Now check that oversized packets are not delivered to multichassis
> ports, and
> +# that the originating ports receive an ICMP error suggesting
> fragmentation
> +
> +ip_len=0bcc
> +len=3000
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
> $migrator_ip 0000 $data $ip_len)
> +echo $packet >> hv1/first.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
> $migrator_ip 0000 $data $ip_len)
> +echo $packet >> hv2/second.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
> $migrator_ip6 $data $ip_len)
> +echo $packet >> hv1/first.expected
> +
> +data=$(xxd -l $len -c $len -p < /dev/urandom)
> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
> $migrator_ip6 $data $ip_len)
> +echo $packet >> hv2/second.expected
> +
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
> [hv1/migrator.expected])
> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
> [hv2/migrator.expected])
> +
> +# it's harder to construct the original packet that includes geneve
> headers
> +# etc. so check manually
> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/first-tx.pcap >
> hv1/first.packets
> +sed -i '/ffffffffffff/d' hv1/first.packets
> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/second-tx.pcap >
> hv2/second.packets
> +sed -i '/ffffffffffff/d' hv2/second.packets
> +
> +AT_CHECK([wc -l hv1/first.packets | cut -c 1], [0], [2
> +])
> +AT_CHECK([wc -l hv2/second.packets | cut -c 1], [0], [2
> +])
> +
> +for line in $(cat hv1/first.expected); do
> +    trimmed=$(echo $line | cut -c 125-225)
> +    echo "checking if packet $trimmed... generated an icmp error"
> +    AT_CHECK([grep ${trimmed} hv1/first.packets | wc -l | cut -c 1], [0],
> [1
> +])
> +done
> +
> +for line in $(cat hv2/second.expected); do
> +    trimmed=$(echo $line | cut -c 125-225)
> +    echo "checking if packet $trimmed... generated an icmp error"
> +    AT_CHECK([grep ${trimmed} hv2/second.packets | wc -l | cut -c 1],
> [0], [1
> +])
> +done
> +
> +# todo: test multichassis to multichassis
> +
> +OVN_CLEANUP([hv1],[hv2])
> +
> +AT_CLEANUP
> +])
> +
>  OVN_FOR_EACH_NORTHD([
>  AT_SETUP([options:activation-strategy for logical port])
>  AT_KEYWORDS([multi-chassis])
> --
> 2.34.3
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Hi Ihar,
I've started to do the review and came to the conclusion that this approach
does not seem
to be the right way to address something like that. First issue being that
putting something
in the existing logical stages is very error prone (the check_packet_larger
in table 30, which belongs to
"ls_in_external_port" stage).

To me it feels that we should have additional two stages in the switch
pipeline
that would check if the packet is larger, if so it would send icmp4/6_error
in the next stage.
This way we have more control and those stages could be used for something
else in the future.
Also this makes it way easier to overview what is happening in the logical
stage, it also gives us
an easy way to limit this to ports that are bound to multiple chassis and
to remove the flows when that
changes.

Also with the ICMP message there might be an issue: who is actually the
source? Is it the original port?

Dumitru, Mark, any thoughts?
Ihar Hrachyshka Nov. 18, 2022, 9:08 p.m. UTC | #2
On Wed, Nov 16, 2022 at 3:55 AM Ales Musil <amusil@redhat.com> wrote:

>
>
> On Thu, Nov 3, 2022 at 4:46 AM Ihar Hrachyshka <ihrachys@redhat.com>
> wrote:
>
>> When multichassis ports attached to localnet switches are forced to use
>> tunneling to communicate with other ports, MTU of the path between ports
>> may effectively change (to accommodate tunnel headers or otherwise).
>>
>> This patch implements Path MTU Discovery for IPv4 and IPv6 for
>> multichassis ports (both egress and ingress).
>>
>> This patch adds new flows as follows:
>>
>> - table 30: store check_pkt_len(mtu)->reg9[1]
>> - table 37: if reg9[1] is set, then send the packet to controller.
>>
>> TODO: document behavior.
>> TODO: dynamically calculate MTU of the interface.
>> TODO: constrain new flows to communication involving m-chassis ports only.
>>
>> Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
>>
> ---
>>  controller/physical.c | 167 ++++++++++++++++++++++++++++
>>  tests/ovn.at          | 252 ++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 419 insertions(+)
>>
>> diff --git a/controller/physical.c b/controller/physical.c
>> index 705146316..96c0dc31a 100644
>> --- a/controller/physical.c
>> +++ b/controller/physical.c
>> @@ -1072,6 +1072,170 @@ setup_activation_strategy(const struct
>> sbrec_port_binding *binding,
>>      }
>>  }
>>
>> +static size_t
>> +encode_start_controller_op(enum action_opcode opcode, bool pause,
>> +                           uint32_t meter_id, struct ofpbuf *ofpacts)
>> +{
>> +    size_t ofs = ofpacts->size;
>> +
>> +    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
>> +    oc->max_len = UINT16_MAX;
>> +    oc->reason = OFPR_ACTION;
>> +    oc->pause = pause;
>> +    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
>> +        meter_id = NX_CTLR_NO_METER;
>> +    }
>> +    oc->meter_id = meter_id;
>> +
>> +    struct action_header ah = { .opcode = htonl(opcode) };
>> +    ofpbuf_put(ofpacts, &ah, sizeof ah);
>> +
>> +    return ofs;
>> +}
>> +
>> +static void
>> +encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
>> +{
>> +    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof
>> *oc);
>> +    ofpacts->header = oc;
>> +    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
>> +    ofpact_finish_CONTROLLER(ofpacts, &oc);
>> +}
>> +
>>
> +static void
>> +handle_oversized_ip_packets(struct ovn_desired_flow_table *flow_table,
>> +                            const struct sbrec_port_binding *binding,
>> +                            bool is_ipv6)
>> +{
>> +    uint32_t dp_key = binding->datapath->tunnel_key;
>> +
>> +    struct match match;
>> +    match_init_catchall(&match);
>> +    match_set_metadata(&match, htonll(dp_key));
>> +
>> +    struct ofpbuf ofpacts;
>> +    ofpbuf_init(&ofpacts, 0);
>> +
>> +    /* Store packet too large flag in reg9[1]. */
>> +    /* TODO: limit to multichassis traffic? */
>> +    match_init_catchall(&match);
>> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
>> ETH_TYPE_IP));
>> +    match_set_metadata(&match, htonll(dp_key));
>> +
>> +    /* TODO: get mtu from interface of the tunnel */
>> +    uint16_t frag_mtu = 1500;
>> +    struct ofpact_check_pkt_larger *pkt_larger =
>> +        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
>> +    pkt_larger->pkt_len = frag_mtu;
>> +    pkt_larger->dst.field = mf_from_id(MFF_REG9);
>> +    pkt_larger->dst.ofs = 1;
>> +
>> +    put_resubmit(31, &ofpacts);
>> +    /* TODO: should we tag table=30 as consumed for this job anywhere? */
>> +    ofctrl_add_flow(flow_table, 30, 110,
>> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
>> +                    &binding->header_.uuid);
>> +    ofpbuf_uninit(&ofpacts);
>>
> +
>> +    /* Generate ICMP Fragmentation needed for IP packets that are too
>> large
>> +     * (reg9[1] == 1) */
>> +    /* TODO: limit to multichassis traffic? */
>> +    match_init_catchall(&match);
>> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
>> ETH_TYPE_IP));
>> +    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
>> +
>> +    /* Return ICMP error with a part of the original IP packet included.
>> */
>> +    ofpbuf_init(&ofpacts, 0);
>> +    size_t oc_offset = encode_start_controller_op(
>> +        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
>> +
>> +    /* Before sending the ICMP error packet back to the pipeline, set a
>> number
>> +     * of fields. */
>> +    struct ofpbuf inner_ofpacts;
>> +    ofpbuf_init(&inner_ofpacts, 0);
>> +
>> +    /* The new error packet is no longer too large */
>> +    /* REGBIT_PKT_LARGER = 0 */
>> +    ovs_be32 value = htonl(0);
>> +    ovs_be32 mask = htonl(1 << 1);
>> +    ofpact_put_set_field(
>> +        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
>> +
>> +    /* The new error packet is delivered locally */
>> +    /* REGBIT_EGRESS_LOOPBACK = 1 */
>> +    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
>> +    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
>> +    ofpact_put_set_field(
>> +        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
>> +
>> +    /* eth.dst */
>> +    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
>> +    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
>> +
>> +    /* ip.dst */
>> +    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
>> +              ofpact_put_STACK_PUSH(&inner_ofpacts));
>> +    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
>> +              ofpact_put_STACK_POP(&inner_ofpacts));
>> +
>> +    /* ip.ttl */
>> +    struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
>> +    ip_ttl->ttl = 255;
>> +
>> +    if (is_ipv6) {
>> +        /* icmp6.type = 2 (Packet Too Big) */
>> +        /* icmp6.code = 0 */
>> +        uint8_t icmp_type = 2;
>> +        uint8_t icmp_code = 0;
>> +        ofpact_put_set_field(
>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type,
>> NULL);
>> +        ofpact_put_set_field(
>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code,
>> NULL);
>> +
>> +        /* icmp6.frag_mtu */
>> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
>> +            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
>> +            &inner_ofpacts);
>> +        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
>> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
>> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
>> +    } else {
>> +        /* icmp4.type = 3 (Destination Unreachable) */
>> +        /* icmp4.code = 4 (Fragmentation Needed) */
>> +        uint8_t icmp_type = 3;
>> +        uint8_t icmp_code = 4;
>> +        ofpact_put_set_field(
>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type,
>> NULL);
>> +        ofpact_put_set_field(
>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code,
>> NULL);
>> +
>> +        /* icmp4.frag_mtu = */
>> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
>> +            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
>> +            &inner_ofpacts);
>> +        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
>> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
>> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
>> +    }
>> +
>> +    /* Finally, submit the ICMP error back to the ingress pipeline */
>> +    put_resubmit(8, &inner_ofpacts);
>> +
>> +    /* Attach nested actions to ICMP error controller handler */
>> +    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
>> +                                 &ofpacts, OFP15_VERSION);
>> +
>> +    /* Finalize the ICMP error controller handler */
>> +    encode_finish_controller_op(oc_offset, &ofpacts);
>> +
>> +    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 1000,
>> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
>> +                    &binding->header_.uuid);
>> +
>> +    ofpbuf_uninit(&inner_ofpacts);
>> +    ofpbuf_uninit(&ofpacts);
>> +}
>> +
>>  static void
>>  enforce_tunneling_for_multichassis_ports(
>>      struct local_datapath *ld,
>> @@ -1124,6 +1288,9 @@ enforce_tunneling_for_multichassis_ports(
>>                          binding->header_.uuid.parts[0], &match, &ofpacts,
>>                          &binding->header_.uuid);
>>          ofpbuf_uninit(&ofpacts);
>> +
>> +        handle_oversized_ip_packets(flow_table, binding, false);
>> +        handle_oversized_ip_packets(flow_table, binding, true);
>>      }
>>
>>      struct tunnel *tun_elem;
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index f8b8db4df..91ecd0814 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -14887,6 +14887,258 @@ OVN_CLEANUP([hv1],[hv2],[hv3])
>>  AT_CLEANUP
>>  ])
>>
>> +OVN_FOR_EACH_NORTHD([
>> +AT_SETUP([localnet connectivity with multiple requested-chassis, path
>> mtu discovery])
>> +AT_KEYWORDS([multi-chassis])
>> +ovn_start
>> +
>> +net_add n1
>> +for i in 1 2; do
>> +    sim_add hv$i
>> +    as hv$i
>> +    check ovs-vsctl add-br br-phys
>> +    ovn_attach n1 br-phys 192.168.0.$i
>> +    check ovs-vsctl set open .
>> external-ids:ovn-bridge-mappings=phys:br-phys
>> +done
>> +
>> +check ovn-nbctl ls-add ls0 -- add Logical_Switch ls0 other_config
>> vlan-passthru=true
>> +check ovn-nbctl lsp-add ls0 first
>> +check ovn-nbctl lsp-add ls0 second
>> +check ovn-nbctl lsp-add ls0 migrator
>> +check ovn-nbctl lsp-set-addresses first "00:00:00:00:00:01 10.0.0.1
>> ff::01"
>> +check ovn-nbctl lsp-set-addresses second "00:00:00:00:00:02 10.0.0.2
>> ff::02"
>> +check ovn-nbctl lsp-set-addresses migrator "00:00:00:00:00:ff 10.0.0.100
>> ff::ff"
>> +
>> +check ovn-nbctl lsp-add ls0 public
>> +check ovn-nbctl lsp-set-type public localnet
>> +check ovn-nbctl lsp-set-addresses public unknown
>> +check ovn-nbctl lsp-set-options public network_name=phys
>> +
>> +# TODO: do we need mtu request here?
>> +check ovn-nbctl lsp-set-options first requested-chassis=hv1
>> vif-plug-mtu-request=1500
>> +check ovn-nbctl lsp-set-options second requested-chassis=hv2
>> vif-plug-mtu-request=1500
>> +check ovn-nbctl lsp-set-options migrator requested-chassis=hv1,hv2
>> vif-plug-mtu-request=1500
>> +
>> +as hv1 check ovs-vsctl -- add-port br-int first -- \
>> +    set Interface first external-ids:iface-id=first \
>> +    options:tx_pcap=hv1/first-tx.pcap \
>> +    options:rxq_pcap=hv1/first-rx.pcap \
>> +    ofport-request=1
>> +as hv2 check ovs-vsctl -- add-port br-int second -- \
>> +    set Interface second external-ids:iface-id=second \
>> +    options:tx_pcap=hv2/second-tx.pcap \
>> +    options:rxq_pcap=hv2/second-rx.pcap \
>> +    ofport-request=2
>> +
>> +# Create Migrator interfaces on both hv1 and hv2
>> +for hv in hv1 hv2; do
>> +    as $hv check ovs-vsctl -- add-port br-int migrator -- \
>> +        set Interface migrator external-ids:iface-id=migrator \
>> +        options:tx_pcap=$hv/migrator-tx.pcap \
>> +        options:rxq_pcap=$hv/migrator-rx.pcap \
>> +        ofport-request=100
>> +done
>> +
>> +send_ip_packet() {
>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv4_src=$5 ipv4_dst=$6
>> ip_chksum=$7 data=$8 ip_len=$9
>> +    local ip_ttl=ff
>> +    local
>> packet=${eth_dst}${eth_src}08004500${ip_len}00004000${ip_ttl}fd${ip_chksum}${ipv4_src}${ipv4_dst}${data}
>> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
>> +    echo $packet
>> +}
>> +
>> +send_ip6_packet() {
>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv6_src=$5 ipv6_dst=$6
>> data=$7 ip_len=$8
>> +    local ip_ttl=ff
>> +    local
>> packet=${eth_dst}${eth_src}86dd60000000${ip_len}fd${ip_ttl}${ipv6_src}${ipv6_dst}${data}
>> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
>> +    echo $packet
>> +}
>> +
>> +reset_env() {
>> +    as hv1 reset_pcap_file first hv1/first
>> +    as hv2 reset_pcap_file second hv2/second
>> +    as hv1 reset_pcap_file migrator hv1/migrator
>> +    as hv2 reset_pcap_file migrator hv2/migrator
>> +
>> +    for port in hv1/migrator hv2/migrator hv1/first hv2/second; do
>> +        : > $port.expected
>> +    done
>> +}
>> +
>> +first_mac=000000000001
>> +second_mac=000000000002
>> +migrator_mac=0000000000ff
>> +first_ip=$(ip_to_hex 10 0 0 1)
>> +second_ip=$(ip_to_hex 10 0 0 2)
>> +migrator_ip=$(ip_to_hex 10 0 0 100)
>> +first_ip6=ff000000000000000000000000000001
>> +second_ip6=ff000000000000000000000000000002
>> +migrator_ip6=ff0000000000000000000000000000ff
>> +
>> +OVN_POPULATE_ARP
>> +
>> +reset_env
>> +
>> +# Check that packets of proper size are delivered from multichassis to
>> intended ports
>> +ip_len=058c
>> +len=1400
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip
>> $first_ip 0000 $data $ip_len)
>> +echo $packet >> hv1/first.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac
>> $migrator_ip $second_ip 0000 $data $ip_len)
>> +echo $packet >> hv2/second.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
>> $migrator_ip6 $first_ip6 $data $ip_len)
>> +echo $packet >> hv1/first.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
>> $migrator_ip6 $second_ip6 $data $ip_len)
>> +echo $packet >> hv2/second.expected
>> +
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>> [hv1/first.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>> [hv2/second.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>> [hv1/migrator.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>> [hv2/migrator.expected])
>> +
>> +reset_env
>> +
>> +# Now check that oversized packets are not delivered to the intended
>> ports, and
>> +# that the originating port receives an ICMP error suggesting
>> fragmentation
>> +
>> +ip_len=0bcc
>> +len=3000
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip
>> $first_ip 0000 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac
>> $migrator_ip $second_ip 0000 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
>> $migrator_ip6 $first_ip6 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
>> $migrator_ip6 $second_ip6 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>> [hv1/first.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>> [hv2/second.expected])
>> +
>> +# it's harder to construct the original packet that includes geneve
>> headers
>> +# etc. so check manually
>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/migrator-tx.pcap >
>> hv1/migrator.packets
>> +sed -i '/ffffffffffff/d' hv1/migrator.packets
>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/migrator-tx.pcap >
>> hv2/migrator.packets
>> +sed -i '/ffffffffffff/d' hv2/migrator.packets
>> +
>> +# icmp errors are delivered only to the port binding that originated the
>> offending packet
>> +AT_CHECK([wc -l hv1/migrator.packets | cut -c 1], [0], [4
>> +])
>> +AT_CHECK([wc -l hv2/migrator.packets | cut -c 1], [0], [0
>> +])
>> +
>> +for line in $(cat hv1/migrator.expected); do
>> +    trimmed=$(echo $line | cut -c 125-225)
>> +    echo "checking if packet $trimmed... generated an icmp error"
>> +    AT_CHECK([grep ${trimmed} hv1/migrator.packets | wc -l | cut -c 1],
>> [0], [1
>> +])
>> +done
>> +
>> +reset_env
>> +
>> +# Check that packets of proper size are delivered from regular to
>> multichassis ports
>> +ip_len=058c
>> +len=1400
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
>> $migrator_ip 0000 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +echo $packet >> hv2/migrator.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
>> $migrator_ip 0000 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +echo $packet >> hv2/migrator.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
>> $migrator_ip6 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +echo $packet >> hv2/migrator.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
>> $migrator_ip6 $data $ip_len)
>> +echo $packet >> hv1/migrator.expected
>> +echo $packet >> hv2/migrator.expected
>> +
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>> [hv1/first.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>> [hv2/second.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>> [hv1/migrator.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>> [hv2/migrator.expected])
>> +
>> +reset_env
>> +
>> +# Now check that oversized packets are not delivered to multichassis
>> ports, and
>> +# that the originating ports receive an ICMP error suggesting
>> fragmentation
>> +
>> +ip_len=0bcc
>> +len=3000
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
>> $migrator_ip 0000 $data $ip_len)
>> +echo $packet >> hv1/first.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
>> $migrator_ip 0000 $data $ip_len)
>> +echo $packet >> hv2/second.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
>> $migrator_ip6 $data $ip_len)
>> +echo $packet >> hv1/first.expected
>> +
>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
>> $migrator_ip6 $data $ip_len)
>> +echo $packet >> hv2/second.expected
>> +
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>> [hv1/migrator.expected])
>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>> [hv2/migrator.expected])
>> +
>> +# it's harder to construct the original packet that includes geneve
>> headers
>> +# etc. so check manually
>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/first-tx.pcap >
>> hv1/first.packets
>> +sed -i '/ffffffffffff/d' hv1/first.packets
>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/second-tx.pcap >
>> hv2/second.packets
>> +sed -i '/ffffffffffff/d' hv2/second.packets
>> +
>> +AT_CHECK([wc -l hv1/first.packets | cut -c 1], [0], [2
>> +])
>> +AT_CHECK([wc -l hv2/second.packets | cut -c 1], [0], [2
>> +])
>> +
>> +for line in $(cat hv1/first.expected); do
>> +    trimmed=$(echo $line | cut -c 125-225)
>> +    echo "checking if packet $trimmed... generated an icmp error"
>> +    AT_CHECK([grep ${trimmed} hv1/first.packets | wc -l | cut -c 1],
>> [0], [1
>> +])
>> +done
>> +
>> +for line in $(cat hv2/second.expected); do
>> +    trimmed=$(echo $line | cut -c 125-225)
>> +    echo "checking if packet $trimmed... generated an icmp error"
>> +    AT_CHECK([grep ${trimmed} hv2/second.packets | wc -l | cut -c 1],
>> [0], [1
>> +])
>> +done
>> +
>> +# todo: test multichassis to multichassis
>> +
>> +OVN_CLEANUP([hv1],[hv2])
>> +
>> +AT_CLEANUP
>> +])
>> +
>>  OVN_FOR_EACH_NORTHD([
>>  AT_SETUP([options:activation-strategy for logical port])
>>  AT_KEYWORDS([multi-chassis])
>> --
>> 2.34.3
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
> Hi Ihar,
> I've started to do the review and came to the conclusion that this
> approach does not seem
> to be the right way to address something like that. First issue being that
> putting something
> in the existing logical stages is very error prone (the
> check_packet_larger in table 30, which belongs to
> "ls_in_external_port" stage).
>

To confirm, your concern is not about the idea of sending ICMP Too Big
messages, but only about the pipeline design, correct?


>
> To me it feels that we should have additional two stages in the switch
> pipeline
> that would check if the packet is larger, if so it would send
> icmp4/6_error in the next stage.
> This way we have more control and those stages could be used for something
> else in the future.
> Also this makes it way easier to overview what is happening in the logical
> stage, it also gives us
> an easy way to limit this to ports that are bound to multiple chassis and
> to remove the flows when that
> changes.
>
> Also with the ICMP message there might be an issue: who is actually the
> source? Is it the original port?
>
>
This is something I am to test in real life before I polish this patch. The
current implementation here reuses the original port as the source of the
message. There is no other address to use (there's no router). I understand
that this may be unexpected by the client, hence the need to test it in
real life before committing to it. If these messages are honoured by
clients, then we are good to go. Otherwise, we will have to scrape the
patch. On the result, I will report back when I get to testing it.


> Dumitru, Mark, any thoughts?
>
> --
>
> Ales Musil
>
> Senior Software Engineer - OVN Core
>
> Red Hat EMEA <https://www.redhat.com>
>
> amusil@redhat.com    IM: amusil
> <https://red.ht/sig>
>
Ales Musil Nov. 21, 2022, 6:31 a.m. UTC | #3
On Fri, Nov 18, 2022 at 10:08 PM Ihar Hrachyshka <ihrachys@redhat.com>
wrote:

> On Wed, Nov 16, 2022 at 3:55 AM Ales Musil <amusil@redhat.com> wrote:
>
>>
>>
>> On Thu, Nov 3, 2022 at 4:46 AM Ihar Hrachyshka <ihrachys@redhat.com>
>> wrote:
>>
>>> When multichassis ports attached to localnet switches are forced to use
>>> tunneling to communicate with other ports, MTU of the path between ports
>>> may effectively change (to accommodate tunnel headers or otherwise).
>>>
>>> This patch implements Path MTU Discovery for IPv4 and IPv6 for
>>> multichassis ports (both egress and ingress).
>>>
>>> This patch adds new flows as follows:
>>>
>>> - table 30: store check_pkt_len(mtu)->reg9[1]
>>> - table 37: if reg9[1] is set, then send the packet to controller.
>>>
>>> TODO: document behavior.
>>> TODO: dynamically calculate MTU of the interface.
>>> TODO: constrain new flows to communication involving m-chassis ports
>>> only.
>>>
>>> Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
>>>
>> ---
>>>  controller/physical.c | 167 ++++++++++++++++++++++++++++
>>>  tests/ovn.at          | 252 ++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 419 insertions(+)
>>>
>>> diff --git a/controller/physical.c b/controller/physical.c
>>> index 705146316..96c0dc31a 100644
>>> --- a/controller/physical.c
>>> +++ b/controller/physical.c
>>> @@ -1072,6 +1072,170 @@ setup_activation_strategy(const struct
>>> sbrec_port_binding *binding,
>>>      }
>>>  }
>>>
>>> +static size_t
>>> +encode_start_controller_op(enum action_opcode opcode, bool pause,
>>> +                           uint32_t meter_id, struct ofpbuf *ofpacts)
>>> +{
>>> +    size_t ofs = ofpacts->size;
>>> +
>>> +    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
>>> +    oc->max_len = UINT16_MAX;
>>> +    oc->reason = OFPR_ACTION;
>>> +    oc->pause = pause;
>>> +    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
>>> +        meter_id = NX_CTLR_NO_METER;
>>> +    }
>>> +    oc->meter_id = meter_id;
>>> +
>>> +    struct action_header ah = { .opcode = htonl(opcode) };
>>> +    ofpbuf_put(ofpacts, &ah, sizeof ah);
>>> +
>>> +    return ofs;
>>> +}
>>> +
>>> +static void
>>> +encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
>>> +{
>>> +    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs,
>>> sizeof *oc);
>>> +    ofpacts->header = oc;
>>> +    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
>>> +    ofpact_finish_CONTROLLER(ofpacts, &oc);
>>> +}
>>> +
>>>
>> +static void
>>> +handle_oversized_ip_packets(struct ovn_desired_flow_table *flow_table,
>>> +                            const struct sbrec_port_binding *binding,
>>> +                            bool is_ipv6)
>>> +{
>>> +    uint32_t dp_key = binding->datapath->tunnel_key;
>>> +
>>> +    struct match match;
>>> +    match_init_catchall(&match);
>>> +    match_set_metadata(&match, htonll(dp_key));
>>> +
>>> +    struct ofpbuf ofpacts;
>>> +    ofpbuf_init(&ofpacts, 0);
>>> +
>>> +    /* Store packet too large flag in reg9[1]. */
>>> +    /* TODO: limit to multichassis traffic? */
>>> +    match_init_catchall(&match);
>>> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
>>> ETH_TYPE_IP));
>>> +    match_set_metadata(&match, htonll(dp_key));
>>> +
>>> +    /* TODO: get mtu from interface of the tunnel */
>>> +    uint16_t frag_mtu = 1500;
>>> +    struct ofpact_check_pkt_larger *pkt_larger =
>>> +        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
>>> +    pkt_larger->pkt_len = frag_mtu;
>>> +    pkt_larger->dst.field = mf_from_id(MFF_REG9);
>>> +    pkt_larger->dst.ofs = 1;
>>> +
>>> +    put_resubmit(31, &ofpacts);
>>> +    /* TODO: should we tag table=30 as consumed for this job anywhere?
>>> */
>>> +    ofctrl_add_flow(flow_table, 30, 110,
>>> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
>>> +                    &binding->header_.uuid);
>>> +    ofpbuf_uninit(&ofpacts);
>>>
>> +
>>> +    /* Generate ICMP Fragmentation needed for IP packets that are too
>>> large
>>> +     * (reg9[1] == 1) */
>>> +    /* TODO: limit to multichassis traffic? */
>>> +    match_init_catchall(&match);
>>> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
>>> ETH_TYPE_IP));
>>> +    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
>>> +
>>> +    /* Return ICMP error with a part of the original IP packet
>>> included. */
>>> +    ofpbuf_init(&ofpacts, 0);
>>> +    size_t oc_offset = encode_start_controller_op(
>>> +        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
>>> +
>>> +    /* Before sending the ICMP error packet back to the pipeline, set a
>>> number
>>> +     * of fields. */
>>> +    struct ofpbuf inner_ofpacts;
>>> +    ofpbuf_init(&inner_ofpacts, 0);
>>> +
>>> +    /* The new error packet is no longer too large */
>>> +    /* REGBIT_PKT_LARGER = 0 */
>>> +    ovs_be32 value = htonl(0);
>>> +    ovs_be32 mask = htonl(1 << 1);
>>> +    ofpact_put_set_field(
>>> +        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
>>> +
>>> +    /* The new error packet is delivered locally */
>>> +    /* REGBIT_EGRESS_LOOPBACK = 1 */
>>> +    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
>>> +    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
>>> +    ofpact_put_set_field(
>>> +        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
>>> +
>>> +    /* eth.dst */
>>> +    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
>>> +    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
>>> +
>>> +    /* ip.dst */
>>> +    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
>>> +              ofpact_put_STACK_PUSH(&inner_ofpacts));
>>> +    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
>>> +              ofpact_put_STACK_POP(&inner_ofpacts));
>>> +
>>> +    /* ip.ttl */
>>> +    struct ofpact_ip_ttl *ip_ttl =
>>> ofpact_put_SET_IP_TTL(&inner_ofpacts);
>>> +    ip_ttl->ttl = 255;
>>> +
>>> +    if (is_ipv6) {
>>> +        /* icmp6.type = 2 (Packet Too Big) */
>>> +        /* icmp6.code = 0 */
>>> +        uint8_t icmp_type = 2;
>>> +        uint8_t icmp_code = 0;
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type,
>>> NULL);
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code,
>>> NULL);
>>> +
>>> +        /* icmp6.frag_mtu */
>>> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
>>> +            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
>>> +            &inner_ofpacts);
>>> +        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
>>> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
>>> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
>>> +    } else {
>>> +        /* icmp4.type = 3 (Destination Unreachable) */
>>> +        /* icmp4.code = 4 (Fragmentation Needed) */
>>> +        uint8_t icmp_type = 3;
>>> +        uint8_t icmp_code = 4;
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type,
>>> NULL);
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code,
>>> NULL);
>>> +
>>> +        /* icmp4.frag_mtu = */
>>> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
>>> +            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
>>> +            &inner_ofpacts);
>>> +        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
>>> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
>>> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
>>> +    }
>>> +
>>> +    /* Finally, submit the ICMP error back to the ingress pipeline */
>>> +    put_resubmit(8, &inner_ofpacts);
>>> +
>>> +    /* Attach nested actions to ICMP error controller handler */
>>> +    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
>>> +                                 &ofpacts, OFP15_VERSION);
>>> +
>>> +    /* Finalize the ICMP error controller handler */
>>> +    encode_finish_controller_op(oc_offset, &ofpacts);
>>> +
>>> +    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 1000,
>>> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
>>> +                    &binding->header_.uuid);
>>> +
>>> +    ofpbuf_uninit(&inner_ofpacts);
>>> +    ofpbuf_uninit(&ofpacts);
>>> +}
>>> +
>>>  static void
>>>  enforce_tunneling_for_multichassis_ports(
>>>      struct local_datapath *ld,
>>> @@ -1124,6 +1288,9 @@ enforce_tunneling_for_multichassis_ports(
>>>                          binding->header_.uuid.parts[0], &match,
>>> &ofpacts,
>>>                          &binding->header_.uuid);
>>>          ofpbuf_uninit(&ofpacts);
>>> +
>>> +        handle_oversized_ip_packets(flow_table, binding, false);
>>> +        handle_oversized_ip_packets(flow_table, binding, true);
>>>      }
>>>
>>>      struct tunnel *tun_elem;
>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>> index f8b8db4df..91ecd0814 100644
>>> --- a/tests/ovn.at
>>> +++ b/tests/ovn.at
>>> @@ -14887,6 +14887,258 @@ OVN_CLEANUP([hv1],[hv2],[hv3])
>>>  AT_CLEANUP
>>>  ])
>>>
>>> +OVN_FOR_EACH_NORTHD([
>>> +AT_SETUP([localnet connectivity with multiple requested-chassis, path
>>> mtu discovery])
>>> +AT_KEYWORDS([multi-chassis])
>>> +ovn_start
>>> +
>>> +net_add n1
>>> +for i in 1 2; do
>>> +    sim_add hv$i
>>> +    as hv$i
>>> +    check ovs-vsctl add-br br-phys
>>> +    ovn_attach n1 br-phys 192.168.0.$i
>>> +    check ovs-vsctl set open .
>>> external-ids:ovn-bridge-mappings=phys:br-phys
>>> +done
>>> +
>>> +check ovn-nbctl ls-add ls0 -- add Logical_Switch ls0 other_config
>>> vlan-passthru=true
>>> +check ovn-nbctl lsp-add ls0 first
>>> +check ovn-nbctl lsp-add ls0 second
>>> +check ovn-nbctl lsp-add ls0 migrator
>>> +check ovn-nbctl lsp-set-addresses first "00:00:00:00:00:01 10.0.0.1
>>> ff::01"
>>> +check ovn-nbctl lsp-set-addresses second "00:00:00:00:00:02 10.0.0.2
>>> ff::02"
>>> +check ovn-nbctl lsp-set-addresses migrator "00:00:00:00:00:ff
>>> 10.0.0.100 ff::ff"
>>> +
>>> +check ovn-nbctl lsp-add ls0 public
>>> +check ovn-nbctl lsp-set-type public localnet
>>> +check ovn-nbctl lsp-set-addresses public unknown
>>> +check ovn-nbctl lsp-set-options public network_name=phys
>>> +
>>> +# TODO: do we need mtu request here?
>>> +check ovn-nbctl lsp-set-options first requested-chassis=hv1
>>> vif-plug-mtu-request=1500
>>> +check ovn-nbctl lsp-set-options second requested-chassis=hv2
>>> vif-plug-mtu-request=1500
>>> +check ovn-nbctl lsp-set-options migrator requested-chassis=hv1,hv2
>>> vif-plug-mtu-request=1500
>>> +
>>> +as hv1 check ovs-vsctl -- add-port br-int first -- \
>>> +    set Interface first external-ids:iface-id=first \
>>> +    options:tx_pcap=hv1/first-tx.pcap \
>>> +    options:rxq_pcap=hv1/first-rx.pcap \
>>> +    ofport-request=1
>>> +as hv2 check ovs-vsctl -- add-port br-int second -- \
>>> +    set Interface second external-ids:iface-id=second \
>>> +    options:tx_pcap=hv2/second-tx.pcap \
>>> +    options:rxq_pcap=hv2/second-rx.pcap \
>>> +    ofport-request=2
>>> +
>>> +# Create Migrator interfaces on both hv1 and hv2
>>> +for hv in hv1 hv2; do
>>> +    as $hv check ovs-vsctl -- add-port br-int migrator -- \
>>> +        set Interface migrator external-ids:iface-id=migrator \
>>> +        options:tx_pcap=$hv/migrator-tx.pcap \
>>> +        options:rxq_pcap=$hv/migrator-rx.pcap \
>>> +        ofport-request=100
>>> +done
>>> +
>>> +send_ip_packet() {
>>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv4_src=$5 ipv4_dst=$6
>>> ip_chksum=$7 data=$8 ip_len=$9
>>> +    local ip_ttl=ff
>>> +    local
>>> packet=${eth_dst}${eth_src}08004500${ip_len}00004000${ip_ttl}fd${ip_chksum}${ipv4_src}${ipv4_dst}${data}
>>> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
>>> +    echo $packet
>>> +}
>>> +
>>> +send_ip6_packet() {
>>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv6_src=$5 ipv6_dst=$6
>>> data=$7 ip_len=$8
>>> +    local ip_ttl=ff
>>> +    local
>>> packet=${eth_dst}${eth_src}86dd60000000${ip_len}fd${ip_ttl}${ipv6_src}${ipv6_dst}${data}
>>> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
>>> +    echo $packet
>>> +}
>>> +
>>> +reset_env() {
>>> +    as hv1 reset_pcap_file first hv1/first
>>> +    as hv2 reset_pcap_file second hv2/second
>>> +    as hv1 reset_pcap_file migrator hv1/migrator
>>> +    as hv2 reset_pcap_file migrator hv2/migrator
>>> +
>>> +    for port in hv1/migrator hv2/migrator hv1/first hv2/second; do
>>> +        : > $port.expected
>>> +    done
>>> +}
>>> +
>>> +first_mac=000000000001
>>> +second_mac=000000000002
>>> +migrator_mac=0000000000ff
>>> +first_ip=$(ip_to_hex 10 0 0 1)
>>> +second_ip=$(ip_to_hex 10 0 0 2)
>>> +migrator_ip=$(ip_to_hex 10 0 0 100)
>>> +first_ip6=ff000000000000000000000000000001
>>> +second_ip6=ff000000000000000000000000000002
>>> +migrator_ip6=ff0000000000000000000000000000ff
>>> +
>>> +OVN_POPULATE_ARP
>>> +
>>> +reset_env
>>> +
>>> +# Check that packets of proper size are delivered from multichassis to
>>> intended ports
>>> +ip_len=058c
>>> +len=1400
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip $first_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip $second_ip 0000 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip6 $first_ip6 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip6 $second_ip6 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>>> [hv1/first.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>>> [hv2/second.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>>> [hv1/migrator.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>>> [hv2/migrator.expected])
>>> +
>>> +reset_env
>>> +
>>> +# Now check that oversized packets are not delivered to the intended
>>> ports, and
>>> +# that the originating port receives an ICMP error suggesting
>>> fragmentation
>>> +
>>> +ip_len=0bcc
>>> +len=3000
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip $first_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip $second_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip6 $first_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip6 $second_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>>> [hv1/first.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>>> [hv2/second.expected])
>>> +
>>> +# it's harder to construct the original packet that includes geneve
>>> headers
>>> +# etc. so check manually
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/migrator-tx.pcap >
>>> hv1/migrator.packets
>>> +sed -i '/ffffffffffff/d' hv1/migrator.packets
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/migrator-tx.pcap >
>>> hv2/migrator.packets
>>> +sed -i '/ffffffffffff/d' hv2/migrator.packets
>>> +
>>> +# icmp errors are delivered only to the port binding that originated
>>> the offending packet
>>> +AT_CHECK([wc -l hv1/migrator.packets | cut -c 1], [0], [4
>>> +])
>>> +AT_CHECK([wc -l hv2/migrator.packets | cut -c 1], [0], [0
>>> +])
>>> +
>>> +for line in $(cat hv1/migrator.expected); do
>>> +    trimmed=$(echo $line | cut -c 125-225)
>>> +    echo "checking if packet $trimmed... generated an icmp error"
>>> +    AT_CHECK([grep ${trimmed} hv1/migrator.packets | wc -l | cut -c 1],
>>> [0], [1
>>> +])
>>> +done
>>> +
>>> +reset_env
>>> +
>>> +# Check that packets of proper size are delivered from regular to
>>> multichassis ports
>>> +ip_len=058c
>>> +len=1400
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>>> [hv1/first.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>>> [hv2/second.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>>> [hv1/migrator.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>>> [hv2/migrator.expected])
>>> +
>>> +reset_env
>>> +
>>> +# Now check that oversized packets are not delivered to multichassis
>>> ports, and
>>> +# that the originating ports receive an ICMP error suggesting
>>> fragmentation
>>> +
>>> +ip_len=0bcc
>>> +len=3000
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>>> [hv1/migrator.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>>> [hv2/migrator.expected])
>>> +
>>> +# it's harder to construct the original packet that includes geneve
>>> headers
>>> +# etc. so check manually
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/first-tx.pcap >
>>> hv1/first.packets
>>> +sed -i '/ffffffffffff/d' hv1/first.packets
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/second-tx.pcap >
>>> hv2/second.packets
>>> +sed -i '/ffffffffffff/d' hv2/second.packets
>>> +
>>> +AT_CHECK([wc -l hv1/first.packets | cut -c 1], [0], [2
>>> +])
>>> +AT_CHECK([wc -l hv2/second.packets | cut -c 1], [0], [2
>>> +])
>>> +
>>> +for line in $(cat hv1/first.expected); do
>>> +    trimmed=$(echo $line | cut -c 125-225)
>>> +    echo "checking if packet $trimmed... generated an icmp error"
>>> +    AT_CHECK([grep ${trimmed} hv1/first.packets | wc -l | cut -c 1],
>>> [0], [1
>>> +])
>>> +done
>>> +
>>> +for line in $(cat hv2/second.expected); do
>>> +    trimmed=$(echo $line | cut -c 125-225)
>>> +    echo "checking if packet $trimmed... generated an icmp error"
>>> +    AT_CHECK([grep ${trimmed} hv2/second.packets | wc -l | cut -c 1],
>>> [0], [1
>>> +])
>>> +done
>>> +
>>> +# todo: test multichassis to multichassis
>>> +
>>> +OVN_CLEANUP([hv1],[hv2])
>>> +
>>> +AT_CLEANUP
>>> +])
>>> +
>>>  OVN_FOR_EACH_NORTHD([
>>>  AT_SETUP([options:activation-strategy for logical port])
>>>  AT_KEYWORDS([multi-chassis])
>>> --
>>> 2.34.3
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>>
>> Hi Ihar,
>> I've started to do the review and came to the conclusion that this
>> approach does not seem
>> to be the right way to address something like that. First issue being
>> that putting something
>> in the existing logical stages is very error prone (the
>> check_packet_larger in table 30, which belongs to
>> "ls_in_external_port" stage).
>>
>
> To confirm, your concern is not about the idea of sending ICMP Too Big
> messages, but only about the pipeline design, correct?
>

The idea seems fine to me. So yes, it's about the pipeline design.


>
>
>>
>> To me it feels that we should have additional two stages in the switch
>> pipeline
>> that would check if the packet is larger, if so it would send
>> icmp4/6_error in the next stage.
>> This way we have more control and those stages could be used for
>> something else in the future.
>> Also this makes it way easier to overview what is happening in the
>> logical stage, it also gives us
>> an easy way to limit this to ports that are bound to multiple chassis and
>> to remove the flows when that
>> changes.
>>
>> Also with the ICMP message there might be an issue: who is actually the
>> source? Is it the original port?
>>
>>
> This is something I am to test in real life before I polish this patch.
> The current implementation here reuses the original port as the source of
> the message. There is no other address to use (there's no router). I
> understand that this may be unexpected by the client, hence the need to
> test it in real life before committing to it. If these messages are
> honoured by clients, then we are good to go. Otherwise, we will have to
> scrape the patch. On the result, I will report back when I get to testing
> it.
>
>
>> Dumitru, Mark, any thoughts?
>>
>> --
>>
>> Ales Musil
>>
>> Senior Software Engineer - OVN Core
>>
>> Red Hat EMEA <https://www.redhat.com>
>>
>> amusil@redhat.com    IM: amusil
>> <https://red.ht/sig>
>>
>
Ihar Hrachyshka April 19, 2023, 3:50 p.m. UTC | #4
On Fri, Nov 18, 2022 at 4:08 PM Ihar Hrachyshka <ihrachys@redhat.com> wrote:

> On Wed, Nov 16, 2022 at 3:55 AM Ales Musil <amusil@redhat.com> wrote:
>
>>
>>
>> On Thu, Nov 3, 2022 at 4:46 AM Ihar Hrachyshka <ihrachys@redhat.com>
>> wrote:
>>
>>> When multichassis ports attached to localnet switches are forced to use
>>> tunneling to communicate with other ports, MTU of the path between ports
>>> may effectively change (to accommodate tunnel headers or otherwise).
>>>
>>> This patch implements Path MTU Discovery for IPv4 and IPv6 for
>>> multichassis ports (both egress and ingress).
>>>
>>> This patch adds new flows as follows:
>>>
>>> - table 30: store check_pkt_len(mtu)->reg9[1]
>>> - table 37: if reg9[1] is set, then send the packet to controller.
>>>
>>> TODO: document behavior.
>>> TODO: dynamically calculate MTU of the interface.
>>> TODO: constrain new flows to communication involving m-chassis ports
>>> only.
>>>
>>> Signed-off-by: Ihar Hrachyshka <ihrachys@redhat.com>
>>>
>> ---
>>>  controller/physical.c | 167 ++++++++++++++++++++++++++++
>>>  tests/ovn.at          | 252 ++++++++++++++++++++++++++++++++++++++++++
>>>  2 files changed, 419 insertions(+)
>>>
>>> diff --git a/controller/physical.c b/controller/physical.c
>>> index 705146316..96c0dc31a 100644
>>> --- a/controller/physical.c
>>> +++ b/controller/physical.c
>>> @@ -1072,6 +1072,170 @@ setup_activation_strategy(const struct
>>> sbrec_port_binding *binding,
>>>      }
>>>  }
>>>
>>> +static size_t
>>> +encode_start_controller_op(enum action_opcode opcode, bool pause,
>>> +                           uint32_t meter_id, struct ofpbuf *ofpacts)
>>> +{
>>> +    size_t ofs = ofpacts->size;
>>> +
>>> +    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
>>> +    oc->max_len = UINT16_MAX;
>>> +    oc->reason = OFPR_ACTION;
>>> +    oc->pause = pause;
>>> +    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
>>> +        meter_id = NX_CTLR_NO_METER;
>>> +    }
>>> +    oc->meter_id = meter_id;
>>> +
>>> +    struct action_header ah = { .opcode = htonl(opcode) };
>>> +    ofpbuf_put(ofpacts, &ah, sizeof ah);
>>> +
>>> +    return ofs;
>>> +}
>>> +
>>> +static void
>>> +encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
>>> +{
>>> +    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs,
>>> sizeof *oc);
>>> +    ofpacts->header = oc;
>>> +    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
>>> +    ofpact_finish_CONTROLLER(ofpacts, &oc);
>>> +}
>>> +
>>>
>> +static void
>>> +handle_oversized_ip_packets(struct ovn_desired_flow_table *flow_table,
>>> +                            const struct sbrec_port_binding *binding,
>>> +                            bool is_ipv6)
>>> +{
>>> +    uint32_t dp_key = binding->datapath->tunnel_key;
>>> +
>>> +    struct match match;
>>> +    match_init_catchall(&match);
>>> +    match_set_metadata(&match, htonll(dp_key));
>>> +
>>> +    struct ofpbuf ofpacts;
>>> +    ofpbuf_init(&ofpacts, 0);
>>> +
>>> +    /* Store packet too large flag in reg9[1]. */
>>> +    /* TODO: limit to multichassis traffic? */
>>> +    match_init_catchall(&match);
>>> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
>>> ETH_TYPE_IP));
>>> +    match_set_metadata(&match, htonll(dp_key));
>>> +
>>> +    /* TODO: get mtu from interface of the tunnel */
>>> +    uint16_t frag_mtu = 1500;
>>> +    struct ofpact_check_pkt_larger *pkt_larger =
>>> +        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
>>> +    pkt_larger->pkt_len = frag_mtu;
>>> +    pkt_larger->dst.field = mf_from_id(MFF_REG9);
>>> +    pkt_larger->dst.ofs = 1;
>>> +
>>> +    put_resubmit(31, &ofpacts);
>>> +    /* TODO: should we tag table=30 as consumed for this job anywhere?
>>> */
>>> +    ofctrl_add_flow(flow_table, 30, 110,
>>> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
>>> +                    &binding->header_.uuid);
>>> +    ofpbuf_uninit(&ofpacts);
>>>
>> +
>>> +    /* Generate ICMP Fragmentation needed for IP packets that are too
>>> large
>>> +     * (reg9[1] == 1) */
>>> +    /* TODO: limit to multichassis traffic? */
>>> +    match_init_catchall(&match);
>>> +    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 :
>>> ETH_TYPE_IP));
>>> +    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
>>> +
>>> +    /* Return ICMP error with a part of the original IP packet
>>> included. */
>>> +    ofpbuf_init(&ofpacts, 0);
>>> +    size_t oc_offset = encode_start_controller_op(
>>> +        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
>>> +
>>> +    /* Before sending the ICMP error packet back to the pipeline, set a
>>> number
>>> +     * of fields. */
>>> +    struct ofpbuf inner_ofpacts;
>>> +    ofpbuf_init(&inner_ofpacts, 0);
>>> +
>>> +    /* The new error packet is no longer too large */
>>> +    /* REGBIT_PKT_LARGER = 0 */
>>> +    ovs_be32 value = htonl(0);
>>> +    ovs_be32 mask = htonl(1 << 1);
>>> +    ofpact_put_set_field(
>>> +        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
>>> +
>>> +    /* The new error packet is delivered locally */
>>> +    /* REGBIT_EGRESS_LOOPBACK = 1 */
>>> +    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
>>> +    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
>>> +    ofpact_put_set_field(
>>> +        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
>>> +
>>> +    /* eth.dst */
>>> +    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
>>> +    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
>>> +
>>> +    /* ip.dst */
>>> +    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
>>> +              ofpact_put_STACK_PUSH(&inner_ofpacts));
>>> +    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
>>> +              ofpact_put_STACK_POP(&inner_ofpacts));
>>> +
>>> +    /* ip.ttl */
>>> +    struct ofpact_ip_ttl *ip_ttl =
>>> ofpact_put_SET_IP_TTL(&inner_ofpacts);
>>> +    ip_ttl->ttl = 255;
>>> +
>>> +    if (is_ipv6) {
>>> +        /* icmp6.type = 2 (Packet Too Big) */
>>> +        /* icmp6.code = 0 */
>>> +        uint8_t icmp_type = 2;
>>> +        uint8_t icmp_code = 0;
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type,
>>> NULL);
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code,
>>> NULL);
>>> +
>>> +        /* icmp6.frag_mtu */
>>> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
>>> +            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
>>> +            &inner_ofpacts);
>>> +        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
>>> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
>>> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
>>> +    } else {
>>> +        /* icmp4.type = 3 (Destination Unreachable) */
>>> +        /* icmp4.code = 4 (Fragmentation Needed) */
>>> +        uint8_t icmp_type = 3;
>>> +        uint8_t icmp_code = 4;
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type,
>>> NULL);
>>> +        ofpact_put_set_field(
>>> +            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code,
>>> NULL);
>>> +
>>> +        /* icmp4.frag_mtu = */
>>> +        size_t frag_mtu_oc_offset = encode_start_controller_op(
>>> +            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
>>> +            &inner_ofpacts);
>>> +        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
>>> +        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
>>> +        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
>>> +    }
>>> +
>>> +    /* Finally, submit the ICMP error back to the ingress pipeline */
>>> +    put_resubmit(8, &inner_ofpacts);
>>> +
>>> +    /* Attach nested actions to ICMP error controller handler */
>>> +    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
>>> +                                 &ofpacts, OFP15_VERSION);
>>> +
>>> +    /* Finalize the ICMP error controller handler */
>>> +    encode_finish_controller_op(oc_offset, &ofpacts);
>>> +
>>> +    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 1000,
>>> +                    binding->header_.uuid.parts[0], &match, &ofpacts,
>>> +                    &binding->header_.uuid);
>>> +
>>> +    ofpbuf_uninit(&inner_ofpacts);
>>> +    ofpbuf_uninit(&ofpacts);
>>> +}
>>> +
>>>  static void
>>>  enforce_tunneling_for_multichassis_ports(
>>>      struct local_datapath *ld,
>>> @@ -1124,6 +1288,9 @@ enforce_tunneling_for_multichassis_ports(
>>>                          binding->header_.uuid.parts[0], &match,
>>> &ofpacts,
>>>                          &binding->header_.uuid);
>>>          ofpbuf_uninit(&ofpacts);
>>> +
>>> +        handle_oversized_ip_packets(flow_table, binding, false);
>>> +        handle_oversized_ip_packets(flow_table, binding, true);
>>>      }
>>>
>>>      struct tunnel *tun_elem;
>>> diff --git a/tests/ovn.at b/tests/ovn.at
>>> index f8b8db4df..91ecd0814 100644
>>> --- a/tests/ovn.at
>>> +++ b/tests/ovn.at
>>> @@ -14887,6 +14887,258 @@ OVN_CLEANUP([hv1],[hv2],[hv3])
>>>  AT_CLEANUP
>>>  ])
>>>
>>> +OVN_FOR_EACH_NORTHD([
>>> +AT_SETUP([localnet connectivity with multiple requested-chassis, path
>>> mtu discovery])
>>> +AT_KEYWORDS([multi-chassis])
>>> +ovn_start
>>> +
>>> +net_add n1
>>> +for i in 1 2; do
>>> +    sim_add hv$i
>>> +    as hv$i
>>> +    check ovs-vsctl add-br br-phys
>>> +    ovn_attach n1 br-phys 192.168.0.$i
>>> +    check ovs-vsctl set open .
>>> external-ids:ovn-bridge-mappings=phys:br-phys
>>> +done
>>> +
>>> +check ovn-nbctl ls-add ls0 -- add Logical_Switch ls0 other_config
>>> vlan-passthru=true
>>> +check ovn-nbctl lsp-add ls0 first
>>> +check ovn-nbctl lsp-add ls0 second
>>> +check ovn-nbctl lsp-add ls0 migrator
>>> +check ovn-nbctl lsp-set-addresses first "00:00:00:00:00:01 10.0.0.1
>>> ff::01"
>>> +check ovn-nbctl lsp-set-addresses second "00:00:00:00:00:02 10.0.0.2
>>> ff::02"
>>> +check ovn-nbctl lsp-set-addresses migrator "00:00:00:00:00:ff
>>> 10.0.0.100 ff::ff"
>>> +
>>> +check ovn-nbctl lsp-add ls0 public
>>> +check ovn-nbctl lsp-set-type public localnet
>>> +check ovn-nbctl lsp-set-addresses public unknown
>>> +check ovn-nbctl lsp-set-options public network_name=phys
>>> +
>>> +# TODO: do we need mtu request here?
>>> +check ovn-nbctl lsp-set-options first requested-chassis=hv1
>>> vif-plug-mtu-request=1500
>>> +check ovn-nbctl lsp-set-options second requested-chassis=hv2
>>> vif-plug-mtu-request=1500
>>> +check ovn-nbctl lsp-set-options migrator requested-chassis=hv1,hv2
>>> vif-plug-mtu-request=1500
>>> +
>>> +as hv1 check ovs-vsctl -- add-port br-int first -- \
>>> +    set Interface first external-ids:iface-id=first \
>>> +    options:tx_pcap=hv1/first-tx.pcap \
>>> +    options:rxq_pcap=hv1/first-rx.pcap \
>>> +    ofport-request=1
>>> +as hv2 check ovs-vsctl -- add-port br-int second -- \
>>> +    set Interface second external-ids:iface-id=second \
>>> +    options:tx_pcap=hv2/second-tx.pcap \
>>> +    options:rxq_pcap=hv2/second-rx.pcap \
>>> +    ofport-request=2
>>> +
>>> +# Create Migrator interfaces on both hv1 and hv2
>>> +for hv in hv1 hv2; do
>>> +    as $hv check ovs-vsctl -- add-port br-int migrator -- \
>>> +        set Interface migrator external-ids:iface-id=migrator \
>>> +        options:tx_pcap=$hv/migrator-tx.pcap \
>>> +        options:rxq_pcap=$hv/migrator-rx.pcap \
>>> +        ofport-request=100
>>> +done
>>> +
>>> +send_ip_packet() {
>>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv4_src=$5 ipv4_dst=$6
>>> ip_chksum=$7 data=$8 ip_len=$9
>>> +    local ip_ttl=ff
>>> +    local
>>> packet=${eth_dst}${eth_src}08004500${ip_len}00004000${ip_ttl}fd${ip_chksum}${ipv4_src}${ipv4_dst}${data}
>>> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
>>> +    echo $packet
>>> +}
>>> +
>>> +send_ip6_packet() {
>>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv6_src=$5 ipv6_dst=$6
>>> data=$7 ip_len=$8
>>> +    local ip_ttl=ff
>>> +    local
>>> packet=${eth_dst}${eth_src}86dd60000000${ip_len}fd${ip_ttl}${ipv6_src}${ipv6_dst}${data}
>>> +    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
>>> +    echo $packet
>>> +}
>>> +
>>> +reset_env() {
>>> +    as hv1 reset_pcap_file first hv1/first
>>> +    as hv2 reset_pcap_file second hv2/second
>>> +    as hv1 reset_pcap_file migrator hv1/migrator
>>> +    as hv2 reset_pcap_file migrator hv2/migrator
>>> +
>>> +    for port in hv1/migrator hv2/migrator hv1/first hv2/second; do
>>> +        : > $port.expected
>>> +    done
>>> +}
>>> +
>>> +first_mac=000000000001
>>> +second_mac=000000000002
>>> +migrator_mac=0000000000ff
>>> +first_ip=$(ip_to_hex 10 0 0 1)
>>> +second_ip=$(ip_to_hex 10 0 0 2)
>>> +migrator_ip=$(ip_to_hex 10 0 0 100)
>>> +first_ip6=ff000000000000000000000000000001
>>> +second_ip6=ff000000000000000000000000000002
>>> +migrator_ip6=ff0000000000000000000000000000ff
>>> +
>>> +OVN_POPULATE_ARP
>>> +
>>> +reset_env
>>> +
>>> +# Check that packets of proper size are delivered from multichassis to
>>> intended ports
>>> +ip_len=058c
>>> +len=1400
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip $first_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip $second_ip 0000 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip6 $first_ip6 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip6 $second_ip6 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>>> [hv1/first.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>>> [hv2/second.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>>> [hv1/migrator.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>>> [hv2/migrator.expected])
>>> +
>>> +reset_env
>>> +
>>> +# Now check that oversized packets are not delivered to the intended
>>> ports, and
>>> +# that the originating port receives an ICMP error suggesting
>>> fragmentation
>>> +
>>> +ip_len=0bcc
>>> +len=3000
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip $first_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip $second_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac
>>> $migrator_ip6 $first_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac
>>> $migrator_ip6 $second_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>>> [hv1/first.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>>> [hv2/second.expected])
>>> +
>>> +# it's harder to construct the original packet that includes geneve
>>> headers
>>> +# etc. so check manually
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/migrator-tx.pcap >
>>> hv1/migrator.packets
>>> +sed -i '/ffffffffffff/d' hv1/migrator.packets
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/migrator-tx.pcap >
>>> hv2/migrator.packets
>>> +sed -i '/ffffffffffff/d' hv2/migrator.packets
>>> +
>>> +# icmp errors are delivered only to the port binding that originated
>>> the offending packet
>>> +AT_CHECK([wc -l hv1/migrator.packets | cut -c 1], [0], [4
>>> +])
>>> +AT_CHECK([wc -l hv2/migrator.packets | cut -c 1], [0], [0
>>> +])
>>> +
>>> +for line in $(cat hv1/migrator.expected); do
>>> +    trimmed=$(echo $line | cut -c 125-225)
>>> +    echo "checking if packet $trimmed... generated an icmp error"
>>> +    AT_CHECK([grep ${trimmed} hv1/migrator.packets | wc -l | cut -c 1],
>>> [0], [1
>>> +])
>>> +done
>>> +
>>> +reset_env
>>> +
>>> +# Check that packets of proper size are delivered from regular to
>>> multichassis ports
>>> +ip_len=058c
>>> +len=1400
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv1/migrator.expected
>>> +echo $packet >> hv2/migrator.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap],
>>> [hv1/first.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap],
>>> [hv2/second.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>>> [hv1/migrator.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>>> [hv2/migrator.expected])
>>> +
>>> +reset_env
>>> +
>>> +# Now check that oversized packets are not delivered to multichassis
>>> ports, and
>>> +# that the originating ports receive an ICMP error suggesting
>>> fragmentation
>>> +
>>> +ip_len=0bcc
>>> +len=3000
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip
>>> $migrator_ip 0000 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv1/first.expected
>>> +
>>> +data=$(xxd -l $len -c $len -p < /dev/urandom)
>>> +packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6
>>> $migrator_ip6 $data $ip_len)
>>> +echo $packet >> hv2/second.expected
>>> +
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap],
>>> [hv1/migrator.expected])
>>> +OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap],
>>> [hv2/migrator.expected])
>>> +
>>> +# it's harder to construct the original packet that includes geneve
>>> headers
>>> +# etc. so check manually
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/first-tx.pcap >
>>> hv1/first.packets
>>> +sed -i '/ffffffffffff/d' hv1/first.packets
>>> +$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/second-tx.pcap >
>>> hv2/second.packets
>>> +sed -i '/ffffffffffff/d' hv2/second.packets
>>> +
>>> +AT_CHECK([wc -l hv1/first.packets | cut -c 1], [0], [2
>>> +])
>>> +AT_CHECK([wc -l hv2/second.packets | cut -c 1], [0], [2
>>> +])
>>> +
>>> +for line in $(cat hv1/first.expected); do
>>> +    trimmed=$(echo $line | cut -c 125-225)
>>> +    echo "checking if packet $trimmed... generated an icmp error"
>>> +    AT_CHECK([grep ${trimmed} hv1/first.packets | wc -l | cut -c 1],
>>> [0], [1
>>> +])
>>> +done
>>> +
>>> +for line in $(cat hv2/second.expected); do
>>> +    trimmed=$(echo $line | cut -c 125-225)
>>> +    echo "checking if packet $trimmed... generated an icmp error"
>>> +    AT_CHECK([grep ${trimmed} hv2/second.packets | wc -l | cut -c 1],
>>> [0], [1
>>> +])
>>> +done
>>> +
>>> +# todo: test multichassis to multichassis
>>> +
>>> +OVN_CLEANUP([hv1],[hv2])
>>> +
>>> +AT_CLEANUP
>>> +])
>>> +
>>>  OVN_FOR_EACH_NORTHD([
>>>  AT_SETUP([options:activation-strategy for logical port])
>>>  AT_KEYWORDS([multi-chassis])
>>> --
>>> 2.34.3
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>>
>> Hi Ihar,
>> I've started to do the review and came to the conclusion that this
>> approach does not seem
>> to be the right way to address something like that. First issue being
>> that putting something
>> in the existing logical stages is very error prone (the
>> check_packet_larger in table 30, which belongs to
>> "ls_in_external_port" stage).
>>
>
> To confirm, your concern is not about the idea of sending ICMP Too Big
> messages, but only about the pipeline design, correct?
>
>
>>
>> To me it feels that we should have additional two stages in the switch
>> pipeline
>> that would check if the packet is larger, if so it would send
>> icmp4/6_error in the next stage.
>> This way we have more control and those stages could be used for
>> something else in the future.
>> Also this makes it way easier to overview what is happening in the
>> logical stage, it also gives us
>> an easy way to limit this to ports that are bound to multiple chassis and
>> to remove the flows when that
>> changes.
>>
>> Also with the ICMP message there might be an issue: who is actually the
>> source? Is it the original port?
>>
>>
> This is something I am to test in real life before I polish this patch.
> The current implementation here reuses the original port as the source of
> the message. There is no other address to use (there's no router). I
> understand that this may be unexpected by the client, hence the need to
> test it in real life before committing to it. If these messages are
> honoured by clients, then we are good to go. Otherwise, we will have to
> scrape the patch. On the result, I will report back when I get to testing
> it.
>

Reviving this thread to report some results on testing the dynamic path mtu
discovery approach for the scenario where the effective mtu is temporarily
reduced for a multichassis port due to tunneling enforcement.

===

[root@ovn-chassis-2 /]# ip netns exec sw0p4 ip route get 10.0.0.3
10.0.0.3 dev sw0p4 src 10.0.0.6 uid 0
    cache


^^^ path mtu for the iperf3 server is not known yet


[root@ovn-chassis-2 /]# ip netns exec sw0p4 iperf3 -c 10.0.0.3 -u
Connecting to host 10.0.0.3, port 5201
[  5] local 10.0.0.6 port 44500 connected to 10.0.0.3 port 5201
iperf3: error - unable to write to stream socket: Message too long


^^^ Here, OVN sent ICMP need fragmentation mtu=1200 back to iperf3. Sadly,
iperf3 is dumb and doesn't adjust to the new information about path MTU, so
it exits.


[root@ovn-chassis-2 /]# ip netns exec sw0p4 ip route get 10.0.0.3
10.0.0.3 dev sw0p4 src 10.0.0.6 uid 0
    cache expires 597sec mtu 1200


^^^ but linux kernel updated route MTU based on need fragmentation packet
sent by OVN


[root@ovn-chassis-2 /]# ip netns exec sw0p4 iperf3 -c 10.0.0.3 -u

Connecting to host 10.0.0.3, port 5201
[  5] local 10.0.0.6 port 51433 connected to 10.0.0.3 port 5201
[ ID] Interval           Transfer     Bitrate         Total Datagrams
[  5]   0.00-1.00   sec   129 KBytes  1.06 Mbits/sec  115
[  5]   1.00-2.00   sec   128 KBytes  1.05 Mbits/sec  114
[  5]   2.00-3.00   sec   128 KBytes  1.05 Mbits/sec  114
[  5]   3.00-4.00   sec   128 KBytes  1.05 Mbits/sec  114
[  5]   4.00-5.00   sec   128 KBytes  1.05 Mbits/sec  114
^C[  5]   5.00-5.38   sec  49.3 KBytes  1.06 Mbits/sec  44
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval           Transfer     Bitrate         Jitter    Lost/Total
Datagrams
[  5]   0.00-5.38   sec   689 KBytes  1.05 Mbits/sec  0.000 ms  0/615 (0%)
 sender
[  5]   0.00-5.38   sec  0.00 Bytes  0.00 bits/sec  0.000 ms  0/0 (0%)
 receiver
iperf3: interrupt - the client has terminated


^^^ so the next iperf3 client attempt works just fine. After the route MTU
expires, it will attempt using the interface MTU again for a new connection.


===


In TCP mode, iperf3 honours mtu=1200 indicated in ICMP fragmentation needed
message gracefully, switching to lower sized frames:


15:46:35.182853 IP (tos 0x0, ttl 64, id 62820, offset 0, flags [DF], proto
TCP (6), length 1200)


===


Evidently, the approach works (though assumes that clients will gracefully
handle "message too long" error, ignoring / repeating the transfer). So
I'll rework the patch to consider Ales' suggestions about pipeline design.




>
>> Dumitru, Mark, any thoughts?
>>
>> --
>>
>> Ales Musil
>>
>> Senior Software Engineer - OVN Core
>>
>> Red Hat EMEA <https://www.redhat.com>
>>
>> amusil@redhat.com    IM: amusil
>> <https://red.ht/sig>
>>
>
diff mbox series

Patch

diff --git a/controller/physical.c b/controller/physical.c
index 705146316..96c0dc31a 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -1072,6 +1072,170 @@  setup_activation_strategy(const struct sbrec_port_binding *binding,
     }
 }
 
+static size_t
+encode_start_controller_op(enum action_opcode opcode, bool pause,
+                           uint32_t meter_id, struct ofpbuf *ofpacts)
+{
+    size_t ofs = ofpacts->size;
+
+    struct ofpact_controller *oc = ofpact_put_CONTROLLER(ofpacts);
+    oc->max_len = UINT16_MAX;
+    oc->reason = OFPR_ACTION;
+    oc->pause = pause;
+    if (!ovs_feature_is_supported(OVS_DP_METER_SUPPORT)) {
+        meter_id = NX_CTLR_NO_METER;
+    }
+    oc->meter_id = meter_id;
+
+    struct action_header ah = { .opcode = htonl(opcode) };
+    ofpbuf_put(ofpacts, &ah, sizeof ah);
+
+    return ofs;
+}
+
+static void
+encode_finish_controller_op(size_t ofs, struct ofpbuf *ofpacts)
+{
+    struct ofpact_controller *oc = ofpbuf_at_assert(ofpacts, ofs, sizeof *oc);
+    ofpacts->header = oc;
+    oc->userdata_len = ofpacts->size - (ofs + sizeof *oc);
+    ofpact_finish_CONTROLLER(ofpacts, &oc);
+}
+
+static void
+handle_oversized_ip_packets(struct ovn_desired_flow_table *flow_table,
+                            const struct sbrec_port_binding *binding,
+                            bool is_ipv6)
+{
+    uint32_t dp_key = binding->datapath->tunnel_key;
+
+    struct match match;
+    match_init_catchall(&match);
+    match_set_metadata(&match, htonll(dp_key));
+
+    struct ofpbuf ofpacts;
+    ofpbuf_init(&ofpacts, 0);
+
+    /* Store packet too large flag in reg9[1]. */
+    /* TODO: limit to multichassis traffic? */
+    match_init_catchall(&match);
+    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+    match_set_metadata(&match, htonll(dp_key));
+
+    /* TODO: get mtu from interface of the tunnel */
+    uint16_t frag_mtu = 1500;
+    struct ofpact_check_pkt_larger *pkt_larger =
+        ofpact_put_CHECK_PKT_LARGER(&ofpacts);
+    pkt_larger->pkt_len = frag_mtu;
+    pkt_larger->dst.field = mf_from_id(MFF_REG9);
+    pkt_larger->dst.ofs = 1;
+
+    put_resubmit(31, &ofpacts);
+    /* TODO: should we tag table=30 as consumed for this job anywhere? */
+    ofctrl_add_flow(flow_table, 30, 110,
+                    binding->header_.uuid.parts[0], &match, &ofpacts,
+                    &binding->header_.uuid);
+    ofpbuf_uninit(&ofpacts);
+
+    /* Generate ICMP Fragmentation needed for IP packets that are too large
+     * (reg9[1] == 1) */
+    /* TODO: limit to multichassis traffic? */
+    match_init_catchall(&match);
+    match_set_dl_type(&match, htons(is_ipv6 ? ETH_TYPE_IPV6 : ETH_TYPE_IP));
+    match_set_reg_masked(&match, MFF_REG9 - MFF_REG0, 1 << 1, 1 << 1);
+
+    /* Return ICMP error with a part of the original IP packet included. */
+    ofpbuf_init(&ofpacts, 0);
+    size_t oc_offset = encode_start_controller_op(
+        ACTION_OPCODE_ICMP, true, NX_CTLR_NO_METER, &ofpacts);
+
+    /* Before sending the ICMP error packet back to the pipeline, set a number
+     * of fields. */
+    struct ofpbuf inner_ofpacts;
+    ofpbuf_init(&inner_ofpacts, 0);
+
+    /* The new error packet is no longer too large */
+    /* REGBIT_PKT_LARGER = 0 */
+    ovs_be32 value = htonl(0);
+    ovs_be32 mask = htonl(1 << 1);
+    ofpact_put_set_field(
+        &inner_ofpacts, mf_from_id(MFF_REG9), &value, &mask);
+
+    /* The new error packet is delivered locally */
+    /* REGBIT_EGRESS_LOOPBACK = 1 */
+    value = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+    mask = htonl(1 << MLF_ALLOW_LOOPBACK_BIT);
+    ofpact_put_set_field(
+        &inner_ofpacts, mf_from_id(MFF_LOG_FLAGS), &value, &mask);
+
+    /* eth.dst */
+    put_stack(MFF_ETH_SRC, ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(MFF_ETH_DST, ofpact_put_STACK_POP(&inner_ofpacts));
+
+    /* ip.dst */
+    put_stack(is_ipv6 ? MFF_IPV6_SRC : MFF_IPV4_SRC,
+              ofpact_put_STACK_PUSH(&inner_ofpacts));
+    put_stack(is_ipv6 ? MFF_IPV6_DST : MFF_IPV4_DST,
+              ofpact_put_STACK_POP(&inner_ofpacts));
+
+    /* ip.ttl */
+    struct ofpact_ip_ttl *ip_ttl = ofpact_put_SET_IP_TTL(&inner_ofpacts);
+    ip_ttl->ttl = 255;
+
+    if (is_ipv6) {
+        /* icmp6.type = 2 (Packet Too Big) */
+        /* icmp6.code = 0 */
+        uint8_t icmp_type = 2;
+        uint8_t icmp_code = 0;
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV6_TYPE), &icmp_type, NULL);
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV6_CODE), &icmp_code, NULL);
+
+        /* icmp6.frag_mtu */
+        size_t frag_mtu_oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_PUT_ICMP6_FRAG_MTU, true, NX_CTLR_NO_METER,
+            &inner_ofpacts);
+        ovs_be32 frag_mtu_ovs = htonl(frag_mtu);
+        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
+    } else {
+        /* icmp4.type = 3 (Destination Unreachable) */
+        /* icmp4.code = 4 (Fragmentation Needed) */
+        uint8_t icmp_type = 3;
+        uint8_t icmp_code = 4;
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV4_TYPE), &icmp_type, NULL);
+        ofpact_put_set_field(
+            &inner_ofpacts, mf_from_id(MFF_ICMPV4_CODE), &icmp_code, NULL);
+
+        /* icmp4.frag_mtu = */
+        size_t frag_mtu_oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_PUT_ICMP4_FRAG_MTU, true, NX_CTLR_NO_METER,
+            &inner_ofpacts);
+        ovs_be16 frag_mtu_ovs = htons(frag_mtu);
+        ofpbuf_put(&inner_ofpacts, &frag_mtu_ovs, sizeof(frag_mtu_ovs));
+        encode_finish_controller_op(frag_mtu_oc_offset, &inner_ofpacts);
+    }
+
+    /* Finally, submit the ICMP error back to the ingress pipeline */
+    put_resubmit(8, &inner_ofpacts);
+
+    /* Attach nested actions to ICMP error controller handler */
+    ofpacts_put_openflow_actions(inner_ofpacts.data, inner_ofpacts.size,
+                                 &ofpacts, OFP15_VERSION);
+
+    /* Finalize the ICMP error controller handler */
+    encode_finish_controller_op(oc_offset, &ofpacts);
+
+    ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 1000,
+                    binding->header_.uuid.parts[0], &match, &ofpacts,
+                    &binding->header_.uuid);
+
+    ofpbuf_uninit(&inner_ofpacts);
+    ofpbuf_uninit(&ofpacts);
+}
+
 static void
 enforce_tunneling_for_multichassis_ports(
     struct local_datapath *ld,
@@ -1124,6 +1288,9 @@  enforce_tunneling_for_multichassis_ports(
                         binding->header_.uuid.parts[0], &match, &ofpacts,
                         &binding->header_.uuid);
         ofpbuf_uninit(&ofpacts);
+
+        handle_oversized_ip_packets(flow_table, binding, false);
+        handle_oversized_ip_packets(flow_table, binding, true);
     }
 
     struct tunnel *tun_elem;
diff --git a/tests/ovn.at b/tests/ovn.at
index f8b8db4df..91ecd0814 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14887,6 +14887,258 @@  OVN_CLEANUP([hv1],[hv2],[hv3])
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([localnet connectivity with multiple requested-chassis, path mtu discovery])
+AT_KEYWORDS([multi-chassis])
+ovn_start
+
+net_add n1
+for i in 1 2; do
+    sim_add hv$i
+    as hv$i
+    check ovs-vsctl add-br br-phys
+    ovn_attach n1 br-phys 192.168.0.$i
+    check ovs-vsctl set open . external-ids:ovn-bridge-mappings=phys:br-phys
+done
+
+check ovn-nbctl ls-add ls0 -- add Logical_Switch ls0 other_config vlan-passthru=true
+check ovn-nbctl lsp-add ls0 first
+check ovn-nbctl lsp-add ls0 second
+check ovn-nbctl lsp-add ls0 migrator
+check ovn-nbctl lsp-set-addresses first "00:00:00:00:00:01 10.0.0.1 ff::01"
+check ovn-nbctl lsp-set-addresses second "00:00:00:00:00:02 10.0.0.2 ff::02"
+check ovn-nbctl lsp-set-addresses migrator "00:00:00:00:00:ff 10.0.0.100 ff::ff"
+
+check ovn-nbctl lsp-add ls0 public
+check ovn-nbctl lsp-set-type public localnet
+check ovn-nbctl lsp-set-addresses public unknown
+check ovn-nbctl lsp-set-options public network_name=phys
+
+# TODO: do we need mtu request here?
+check ovn-nbctl lsp-set-options first requested-chassis=hv1 vif-plug-mtu-request=1500
+check ovn-nbctl lsp-set-options second requested-chassis=hv2 vif-plug-mtu-request=1500
+check ovn-nbctl lsp-set-options migrator requested-chassis=hv1,hv2 vif-plug-mtu-request=1500
+
+as hv1 check ovs-vsctl -- add-port br-int first -- \
+    set Interface first external-ids:iface-id=first \
+    options:tx_pcap=hv1/first-tx.pcap \
+    options:rxq_pcap=hv1/first-rx.pcap \
+    ofport-request=1
+as hv2 check ovs-vsctl -- add-port br-int second -- \
+    set Interface second external-ids:iface-id=second \
+    options:tx_pcap=hv2/second-tx.pcap \
+    options:rxq_pcap=hv2/second-rx.pcap \
+    ofport-request=2
+
+# Create Migrator interfaces on both hv1 and hv2
+for hv in hv1 hv2; do
+    as $hv check ovs-vsctl -- add-port br-int migrator -- \
+        set Interface migrator external-ids:iface-id=migrator \
+        options:tx_pcap=$hv/migrator-tx.pcap \
+        options:rxq_pcap=$hv/migrator-rx.pcap \
+        ofport-request=100
+done
+
+send_ip_packet() {
+    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv4_src=$5 ipv4_dst=$6 ip_chksum=$7 data=$8 ip_len=$9
+    local ip_ttl=ff
+    local packet=${eth_dst}${eth_src}08004500${ip_len}00004000${ip_ttl}fd${ip_chksum}${ipv4_src}${ipv4_dst}${data}
+    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
+    echo $packet
+}
+
+send_ip6_packet() {
+    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ipv6_src=$5 ipv6_dst=$6 data=$7 ip_len=$8
+    local ip_ttl=ff
+    local packet=${eth_dst}${eth_src}86dd60000000${ip_len}fd${ip_ttl}${ipv6_src}${ipv6_dst}${data}
+    as hv$hv ovs-appctl netdev-dummy/receive $inport $packet
+    echo $packet
+}
+
+reset_env() {
+    as hv1 reset_pcap_file first hv1/first
+    as hv2 reset_pcap_file second hv2/second
+    as hv1 reset_pcap_file migrator hv1/migrator
+    as hv2 reset_pcap_file migrator hv2/migrator
+
+    for port in hv1/migrator hv2/migrator hv1/first hv2/second; do
+        : > $port.expected
+    done
+}
+
+first_mac=000000000001
+second_mac=000000000002
+migrator_mac=0000000000ff
+first_ip=$(ip_to_hex 10 0 0 1)
+second_ip=$(ip_to_hex 10 0 0 2)
+migrator_ip=$(ip_to_hex 10 0 0 100)
+first_ip6=ff000000000000000000000000000001
+second_ip6=ff000000000000000000000000000002
+migrator_ip6=ff0000000000000000000000000000ff
+
+OVN_POPULATE_ARP
+
+reset_env
+
+# Check that packets of proper size are delivered from multichassis to intended ports
+ip_len=058c
+len=1400
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip $first_ip 0000 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac $migrator_ip $second_ip 0000 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac $migrator_ip6 $first_ip6 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac $migrator_ip6 $second_ip6 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap], [hv1/first.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap], [hv2/second.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap], [hv1/migrator.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap], [hv2/migrator.expected])
+
+reset_env
+
+# Now check that oversized packets are not delivered to the intended ports, and
+# that the originating port receives an ICMP error suggesting fragmentation
+
+ip_len=0bcc
+len=3000
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $first_mac $migrator_ip $first_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet migrator 1 $migrator_mac $second_mac $migrator_ip $second_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $first_mac $migrator_ip6 $first_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet migrator 1 $migrator_mac $second_mac $migrator_ip6 $second_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap], [hv1/first.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap], [hv2/second.expected])
+
+# it's harder to construct the original packet that includes geneve headers
+# etc. so check manually
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/migrator-tx.pcap > hv1/migrator.packets
+sed -i '/ffffffffffff/d' hv1/migrator.packets
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/migrator-tx.pcap > hv2/migrator.packets
+sed -i '/ffffffffffff/d' hv2/migrator.packets
+
+# icmp errors are delivered only to the port binding that originated the offending packet
+AT_CHECK([wc -l hv1/migrator.packets | cut -c 1], [0], [4
+])
+AT_CHECK([wc -l hv2/migrator.packets | cut -c 1], [0], [0
+])
+
+for line in $(cat hv1/migrator.expected); do
+    trimmed=$(echo $line | cut -c 125-225)
+    echo "checking if packet $trimmed... generated an icmp error"
+    AT_CHECK([grep ${trimmed} hv1/migrator.packets | wc -l | cut -c 1], [0], [1
+])
+done
+
+reset_env
+
+# Check that packets of proper size are delivered from regular to multichassis ports
+ip_len=058c
+len=1400
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip $migrator_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip $migrator_ip 0000 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6 $migrator_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6 $migrator_ip6 $data $ip_len)
+echo $packet >> hv1/migrator.expected
+echo $packet >> hv2/migrator.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/first-tx.pcap], [hv1/first.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/second-tx.pcap], [hv2/second.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap], [hv1/migrator.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap], [hv2/migrator.expected])
+
+reset_env
+
+# Now check that oversized packets are not delivered to multichassis ports, and
+# that the originating ports receive an ICMP error suggesting fragmentation
+
+ip_len=0bcc
+len=3000
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet first 1 $first_mac $migrator_mac $first_ip $migrator_ip 0000 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip_packet second 2 $second_mac $migrator_mac $second_ip $migrator_ip 0000 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet first 1 $first_mac $migrator_mac $first_ip6 $migrator_ip6 $data $ip_len)
+echo $packet >> hv1/first.expected
+
+data=$(xxd -l $len -c $len -p < /dev/urandom)
+packet=$(send_ip6_packet second 2 $second_mac $migrator_mac $second_ip6 $migrator_ip6 $data $ip_len)
+echo $packet >> hv2/second.expected
+
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv1/migrator-tx.pcap], [hv1/migrator.expected])
+OVN_CHECK_PACKETS_REMOVE_BROADCAST([hv2/migrator-tx.pcap], [hv2/migrator.expected])
+
+# it's harder to construct the original packet that includes geneve headers
+# etc. so check manually
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv1/first-tx.pcap > hv1/first.packets
+sed -i '/ffffffffffff/d' hv1/first.packets
+$PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" hv2/second-tx.pcap > hv2/second.packets
+sed -i '/ffffffffffff/d' hv2/second.packets
+
+AT_CHECK([wc -l hv1/first.packets | cut -c 1], [0], [2
+])
+AT_CHECK([wc -l hv2/second.packets | cut -c 1], [0], [2
+])
+
+for line in $(cat hv1/first.expected); do
+    trimmed=$(echo $line | cut -c 125-225)
+    echo "checking if packet $trimmed... generated an icmp error"
+    AT_CHECK([grep ${trimmed} hv1/first.packets | wc -l | cut -c 1], [0], [1
+])
+done
+
+for line in $(cat hv2/second.expected); do
+    trimmed=$(echo $line | cut -c 125-225)
+    echo "checking if packet $trimmed... generated an icmp error"
+    AT_CHECK([grep ${trimmed} hv2/second.packets | wc -l | cut -c 1], [0], [1
+])
+done
+
+# todo: test multichassis to multichassis
+
+OVN_CLEANUP([hv1],[hv2])
+
+AT_CLEANUP
+])
+
 OVN_FOR_EACH_NORTHD([
 AT_SETUP([options:activation-strategy for logical port])
 AT_KEYWORDS([multi-chassis])