diff mbox series

[ovs-dev] ovn-ic: Support IGMP/MLD in multi-AZ deployments.

Message ID 20220228210640.5458-1-dceara@redhat.com
State Changes Requested
Headers show
Series [ovs-dev] ovn-ic: Support IGMP/MLD in multi-AZ deployments. | expand

Commit Message

Dumitru Ceara Feb. 28, 2022, 9:06 p.m. UTC
Commit 974618c61de8 ("ovn-ic: physical: Support multicast_group flooding
on IC transit switches.") added support for statically forwarding IP
multicast traffic between AZs.  While this works, it's a waste of
resources to flood all IP multicast traffic to all instances of the
transit switch in all AZs.

Instead, we now extend the OVN IGMP/MLD support, in order to make it
function across availability zones.

Until now, all IGMP/MLD control packets were punted to ovn-controller
in the ingress pipeline of the logical switch that had multicast
snooping enabled.  We change this behavior for transit switches and
punt the packets both in the ingress pipeline (in the source AZ) but
also in the egress pipeline for remote ports (in the destination AZ).

This ensures that both OVN deployments (in the source and destination
AZs) dynamically learn the same IGMP groups on the transit switch.
There is, however, a catch: there's no guarantee that multicast groups
are learnt in the same order on all AZs so they might end up having
different tunnel keys.  In order to avoid the potential mismatch between
local and remote AZ multicast group tunnel keys we change multicast
group processing for "remote" ports and perform the group expansion on
the source node.

In order for the end-to-end solution to work, and for IP multicast
traffic to flow between logical switch ports that are behind gateway
routers in different AZs we also fix the logical router port static
multicast flooding (options:mcast_flood) implementation as it was always
dropping IGMP/MLD packets (these are always generated with TTL=1).

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2042952
Signed-off-by: Dumitru Ceara <dceara@redhat.com>
---
 controller/physical.c   |  28 +++++
 northd/northd.c         |  93 ++++++++++++++++-
 northd/ovn-northd.8.xml |  48 +++++++--
 tests/ovn-northd.at     |  43 ++++++++
 tests/ovn.at            | 226 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 430 insertions(+), 8 deletions(-)

Comments

Mark Michelson March 2, 2022, 7:31 p.m. UTC | #1
Hi Dumitru,

This looks good to me. I only have a small nit below.

Acked-by: Mark Michelson <mmichels@redhat.com>

On 2/28/22 16:06, Dumitru Ceara wrote:
> Commit 974618c61de8 ("ovn-ic: physical: Support multicast_group flooding
> on IC transit switches.") added support for statically forwarding IP
> multicast traffic between AZs.  While this works, it's a waste of
> resources to flood all IP multicast traffic to all instances of the
> transit switch in all AZs.
> 
> Instead, we now extend the OVN IGMP/MLD support, in order to make it
> function across availability zones.
> 
> Until now, all IGMP/MLD control packets were punted to ovn-controller
> in the ingress pipeline of the logical switch that had multicast
> snooping enabled.  We change this behavior for transit switches and
> punt the packets both in the ingress pipeline (in the source AZ) but
> also in the egress pipeline for remote ports (in the destination AZ).
> 
> This ensures that both OVN deployments (in the source and destination
> AZs) dynamically learn the same IGMP groups on the transit switch.
> There is, however, a catch: there's no guarantee that multicast groups
> are learnt in the same order on all AZs so they might end up having
> different tunnel keys.  In order to avoid the potential mismatch between
> local and remote AZ multicast group tunnel keys we change multicast
> group processing for "remote" ports and perform the group expansion on
> the source node.
> 
> In order for the end-to-end solution to work, and for IP multicast
> traffic to flow between logical switch ports that are behind gateway
> routers in different AZs we also fix the logical router port static
> multicast flooding (options:mcast_flood) implementation as it was always
> dropping IGMP/MLD packets (these are always generated with TTL=1).
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2042952
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> ---
>   controller/physical.c   |  28 +++++
>   northd/northd.c         |  93 ++++++++++++++++-
>   northd/ovn-northd.8.xml |  48 +++++++--
>   tests/ovn-northd.at     |  43 ++++++++
>   tests/ovn.at            | 226 ++++++++++++++++++++++++++++++++++++++++
>   5 files changed, 430 insertions(+), 8 deletions(-)
> 
> diff --git a/controller/physical.c b/controller/physical.c
> index 033828d577..02fcd5ea8b 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -1382,6 +1382,26 @@ get_vxlan_port_key(int64_t port_key)
>       return port_key;
>   }
>   
> +/* Encapsulate and send to a single remote chassis. */
> +static void
> +tunnel_to_chassis(enum mf_field_id mff_ovn_geneve,
> +                  const char *chassis_name,
> +                  const struct hmap *chassis_tunnels,
> +                  const struct sbrec_datapath_binding *datapath,
> +                  uint16_t outport, struct ofpbuf *remote_ofpacts)
> +{
> +    const struct chassis_tunnel *tun
> +        = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
> +    if (!tun) {
> +        return;
> +    }
> +
> +    put_encapsulation(mff_ovn_geneve, tun, datapath, outport, false,
> +                      remote_ofpacts);
> +    ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport;
> +}
> +
> +/* Encapsulate and send to a set of remote chassis. */
>   static void
>   fanout_to_chassis(enum mf_field_id mff_ovn_geneve,
>                     struct sset *remote_chassis,
> @@ -1487,6 +1507,14 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>                            &remote_ofpacts);
>                   put_resubmit(OFTABLE_CHECK_LOOPBACK, &remote_ofpacts);
>               }
> +        } if (!strcmp(port->type, "remote")) {
> +            if (port->chassis) {
> +                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> +                         &remote_ofpacts);
> +                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
> +                                  chassis_tunnels, mc->datapath,
> +                                  port->tunnel_key, &remote_ofpacts);
> +            }
>           } else if (!strcmp(port->type, "localport")) {
>               put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>                        &remote_ofpacts);
> diff --git a/northd/northd.c b/northd/northd.c
> index 294a59bd7e..4c95933b87 100644
> --- a/northd/northd.c
> +++ b/northd/northd.c
> @@ -1792,6 +1792,12 @@ lsp_is_router(const struct nbrec_logical_switch_port *nbsp)
>       return !strcmp(nbsp->type, "router");
>   }
>   
> +static bool
> +lsp_is_remote(const struct nbrec_logical_switch_port *nbsp)
> +{
> +    return !strcmp(nbsp->type, "remote");
> +}
> +
>   static bool
>   lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
>   {
> @@ -5838,8 +5844,48 @@ build_empty_lb_event_flow(struct ovn_lb_vip *lb_vip,
>   }
>   
>   static void
> -build_pre_lb(struct ovn_datapath *od, struct hmap *lflows)
> +build_interconn_mcast_snoop_flows(struct ovn_datapath *od,
> +                                  const struct shash *meter_groups,
> +                                  struct hmap *lflows)
> +{
> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> +    if (!smap_get(&od->nbs->other_config, "interconn-ts")
> +        || !mcast_sw_info->enabled) {
> +        return;
> +    }
> +
> +    struct ovn_port *op;
> +
> +    LIST_FOR_EACH (op, dp_node, &od->port_list) {
> +        if (!lsp_is_remote(op->nbsp)) {
> +            continue;
> +        }
> +        /* Punt IGMP traffic to controller. */
> +        char *match = xasprintf("inport == %s && ip4 && ip.proto == 2",

FYI, you can use "igmp" in place of "ip4 && ip.proto == 2" in logical 
flow matches.

> +                                op->json_key);
> +        ovn_lflow_metered(lflows, od, S_SWITCH_OUT_PRE_LB, 120, match,
> +                          "clone { igmp; }; next;",
> +                          copp_meter_get(COPP_IGMP, od->nbs->copp,
> +                                         meter_groups));
> +        free(match);
> +
> +        /* Punt MLD traffic to controller. */
> +        match = xasprintf("inport == %s && (mldv1 || mldv2)", op->json_key);
> +        ovn_lflow_metered(lflows, od, S_SWITCH_OUT_PRE_LB, 120, match,
> +                          "clone { igmp; }; next;",
> +                          copp_meter_get(COPP_IGMP, od->nbs->copp,
> +                                         meter_groups));
> +        free(match);
> +    }
> +}
> +
> +static void
> +build_pre_lb(struct ovn_datapath *od, const struct shash *meter_groups,
> +             struct hmap *lflows)
>   {
> +    /* Handle IGMP/MLD packets crossing AZs. */
> +    build_interconn_mcast_snoop_flows(od, meter_groups, lflows);
> +
>       /* Do not send multicast packets to conntrack */
>       ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110, "eth.mcast", "next;");
>       ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110, "eth.mcast", "next;");
> @@ -7585,7 +7631,7 @@ build_lswitch_lflows_pre_acl_and_acl(struct ovn_datapath *od,
>           ls_get_acl_flags(od);
>   
>           build_pre_acls(od, port_groups, lflows);
> -        build_pre_lb(od, lflows);
> +        build_pre_lb(od, meter_groups, lflows);
>           build_pre_stateful(od, lflows);
>           build_acl_hints(od, lflows);
>           build_acls(od, lflows, port_groups, meter_groups);
> @@ -11160,6 +11206,39 @@ build_mcast_lookup_flows_for_lrouter(
>            * ports. Otherwise drop any multicast traffic.
>            */
>           if (od->mcast_info.rtr.flood_static) {
> +            /* MLD and IGMP packets that need to be flooded statically
> +             * should be flooded without decrementing TTL (it's always
> +             * 1).  To prevent packets looping for ever (to some extent),
> +             * drop IGMP/MLD packets that are received from the router's
> +             * own mac addresses.
> +             */
> +            struct ovn_port *op;
> +            LIST_FOR_EACH (op, dp_node, &od->port_list) {
> +                ds_clear(match);
> +                ds_put_format(match, "eth.src == %s && ip4 && ip.proto == 2",
> +                              op->lrp_networks.ea_s);
> +                ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10550,
> +                              ds_cstr(match), "drop;");
> +
> +                ds_clear(match);
> +                ds_put_format(match, "eth.src == %s && (mldv1 || mldv2)",
> +                              op->lrp_networks.ea_s);
> +                ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10550,
> +                              ds_cstr(match), "drop;");
> +            }
> +
> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10460,
> +                          "ip4 && ip.proto == 2",
> +                          "clone { "
> +                                "outport = \""MC_STATIC"\"; "
> +                                "next; "
> +                          "};");
> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10460,
> +                          "mldv1 || mldv2",
> +                          "clone { "
> +                                "outport = \""MC_STATIC"\"; "
> +                                "next; "
> +                          "};");
>               ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10450,
>                             "ip4.mcast || ip6.mcast",
>                             "clone { "
> @@ -11932,6 +12011,16 @@ build_misc_local_traffic_drop_flows_for_lrouter(
>           struct ovn_datapath *od, struct hmap *lflows)
>   {
>       if (od->nbr) {
> +        /* Allow IGMP and MLD packets (with TTL = 1) if the router is
> +         * configured to flood them statically on some ports.
> +         */
> +        if (od->mcast_info.rtr.flood_static) {
> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 120,
> +                          "ip4 && ip.proto == 2 && ip.ttl == 1", "next;");
> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 120,
> +                          "ip6 && (mldv1 || mldv2) && ip.ttl == 1", "next;");
> +        }
> +
>           /* L3 admission control: drop multicast and broadcast source, localhost
>            * source or destination, and zero network source or destination
>            * (priority 100). */
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index e495db46a0..89afad6289 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -570,6 +570,12 @@
>         configured. We can now add a lflow to drop ct.inv packets.
>       </p>
>   
> +    <p>
> +      This table also has priority-120 flows that punt all IGMP/MLD packets to
> +      <code>ovn-controller</code> if the switch is an interconnect switch
> +      with multicast snooping enabled.
> +    </p>
> +
>       <p>
>         This table also has a priority-110 flow with the match
>         <code>eth.dst == <var>E</var></code> for all logical switch
> @@ -2385,6 +2391,15 @@ icmp6_error {
>           </p>
>         </li>
>   
> +      <li>
> +        <p>
> +          L3 admission control: Priority-120 flows allows IGMP and MLD packets
> +          if the router has logical ports that have
> +          <ref column="options" table="Logical_Router_Port"/>
> +          <code>:mcast_flood='true'</code>.
> +        </p>
> +      </li>
> +
>         <li>
>           <p>
>             L3 admission control: A priority-100 flow drops packets that match
> @@ -3449,14 +3464,23 @@ output;
>       <ul>
>         <li>
>           <p>
> -          Priority-550 flow that drops IPv6 Router Solicitation/Advertisement
> +          Priority-10550 flow that drops IPv6 Router Solicitation/Advertisement
>             packets that were not processed in previous tables.
>           </p>
>         </li>
>   
>         <li>
>           <p>
> -          Priority-500 flows that match IP multicast traffic destined to
> +          Priority-10550 flows that drop IGMP and MLD packets with source MAC
> +          address owned by the router.  These are used to prevent looping
> +          statically forwarded IGMP and MLD packets for which TTL is not
> +          decremented (it is always 1).
> +        </p>
> +      </li>
> +
> +      <li>
> +        <p>
> +          Priority-10500 flows that match IP multicast traffic destined to
>             groups registered on any of the attached switches and sets
>             <code>outport</code> to the associated multicast group that will
>             eventually flood the traffic to all interested attached logical
> @@ -3466,10 +3490,22 @@ output;
>   
>         <li>
>           <p>
> -          Priority-450 flow that matches unregistered IP multicast traffic
> -          and sets <code>outport</code> to the <code>MC_STATIC</code>
> -          multicast group, which <code>ovn-northd</code> populates with the
> -          logical ports that have
> +          Priority-10460 flows that match IGMP and MLD control packets,
> +          set <code>outport</code> to the <code>MC_STATIC</code>
> +          multicast group, which <code>ovn-northd</code>
> +          populates with the logical ports that have
> +          <ref column="options" table="Logical_Router_Port"/>
> +          <code>:mcast_flood='true'</code>. If no router ports are configured
> +          to flood multicast traffic the packets are dropped.
> +        </p>
> +      </li>
> +
> +      <li>
> +        <p>
> +          Priority-10450 flow that matches unregistered IP multicast traffic
> +          decrements TTL and sets <code>outport</code> to the
> +          <code>MC_STATIC</code> multicast group, which <code>ovn-northd</code>
> +          populates with the logical ports that have
>             <ref column="options" table="Logical_Router_Port"/>
>             <code>:mcast_flood='true'</code>. If no router ports are configured
>             to flood multicast traffic the packets are dropped.
> diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
> index fe27869737..72c6d44f60 100644
> --- a/tests/ovn-northd.at
> +++ b/tests/ovn-northd.at
> @@ -6249,3 +6249,46 @@ check_log_flows_count 0 in
>   
>   AT_CLEANUP
>   ])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([Static IP multicast report forwarding])
> +
> +ovn_start
> +
> +check ovn-nbctl lr-add lr
> +check ovn-nbctl lrp-add lr lrp1 00:00:00:00:00:01 10.10.10.1/24 1010::1/64
> +check ovn-nbctl lrp-add lr lrp2 00:00:00:00:00:02 20.20.20.1/24 2020::1/64
> +check ovn-nbctl set logical_router lr options:mcast_relay="true"
> +check ovn-nbctl set logical_router_port lrp1 options:mcast_flood="true"
> +check ovn-nbctl set logical_router_port lrp2 options:mcast_flood="true"
> +
> +check ovn-nbctl --wait=sb sync
> +
> +ovn-sbctl dump-flows lr > lrflows
> +AT_CAPTURE_FILE([lrflows])
> +
> +dnl Flows to skip TTL == {0, 1} check for IGMP and MLD packets.
> +AT_CHECK([grep -e 'lr_in_ip_input    ' lrflows | grep -e 'ip.proto == 2' -e 'mld' -e 'ip.ttl == {0, 1}' | sed 's/table=../table=??/'], [0], [dnl
> +  table=??(lr_in_ip_input     ), priority=120  , match=(ip4 && ip.proto == 2 && ip.ttl == 1), action=(next;)
> +  table=??(lr_in_ip_input     ), priority=120  , match=(ip6 && (mldv1 || mldv2) && ip.ttl == 1), action=(next;)
> +  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp1" && ip4 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp4 {eth.dst <-> eth.src; icmp4.type = 11; /* Time exceeded */ icmp4.code = 0; /* TTL exceeded in transit */ ip4.dst = ip4.src; ip4.src = 10.10.10.1 ; ip.ttl = 254; outport = "lrp1"; flags.loopback = 1; output; };)
> +  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp1" && ip6 && ip6.src == 1010::/64 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp6 {eth.dst <-> eth.src; ip6.dst = ip6.src; ip6.src = 1010::1 ; ip.ttl = 254; icmp6.type = 3; /* Time exceeded */ icmp6.code = 0; /* TTL exceeded in transit */ outport = "lrp1"; flags.loopback = 1; output; };)
> +  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp2" && ip4 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp4 {eth.dst <-> eth.src; icmp4.type = 11; /* Time exceeded */ icmp4.code = 0; /* TTL exceeded in transit */ ip4.dst = ip4.src; ip4.src = 20.20.20.1 ; ip.ttl = 254; outport = "lrp2"; flags.loopback = 1; output; };)
> +  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp2" && ip6 && ip6.src == 2020::/64 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp6 {eth.dst <-> eth.src; ip6.dst = ip6.src; ip6.src = 2020::1 ; ip.ttl = 254; icmp6.type = 3; /* Time exceeded */ icmp6.code = 0; /* TTL exceeded in transit */ outport = "lrp2"; flags.loopback = 1; output; };)
> +  table=??(lr_in_ip_input     ), priority=30   , match=(ip4 && ip.ttl == {0, 1}), action=(drop;)
> +])
> +
> +dnl Flows to "route" (statically forward) without decrementing TTL for
> +dnl IGMP and MLD packets.  Also, flows to drop potentially looping IGMP/MLD
> +dnl packets.
> +AT_CHECK([grep -e 'lr_in_ip_routing   ' lrflows | grep -e 'ip.proto == 2' -e 'mld' | sed 's/table=../table=??/'], [0], [dnl
> +  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:01 && (mldv1 || mldv2)), action=(drop;)
> +  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:01 && ip4 && ip.proto == 2), action=(drop;)
> +  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:02 && (mldv1 || mldv2)), action=(drop;)
> +  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:02 && ip4 && ip.proto == 2), action=(drop;)
> +  table=??(lr_in_ip_routing   ), priority=10460, match=(ip4 && ip.proto == 2), action=(clone { outport = "_MC_static"; next; };)
> +  table=??(lr_in_ip_routing   ), priority=10460, match=(mldv1 || mldv2), action=(clone { outport = "_MC_static"; next; };)
> +])
> +
> +AT_CLEANUP
> +])
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 69270601ab..d7ae7d917b 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -22906,6 +22906,232 @@ OVS_WAIT_UNTIL(
>   AT_CLEANUP
>   ])
>   
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([interconnection - IGMP/MLD multicast])
> +
> +# Logical network:
> +#
> +#       AZ1                     |                     AZ2
> +#   ---------------------------------------------------------------------
> +#                               |
> +#                               |     +-- LR2 --- LS2 --- LSP2 (sender)
> +#                               |    /
> +#     LSP1  --- LS1 --- LR1 --- TS ---
> +#   (receiver)                  |    \
> +#                               |     +-- LR3 --- LS3 --- LSP3 (receiver)
> +#
> +# LS1, LS2, LS3, TS configured to snoop IP multicast.
> +# LR1, LR2, LR3 configured to relay IP multicast.
> +# LR1-TS configured to flood IP multicast traffic unconditionally.
> +# LR2-TS configured to flood IP multicast traffic unconditionally.
> +# LR3-TS configured to flood IP multicast traffic unconditionally.
> +
> +AT_CAPTURE_FILE([exp])
> +AT_CAPTURE_FILE([rcv])
> +check_packets() {
> +    > exp
> +    > rcv
> +    if test "$1" = --uniq; then
> +        sort="sort -u"; shift
> +    else
> +        sort=sort
> +    fi
> +    for tuple in "$@"; do
> +        set $tuple; pcap=$1 type=$2
> +        echo "--- $pcap" | tee -a exp >> rcv
> +        $sort "$type" >> exp
> +        $PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" $pcap | $sort >> rcv
> +        echo | tee -a exp >> rcv
> +    done
> +
> +    $at_diff exp rcv >/dev/null
> +}
> +
> +ovn_init_ic_db
> +ovn_start az1
> +ovn_start az2
> +
> +net_add n1
> +
> +sim_add hv1
> +as hv1
> +check ovs-vsctl add-br br-phys
> +ovn_az_attach az1 n1 br-phys 192.168.1.1 16
> +check ovs-vsctl -- add-port br-int hv1-vif1 \
> +    -- set interface hv1-vif1 external-ids:iface-id=lsp1 \
> +       options:tx_pcap=hv1/vif1-tx.pcap \
> +       options:rxq_pcap=hv1/vif1-rx.pcap
> +check ovs-vsctl set open . external-ids:ovn-is-interconn=true
> +
> +sim_add hv2
> +as hv2
> +check ovs-vsctl add-br br-phys
> +ovn_az_attach az2 n1 br-phys 192.168.2.1 16
> +check ovs-vsctl -- add-port br-int hv2-vif1 \
> +    -- set interface hv2-vif1 external-ids:iface-id=lsp2 \
> +       options:tx_pcap=hv2/vif1-tx.pcap \
> +       options:rxq_pcap=hv2/vif1-rx.pcap
> +check ovs-vsctl -- add-port br-int hv2-vif2 \
> +    -- set interface hv2-vif2 external-ids:iface-id=lsp3 \
> +       options:tx_pcap=hv2/vif2-tx.pcap \
> +       options:rxq_pcap=hv2/vif2-rx.pcap
> +check ovs-vsctl set open . external-ids:ovn-is-interconn=true
> +
> +AT_CHECK([ovn-ic-nbctl create Transit_Switch name=ts], [0], [ignore])
> +check ovn_as az1 ovn-nbctl wait-until logical_switch ts
> +check ovn_as az2 ovn-nbctl wait-until logical_switch ts
> +
> +ovn_as az1
> +check ovn-nbctl lr-add lr1 \
> +    -- lrp-add lr1 lr1-ts 00:00:00:01:00:01 42.42.42.1/24 4242::1/64 \
> +    -- lrp-add lr1 lr1-ls1 00:00:00:01:01:00 43.43.43.1/24 4343::1/64\
> +    -- lrp-set-gateway-chassis lr1-ts hv1
> +check ovn-nbctl ls-add ls1 \
> +    -- lsp-add ls1 ls1-lr1 \
> +    -- lsp-set-addresses ls1-lr1 router \
> +    -- lsp-set-type ls1-lr1 router \
> +    -- lsp-set-options ls1-lr1 router-port=lr1-ls1 \
> +    -- lsp-add ls1 lsp1
> +check ovn-nbctl lsp-add ts ts-lr1 \
> +    -- lsp-set-addresses ts-lr1 router \
> +    -- lsp-set-type ts-lr1 router \
> +    -- lsp-set-options ts-lr1 router-port=lr1-ts
> +wait_for_ports_up
> +
> +ovn_as az2
> +check ovn-nbctl lr-add lr2 \
> +    -- lrp-add lr2 lr2-ts 00:00:00:02:00:01 42.42.42.2/24 4242::2/64 \
> +    -- lrp-add lr2 lr2-ls2 00:00:00:02:01:00 44.44.44.1/24 4444::1/64 \
> +    -- lrp-set-gateway-chassis lr2-ts hv2
> +check ovn-nbctl ls-add ls2 \
> +    -- lsp-add ls2 ls2-lr2 \
> +    -- lsp-set-addresses ls2-lr2 router \
> +    -- lsp-set-type ls2-lr2 router \
> +    -- lsp-set-options ls2-lr2 router-port=lr2-ls2 \
> +    -- lsp-add ls2 lsp2
> +check ovn-nbctl lsp-add ts ts-lr2 \
> +    -- lsp-set-addresses ts-lr2 router \
> +    -- lsp-set-type ts-lr2 router \
> +    -- lsp-set-options ts-lr2 router-port=lr2-ts
> +
> +check ovn-nbctl lr-add lr3 \
> +    -- lrp-add lr3 lr3-ts 00:00:00:02:00:02 42.42.42.3/24 4242::3/64 \
> +    -- lrp-add lr3 lr3-ls3 00:00:00:02:02:00 44.44.45.1/24 4445::1/64 \
> +    -- lrp-set-gateway-chassis lr3-ts hv2
> +check ovn-nbctl ls-add ls3 \
> +    -- lsp-add ls3 ls3-lr3 \
> +    -- lsp-set-addresses ls3-lr3 router \
> +    -- lsp-set-type ls3-lr3 router \
> +    -- lsp-set-options ls3-lr3 router-port=lr3-ls3 \
> +    -- lsp-add ls3 lsp3
> +check ovn-nbctl lsp-add ts ts-lr3 \
> +    -- lsp-set-addresses ts-lr3 router \
> +    -- lsp-set-type ts-lr3 router \
> +    -- lsp-set-options ts-lr3 router-port=lr3-ts
> +
> +wait_for_ports_up
> +
> +dnl Enable IP multicast snooping and IP multicast relay.  Reports are
> +dnl forwarded statically.
> +ovn_as az1
> +check ovn-nbctl set logical_switch ls1 other_config:mcast_snoop="true"
> +check ovn-nbctl set Logical_Switch_Port ls1-lr1 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_router lr1 options:mcast_relay="true"
> +check ovn-nbctl set logical_router_port lr1-ts options:mcast_flood="true"
> +check ovn-nbctl set logical_switch ts other_config:mcast_snoop="true"
> +check ovn-nbctl set logical_switch_port ts-lr1 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_switch_port ts-lr2 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_switch_port ts-lr3 options:mcast_flood_reports="true"
> +
> +ovn_as az2
> +check ovn-nbctl set logical_switch ls2 other_config:mcast_snoop="true"
> +check ovn-nbctl set Logical_Switch_Port ls2-lr2 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_router lr2 options:mcast_relay="true"
> +check ovn-nbctl set logical_router_port lr2-ts options:mcast_flood="true"
> +check ovn-nbctl set logical_switch ls3 other_config:mcast_snoop="true"
> +check ovn-nbctl set Logical_Switch_Port ls3-lr3 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_router lr3 options:mcast_relay="true"
> +check ovn-nbctl set logical_router_port lr3-ts options:mcast_flood="true"
> +check ovn-nbctl set logical_switch ts other_config:mcast_snoop="true"
> +check ovn-nbctl set logical_switch_port ts-lr1 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_switch_port ts-lr2 options:mcast_flood_reports="true"
> +check ovn-nbctl set logical_switch_port ts-lr3 options:mcast_flood_reports="true"
> +
> +check ovn_as az1 ovn-nbctl --wait=hv sync
> +check ovn_as az2 ovn-nbctl --wait=hv sync
> +
> +# Pre-populate the hypervisors' ARP tables so that we don't lose any
> +# packets for ARP resolution (native tunneling doesn't queue packets
> +# for ARP resolution).
> +OVN_POPULATE_ARP
> +
> +# Inject IGMP Join for 239.0.1.68 on LSP1.
> +send_igmp_v3_report hv1-vif1 hv1 \
> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> +    /dev/null
> +
> +# Inject MLD Join for ff0a:dead:beef::1 on LSP1.
> +send_mld_v2_report hv1-vif1 hv1 \
> +    000000000001 10000000000000000000000000000001 \
> +    ff0adeadbeef00000000000000000001 04 c0e4 \
> +    /dev/null
> +
> +# Inject IGMP Join for 239.0.1.68 on LSP3.
> +send_igmp_v3_report hv2-vif2 hv2 \
> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> +    /dev/null
> +
> +# Inject MLD Join for ff0a:dead:beef::1 on LSP3.
> +send_mld_v2_report hv2-vif2 hv2 \
> +    000000000001 10000000000000000000000000000001 \
> +    ff0adeadbeef00000000000000000001 04 c0e4 \
> +    /dev/null
> +
> +# Send an IP multicast packet from LSP2, it should be forwarded
> +# to lsp1 and lsp3.
> +> expected_az1
> +> expected_az2
> +send_ip_multicast_pkt hv2-vif1 hv2 \
> +    000000000001 01005e000144 \
> +    $(ip_to_hex 44 44 44 2) $(ip_to_hex 239 0 1 68) 1e 20 7c6b 11 \
> +    e518e518000aed350000
> +store_ip_multicast_pkt \
> +    000000010100 01005e000144 \
> +    $(ip_to_hex 44 44 44 2) $(ip_to_hex 239 0 1 68) 1e 1e 7e6b 11 \
> +    e518e518000aed350000 expected_az1
> +store_ip_multicast_pkt \
> +    000000020200 01005e000144 \
> +    $(ip_to_hex 44 44 44 2) $(ip_to_hex 239 0 1 68) 1e 1e 7e6b 11 \
> +    e518e518000aed350000 expected_az2
> +
> +send_ip6_multicast_pkt hv2-vif1 hv2 \
> +    000000000001 333300000001 \
> +    00100000000000000000000000000042 ff0adeadbeef00000000000000000001 \
> +    000e 40 11 \
> +    93407a69000e2b4e61736461640a
> +store_ip6_multicast_pkt \
> +    000000010100 333300000001 \
> +    00100000000000000000000000000042 ff0adeadbeef00000000000000000001 \
> +    000e 3e 11 \
> +    93407a69000e2b4e61736461640a \
> +    expected_az1
> +store_ip6_multicast_pkt \
> +    000000020200 333300000001 \
> +    00100000000000000000000000000042 ff0adeadbeef00000000000000000001 \
> +    000e 3e 11 \
> +    93407a69000e2b4e61736461640a \
> +    expected_az2
> +
> +OVS_WAIT_UNTIL(
> +  [check_packets 'hv1/vif1-tx.pcap expected_az1' \
> +                 'hv2/vif2-tx.pcap expected_az2'],
> +  [$at_diff -F'^---' exp rcv])
> +
> +AT_CLEANUP
> +])
> +
>   OVN_FOR_EACH_NORTHD([
>   AT_SETUP([ECMP static routes])
>   ovn_start
>
Dumitru Ceara March 3, 2022, 9:32 a.m. UTC | #2
On 3/2/22 20:31, Mark Michelson wrote:
> Hi Dumitru,
> 

Hi Mark,

> This looks good to me. I only have a small nit below.
> 
> Acked-by: Mark Michelson <mmichels@redhat.com>
> 

Thanks!

> On 2/28/22 16:06, Dumitru Ceara wrote:
>> Commit 974618c61de8 ("ovn-ic: physical: Support multicast_group flooding
>> on IC transit switches.") added support for statically forwarding IP
>> multicast traffic between AZs.  While this works, it's a waste of
>> resources to flood all IP multicast traffic to all instances of the
>> transit switch in all AZs.
>>
>> Instead, we now extend the OVN IGMP/MLD support, in order to make it
>> function across availability zones.
>>
>> Until now, all IGMP/MLD control packets were punted to ovn-controller
>> in the ingress pipeline of the logical switch that had multicast
>> snooping enabled.  We change this behavior for transit switches and
>> punt the packets both in the ingress pipeline (in the source AZ) but
>> also in the egress pipeline for remote ports (in the destination AZ).
>>
>> This ensures that both OVN deployments (in the source and destination
>> AZs) dynamically learn the same IGMP groups on the transit switch.
>> There is, however, a catch: there's no guarantee that multicast groups
>> are learnt in the same order on all AZs so they might end up having
>> different tunnel keys.  In order to avoid the potential mismatch between
>> local and remote AZ multicast group tunnel keys we change multicast
>> group processing for "remote" ports and perform the group expansion on
>> the source node.
>>
>> In order for the end-to-end solution to work, and for IP multicast
>> traffic to flow between logical switch ports that are behind gateway
>> routers in different AZs we also fix the logical router port static
>> multicast flooding (options:mcast_flood) implementation as it was always
>> dropping IGMP/MLD packets (these are always generated with TTL=1).
>>
>> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2042952
>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>> ---
>>   controller/physical.c   |  28 +++++
>>   northd/northd.c         |  93 ++++++++++++++++-
>>   northd/ovn-northd.8.xml |  48 +++++++--
>>   tests/ovn-northd.at     |  43 ++++++++
>>   tests/ovn.at            | 226 ++++++++++++++++++++++++++++++++++++++++
>>   5 files changed, 430 insertions(+), 8 deletions(-)
>>
>> diff --git a/controller/physical.c b/controller/physical.c
>> index 033828d577..02fcd5ea8b 100644
>> --- a/controller/physical.c
>> +++ b/controller/physical.c
>> @@ -1382,6 +1382,26 @@ get_vxlan_port_key(int64_t port_key)
>>       return port_key;
>>   }
>>   +/* Encapsulate and send to a single remote chassis. */
>> +static void
>> +tunnel_to_chassis(enum mf_field_id mff_ovn_geneve,
>> +                  const char *chassis_name,
>> +                  const struct hmap *chassis_tunnels,
>> +                  const struct sbrec_datapath_binding *datapath,
>> +                  uint16_t outport, struct ofpbuf *remote_ofpacts)
>> +{
>> +    const struct chassis_tunnel *tun
>> +        = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
>> +    if (!tun) {
>> +        return;
>> +    }
>> +
>> +    put_encapsulation(mff_ovn_geneve, tun, datapath, outport, false,
>> +                      remote_ofpacts);
>> +    ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport;
>> +}
>> +
>> +/* Encapsulate and send to a set of remote chassis. */
>>   static void
>>   fanout_to_chassis(enum mf_field_id mff_ovn_geneve,
>>                     struct sset *remote_chassis,
>> @@ -1487,6 +1507,14 @@ consider_mc_group(struct ovsdb_idl_index
>> *sbrec_port_binding_by_name,
>>                            &remote_ofpacts);
>>                   put_resubmit(OFTABLE_CHECK_LOOPBACK, &remote_ofpacts);
>>               }
>> +        } if (!strcmp(port->type, "remote")) {
>> +            if (port->chassis) {
>> +                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>> +                         &remote_ofpacts);
>> +                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
>> +                                  chassis_tunnels, mc->datapath,
>> +                                  port->tunnel_key, &remote_ofpacts);
>> +            }
>>           } else if (!strcmp(port->type, "localport")) {
>>               put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
>>                        &remote_ofpacts);
>> diff --git a/northd/northd.c b/northd/northd.c
>> index 294a59bd7e..4c95933b87 100644
>> --- a/northd/northd.c
>> +++ b/northd/northd.c
>> @@ -1792,6 +1792,12 @@ lsp_is_router(const struct
>> nbrec_logical_switch_port *nbsp)
>>       return !strcmp(nbsp->type, "router");
>>   }
>>   +static bool
>> +lsp_is_remote(const struct nbrec_logical_switch_port *nbsp)
>> +{
>> +    return !strcmp(nbsp->type, "remote");
>> +}
>> +
>>   static bool
>>   lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
>>   {
>> @@ -5838,8 +5844,48 @@ build_empty_lb_event_flow(struct ovn_lb_vip
>> *lb_vip,
>>   }
>>     static void
>> -build_pre_lb(struct ovn_datapath *od, struct hmap *lflows)
>> +build_interconn_mcast_snoop_flows(struct ovn_datapath *od,
>> +                                  const struct shash *meter_groups,
>> +                                  struct hmap *lflows)
>> +{
>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> +    if (!smap_get(&od->nbs->other_config, "interconn-ts")
>> +        || !mcast_sw_info->enabled) {
>> +        return;
>> +    }
>> +
>> +    struct ovn_port *op;
>> +
>> +    LIST_FOR_EACH (op, dp_node, &od->port_list) {
>> +        if (!lsp_is_remote(op->nbsp)) {
>> +            continue;
>> +        }
>> +        /* Punt IGMP traffic to controller. */
>> +        char *match = xasprintf("inport == %s && ip4 && ip.proto == 2",
> 
> FYI, you can use "igmp" in place of "ip4 && ip.proto == 2" in logical
> flow matches.
> 

Good point, I'll clean it up everywhere and post v2.

Thanks,
Dumitru
diff mbox series

Patch

diff --git a/controller/physical.c b/controller/physical.c
index 033828d577..02fcd5ea8b 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -1382,6 +1382,26 @@  get_vxlan_port_key(int64_t port_key)
     return port_key;
 }
 
+/* Encapsulate and send to a single remote chassis. */
+static void
+tunnel_to_chassis(enum mf_field_id mff_ovn_geneve,
+                  const char *chassis_name,
+                  const struct hmap *chassis_tunnels,
+                  const struct sbrec_datapath_binding *datapath,
+                  uint16_t outport, struct ofpbuf *remote_ofpacts)
+{
+    const struct chassis_tunnel *tun
+        = chassis_tunnel_find(chassis_tunnels, chassis_name, NULL);
+    if (!tun) {
+        return;
+    }
+
+    put_encapsulation(mff_ovn_geneve, tun, datapath, outport, false,
+                      remote_ofpacts);
+    ofpact_put_OUTPUT(remote_ofpacts)->port = tun->ofport;
+}
+
+/* Encapsulate and send to a set of remote chassis. */
 static void
 fanout_to_chassis(enum mf_field_id mff_ovn_geneve,
                   struct sset *remote_chassis,
@@ -1487,6 +1507,14 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                          &remote_ofpacts);
                 put_resubmit(OFTABLE_CHECK_LOOPBACK, &remote_ofpacts);
             }
+        } if (!strcmp(port->type, "remote")) {
+            if (port->chassis) {
+                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
+                         &remote_ofpacts);
+                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
+                                  chassis_tunnels, mc->datapath,
+                                  port->tunnel_key, &remote_ofpacts);
+            }
         } else if (!strcmp(port->type, "localport")) {
             put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
                      &remote_ofpacts);
diff --git a/northd/northd.c b/northd/northd.c
index 294a59bd7e..4c95933b87 100644
--- a/northd/northd.c
+++ b/northd/northd.c
@@ -1792,6 +1792,12 @@  lsp_is_router(const struct nbrec_logical_switch_port *nbsp)
     return !strcmp(nbsp->type, "router");
 }
 
+static bool
+lsp_is_remote(const struct nbrec_logical_switch_port *nbsp)
+{
+    return !strcmp(nbsp->type, "remote");
+}
+
 static bool
 lrport_is_enabled(const struct nbrec_logical_router_port *lrport)
 {
@@ -5838,8 +5844,48 @@  build_empty_lb_event_flow(struct ovn_lb_vip *lb_vip,
 }
 
 static void
-build_pre_lb(struct ovn_datapath *od, struct hmap *lflows)
+build_interconn_mcast_snoop_flows(struct ovn_datapath *od,
+                                  const struct shash *meter_groups,
+                                  struct hmap *lflows)
+{
+    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
+    if (!smap_get(&od->nbs->other_config, "interconn-ts")
+        || !mcast_sw_info->enabled) {
+        return;
+    }
+
+    struct ovn_port *op;
+
+    LIST_FOR_EACH (op, dp_node, &od->port_list) {
+        if (!lsp_is_remote(op->nbsp)) {
+            continue;
+        }
+        /* Punt IGMP traffic to controller. */
+        char *match = xasprintf("inport == %s && ip4 && ip.proto == 2",
+                                op->json_key);
+        ovn_lflow_metered(lflows, od, S_SWITCH_OUT_PRE_LB, 120, match,
+                          "clone { igmp; }; next;",
+                          copp_meter_get(COPP_IGMP, od->nbs->copp,
+                                         meter_groups));
+        free(match);
+
+        /* Punt MLD traffic to controller. */
+        match = xasprintf("inport == %s && (mldv1 || mldv2)", op->json_key);
+        ovn_lflow_metered(lflows, od, S_SWITCH_OUT_PRE_LB, 120, match,
+                          "clone { igmp; }; next;",
+                          copp_meter_get(COPP_IGMP, od->nbs->copp,
+                                         meter_groups));
+        free(match);
+    }
+}
+
+static void
+build_pre_lb(struct ovn_datapath *od, const struct shash *meter_groups,
+             struct hmap *lflows)
 {
+    /* Handle IGMP/MLD packets crossing AZs. */
+    build_interconn_mcast_snoop_flows(od, meter_groups, lflows);
+
     /* Do not send multicast packets to conntrack */
     ovn_lflow_add(lflows, od, S_SWITCH_IN_PRE_LB, 110, "eth.mcast", "next;");
     ovn_lflow_add(lflows, od, S_SWITCH_OUT_PRE_LB, 110, "eth.mcast", "next;");
@@ -7585,7 +7631,7 @@  build_lswitch_lflows_pre_acl_and_acl(struct ovn_datapath *od,
         ls_get_acl_flags(od);
 
         build_pre_acls(od, port_groups, lflows);
-        build_pre_lb(od, lflows);
+        build_pre_lb(od, meter_groups, lflows);
         build_pre_stateful(od, lflows);
         build_acl_hints(od, lflows);
         build_acls(od, lflows, port_groups, meter_groups);
@@ -11160,6 +11206,39 @@  build_mcast_lookup_flows_for_lrouter(
          * ports. Otherwise drop any multicast traffic.
          */
         if (od->mcast_info.rtr.flood_static) {
+            /* MLD and IGMP packets that need to be flooded statically
+             * should be flooded without decrementing TTL (it's always
+             * 1).  To prevent packets looping for ever (to some extent),
+             * drop IGMP/MLD packets that are received from the router's
+             * own mac addresses.
+             */
+            struct ovn_port *op;
+            LIST_FOR_EACH (op, dp_node, &od->port_list) {
+                ds_clear(match);
+                ds_put_format(match, "eth.src == %s && ip4 && ip.proto == 2",
+                              op->lrp_networks.ea_s);
+                ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10550,
+                              ds_cstr(match), "drop;");
+
+                ds_clear(match);
+                ds_put_format(match, "eth.src == %s && (mldv1 || mldv2)",
+                              op->lrp_networks.ea_s);
+                ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10550,
+                              ds_cstr(match), "drop;");
+            }
+
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10460,
+                          "ip4 && ip.proto == 2",
+                          "clone { "
+                                "outport = \""MC_STATIC"\"; "
+                                "next; "
+                          "};");
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10460,
+                          "mldv1 || mldv2",
+                          "clone { "
+                                "outport = \""MC_STATIC"\"; "
+                                "next; "
+                          "};");
             ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 10450,
                           "ip4.mcast || ip6.mcast",
                           "clone { "
@@ -11932,6 +12011,16 @@  build_misc_local_traffic_drop_flows_for_lrouter(
         struct ovn_datapath *od, struct hmap *lflows)
 {
     if (od->nbr) {
+        /* Allow IGMP and MLD packets (with TTL = 1) if the router is
+         * configured to flood them statically on some ports.
+         */
+        if (od->mcast_info.rtr.flood_static) {
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 120,
+                          "ip4 && ip.proto == 2 && ip.ttl == 1", "next;");
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 120,
+                          "ip6 && (mldv1 || mldv2) && ip.ttl == 1", "next;");
+        }
+
         /* L3 admission control: drop multicast and broadcast source, localhost
          * source or destination, and zero network source or destination
          * (priority 100). */
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index e495db46a0..89afad6289 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -570,6 +570,12 @@ 
       configured. We can now add a lflow to drop ct.inv packets.
     </p>
 
+    <p>
+      This table also has priority-120 flows that punt all IGMP/MLD packets to
+      <code>ovn-controller</code> if the switch is an interconnect switch
+      with multicast snooping enabled.
+    </p>
+
     <p>
       This table also has a priority-110 flow with the match
       <code>eth.dst == <var>E</var></code> for all logical switch
@@ -2385,6 +2391,15 @@  icmp6_error {
         </p>
       </li>
 
+      <li>
+        <p>
+          L3 admission control: Priority-120 flows allows IGMP and MLD packets
+          if the router has logical ports that have
+          <ref column="options" table="Logical_Router_Port"/>
+          <code>:mcast_flood='true'</code>.
+        </p>
+      </li>
+
       <li>
         <p>
           L3 admission control: A priority-100 flow drops packets that match
@@ -3449,14 +3464,23 @@  output;
     <ul>
       <li>
         <p>
-          Priority-550 flow that drops IPv6 Router Solicitation/Advertisement
+          Priority-10550 flow that drops IPv6 Router Solicitation/Advertisement
           packets that were not processed in previous tables.
         </p>
       </li>
 
       <li>
         <p>
-          Priority-500 flows that match IP multicast traffic destined to
+          Priority-10550 flows that drop IGMP and MLD packets with source MAC
+          address owned by the router.  These are used to prevent looping
+          statically forwarded IGMP and MLD packets for which TTL is not
+          decremented (it is always 1).
+        </p>
+      </li>
+
+      <li>
+        <p>
+          Priority-10500 flows that match IP multicast traffic destined to
           groups registered on any of the attached switches and sets
           <code>outport</code> to the associated multicast group that will
           eventually flood the traffic to all interested attached logical
@@ -3466,10 +3490,22 @@  output;
 
       <li>
         <p>
-          Priority-450 flow that matches unregistered IP multicast traffic
-          and sets <code>outport</code> to the <code>MC_STATIC</code>
-          multicast group, which <code>ovn-northd</code> populates with the
-          logical ports that have
+          Priority-10460 flows that match IGMP and MLD control packets,
+          set <code>outport</code> to the <code>MC_STATIC</code>
+          multicast group, which <code>ovn-northd</code>
+          populates with the logical ports that have
+          <ref column="options" table="Logical_Router_Port"/>
+          <code>:mcast_flood='true'</code>. If no router ports are configured
+          to flood multicast traffic the packets are dropped.
+        </p>
+      </li>
+
+      <li>
+        <p>
+          Priority-10450 flow that matches unregistered IP multicast traffic
+          decrements TTL and sets <code>outport</code> to the
+          <code>MC_STATIC</code> multicast group, which <code>ovn-northd</code>
+          populates with the logical ports that have
           <ref column="options" table="Logical_Router_Port"/>
           <code>:mcast_flood='true'</code>. If no router ports are configured
           to flood multicast traffic the packets are dropped.
diff --git a/tests/ovn-northd.at b/tests/ovn-northd.at
index fe27869737..72c6d44f60 100644
--- a/tests/ovn-northd.at
+++ b/tests/ovn-northd.at
@@ -6249,3 +6249,46 @@  check_log_flows_count 0 in
 
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([Static IP multicast report forwarding])
+
+ovn_start
+
+check ovn-nbctl lr-add lr
+check ovn-nbctl lrp-add lr lrp1 00:00:00:00:00:01 10.10.10.1/24 1010::1/64
+check ovn-nbctl lrp-add lr lrp2 00:00:00:00:00:02 20.20.20.1/24 2020::1/64
+check ovn-nbctl set logical_router lr options:mcast_relay="true"
+check ovn-nbctl set logical_router_port lrp1 options:mcast_flood="true"
+check ovn-nbctl set logical_router_port lrp2 options:mcast_flood="true"
+
+check ovn-nbctl --wait=sb sync
+
+ovn-sbctl dump-flows lr > lrflows
+AT_CAPTURE_FILE([lrflows])
+
+dnl Flows to skip TTL == {0, 1} check for IGMP and MLD packets.
+AT_CHECK([grep -e 'lr_in_ip_input    ' lrflows | grep -e 'ip.proto == 2' -e 'mld' -e 'ip.ttl == {0, 1}' | sed 's/table=../table=??/'], [0], [dnl
+  table=??(lr_in_ip_input     ), priority=120  , match=(ip4 && ip.proto == 2 && ip.ttl == 1), action=(next;)
+  table=??(lr_in_ip_input     ), priority=120  , match=(ip6 && (mldv1 || mldv2) && ip.ttl == 1), action=(next;)
+  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp1" && ip4 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp4 {eth.dst <-> eth.src; icmp4.type = 11; /* Time exceeded */ icmp4.code = 0; /* TTL exceeded in transit */ ip4.dst = ip4.src; ip4.src = 10.10.10.1 ; ip.ttl = 254; outport = "lrp1"; flags.loopback = 1; output; };)
+  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp1" && ip6 && ip6.src == 1010::/64 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp6 {eth.dst <-> eth.src; ip6.dst = ip6.src; ip6.src = 1010::1 ; ip.ttl = 254; icmp6.type = 3; /* Time exceeded */ icmp6.code = 0; /* TTL exceeded in transit */ outport = "lrp1"; flags.loopback = 1; output; };)
+  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp2" && ip4 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp4 {eth.dst <-> eth.src; icmp4.type = 11; /* Time exceeded */ icmp4.code = 0; /* TTL exceeded in transit */ ip4.dst = ip4.src; ip4.src = 20.20.20.1 ; ip.ttl = 254; outport = "lrp2"; flags.loopback = 1; output; };)
+  table=??(lr_in_ip_input     ), priority=100  , match=(inport == "lrp2" && ip6 && ip6.src == 2020::/64 && ip.ttl == {0, 1} && !ip.later_frag), action=(icmp6 {eth.dst <-> eth.src; ip6.dst = ip6.src; ip6.src = 2020::1 ; ip.ttl = 254; icmp6.type = 3; /* Time exceeded */ icmp6.code = 0; /* TTL exceeded in transit */ outport = "lrp2"; flags.loopback = 1; output; };)
+  table=??(lr_in_ip_input     ), priority=30   , match=(ip4 && ip.ttl == {0, 1}), action=(drop;)
+])
+
+dnl Flows to "route" (statically forward) without decrementing TTL for
+dnl IGMP and MLD packets.  Also, flows to drop potentially looping IGMP/MLD
+dnl packets.
+AT_CHECK([grep -e 'lr_in_ip_routing   ' lrflows | grep -e 'ip.proto == 2' -e 'mld' | sed 's/table=../table=??/'], [0], [dnl
+  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:01 && (mldv1 || mldv2)), action=(drop;)
+  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:01 && ip4 && ip.proto == 2), action=(drop;)
+  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:02 && (mldv1 || mldv2)), action=(drop;)
+  table=??(lr_in_ip_routing   ), priority=10550, match=(eth.src == 00:00:00:00:00:02 && ip4 && ip.proto == 2), action=(drop;)
+  table=??(lr_in_ip_routing   ), priority=10460, match=(ip4 && ip.proto == 2), action=(clone { outport = "_MC_static"; next; };)
+  table=??(lr_in_ip_routing   ), priority=10460, match=(mldv1 || mldv2), action=(clone { outport = "_MC_static"; next; };)
+])
+
+AT_CLEANUP
+])
diff --git a/tests/ovn.at b/tests/ovn.at
index 69270601ab..d7ae7d917b 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -22906,6 +22906,232 @@  OVS_WAIT_UNTIL(
 AT_CLEANUP
 ])
 
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([interconnection - IGMP/MLD multicast])
+
+# Logical network:
+#
+#       AZ1                     |                     AZ2
+#   ---------------------------------------------------------------------
+#                               |
+#                               |     +-- LR2 --- LS2 --- LSP2 (sender)
+#                               |    /
+#     LSP1  --- LS1 --- LR1 --- TS ---
+#   (receiver)                  |    \
+#                               |     +-- LR3 --- LS3 --- LSP3 (receiver)
+#
+# LS1, LS2, LS3, TS configured to snoop IP multicast.
+# LR1, LR2, LR3 configured to relay IP multicast.
+# LR1-TS configured to flood IP multicast traffic unconditionally.
+# LR2-TS configured to flood IP multicast traffic unconditionally.
+# LR3-TS configured to flood IP multicast traffic unconditionally.
+
+AT_CAPTURE_FILE([exp])
+AT_CAPTURE_FILE([rcv])
+check_packets() {
+    > exp
+    > rcv
+    if test "$1" = --uniq; then
+        sort="sort -u"; shift
+    else
+        sort=sort
+    fi
+    for tuple in "$@"; do
+        set $tuple; pcap=$1 type=$2
+        echo "--- $pcap" | tee -a exp >> rcv
+        $sort "$type" >> exp
+        $PYTHON "$ovs_srcdir/utilities/ovs-pcap.in" $pcap | $sort >> rcv
+        echo | tee -a exp >> rcv
+    done
+
+    $at_diff exp rcv >/dev/null
+}
+
+ovn_init_ic_db
+ovn_start az1
+ovn_start az2
+
+net_add n1
+
+sim_add hv1
+as hv1
+check ovs-vsctl add-br br-phys
+ovn_az_attach az1 n1 br-phys 192.168.1.1 16
+check ovs-vsctl -- add-port br-int hv1-vif1 \
+    -- set interface hv1-vif1 external-ids:iface-id=lsp1 \
+       options:tx_pcap=hv1/vif1-tx.pcap \
+       options:rxq_pcap=hv1/vif1-rx.pcap
+check ovs-vsctl set open . external-ids:ovn-is-interconn=true
+
+sim_add hv2
+as hv2
+check ovs-vsctl add-br br-phys
+ovn_az_attach az2 n1 br-phys 192.168.2.1 16
+check ovs-vsctl -- add-port br-int hv2-vif1 \
+    -- set interface hv2-vif1 external-ids:iface-id=lsp2 \
+       options:tx_pcap=hv2/vif1-tx.pcap \
+       options:rxq_pcap=hv2/vif1-rx.pcap
+check ovs-vsctl -- add-port br-int hv2-vif2 \
+    -- set interface hv2-vif2 external-ids:iface-id=lsp3 \
+       options:tx_pcap=hv2/vif2-tx.pcap \
+       options:rxq_pcap=hv2/vif2-rx.pcap
+check ovs-vsctl set open . external-ids:ovn-is-interconn=true
+
+AT_CHECK([ovn-ic-nbctl create Transit_Switch name=ts], [0], [ignore])
+check ovn_as az1 ovn-nbctl wait-until logical_switch ts
+check ovn_as az2 ovn-nbctl wait-until logical_switch ts
+
+ovn_as az1
+check ovn-nbctl lr-add lr1 \
+    -- lrp-add lr1 lr1-ts 00:00:00:01:00:01 42.42.42.1/24 4242::1/64 \
+    -- lrp-add lr1 lr1-ls1 00:00:00:01:01:00 43.43.43.1/24 4343::1/64\
+    -- lrp-set-gateway-chassis lr1-ts hv1
+check ovn-nbctl ls-add ls1 \
+    -- lsp-add ls1 ls1-lr1 \
+    -- lsp-set-addresses ls1-lr1 router \
+    -- lsp-set-type ls1-lr1 router \
+    -- lsp-set-options ls1-lr1 router-port=lr1-ls1 \
+    -- lsp-add ls1 lsp1
+check ovn-nbctl lsp-add ts ts-lr1 \
+    -- lsp-set-addresses ts-lr1 router \
+    -- lsp-set-type ts-lr1 router \
+    -- lsp-set-options ts-lr1 router-port=lr1-ts
+wait_for_ports_up
+
+ovn_as az2
+check ovn-nbctl lr-add lr2 \
+    -- lrp-add lr2 lr2-ts 00:00:00:02:00:01 42.42.42.2/24 4242::2/64 \
+    -- lrp-add lr2 lr2-ls2 00:00:00:02:01:00 44.44.44.1/24 4444::1/64 \
+    -- lrp-set-gateway-chassis lr2-ts hv2
+check ovn-nbctl ls-add ls2 \
+    -- lsp-add ls2 ls2-lr2 \
+    -- lsp-set-addresses ls2-lr2 router \
+    -- lsp-set-type ls2-lr2 router \
+    -- lsp-set-options ls2-lr2 router-port=lr2-ls2 \
+    -- lsp-add ls2 lsp2
+check ovn-nbctl lsp-add ts ts-lr2 \
+    -- lsp-set-addresses ts-lr2 router \
+    -- lsp-set-type ts-lr2 router \
+    -- lsp-set-options ts-lr2 router-port=lr2-ts
+
+check ovn-nbctl lr-add lr3 \
+    -- lrp-add lr3 lr3-ts 00:00:00:02:00:02 42.42.42.3/24 4242::3/64 \
+    -- lrp-add lr3 lr3-ls3 00:00:00:02:02:00 44.44.45.1/24 4445::1/64 \
+    -- lrp-set-gateway-chassis lr3-ts hv2
+check ovn-nbctl ls-add ls3 \
+    -- lsp-add ls3 ls3-lr3 \
+    -- lsp-set-addresses ls3-lr3 router \
+    -- lsp-set-type ls3-lr3 router \
+    -- lsp-set-options ls3-lr3 router-port=lr3-ls3 \
+    -- lsp-add ls3 lsp3
+check ovn-nbctl lsp-add ts ts-lr3 \
+    -- lsp-set-addresses ts-lr3 router \
+    -- lsp-set-type ts-lr3 router \
+    -- lsp-set-options ts-lr3 router-port=lr3-ts
+
+wait_for_ports_up
+
+dnl Enable IP multicast snooping and IP multicast relay.  Reports are
+dnl forwarded statically.
+ovn_as az1
+check ovn-nbctl set logical_switch ls1 other_config:mcast_snoop="true"
+check ovn-nbctl set Logical_Switch_Port ls1-lr1 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_router lr1 options:mcast_relay="true"
+check ovn-nbctl set logical_router_port lr1-ts options:mcast_flood="true"
+check ovn-nbctl set logical_switch ts other_config:mcast_snoop="true"
+check ovn-nbctl set logical_switch_port ts-lr1 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_switch_port ts-lr2 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_switch_port ts-lr3 options:mcast_flood_reports="true"
+
+ovn_as az2
+check ovn-nbctl set logical_switch ls2 other_config:mcast_snoop="true"
+check ovn-nbctl set Logical_Switch_Port ls2-lr2 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_router lr2 options:mcast_relay="true"
+check ovn-nbctl set logical_router_port lr2-ts options:mcast_flood="true"
+check ovn-nbctl set logical_switch ls3 other_config:mcast_snoop="true"
+check ovn-nbctl set Logical_Switch_Port ls3-lr3 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_router lr3 options:mcast_relay="true"
+check ovn-nbctl set logical_router_port lr3-ts options:mcast_flood="true"
+check ovn-nbctl set logical_switch ts other_config:mcast_snoop="true"
+check ovn-nbctl set logical_switch_port ts-lr1 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_switch_port ts-lr2 options:mcast_flood_reports="true"
+check ovn-nbctl set logical_switch_port ts-lr3 options:mcast_flood_reports="true"
+
+check ovn_as az1 ovn-nbctl --wait=hv sync
+check ovn_as az2 ovn-nbctl --wait=hv sync
+
+# Pre-populate the hypervisors' ARP tables so that we don't lose any
+# packets for ARP resolution (native tunneling doesn't queue packets
+# for ARP resolution).
+OVN_POPULATE_ARP
+
+# Inject IGMP Join for 239.0.1.68 on LSP1.
+send_igmp_v3_report hv1-vif1 hv1 \
+    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
+    $(ip_to_hex 239 0 1 68) 04 e9b9 \
+    /dev/null
+
+# Inject MLD Join for ff0a:dead:beef::1 on LSP1.
+send_mld_v2_report hv1-vif1 hv1 \
+    000000000001 10000000000000000000000000000001 \
+    ff0adeadbeef00000000000000000001 04 c0e4 \
+    /dev/null
+
+# Inject IGMP Join for 239.0.1.68 on LSP3.
+send_igmp_v3_report hv2-vif2 hv2 \
+    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
+    $(ip_to_hex 239 0 1 68) 04 e9b9 \
+    /dev/null
+
+# Inject MLD Join for ff0a:dead:beef::1 on LSP3.
+send_mld_v2_report hv2-vif2 hv2 \
+    000000000001 10000000000000000000000000000001 \
+    ff0adeadbeef00000000000000000001 04 c0e4 \
+    /dev/null
+
+# Send an IP multicast packet from LSP2, it should be forwarded
+# to lsp1 and lsp3.
+> expected_az1
+> expected_az2
+send_ip_multicast_pkt hv2-vif1 hv2 \
+    000000000001 01005e000144 \
+    $(ip_to_hex 44 44 44 2) $(ip_to_hex 239 0 1 68) 1e 20 7c6b 11 \
+    e518e518000aed350000
+store_ip_multicast_pkt \
+    000000010100 01005e000144 \
+    $(ip_to_hex 44 44 44 2) $(ip_to_hex 239 0 1 68) 1e 1e 7e6b 11 \
+    e518e518000aed350000 expected_az1
+store_ip_multicast_pkt \
+    000000020200 01005e000144 \
+    $(ip_to_hex 44 44 44 2) $(ip_to_hex 239 0 1 68) 1e 1e 7e6b 11 \
+    e518e518000aed350000 expected_az2
+
+send_ip6_multicast_pkt hv2-vif1 hv2 \
+    000000000001 333300000001 \
+    00100000000000000000000000000042 ff0adeadbeef00000000000000000001 \
+    000e 40 11 \
+    93407a69000e2b4e61736461640a
+store_ip6_multicast_pkt \
+    000000010100 333300000001 \
+    00100000000000000000000000000042 ff0adeadbeef00000000000000000001 \
+    000e 3e 11 \
+    93407a69000e2b4e61736461640a \
+    expected_az1
+store_ip6_multicast_pkt \
+    000000020200 333300000001 \
+    00100000000000000000000000000042 ff0adeadbeef00000000000000000001 \
+    000e 3e 11 \
+    93407a69000e2b4e61736461640a \
+    expected_az2
+
+OVS_WAIT_UNTIL(
+  [check_packets 'hv1/vif1-tx.pcap expected_az1' \
+                 'hv2/vif2-tx.pcap expected_az2'],
+  [$at_diff -F'^---' exp rcv])
+
+AT_CLEANUP
+])
+
 OVN_FOR_EACH_NORTHD([
 AT_SETUP([ECMP static routes])
 ovn_start