diff mbox series

[ovs-dev,ovn,v3] ovn-northd: Add IGMP Relay support

Message ID 1565611228-32122-1-git-send-email-dceara@redhat.com
State Superseded
Headers show
Series [ovs-dev,ovn,v3] ovn-northd: Add IGMP Relay support | expand

Commit Message

Dumitru Ceara Aug. 12, 2019, noon UTC
Add a new configuration option 'mcast_relay' to the Logical_Router:options
in the OVN Northbound database.

If a router is configured with 'mcast_relay' enabled then ovn-northd
will install Logical_Flows to allow IP multicast traffic to be routed
between Logical_Switches. The logical router will aggregate all IGMP
groups from attached logical switches and modify the routing pipeline in
the following way:
- Table S_ROUTER_IN_IP_INPUT: add flow allowing IP multicast traffic
  if mcast_relay is enabled on the datapath.
- Table S_ROUTER_IN_IP_ROUTING: add flow matching the group address,
  update TTL and set outport="<Multicast_Group> associated with the
  IGMP group". Continue to next table.
- Table S_ROUTER_IN_ARP_RESOLVE: bypass ARP resolve for IP multicast
  traffic and continue to next table.
- Table S_ROUTER_OUT_DELIVERY: add flow matching IP multicast traffic
  and set ETH.SRC to the MAC address of the logical port on which
  traffic is forwarded.

Signed-off-by: Dumitru Ceara <dceara@redhat.com>
Acked-by: Mark Michelson <mmichels@redhat.com>

---
v3:
- Address Mark's comment and move setting of the outport in the IP
  Routing stage.
- Update commit message.
- Fix some typos.
v2:
- Optimize flooding to multicast router ports.
- Fix check for source IP multicast in router pipeline.
- Use an enum for OVN_MCAST_*_KEY definitions to avoid hard to debug
  errors due to typos when adding new OVN_MCAST_*_KEY values.
- Fix ovn-northd man page for IGMP.
---
 NEWS                    |   1 +
 lib/mcast-group-index.h |  13 +-
 northd/ovn-northd.8.xml |  79 +++++++-
 northd/ovn-northd.c     | 504 ++++++++++++++++++++++++++++++++++++------------
 ovn-nb.xml              |   6 +
 tests/ovn.at            | 199 +++++++++++++++++--
 6 files changed, 651 insertions(+), 151 deletions(-)

Comments

Numan Siddique Aug. 16, 2019, 10:22 a.m. UTC | #1
On Mon, Aug 12, 2019 at 5:32 PM Dumitru Ceara <dceara@redhat.com> wrote:

> Add a new configuration option 'mcast_relay' to the Logical_Router:options
> in the OVN Northbound database.
>
> If a router is configured with 'mcast_relay' enabled then ovn-northd
> will install Logical_Flows to allow IP multicast traffic to be routed
> between Logical_Switches. The logical router will aggregate all IGMP
> groups from attached logical switches and modify the routing pipeline in
> the following way:
> - Table S_ROUTER_IN_IP_INPUT: add flow allowing IP multicast traffic
>   if mcast_relay is enabled on the datapath.
> - Table S_ROUTER_IN_IP_ROUTING: add flow matching the group address,
>   update TTL and set outport="<Multicast_Group> associated with the
>   IGMP group". Continue to next table.
> - Table S_ROUTER_IN_ARP_RESOLVE: bypass ARP resolve for IP multicast
>   traffic and continue to next table.
> - Table S_ROUTER_OUT_DELIVERY: add flow matching IP multicast traffic
>   and set ETH.SRC to the MAC address of the logical port on which
>   traffic is forwarded.
>
> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> Acked-by: Mark Michelson <mmichels@redhat.com>
>

Hi Dumitru,

Just a minor comment. Please see below. Can you please update the patch ?
LGTM otherwise.

Thanks
Numan


>
> ---
> v3:
> - Address Mark's comment and move setting of the outport in the IP
>   Routing stage.
> - Update commit message.
> - Fix some typos.
> v2:
> - Optimize flooding to multicast router ports.
> - Fix check for source IP multicast in router pipeline.
> - Use an enum for OVN_MCAST_*_KEY definitions to avoid hard to debug
>   errors due to typos when adding new OVN_MCAST_*_KEY values.
> - Fix ovn-northd man page for IGMP.
> ---
>  NEWS                    |   1 +
>  lib/mcast-group-index.h |  13 +-
>  northd/ovn-northd.8.xml |  79 +++++++-
>  northd/ovn-northd.c     | 504
> ++++++++++++++++++++++++++++++++++++------------
>  ovn-nb.xml              |   6 +
>  tests/ovn.at            | 199 +++++++++++++++++--
>  6 files changed, 651 insertions(+), 151 deletions(-)
>
> diff --git a/NEWS b/NEWS
> index f476984..73045d6 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -39,6 +39,7 @@ Post-v2.11.0
>         logical groups which results in tunnels only been formed between
>         members of the same transport zone(s).
>       * Support for new logical switch port type - 'virtual'.
> +     * Support for IGMP Snooping/Querier and Relay.
>     - New QoS type "linux-netem" on Linux.
>     - Added support for TLS Server Name Indication (SNI).
>
> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
> index 15a1592..cb49ad7 100644
> --- a/lib/mcast-group-index.h
> +++ b/lib/mcast-group-index.h
> @@ -20,8 +20,17 @@ struct ovsdb_idl;
>
>  struct sbrec_datapath_binding;
>
> -#define OVN_MCAST_FLOOD_TUNNEL_KEY   65535
> -#define OVN_MCAST_UNKNOWN_TUNNEL_KEY (OVN_MCAST_FLOOD_TUNNEL_KEY - 1)
> +#define OVN_MIN_MULTICAST 32768
> +#define OVN_MAX_MULTICAST 65535
> +
> +enum ovn_mcast_tunnel_keys {
> +
> +    OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
> +    OVN_MCAST_UNKNOWN_TUNNEL_KEY,
> +    OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
> +    OVN_MIN_IP_MULTICAST,
> +    OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST,
> +};
>
>  struct ovsdb_idl_index *mcast_group_index_create(struct ovsdb_idl *);
>  const struct sbrec_multicast_group *
> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> index 6d2fbe3..d45bb15 100644
> --- a/northd/ovn-northd.8.xml
> +++ b/northd/ovn-northd.8.xml
> @@ -947,10 +947,40 @@ output;
>
>      <ul>
>        <li>
> -        A priority-100 flow that outputs all packets with an Ethernet
> broadcast
> +        A priority-100 flow that punts all IGMP packets to
> +        <code>ovn-controller</code> if IGMP snooping is enabled on the
> +        logical switch.
> +      </li>
> +
> +      <li>
> +        Priority-90 flows that forward registered IP multicast traffic to
> +        their corresponding multicast group, which <code>ovn-northd</code>
> +        creates based on learnt <ref table="IGMP_Group"
> db="OVN_Southbound"/>
> +        entries.  The flows also forward packets to the
> +        <code>MC_MROUTER_FLOOD</code> multicast group, which
> +        <code>ovn-nortdh</code> populates with all the logical ports that
> +        are connected to logical routers with
> +        <ref column="options" table="Logical_Router"/>:mcast_relay='true'.
> +      </li>
> +
> +      <li>
> +        A priority-85 flow that forwards all IP multicast traffic
> destined to
> +        224.0.0.X to the <code>MC_FLOOD</code> multicast group, which
> +        <code>ovn-northd</code> populates with all enabled logical ports.
> +      </li>
> +
> +      <li>
> +        A priority-80 flow that forwards all unregistered IP multicast
> traffic
> +        to the <code>MC_MROUTER_FLOOD</code> multicast group, if any.
> +        Otherwise the flow drops all unregistered IP multicast packets.
> This
> +        flow is added only if <ref column="other_config"
> +        table="Logical_Switch"/>:mcast_flood_unregistered='false'.
> +      </li>
> +
> +      <li>
> +        A priority-70 flow that outputs all packets with an Ethernet
> broadcast
>          or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
> -        multicast group, which <code>ovn-northd</code> populates with all
> -        enabled logical ports.
> +        multicast group.
>        </li>
>
>        <li>
> @@ -1228,6 +1258,14 @@ output;
>
>        <li>
>          <p>
> +          A priority-95 flow allows IP multicast traffic if
> +          <ref column="options"
> table="Logical_Router"/>:mcast_relay='true',
> +          otherwise drops it.
> +        </p>
> +      </li>
> +
> +      <li>
> +        <p>
>            ICMP echo reply.  These flows reply to ICMP echo requests
> received
>            for the router's IP address.  Let <var>A</var> be an IP address
>            owned by a router port.  Then, for each <var>A</var> that is
> @@ -1941,6 +1979,16 @@ output;
>      <ul>
>        <li>
>          <p>
> +          Priority-500 flows that match IP multicast traffic destined to
> +          groups registered on any of the attached switches and sets
> +          <code>outport</code> to the associated multicast group that will
> +          eventually flood the traffic to all interested attached logical
> +          switches. The flows also decrement TTL.
> +        </p>
> +      </li>
> +
> +      <li>
> +        <p>
>            For distributed logical routers where one of the logical router
>            ports specifies a <code>redirect-chassis</code>, a priority-400
>            logical flow for each ip source/destination couple that matches
> the
> @@ -2074,6 +2122,15 @@ next;
>      <ul>
>        <li>
>          <p>
> +          A priority-500 flow that matches IP multicast traffic that was
> +          allowed in the routing pipeline. For this kind of traffic the
> +          <code>outport</code> was already set so the flow just advances
> to
> +          the next table.
> +        </p>
> +      </li>
> +
> +      <li>
> +        <p>
>            For distributed logical routers where one of the logical router
>            ports specifies a <code>redirect-chassis</code>, a priority-400
>            logical flow with match <code>REGBIT_DISTRIBUTED_NAT == 1</code>
> @@ -2641,9 +2698,19 @@ clone {
>      <h3>Egress Table 3: Delivery</h3>
>
>      <p>
> -      Packets that reach this table are ready for delivery.  It contains
> -      priority-100 logical flows that match packets on each enabled
> logical
> -      router port, with action <code>output;</code>.
> +      Packets that reach this table are ready for delivery.  It contains:
> +      <ul>
> +        <li>
> +          Priority-110 logical flows that match IP multicast packets on
> each
> +          enabled logical router port and modify the Ethernet source
> address
> +          of the packets to the Ethernet address of the port and then
> execute
> +          action <code>output;</code>.
> +        </li>
> +        <li>
> +          Priority-100 logical flows that match packets on each enabled
> +          logical router port, with action <code>output;</code>.
> +        </li>
> +      </ul>
>      </p>
>
>  </manpage>
> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> index e6953a4..9ee9230 100644
> --- a/northd/ovn-northd.c
> +++ b/northd/ovn-northd.c
> @@ -433,32 +433,52 @@ struct ipam_info {
>      bool mac_only;
>  };
>
> -#define OVN_MIN_MULTICAST 32768
> -#define OVN_MAX_MULTICAST OVN_MCAST_FLOOD_TUNNEL_KEY
> -BUILD_ASSERT_DECL(OVN_MIN_MULTICAST < OVN_MAX_MULTICAST);
> -
> -#define OVN_MIN_IP_MULTICAST OVN_MIN_MULTICAST
> -#define OVN_MAX_IP_MULTICAST (OVN_MCAST_UNKNOWN_TUNNEL_KEY - 1)
> -BUILD_ASSERT_DECL(OVN_MAX_IP_MULTICAST >= OVN_MIN_MULTICAST);
> -
>  /*
>   * Multicast snooping and querier per datapath configuration.
>   */
> +struct mcast_switch_info {
> +
> +    bool enabled;               /* True if snooping enabled. */
> +    bool querier;               /* True if querier enabled. */
> +    bool flood_unregistered;    /* True if unregistered multicast should
> be
> +                                 * flooded.
> +                                 */
> +    bool flood_relay;           /* True if the switch is connected to a
> +                                 * multicast router and unregistered
> multicast
> +                                 * should be flooded to the mrouter. Only
> +                                 * applicable if flood_unregistered ==
> false.
> +                                 */
> +
> +    int64_t table_size;         /* Max number of IP multicast groups. */
> +    int64_t idle_timeout;       /* Timeout after which an idle group is
> +                                 * flushed.
> +                                 */
> +    int64_t query_interval;     /* Interval between multicast queries. */
> +    char *eth_src;              /* ETH src address of the multicast
> queries. */
> +    char *ipv4_src;             /* IP src address of the multicast
> queries. */
> +    int64_t query_max_response; /* Expected time after which reports
> should
> +                                 * be received for queries that were sent
> out.
> +                                 */
> +
> +    uint32_t active_flows;      /* Current number of active IP multicast
> +                                 * flows.
> +                                 */
> +};
> +
> +struct mcast_router_info {
> +    bool relay; /* True if the router should relay IP multicast. */
> +};
> +
>  struct mcast_info {
> -    bool enabled;
> -    bool querier;
> -    bool flood_unregistered;
> -
> -    int64_t table_size;
> -    int64_t idle_timeout;
> -    int64_t query_interval;
> -    char *eth_src;
> -    char *ipv4_src;
> -    int64_t  query_max_response;
> -
> -    struct hmap group_tnlids;
> -    uint32_t group_tnlid_hint;
> -    uint32_t active_flows;
> +
> +    struct hmap group_tnlids;  /* Group tunnel IDs in use on this DP. */
> +    uint32_t group_tnlid_hint; /* Hint for allocating next group tunnel
> ID. */
> +    struct ovs_list groups;    /* List of groups learnt on this DP. */
> +
> +    union {
> +        struct mcast_switch_info sw;  /* Switch specific multicast info.
> */
> +        struct mcast_router_info rtr; /* Router specific multicast info.
> */
> +    };
>  };
>
>  static uint32_t
> @@ -559,6 +579,7 @@ ovn_datapath_create(struct hmap *datapaths, const
> struct uuid *key,
>  }
>
>  static void ovn_ls_port_group_destroy(struct hmap *nb_pgs);
> +static void destroy_mcast_info_for_datapath(struct ovn_datapath *od);
>
>  static void
>  ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
> @@ -572,12 +593,7 @@ ovn_datapath_destroy(struct hmap *datapaths, struct
> ovn_datapath *od)
>          bitmap_free(od->ipam_info.allocated_ipv4s);
>          free(od->router_ports);
>          ovn_ls_port_group_destroy(&od->nb_pgs);
> -
> -        if (od->nbs) {
> -            free(od->mcast_info.eth_src);
> -            free(od->mcast_info.ipv4_src);
> -            destroy_tnlids(&od->mcast_info.group_tnlids);
> -        }
> +        destroy_mcast_info_for_datapath(od);
>
>          free(od);
>      }
> @@ -714,23 +730,28 @@ init_ipam_info_for_datapath(struct ovn_datapath *od)
>  }
>
>  static void
> -init_mcast_info_for_datapath(struct ovn_datapath *od)
> +init_mcast_info_for_router_datapath(struct ovn_datapath *od)
>  {
> -    if (!od->nbs) {
> -        return;
> -    }
> +    struct mcast_router_info *mcast_rtr_info = &od->mcast_info.rtr;
>
> -    struct mcast_info *mcast_info = &od->mcast_info;
> +    mcast_rtr_info->relay = smap_get_bool(&od->nbr->options,
> "mcast_relay",
> +                                          false);
> +}
>
> -    mcast_info->enabled =
> +static void
> +init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
> +{
> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> +
> +    mcast_sw_info->enabled =
>          smap_get_bool(&od->nbs->other_config, "mcast_snoop", false);
> -    mcast_info->querier =
> +    mcast_sw_info->querier =
>          smap_get_bool(&od->nbs->other_config, "mcast_querier", true);
> -    mcast_info->flood_unregistered =
> +    mcast_sw_info->flood_unregistered =
>          smap_get_bool(&od->nbs->other_config, "mcast_flood_unregistered",
>                        false);
>
> -    mcast_info->table_size =
> +    mcast_sw_info->table_size =
>          smap_get_ullong(&od->nbs->other_config, "mcast_table_size",
>                          OVN_MCAST_DEFAULT_MAX_ENTRIES);
>
> @@ -742,54 +763,94 @@ init_mcast_info_for_datapath(struct ovn_datapath *od)
>      } else if (idle_timeout > OVN_MCAST_MAX_IDLE_TIMEOUT_S) {
>          idle_timeout = OVN_MCAST_MAX_IDLE_TIMEOUT_S;
>      }
> -    mcast_info->idle_timeout = idle_timeout;
> +    mcast_sw_info->idle_timeout = idle_timeout;
>
>      uint32_t query_interval =
>          smap_get_ullong(&od->nbs->other_config, "mcast_query_interval",
> -                        mcast_info->idle_timeout / 2);
> +                        mcast_sw_info->idle_timeout / 2);
>      if (query_interval < OVN_MCAST_MIN_QUERY_INTERVAL_S) {
>          query_interval = OVN_MCAST_MIN_QUERY_INTERVAL_S;
>      } else if (query_interval > OVN_MCAST_MAX_QUERY_INTERVAL_S) {
>          query_interval = OVN_MCAST_MAX_QUERY_INTERVAL_S;
>      }
> -    mcast_info->query_interval = query_interval;
> +    mcast_sw_info->query_interval = query_interval;
>
> -    mcast_info->eth_src =
> +    mcast_sw_info->eth_src =
>          nullable_xstrdup(smap_get(&od->nbs->other_config,
> "mcast_eth_src"));
> -    mcast_info->ipv4_src =
> +    mcast_sw_info->ipv4_src =
>          nullable_xstrdup(smap_get(&od->nbs->other_config,
> "mcast_ip4_src"));
>
> -    mcast_info->query_max_response =
> +    mcast_sw_info->query_max_response =
>          smap_get_ullong(&od->nbs->other_config,
> "mcast_query_max_response",
>                          OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
>
> -    hmap_init(&mcast_info->group_tnlids);
> -    mcast_info->group_tnlid_hint = OVN_MIN_IP_MULTICAST;
> -    mcast_info->active_flows = 0;
> +    mcast_sw_info->active_flows = 0;
> +}
> +
> +static void
> +init_mcast_info_for_datapath(struct ovn_datapath *od)
> +{
> +    if (!od->nbr && !od->nbs) {
> +        return;
> +    }
> +
> +    hmap_init(&od->mcast_info.group_tnlids);
> +    od->mcast_info.group_tnlid_hint = OVN_MIN_IP_MULTICAST;
> +    ovs_list_init(&od->mcast_info.groups);
> +
> +    if (od->nbs) {
> +        init_mcast_info_for_switch_datapath(od);
> +    } else {
> +        init_mcast_info_for_router_datapath(od);
> +    }
> +}
> +
> +static void
> +destroy_mcast_info_for_switch_datapath(struct ovn_datapath *od)
> +{
> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> +
> +    free(mcast_sw_info->eth_src);
> +    free(mcast_sw_info->ipv4_src);
> +}
> +
> +static void
> +destroy_mcast_info_for_datapath(struct ovn_datapath *od)
> +{
> +    if (!od->nbr && !od->nbs) {
> +        return;
> +    }
> +
> +    if (od->nbs) {
> +        destroy_mcast_info_for_switch_datapath(od);
> +    }
> +
> +    destroy_tnlids(&od->mcast_info.group_tnlids);
>  }
>
>  static void
> -store_mcast_info_for_datapath(const struct sbrec_ip_multicast *sb,
> -                              struct ovn_datapath *od)
> +store_mcast_info_for_switch_datapath(const struct sbrec_ip_multicast *sb,
> +                                     struct ovn_datapath *od)
>  {
> -    struct mcast_info *mcast_info = &od->mcast_info;
> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>
>      sbrec_ip_multicast_set_datapath(sb, od->sb);
> -    sbrec_ip_multicast_set_enabled(sb, &mcast_info->enabled, 1);
> -    sbrec_ip_multicast_set_querier(sb, &mcast_info->querier, 1);
> -    sbrec_ip_multicast_set_table_size(sb, &mcast_info->table_size, 1);
> -    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_info->idle_timeout, 1);
> +    sbrec_ip_multicast_set_enabled(sb, &mcast_sw_info->enabled, 1);
> +    sbrec_ip_multicast_set_querier(sb, &mcast_sw_info->querier, 1);
> +    sbrec_ip_multicast_set_table_size(sb, &mcast_sw_info->table_size, 1);
> +    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_sw_info->idle_timeout,
> 1);
>      sbrec_ip_multicast_set_query_interval(sb,
> -                                          &mcast_info->query_interval, 1);
> +                                          &mcast_sw_info->query_interval,
> 1);
>      sbrec_ip_multicast_set_query_max_resp(sb,
> -
> &mcast_info->query_max_response, 1);
> +
> &mcast_sw_info->query_max_response,
> +                                          1);
>
> -    if (mcast_info->eth_src) {
> -        sbrec_ip_multicast_set_eth_src(sb, mcast_info->eth_src);
> +    if (mcast_sw_info->eth_src) {
> +        sbrec_ip_multicast_set_eth_src(sb, mcast_sw_info->eth_src);
>      }
>
> -    if (mcast_info->ipv4_src) {
> -        sbrec_ip_multicast_set_ip4_src(sb, mcast_info->ipv4_src);
> +    if (mcast_sw_info->ipv4_src) {
> +        sbrec_ip_multicast_set_ip4_src(sb, mcast_sw_info->ipv4_src);
>      }
>  }
>
> @@ -906,6 +967,7 @@ join_datapaths(struct northd_context *ctx, struct hmap
> *datapaths,
>                                       NULL, nbr, NULL);
>              ovs_list_push_back(nb_only, &od->list);
>          }
> +        init_mcast_info_for_datapath(od);
>          ovs_list_push_back(lr_list, &od->lr_list);
>      }
>  }
> @@ -1999,6 +2061,13 @@ join_logical_ports(struct northd_context *ctx,
>                      break;
>                  }
>              }
> +
> +            /* If the router is multicast enabled then set relay on the
> switch
> +             * datapath.
> +             */
> +            if (peer->od && peer->od->mcast_info.rtr.relay) {
> +                op->od->mcast_info.sw.flood_relay = true;
> +            }
>          } else if (op->nbrp && op->nbrp->peer && !op->derived) {
>              struct ovn_port *peer = ovn_port_find(ports, op->nbrp->peer);
>              if (peer) {
> @@ -2846,6 +2915,10 @@ struct multicast_group {
>  static const struct multicast_group mc_flood =
>      { MC_FLOOD, OVN_MCAST_FLOOD_TUNNEL_KEY };
>
> +#define MC_MROUTER_FLOOD "_MC_mrouter_flood"
> +static const struct multicast_group mc_mrouter_flood =
> +    { MC_MROUTER_FLOOD, OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY };
> +
>  #define MC_UNKNOWN "_MC_unknown"
>  static const struct multicast_group mc_unknown =
>      { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
> @@ -2955,7 +3028,8 @@ ovn_multicast_update_sbrec(const struct
> ovn_multicast *mc,
>   */
>  struct ovn_igmp_group_entry {
>      struct ovs_list list_node; /* Linkage in the list of entries. */
> -    const struct sbrec_igmp_group *sb;
> +    size_t n_ports;
> +    struct ovn_port **ports;
>  };
>
>  /*
> @@ -2964,12 +3038,13 @@ struct ovn_igmp_group_entry {
>   */
>  struct ovn_igmp_group {
>      struct hmap_node hmap_node; /* Index on 'datapath' and 'address'. */
> +    struct ovs_list list_node;  /* Linkage in the per-dp igmp group list.
> */
>
>      struct ovn_datapath *datapath;
>      struct in6_addr address; /* Multicast IPv6-mapped-IPv4 or IPv4
> address. */
>      struct multicast_group mcgroup;
>
> -    struct ovs_list sb_entries; /* List of SB entries for this group. */
> +    struct ovs_list entries; /* List of SB entries for this group. */
>  };
>
>  static uint32_t
> @@ -2997,77 +3072,120 @@ ovn_igmp_group_find(struct hmap *igmp_groups,
>      return NULL;
>  }
>
> -static void
> +static struct ovn_igmp_group *
>  ovn_igmp_group_add(struct northd_context *ctx, struct hmap *igmp_groups,
>                     struct ovn_datapath *datapath,
> -                   const struct sbrec_igmp_group *sb_igmp_group)
> +                   const struct in6_addr *address,
> +                   const char *address_s)
>  {
> -    struct in6_addr group_address;
> -    ovs_be32 ipv4;
> -
> -    if (ip_parse(sb_igmp_group->address, &ipv4)) {
> -        group_address = in6_addr_mapped_ipv4(ipv4);
> -    } else if (!ipv6_parse(sb_igmp_group->address, &group_address)) {
> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> -        VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
> -                     sb_igmp_group->address);
> -        return;
> -    }
> -
>      struct ovn_igmp_group *igmp_group =
> -        ovn_igmp_group_find(igmp_groups, datapath, &group_address);
> +        ovn_igmp_group_find(igmp_groups, datapath, address);
>
>      if (!igmp_group) {
>          igmp_group = xmalloc(sizeof *igmp_group);
>
>          const struct sbrec_multicast_group *mcgroup =
> -            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
> -                               sb_igmp_group->address, datapath->sb);
> +            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
> address_s,
> +                               datapath->sb);
>
>          igmp_group->datapath = datapath;
> -        igmp_group->address = group_address;
> +        igmp_group->address = *address;
>          if (mcgroup) {
>              igmp_group->mcgroup.key = mcgroup->tunnel_key;
>              add_tnlid(&datapath->mcast_info.group_tnlids,
> mcgroup->tunnel_key);
>          } else {
>              igmp_group->mcgroup.key = 0;
>          }
> -        igmp_group->mcgroup.name = sb_igmp_group->address;
> -        ovs_list_init(&igmp_group->sb_entries);
> +        igmp_group->mcgroup.name = address_s;
> +        ovs_list_init(&igmp_group->entries);
>
>          hmap_insert(igmp_groups, &igmp_group->hmap_node,
> -                    ovn_igmp_group_hash(datapath, &group_address));
> +                    ovn_igmp_group_hash(datapath, address));
> +        ovs_list_push_back(&datapath->mcast_info.groups,
> +                           &igmp_group->list_node);
> +    }
> +
> +    return igmp_group;
> +}
> +
> +static bool
> +ovn_igmp_group_get_address(const struct sbrec_igmp_group *sb_igmp_group,
> +                           struct in6_addr *address)
> +{
> +    ovs_be32 ipv4;
> +
> +    if (ip_parse(sb_igmp_group->address, &ipv4)) {
> +        *address = in6_addr_mapped_ipv4(ipv4);
> +        return true;
> +    }
> +    if (!ipv6_parse(sb_igmp_group->address, address)) {
> +        return false;
>      }
> +    return true;
> +}
>
> +static struct ovn_port **
> +ovn_igmp_group_get_ports(const struct sbrec_igmp_group *sb_igmp_group,
> +                         size_t *n_ports, struct hmap *ovn_ports)
> +{
> +    struct ovn_port **ports = xmalloc(sb_igmp_group->n_ports * sizeof
> *ports);
> +
> +     *n_ports = 0;
> +     for (size_t i = 0; i < sb_igmp_group->n_ports; i++) {
> +        ports[(*n_ports)] =
> +            ovn_port_find(ovn_ports,
> sb_igmp_group->ports[i]->logical_port);
> +        if (ports[(*n_ports)]) {
> +            (*n_ports)++;
> +        }
> +    }
> +
> +    return ports;
> +}
> +
> +static void
> +ovn_igmp_group_add_entry(struct ovn_igmp_group *igmp_group,
> +                         struct ovn_port **ports, size_t n_ports)
> +{
>      struct ovn_igmp_group_entry *entry = xmalloc(sizeof *entry);
>
> -    entry->sb = sb_igmp_group;
> -    ovs_list_push_back(&igmp_group->sb_entries , &entry->list_node);
> +    entry->ports = ports;
> +    entry->n_ports = n_ports;
> +    ovs_list_push_back(&igmp_group->entries, &entry->list_node);
> +}
> +
> +static void
> +ovn_igmp_group_destroy_entry(struct ovn_igmp_group_entry *entry)
> +{
> +    free(entry->ports);
> +}
> +
> +static bool
> +ovn_igmp_group_allocate_id(struct ovn_igmp_group *igmp_group)
> +{
> +    if (igmp_group->mcgroup.key == 0) {
> +        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
> +        igmp_group->mcgroup.key =
> ovn_mcast_group_allocate_key(mcast_info);
> +    }
> +
> +    if (igmp_group->mcgroup.key == 0) {
> +        return false;
> +    }
> +
> +    return true;
>  }
>
>  static void
>  ovn_igmp_group_aggregate_ports(struct ovn_igmp_group *igmp_group,
> -                               struct hmap *ovn_ports,
>                                 struct hmap *mcast_groups)
>  {
>      struct ovn_igmp_group_entry *entry;
>
> -    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
> -        size_t n_oports = 0;
> -        struct ovn_port **oports =
> -            xmalloc(entry->sb->n_ports * sizeof *oports);
> -
> -        for (size_t i = 0; i < entry->sb->n_ports; i++) {
> -            oports[n_oports] =
> -                ovn_port_find(ovn_ports,
> entry->sb->ports[i]->logical_port);
> -            if (oports[n_oports]) {
> -                n_oports++;
> -            }
> -        }
> -
> +    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>          ovn_multicast_add_ports(mcast_groups, igmp_group->datapath,
> -                                &igmp_group->mcgroup, oports, n_oports);
> -        free(oports);
> +                                &igmp_group->mcgroup, entry->ports,
> +                                entry->n_ports);
> +
> +        ovn_igmp_group_destroy_entry(entry);
>          free(entry);
>      }
>  }
> @@ -3079,10 +3197,12 @@ ovn_igmp_group_destroy(struct hmap *igmp_groups,
>      if (igmp_group) {
>          struct ovn_igmp_group_entry *entry;
>
> -        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
> +        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
> +            ovn_igmp_group_destroy_entry(entry);
>              free(entry);
>          }
>          hmap_remove(igmp_groups, &igmp_group->hmap_node);
> +        ovs_list_remove(&igmp_group->list_node);
>          free(igmp_group);
>      }
>  }
> @@ -5282,7 +5402,9 @@ build_lswitch_flows(struct hmap *datapaths, struct
> hmap *ports,
>              continue;
>          }
>
> -        if (od->mcast_info.enabled) {
> +        struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> +
> +        if (mcast_sw_info->enabled) {
>              /* Punt IGMP traffic to controller. */
>              ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100,
>                            "ip4 && ip.proto == 2", "igmp;");
> @@ -5295,9 +5417,16 @@ build_lswitch_flows(struct hmap *datapaths, struct
> hmap *ports,
>                            "outport = \""MC_FLOOD"\"; output;");
>
>              /* Drop unregistered IP multicast if not allowed. */
> -            if (!od->mcast_info.flood_unregistered) {
> -                ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
> -                              "ip4 && ip4.mcast", "drop;");
> +            if (!mcast_sw_info->flood_unregistered) {
> +                /* Forward unregistered IP multicast to mrouter (if any).
> */
> +                if (mcast_sw_info->flood_relay) {
> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
> +                                  "ip4 && ip4.mcast",
> +                                  "outport = \""MC_MROUTER_FLOOD"\";
> output;");
> +                } else {
> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
> +                                  "ip4 && ip4.mcast", "drop;");
> +                }
>              }
>          }
>
> @@ -5314,18 +5443,26 @@ build_lswitch_flows(struct hmap *datapaths, struct
> hmap *ports,
>              continue;
>          }
>
> -        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
> +        struct mcast_switch_info *mcast_sw_info =
> +            &igmp_group->datapath->mcast_info.sw;
>
> -        if (mcast_info->active_flows >= mcast_info->table_size) {
> +        if (mcast_sw_info->active_flows >= mcast_sw_info->table_size) {
>              continue;
>          }
> -        mcast_info->active_flows++;
> +        mcast_sw_info->active_flows++;
>
>          ds_clear(&match);
>          ds_clear(&actions);
>
>          ds_put_format(&match, "eth.mcast && ip4 && ip4.dst == %s ",
>                        igmp_group->mcgroup.name);
> +        /* Also flood traffic to all multicast routers with relay
> enabled. */
> +        if (mcast_sw_info->flood_relay) {
> +            ds_put_cstr(&actions,
> +                        "clone { "
> +                            "outport = \""MC_MROUTER_FLOOD "\"; output; "
> +                        "};");
> +        }
>          ds_put_format(&actions, "outport = \"%s\"; output; ",
>                        igmp_group->mcgroup.name);
>
> @@ -6205,7 +6342,7 @@ build_lrouter_flows(struct hmap *datapaths, struct
> hmap *ports,
>           * source or destination, and zero network source or destination
>           * (priority 100). */
>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
> -                      "ip4.mcast || "
> +                      "ip4.src[28..31] == 0xe ||"
>

Does it makes sense to add another predicate for source mcast addr ?
I am fine either way.



>                        "ip4.src == 255.255.255.255 || "
>                        "ip4.src == 127.0.0.0/8 || "
>                        "ip4.dst == 127.0.0.0/8 || "
> @@ -6213,6 +6350,16 @@ build_lrouter_flows(struct hmap *datapaths, struct
> hmap *ports,
>                        "ip4.dst == 0.0.0.0/8",
>                        "drop;");
>
> +        /* Allow multicast if relay enabled (priority 95). */
> +        ds_clear(&actions);
> +        if (od->mcast_info.rtr.relay) {
> +            ds_put_cstr(&actions, "next;");
> +        } else {
> +            ds_put_cstr(&actions, "drop;");
> +        }
> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
> +                      "ip4.dst[28..31] == 0xe", ds_cstr(&actions));
>

Since ip4.mcast is a predicate to "ip4.dst[28..31] == 0xe", can you please
use ip4.mcast here ?



> +
>          /* ARP reply handling.  Use ARP replies to populate the logical
>           * router's ARP table. */
>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
> @@ -7483,6 +7630,27 @@ build_lrouter_flows(struct hmap *datapaths, struct
> hmap *ports,
>          }
>      }
>
> +    /* IP Multicast lookup. Here we set the output port, adjust TTL and
> +     * advance to next table (priority 500).
> +     */
> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> +        if (!od->nbr || !od->mcast_info.rtr.relay) {
> +            continue;
> +        }
> +        struct ovn_igmp_group *igmp_group;
> +
> +        LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
> +            ds_clear(&match);
> +            ds_clear(&actions);
> +            ds_put_format(&match, "ip4 && ip4.dst == %s ",
> +                          igmp_group->mcgroup.name);
> +            ds_put_format(&actions, "outport = \"%s\"; ip.ttl--; next;",
> +                          igmp_group->mcgroup.name);
> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 500,
> +                          ds_cstr(&match), ds_cstr(&actions));
> +        }
> +    }
> +
>      /* Logical router ingress table 8: Policy.
>       *
>       * A packet that arrives at this table is an IP packet that should be
> @@ -7513,10 +7681,24 @@ build_lrouter_flows(struct hmap *datapaths, struct
> hmap *ports,
>
>      /* Local router ingress table 9: ARP Resolution.
>       *
> -     * Any packet that reaches this table is an IP packet whose next-hop
> IP
> -     * address is in reg0. (ip4.dst is the final destination.) This table
> -     * resolves the IP address in reg0 into an output port in outport and
> an
> -     * Ethernet address in eth.dst. */
> +     * Multicast packets already have the outport set so just advance to
> next
> +     * table (priority 500). */
> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> +        if (!od->nbr) {
> +            continue;
> +        }
> +
> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 500,
> +                      "ip4.mcast", "next;");
> +    }
> +
> +    /* Local router ingress table 9: ARP Resolution.
> +     *
> +     * Any unicast packet that reaches this table is an IP packet whose
> +     * next-hop IP address is in reg0. (ip4.dst is the final destination.)
> +     * This table resolves the IP address in reg0 into an output port in
> +     * outport and an Ethernet address in eth.dst.
> +     */
>      HMAP_FOR_EACH (op, key_node, ports) {
>          if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
>              continue;
> @@ -7998,9 +8180,13 @@ build_lrouter_flows(struct hmap *datapaths, struct
> hmap *ports,
>          ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1",
> "output;");
>      }
>
> -    /* Logical router egress table 1: Delivery (priority 100).
> +    /* Logical router egress table 1: Delivery (priority 100-110).
>       *
> -     * Priority 100 rules deliver packets to enabled logical ports. */
> +     * Priority 100 rules deliver packets to enabled logical ports.
> +     * Priority 110 rules match multicast packets and update the source
> +     * mac before delivering to enabled logical ports. IP multicast
> traffic
> +     * bypasses S_ROUTER_IN_IP_ROUTING route lookups.
> +     */
>      HMAP_FOR_EACH (op, key_node, ports) {
>          if (!op->nbrp) {
>              continue;
> @@ -8020,6 +8206,19 @@ build_lrouter_flows(struct hmap *datapaths, struct
> hmap *ports,
>              continue;
>          }
>
> +        /* If multicast relay is enabled then also adjust source mac for
> IP
> +         * multicast traffic.
> +         */
> +        if (op->od->mcast_info.rtr.relay) {
> +            ds_clear(&match);
> +            ds_clear(&actions);
> +            ds_put_format(&match, "ip4.mcast && outport == %s",
> op->json_key);
> +            ds_put_format(&actions, "eth.src = %s; output;",
> +                          op->lrp_networks.ea_s);
> +            ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 110,
> +                        ds_cstr(&match), ds_cstr(&actions));
> +        }
> +
>          ds_clear(&match);
>          ds_put_format(&match, "outport == %s", op->json_key);
>          ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
> @@ -8570,7 +8769,7 @@ build_ip_mcast(struct northd_context *ctx, struct
> hmap *datapaths)
>          if (!ip_mcast) {
>              ip_mcast = sbrec_ip_multicast_insert(ctx->ovnsb_txn);
>          }
> -        store_mcast_info_for_datapath(ip_mcast, od);
> +        store_mcast_info_for_switch_datapath(ip_mcast, od);
>      }
>
>      /* Delete southbound records without northbound matches. */
> @@ -8602,6 +8801,14 @@ build_mcast_groups(struct northd_context *ctx,
>
>          if (lsp_is_enabled(op->nbsp)) {
>              ovn_multicast_add(mcast_groups, &mc_flood, op);
> +
> +            /* If this port is connected to a multicast router then add it
> +             * to the MC_MROUTER_FLOOD group.
> +             */
> +            if (op->od->mcast_info.sw.flood_relay && op->peer &&
> +                    op->peer->od && op->peer->od->mcast_info.rtr.relay) {
> +                ovn_multicast_add(mcast_groups, &mc_mrouter_flood, op);
> +            }
>          }
>      }
>
> @@ -8624,10 +8831,61 @@ build_mcast_groups(struct northd_context *ctx,
>              continue;
>          }
>
> +        struct in6_addr group_address;
> +        if (!ovn_igmp_group_get_address(sb_igmp, &group_address)) {
> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> +            VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
> +                         sb_igmp->address);
> +            continue;
> +        }
> +
>          /* Add the IGMP group entry. Will also try to allocate an ID for
> it
>           * if the multicast group already exists.
>           */
> -        ovn_igmp_group_add(ctx, igmp_groups, od, sb_igmp);
> +        struct ovn_igmp_group *igmp_group =
> +            ovn_igmp_group_add(ctx, igmp_groups, od, &group_address,
> +                               sb_igmp->address);
> +
> +        /* Extract the IGMP group ports from the SB entry and store them
> +         * in the IGMP group.
> +         */
> +        size_t n_igmp_ports;
> +        struct ovn_port **igmp_ports =
> +            ovn_igmp_group_get_ports(sb_igmp, &n_igmp_ports, ports);
> +        ovn_igmp_group_add_entry(igmp_group, igmp_ports, n_igmp_ports);
> +    }
> +
> +    /* Build IGMP groups for multicast routers with relay enabled. The
> router
> +     * IGMP groups are based on the groups learnt by their multicast
> enabled
> +     * peers.
> +     */
> +    struct ovn_datapath *od;
> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> +
> +        if (ovs_list_is_empty(&od->mcast_info.groups)) {
> +            continue;
> +        }
> +
> +        for (size_t i = 0; i < od->n_router_ports; i++) {
> +            struct ovn_port *router_port = od->router_ports[i]->peer;
> +
> +            if (!router_port || !router_port->od ||
> +                    !router_port->od->mcast_info.rtr.relay) {
> +                continue;
> +            }
> +
> +            struct ovn_igmp_group *igmp_group;
> +            LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups)
> {
> +                struct ovn_igmp_group *igmp_group_rtr =
> +                    ovn_igmp_group_add(ctx, igmp_groups, router_port->od,
> +                                       &igmp_group->address,
> +                                       igmp_group->mcgroup.name);
> +                struct ovn_port **router_igmp_ports =
> +                    xmalloc(sizeof *router_igmp_ports);
> +                router_igmp_ports[0] = router_port;
> +                ovn_igmp_group_add_entry(igmp_group_rtr,
> router_igmp_ports, 1);
> +            }
> +        }
>      }
>
>      /* Walk the aggregated IGMP groups and allocate IDs for new entries.
> @@ -8635,21 +8893,17 @@ build_mcast_groups(struct northd_context *ctx,
>       */
>      struct ovn_igmp_group *igmp_group, *igmp_group_next;
>      HMAP_FOR_EACH_SAFE (igmp_group, igmp_group_next, hmap_node,
> igmp_groups) {
> -        if (igmp_group->mcgroup.key == 0) {
> -            struct mcast_info *mcast_info =
> &igmp_group->datapath->mcast_info;
> -            igmp_group->mcgroup.key =
> ovn_mcast_group_allocate_key(mcast_info);
> -        }
>
> -        /* If we ran out of keys just destroy the entry. */
> -        if (igmp_group->mcgroup.key == 0) {
> +        if (!ovn_igmp_group_allocate_id(igmp_group)) {
> +            /* If we ran out of keys just destroy the entry. */
>              ovn_igmp_group_destroy(igmp_groups, igmp_group);
>              continue;
>          }
>
> -        /* Aggregate the ports from all SB entries corresponding to this
> +        /* Aggregate the ports from all entries corresponding to this
>           * group.
>           */
> -        ovn_igmp_group_aggregate_ports(igmp_group, ports, mcast_groups);
> +        ovn_igmp_group_aggregate_ports(igmp_group, mcast_groups);
>      }
>  }
>
> diff --git a/ovn-nb.xml b/ovn-nb.xml
> index f5f10a5..db8cc20 100644
> --- a/ovn-nb.xml
> +++ b/ovn-nb.xml
> @@ -1526,6 +1526,12 @@
>            address.
>          </p>
>        </column>
> +      <column name="options" key="mcast_relay" type'{"type": "boolean"}'>
> +        <p>
> +          Enables/disables IP multicast relay between logical switches
> +          connected to the logical router. Default: False.
> +        </p>
> +      </column>
>      </group>
>
>      <group title="Common Columns">
> diff --git a/tests/ovn.at b/tests/ovn.at
> index 71eb390..52c044c 100644
> --- a/tests/ovn.at
> +++ b/tests/ovn.at
> @@ -14721,12 +14721,12 @@ AT_CHECK([ovn-sbctl get controller_event $uuid
> seq_num], [0], [dnl
>  OVN_CLEANUP([hv1], [hv2])
>  AT_CLEANUP
>
> -AT_SETUP([ovn -- IGMP snoop/querier])
> +AT_SETUP([ovn -- IGMP snoop/querier/relay])
>  AT_SKIP_IF([test $HAVE_PYTHON = no])
>  ovn_start
>
>  # Logical network:
> -# Two independent logical switches (sw1 and sw2).
> +# Three logical switches (sw1-sw3) connected to a logical router (rtr).
>  # sw1:
>  #   - subnet 10.0.0.0/8
>  #   - 2 ports bound on hv1 (sw1-p11, sw1-p12)
> @@ -14736,6 +14736,10 @@ ovn_start
>  #   - 1 port bound on hv1 (sw2-p1)
>  #   - 1 port bound on hv2 (sw2-p2)
>  #   - IGMP Querier from 20.0.0.254
> +# sw3:
> +#   - subnet 30.0.0.0/8
> +#   - 1 port bound on hv1 (sw3-p1)
> +#   - 1 port bound on hv2 (sw3-p2)
>
>  reset_pcap_file() {
>      local iface=$1
> @@ -14812,29 +14816,47 @@ store_igmp_v3_query() {
>  }
>
>  #
> -# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
> -#    IP_PROTO DATA OUTFILE
> +# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
> +#    IP_CHKSUM IP_PROTO DATA
>  #
>  # This shell function causes an IP multicast packet to be received on
> INPORT
>  # of HV.
>  # The hexdump of the packet is stored in OUTFILE.
>  #
>  send_ip_multicast_pkt() {
> -    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ip_src=$5 ip_dst=$6
> -    local ip_len=$7 ip_chksum=$8 proto=$9 data=${10} outfile=${11}
> -
> -    local ip_ttl=20
> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4
> +    local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9 proto=${10}
> +    local data=${11}
>
>      local eth=${eth_dst}${eth_src}0800
>      local
> ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>      local packet=${eth}${ip}${data}
>
>      as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet}
> +}
> +
> +#
> +# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
> +#    IP_CHKSUM IP_PROTO DATA OUTFILE
> +#
> +# This shell function builds an IP multicast packet and stores the
> hexdump of
> +# the packet in OUTFILE.
> +#
> +store_ip_multicast_pkt() {
> +    local eth_src=$1 eth_dst=$2
> +    local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8
> +    local data=$9 outfile=${10}
> +
> +    local eth=${eth_dst}${eth_src}0800
> +    local
> ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
> +    local packet=${eth}${ip}${data}
> +
>      echo ${packet} >> ${outfile}
>  }
>
>  ovn-nbctl ls-add sw1
>  ovn-nbctl ls-add sw2
> +ovn-nbctl ls-add sw3
>
>  ovn-nbctl lsp-add sw1 sw1-p11
>  ovn-nbctl lsp-add sw1 sw1-p12
> @@ -14842,6 +14864,26 @@ ovn-nbctl lsp-add sw1 sw1-p21
>  ovn-nbctl lsp-add sw1 sw1-p22
>  ovn-nbctl lsp-add sw2 sw2-p1
>  ovn-nbctl lsp-add sw2 sw2-p2
> +ovn-nbctl lsp-add sw3 sw3-p1
> +ovn-nbctl lsp-add sw3 sw3-p2
> +
> +ovn-nbctl lr-add rtr
> +ovn-nbctl lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24
> +ovn-nbctl lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24
> +ovn-nbctl lrp-add rtr rtr-sw3 00:00:00:00:03:00 30.0.0.254/24
> +
> +ovn-nbctl lsp-add sw1 sw1-rtr                      \
> +    -- lsp-set-type sw1-rtr router                 \
> +    -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \
> +    -- lsp-set-options sw1-rtr router-port=rtr-sw1
> +ovn-nbctl lsp-add sw2 sw2-rtr                      \
> +    -- lsp-set-type sw2-rtr router                 \
> +    -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \
> +    -- lsp-set-options sw2-rtr router-port=rtr-sw2
> +ovn-nbctl lsp-add sw3 sw3-rtr                      \
> +    -- lsp-set-type sw3-rtr router                 \
> +    -- lsp-set-addresses sw3-rtr 00:00:00:00:03:00 \
> +    -- lsp-set-options sw3-rtr router-port=rtr-sw3
>
>  net_add n1
>  sim_add hv1
> @@ -14863,6 +14905,11 @@ ovs-vsctl -- add-port br-int hv1-vif3 -- \
>      options:tx_pcap=hv1/vif3-tx.pcap \
>      options:rxq_pcap=hv1/vif3-rx.pcap \
>      ofport-request=1
> +ovs-vsctl -- add-port br-int hv1-vif4 -- \
> +    set interface hv1-vif4 external-ids:iface-id=sw3-p1 \
> +    options:tx_pcap=hv1/vif4-tx.pcap \
> +    options:rxq_pcap=hv1/vif4-rx.pcap \
> +    ofport-request=1
>
>  sim_add hv2
>  as hv2
> @@ -14883,12 +14930,18 @@ ovs-vsctl -- add-port br-int hv2-vif3 -- \
>      options:tx_pcap=hv2/vif3-tx.pcap \
>      options:rxq_pcap=hv2/vif3-rx.pcap \
>      ofport-request=1
> +ovs-vsctl -- add-port br-int hv2-vif4 -- \
> +    set interface hv2-vif4 external-ids:iface-id=sw3-p2 \
> +    options:tx_pcap=hv2/vif4-tx.pcap \
> +    options:rxq_pcap=hv2/vif4-rx.pcap \
> +    ofport-request=1
>
>  OVN_POPULATE_ARP
>
>  # Enable IGMP snooping on sw1.
> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_querier="false"
> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_snoop="true"
> +ovn-nbctl set Logical_Switch sw1       \
> +    other_config:mcast_querier="false" \
> +    other_config:mcast_snoop="true"
>
>  # No IGMP query should be generated by sw1 (mcast_querier="false").
>  truncate -s 0 expected
> @@ -14921,9 +14974,12 @@ truncate -s 0 expected
>  truncate -s 0 expected_empty
>  send_ip_multicast_pkt hv1-vif2 hv1 \
>      000000000001 01005e000144 \
> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
> -    e518e518000a3b3a0000 \
> -    expected
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000
> +store_ip_multicast_pkt \
> +    000000000001 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000 expected
>
>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected])
>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
> @@ -14944,17 +15000,19 @@ OVS_WAIT_UNTIL([
>      test "${total_entries}" = "1"
>  ])
>
> -# Send traffic traffic and make sure it gets forwarded only on the port
> that
> -# joined.
> +# Send traffic and make sure it gets forwarded only on the port that
> joined.
>  as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>  as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>  truncate -s 0 expected
>  truncate -s 0 expected_empty
>  send_ip_multicast_pkt hv1-vif2 hv1 \
>      000000000001 01005e000144 \
> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
> -    e518e518000a3b3a0000 \
> -    expected
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000
> +store_ip_multicast_pkt \
> +    000000000001 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000 expected
>
>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
> @@ -14988,6 +15046,111 @@ sleep 1
>  OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected])
>  OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
>
> +# Dissable IGMP querier on sw2.
> +ovn-nbctl set Logical_Switch sw2 \
> +    other_config:mcast_querier="false"
> +
> +# Enable IGMP snooping on sw3.
> +ovn-nbctl set Logical_Switch sw3       \
> +    other_config:mcast_querier="false" \
> +    other_config:mcast_snoop="true"
> +
> +# Send traffic from sw3 and make sure rtr doesn't relay it.
> +truncate -s 0 expected_empty
> +
> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
> +
> +send_ip_multicast_pkt hv2-vif4 hv2 \
> +    000000000001 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000
> +
> +# Sleep a bit to make sure no traffic is received and then check.
> +sleep 1
> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
> +
> +# Enable IGMP relay on rtr
> +ovn-nbctl set logical_router rtr \
> +    options:mcast_relay="true"
> +
> +# Inject IGMP Join for 239.0.1.68 on sw1-p11.
> +send_igmp_v3_report hv1-vif1 hv1 \
> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> +    /dev/null
> +# Inject IGMP Join for 239.0.1.68 on sw2-p2.
> +send_igmp_v3_report hv2-vif3 hv2 \
> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> +    /dev/null
> +# Inject IGMP Join for 239.0.1.68 on sw3-p1.
> +send_igmp_v3_report hv1-vif4 hv1 \
> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> +    /dev/null
> +
> +# Check that the IGMP Group is learned by all switches.
> +OVS_WAIT_UNTIL([
> +    total_entries=`ovn-sbctl find IGMP_Group | grep "239.0.1.68" | wc -l`
> +    test "${total_entries}" = "3"
> +])
> +
> +# Send traffic from sw3 and make sure it is relayed by rtr.
> +# and ports that joined.
> +truncate -s 0 expected_routed_sw1
> +truncate -s 0 expected_routed_sw2
> +truncate -s 0 expected_switched
> +truncate -s 0 expected_empty
> +
> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
> +
> +send_ip_multicast_pkt hv2-vif4 hv2 \
> +    000000000001 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000
> +store_ip_multicast_pkt \
> +    000000000100 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
> +    e518e518000a3b3a0000 expected_routed_sw1
> +store_ip_multicast_pkt \
> +    000000000200 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
> +    e518e518000a3b3a0000 expected_routed_sw2
> +store_ip_multicast_pkt \
> +    000000000001 01005e000144 \
> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> +    e518e518000a3b3a0000 expected_switched
> +
> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_routed_sw1])
> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_routed_sw2])
> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_switched])
> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
> +
>  OVN_CLEANUP([hv1], [hv2])
>  AT_CLEANUP
>
> --
> 1.8.3.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Dumitru Ceara Aug. 16, 2019, 11:56 a.m. UTC | #2
On Fri, Aug 16, 2019 at 12:23 PM Numan Siddique <nusiddiq@redhat.com> wrote:
>
>
>
> On Mon, Aug 12, 2019 at 5:32 PM Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> Add a new configuration option 'mcast_relay' to the Logical_Router:options
>> in the OVN Northbound database.
>>
>> If a router is configured with 'mcast_relay' enabled then ovn-northd
>> will install Logical_Flows to allow IP multicast traffic to be routed
>> between Logical_Switches. The logical router will aggregate all IGMP
>> groups from attached logical switches and modify the routing pipeline in
>> the following way:
>> - Table S_ROUTER_IN_IP_INPUT: add flow allowing IP multicast traffic
>>   if mcast_relay is enabled on the datapath.
>> - Table S_ROUTER_IN_IP_ROUTING: add flow matching the group address,
>>   update TTL and set outport="<Multicast_Group> associated with the
>>   IGMP group". Continue to next table.
>> - Table S_ROUTER_IN_ARP_RESOLVE: bypass ARP resolve for IP multicast
>>   traffic and continue to next table.
>> - Table S_ROUTER_OUT_DELIVERY: add flow matching IP multicast traffic
>>   and set ETH.SRC to the MAC address of the logical port on which
>>   traffic is forwarded.
>>
>> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>> Acked-by: Mark Michelson <mmichels@redhat.com>
>
>
> Hi Dumitru,
>
> Just a minor comment. Please see below. Can you please update the patch ?
> LGTM otherwise.

Hi Numan,

Thanks for the review, I'll send a v4 but I have a follow up to your
comment below.

Thanks,
Dumitru

>
> Thanks
> Numan
>
>>
>>
>> ---
>> v3:
>> - Address Mark's comment and move setting of the outport in the IP
>>   Routing stage.
>> - Update commit message.
>> - Fix some typos.
>> v2:
>> - Optimize flooding to multicast router ports.
>> - Fix check for source IP multicast in router pipeline.
>> - Use an enum for OVN_MCAST_*_KEY definitions to avoid hard to debug
>>   errors due to typos when adding new OVN_MCAST_*_KEY values.
>> - Fix ovn-northd man page for IGMP.
>> ---
>>  NEWS                    |   1 +
>>  lib/mcast-group-index.h |  13 +-
>>  northd/ovn-northd.8.xml |  79 +++++++-
>>  northd/ovn-northd.c     | 504 ++++++++++++++++++++++++++++++++++++------------
>>  ovn-nb.xml              |   6 +
>>  tests/ovn.at            | 199 +++++++++++++++++--
>>  6 files changed, 651 insertions(+), 151 deletions(-)
>>
>> diff --git a/NEWS b/NEWS
>> index f476984..73045d6 100644
>> --- a/NEWS
>> +++ b/NEWS
>> @@ -39,6 +39,7 @@ Post-v2.11.0
>>         logical groups which results in tunnels only been formed between
>>         members of the same transport zone(s).
>>       * Support for new logical switch port type - 'virtual'.
>> +     * Support for IGMP Snooping/Querier and Relay.
>>     - New QoS type "linux-netem" on Linux.
>>     - Added support for TLS Server Name Indication (SNI).
>>
>> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
>> index 15a1592..cb49ad7 100644
>> --- a/lib/mcast-group-index.h
>> +++ b/lib/mcast-group-index.h
>> @@ -20,8 +20,17 @@ struct ovsdb_idl;
>>
>>  struct sbrec_datapath_binding;
>>
>> -#define OVN_MCAST_FLOOD_TUNNEL_KEY   65535
>> -#define OVN_MCAST_UNKNOWN_TUNNEL_KEY (OVN_MCAST_FLOOD_TUNNEL_KEY - 1)
>> +#define OVN_MIN_MULTICAST 32768
>> +#define OVN_MAX_MULTICAST 65535
>> +
>> +enum ovn_mcast_tunnel_keys {
>> +
>> +    OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
>> +    OVN_MCAST_UNKNOWN_TUNNEL_KEY,
>> +    OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
>> +    OVN_MIN_IP_MULTICAST,
>> +    OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST,
>> +};
>>
>>  struct ovsdb_idl_index *mcast_group_index_create(struct ovsdb_idl *);
>>  const struct sbrec_multicast_group *
>> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>> index 6d2fbe3..d45bb15 100644
>> --- a/northd/ovn-northd.8.xml
>> +++ b/northd/ovn-northd.8.xml
>> @@ -947,10 +947,40 @@ output;
>>
>>      <ul>
>>        <li>
>> -        A priority-100 flow that outputs all packets with an Ethernet broadcast
>> +        A priority-100 flow that punts all IGMP packets to
>> +        <code>ovn-controller</code> if IGMP snooping is enabled on the
>> +        logical switch.
>> +      </li>
>> +
>> +      <li>
>> +        Priority-90 flows that forward registered IP multicast traffic to
>> +        their corresponding multicast group, which <code>ovn-northd</code>
>> +        creates based on learnt <ref table="IGMP_Group" db="OVN_Southbound"/>
>> +        entries.  The flows also forward packets to the
>> +        <code>MC_MROUTER_FLOOD</code> multicast group, which
>> +        <code>ovn-nortdh</code> populates with all the logical ports that
>> +        are connected to logical routers with
>> +        <ref column="options" table="Logical_Router"/>:mcast_relay='true'.
>> +      </li>
>> +
>> +      <li>
>> +        A priority-85 flow that forwards all IP multicast traffic destined to
>> +        224.0.0.X to the <code>MC_FLOOD</code> multicast group, which
>> +        <code>ovn-northd</code> populates with all enabled logical ports.
>> +      </li>
>> +
>> +      <li>
>> +        A priority-80 flow that forwards all unregistered IP multicast traffic
>> +        to the <code>MC_MROUTER_FLOOD</code> multicast group, if any.
>> +        Otherwise the flow drops all unregistered IP multicast packets.  This
>> +        flow is added only if <ref column="other_config"
>> +        table="Logical_Switch"/>:mcast_flood_unregistered='false'.
>> +      </li>
>> +
>> +      <li>
>> +        A priority-70 flow that outputs all packets with an Ethernet broadcast
>>          or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
>> -        multicast group, which <code>ovn-northd</code> populates with all
>> -        enabled logical ports.
>> +        multicast group.
>>        </li>
>>
>>        <li>
>> @@ -1228,6 +1258,14 @@ output;
>>
>>        <li>
>>          <p>
>> +          A priority-95 flow allows IP multicast traffic if
>> +          <ref column="options" table="Logical_Router"/>:mcast_relay='true',
>> +          otherwise drops it.
>> +        </p>
>> +      </li>
>> +
>> +      <li>
>> +        <p>
>>            ICMP echo reply.  These flows reply to ICMP echo requests received
>>            for the router's IP address.  Let <var>A</var> be an IP address
>>            owned by a router port.  Then, for each <var>A</var> that is
>> @@ -1941,6 +1979,16 @@ output;
>>      <ul>
>>        <li>
>>          <p>
>> +          Priority-500 flows that match IP multicast traffic destined to
>> +          groups registered on any of the attached switches and sets
>> +          <code>outport</code> to the associated multicast group that will
>> +          eventually flood the traffic to all interested attached logical
>> +          switches. The flows also decrement TTL.
>> +        </p>
>> +      </li>
>> +
>> +      <li>
>> +        <p>
>>            For distributed logical routers where one of the logical router
>>            ports specifies a <code>redirect-chassis</code>, a priority-400
>>            logical flow for each ip source/destination couple that matches the
>> @@ -2074,6 +2122,15 @@ next;
>>      <ul>
>>        <li>
>>          <p>
>> +          A priority-500 flow that matches IP multicast traffic that was
>> +          allowed in the routing pipeline. For this kind of traffic the
>> +          <code>outport</code> was already set so the flow just advances to
>> +          the next table.
>> +        </p>
>> +      </li>
>> +
>> +      <li>
>> +        <p>
>>            For distributed logical routers where one of the logical router
>>            ports specifies a <code>redirect-chassis</code>, a priority-400
>>            logical flow with match <code>REGBIT_DISTRIBUTED_NAT == 1</code>
>> @@ -2641,9 +2698,19 @@ clone {
>>      <h3>Egress Table 3: Delivery</h3>
>>
>>      <p>
>> -      Packets that reach this table are ready for delivery.  It contains
>> -      priority-100 logical flows that match packets on each enabled logical
>> -      router port, with action <code>output;</code>.
>> +      Packets that reach this table are ready for delivery.  It contains:
>> +      <ul>
>> +        <li>
>> +          Priority-110 logical flows that match IP multicast packets on each
>> +          enabled logical router port and modify the Ethernet source address
>> +          of the packets to the Ethernet address of the port and then execute
>> +          action <code>output;</code>.
>> +        </li>
>> +        <li>
>> +          Priority-100 logical flows that match packets on each enabled
>> +          logical router port, with action <code>output;</code>.
>> +        </li>
>> +      </ul>
>>      </p>
>>
>>  </manpage>
>> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
>> index e6953a4..9ee9230 100644
>> --- a/northd/ovn-northd.c
>> +++ b/northd/ovn-northd.c
>> @@ -433,32 +433,52 @@ struct ipam_info {
>>      bool mac_only;
>>  };
>>
>> -#define OVN_MIN_MULTICAST 32768
>> -#define OVN_MAX_MULTICAST OVN_MCAST_FLOOD_TUNNEL_KEY
>> -BUILD_ASSERT_DECL(OVN_MIN_MULTICAST < OVN_MAX_MULTICAST);
>> -
>> -#define OVN_MIN_IP_MULTICAST OVN_MIN_MULTICAST
>> -#define OVN_MAX_IP_MULTICAST (OVN_MCAST_UNKNOWN_TUNNEL_KEY - 1)
>> -BUILD_ASSERT_DECL(OVN_MAX_IP_MULTICAST >= OVN_MIN_MULTICAST);
>> -
>>  /*
>>   * Multicast snooping and querier per datapath configuration.
>>   */
>> +struct mcast_switch_info {
>> +
>> +    bool enabled;               /* True if snooping enabled. */
>> +    bool querier;               /* True if querier enabled. */
>> +    bool flood_unregistered;    /* True if unregistered multicast should be
>> +                                 * flooded.
>> +                                 */
>> +    bool flood_relay;           /* True if the switch is connected to a
>> +                                 * multicast router and unregistered multicast
>> +                                 * should be flooded to the mrouter. Only
>> +                                 * applicable if flood_unregistered == false.
>> +                                 */
>> +
>> +    int64_t table_size;         /* Max number of IP multicast groups. */
>> +    int64_t idle_timeout;       /* Timeout after which an idle group is
>> +                                 * flushed.
>> +                                 */
>> +    int64_t query_interval;     /* Interval between multicast queries. */
>> +    char *eth_src;              /* ETH src address of the multicast queries. */
>> +    char *ipv4_src;             /* IP src address of the multicast queries. */
>> +    int64_t query_max_response; /* Expected time after which reports should
>> +                                 * be received for queries that were sent out.
>> +                                 */
>> +
>> +    uint32_t active_flows;      /* Current number of active IP multicast
>> +                                 * flows.
>> +                                 */
>> +};
>> +
>> +struct mcast_router_info {
>> +    bool relay; /* True if the router should relay IP multicast. */
>> +};
>> +
>>  struct mcast_info {
>> -    bool enabled;
>> -    bool querier;
>> -    bool flood_unregistered;
>> -
>> -    int64_t table_size;
>> -    int64_t idle_timeout;
>> -    int64_t query_interval;
>> -    char *eth_src;
>> -    char *ipv4_src;
>> -    int64_t  query_max_response;
>> -
>> -    struct hmap group_tnlids;
>> -    uint32_t group_tnlid_hint;
>> -    uint32_t active_flows;
>> +
>> +    struct hmap group_tnlids;  /* Group tunnel IDs in use on this DP. */
>> +    uint32_t group_tnlid_hint; /* Hint for allocating next group tunnel ID. */
>> +    struct ovs_list groups;    /* List of groups learnt on this DP. */
>> +
>> +    union {
>> +        struct mcast_switch_info sw;  /* Switch specific multicast info. */
>> +        struct mcast_router_info rtr; /* Router specific multicast info. */
>> +    };
>>  };
>>
>>  static uint32_t
>> @@ -559,6 +579,7 @@ ovn_datapath_create(struct hmap *datapaths, const struct uuid *key,
>>  }
>>
>>  static void ovn_ls_port_group_destroy(struct hmap *nb_pgs);
>> +static void destroy_mcast_info_for_datapath(struct ovn_datapath *od);
>>
>>  static void
>>  ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
>> @@ -572,12 +593,7 @@ ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
>>          bitmap_free(od->ipam_info.allocated_ipv4s);
>>          free(od->router_ports);
>>          ovn_ls_port_group_destroy(&od->nb_pgs);
>> -
>> -        if (od->nbs) {
>> -            free(od->mcast_info.eth_src);
>> -            free(od->mcast_info.ipv4_src);
>> -            destroy_tnlids(&od->mcast_info.group_tnlids);
>> -        }
>> +        destroy_mcast_info_for_datapath(od);
>>
>>          free(od);
>>      }
>> @@ -714,23 +730,28 @@ init_ipam_info_for_datapath(struct ovn_datapath *od)
>>  }
>>
>>  static void
>> -init_mcast_info_for_datapath(struct ovn_datapath *od)
>> +init_mcast_info_for_router_datapath(struct ovn_datapath *od)
>>  {
>> -    if (!od->nbs) {
>> -        return;
>> -    }
>> +    struct mcast_router_info *mcast_rtr_info = &od->mcast_info.rtr;
>>
>> -    struct mcast_info *mcast_info = &od->mcast_info;
>> +    mcast_rtr_info->relay = smap_get_bool(&od->nbr->options, "mcast_relay",
>> +                                          false);
>> +}
>>
>> -    mcast_info->enabled =
>> +static void
>> +init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>> +{
>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> +
>> +    mcast_sw_info->enabled =
>>          smap_get_bool(&od->nbs->other_config, "mcast_snoop", false);
>> -    mcast_info->querier =
>> +    mcast_sw_info->querier =
>>          smap_get_bool(&od->nbs->other_config, "mcast_querier", true);
>> -    mcast_info->flood_unregistered =
>> +    mcast_sw_info->flood_unregistered =
>>          smap_get_bool(&od->nbs->other_config, "mcast_flood_unregistered",
>>                        false);
>>
>> -    mcast_info->table_size =
>> +    mcast_sw_info->table_size =
>>          smap_get_ullong(&od->nbs->other_config, "mcast_table_size",
>>                          OVN_MCAST_DEFAULT_MAX_ENTRIES);
>>
>> @@ -742,54 +763,94 @@ init_mcast_info_for_datapath(struct ovn_datapath *od)
>>      } else if (idle_timeout > OVN_MCAST_MAX_IDLE_TIMEOUT_S) {
>>          idle_timeout = OVN_MCAST_MAX_IDLE_TIMEOUT_S;
>>      }
>> -    mcast_info->idle_timeout = idle_timeout;
>> +    mcast_sw_info->idle_timeout = idle_timeout;
>>
>>      uint32_t query_interval =
>>          smap_get_ullong(&od->nbs->other_config, "mcast_query_interval",
>> -                        mcast_info->idle_timeout / 2);
>> +                        mcast_sw_info->idle_timeout / 2);
>>      if (query_interval < OVN_MCAST_MIN_QUERY_INTERVAL_S) {
>>          query_interval = OVN_MCAST_MIN_QUERY_INTERVAL_S;
>>      } else if (query_interval > OVN_MCAST_MAX_QUERY_INTERVAL_S) {
>>          query_interval = OVN_MCAST_MAX_QUERY_INTERVAL_S;
>>      }
>> -    mcast_info->query_interval = query_interval;
>> +    mcast_sw_info->query_interval = query_interval;
>>
>> -    mcast_info->eth_src =
>> +    mcast_sw_info->eth_src =
>>          nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_eth_src"));
>> -    mcast_info->ipv4_src =
>> +    mcast_sw_info->ipv4_src =
>>          nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_ip4_src"));
>>
>> -    mcast_info->query_max_response =
>> +    mcast_sw_info->query_max_response =
>>          smap_get_ullong(&od->nbs->other_config, "mcast_query_max_response",
>>                          OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
>>
>> -    hmap_init(&mcast_info->group_tnlids);
>> -    mcast_info->group_tnlid_hint = OVN_MIN_IP_MULTICAST;
>> -    mcast_info->active_flows = 0;
>> +    mcast_sw_info->active_flows = 0;
>> +}
>> +
>> +static void
>> +init_mcast_info_for_datapath(struct ovn_datapath *od)
>> +{
>> +    if (!od->nbr && !od->nbs) {
>> +        return;
>> +    }
>> +
>> +    hmap_init(&od->mcast_info.group_tnlids);
>> +    od->mcast_info.group_tnlid_hint = OVN_MIN_IP_MULTICAST;
>> +    ovs_list_init(&od->mcast_info.groups);
>> +
>> +    if (od->nbs) {
>> +        init_mcast_info_for_switch_datapath(od);
>> +    } else {
>> +        init_mcast_info_for_router_datapath(od);
>> +    }
>> +}
>> +
>> +static void
>> +destroy_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>> +{
>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> +
>> +    free(mcast_sw_info->eth_src);
>> +    free(mcast_sw_info->ipv4_src);
>> +}
>> +
>> +static void
>> +destroy_mcast_info_for_datapath(struct ovn_datapath *od)
>> +{
>> +    if (!od->nbr && !od->nbs) {
>> +        return;
>> +    }
>> +
>> +    if (od->nbs) {
>> +        destroy_mcast_info_for_switch_datapath(od);
>> +    }
>> +
>> +    destroy_tnlids(&od->mcast_info.group_tnlids);
>>  }
>>
>>  static void
>> -store_mcast_info_for_datapath(const struct sbrec_ip_multicast *sb,
>> -                              struct ovn_datapath *od)
>> +store_mcast_info_for_switch_datapath(const struct sbrec_ip_multicast *sb,
>> +                                     struct ovn_datapath *od)
>>  {
>> -    struct mcast_info *mcast_info = &od->mcast_info;
>> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>>
>>      sbrec_ip_multicast_set_datapath(sb, od->sb);
>> -    sbrec_ip_multicast_set_enabled(sb, &mcast_info->enabled, 1);
>> -    sbrec_ip_multicast_set_querier(sb, &mcast_info->querier, 1);
>> -    sbrec_ip_multicast_set_table_size(sb, &mcast_info->table_size, 1);
>> -    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_info->idle_timeout, 1);
>> +    sbrec_ip_multicast_set_enabled(sb, &mcast_sw_info->enabled, 1);
>> +    sbrec_ip_multicast_set_querier(sb, &mcast_sw_info->querier, 1);
>> +    sbrec_ip_multicast_set_table_size(sb, &mcast_sw_info->table_size, 1);
>> +    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_sw_info->idle_timeout, 1);
>>      sbrec_ip_multicast_set_query_interval(sb,
>> -                                          &mcast_info->query_interval, 1);
>> +                                          &mcast_sw_info->query_interval, 1);
>>      sbrec_ip_multicast_set_query_max_resp(sb,
>> -                                          &mcast_info->query_max_response, 1);
>> +                                          &mcast_sw_info->query_max_response,
>> +                                          1);
>>
>> -    if (mcast_info->eth_src) {
>> -        sbrec_ip_multicast_set_eth_src(sb, mcast_info->eth_src);
>> +    if (mcast_sw_info->eth_src) {
>> +        sbrec_ip_multicast_set_eth_src(sb, mcast_sw_info->eth_src);
>>      }
>>
>> -    if (mcast_info->ipv4_src) {
>> -        sbrec_ip_multicast_set_ip4_src(sb, mcast_info->ipv4_src);
>> +    if (mcast_sw_info->ipv4_src) {
>> +        sbrec_ip_multicast_set_ip4_src(sb, mcast_sw_info->ipv4_src);
>>      }
>>  }
>>
>> @@ -906,6 +967,7 @@ join_datapaths(struct northd_context *ctx, struct hmap *datapaths,
>>                                       NULL, nbr, NULL);
>>              ovs_list_push_back(nb_only, &od->list);
>>          }
>> +        init_mcast_info_for_datapath(od);
>>          ovs_list_push_back(lr_list, &od->lr_list);
>>      }
>>  }
>> @@ -1999,6 +2061,13 @@ join_logical_ports(struct northd_context *ctx,
>>                      break;
>>                  }
>>              }
>> +
>> +            /* If the router is multicast enabled then set relay on the switch
>> +             * datapath.
>> +             */
>> +            if (peer->od && peer->od->mcast_info.rtr.relay) {
>> +                op->od->mcast_info.sw.flood_relay = true;
>> +            }
>>          } else if (op->nbrp && op->nbrp->peer && !op->derived) {
>>              struct ovn_port *peer = ovn_port_find(ports, op->nbrp->peer);
>>              if (peer) {
>> @@ -2846,6 +2915,10 @@ struct multicast_group {
>>  static const struct multicast_group mc_flood =
>>      { MC_FLOOD, OVN_MCAST_FLOOD_TUNNEL_KEY };
>>
>> +#define MC_MROUTER_FLOOD "_MC_mrouter_flood"
>> +static const struct multicast_group mc_mrouter_flood =
>> +    { MC_MROUTER_FLOOD, OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY };
>> +
>>  #define MC_UNKNOWN "_MC_unknown"
>>  static const struct multicast_group mc_unknown =
>>      { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
>> @@ -2955,7 +3028,8 @@ ovn_multicast_update_sbrec(const struct ovn_multicast *mc,
>>   */
>>  struct ovn_igmp_group_entry {
>>      struct ovs_list list_node; /* Linkage in the list of entries. */
>> -    const struct sbrec_igmp_group *sb;
>> +    size_t n_ports;
>> +    struct ovn_port **ports;
>>  };
>>
>>  /*
>> @@ -2964,12 +3038,13 @@ struct ovn_igmp_group_entry {
>>   */
>>  struct ovn_igmp_group {
>>      struct hmap_node hmap_node; /* Index on 'datapath' and 'address'. */
>> +    struct ovs_list list_node;  /* Linkage in the per-dp igmp group list. */
>>
>>      struct ovn_datapath *datapath;
>>      struct in6_addr address; /* Multicast IPv6-mapped-IPv4 or IPv4 address. */
>>      struct multicast_group mcgroup;
>>
>> -    struct ovs_list sb_entries; /* List of SB entries for this group. */
>> +    struct ovs_list entries; /* List of SB entries for this group. */
>>  };
>>
>>  static uint32_t
>> @@ -2997,77 +3072,120 @@ ovn_igmp_group_find(struct hmap *igmp_groups,
>>      return NULL;
>>  }
>>
>> -static void
>> +static struct ovn_igmp_group *
>>  ovn_igmp_group_add(struct northd_context *ctx, struct hmap *igmp_groups,
>>                     struct ovn_datapath *datapath,
>> -                   const struct sbrec_igmp_group *sb_igmp_group)
>> +                   const struct in6_addr *address,
>> +                   const char *address_s)
>>  {
>> -    struct in6_addr group_address;
>> -    ovs_be32 ipv4;
>> -
>> -    if (ip_parse(sb_igmp_group->address, &ipv4)) {
>> -        group_address = in6_addr_mapped_ipv4(ipv4);
>> -    } else if (!ipv6_parse(sb_igmp_group->address, &group_address)) {
>> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>> -        VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
>> -                     sb_igmp_group->address);
>> -        return;
>> -    }
>> -
>>      struct ovn_igmp_group *igmp_group =
>> -        ovn_igmp_group_find(igmp_groups, datapath, &group_address);
>> +        ovn_igmp_group_find(igmp_groups, datapath, address);
>>
>>      if (!igmp_group) {
>>          igmp_group = xmalloc(sizeof *igmp_group);
>>
>>          const struct sbrec_multicast_group *mcgroup =
>> -            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
>> -                               sb_igmp_group->address, datapath->sb);
>> +            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp, address_s,
>> +                               datapath->sb);
>>
>>          igmp_group->datapath = datapath;
>> -        igmp_group->address = group_address;
>> +        igmp_group->address = *address;
>>          if (mcgroup) {
>>              igmp_group->mcgroup.key = mcgroup->tunnel_key;
>>              add_tnlid(&datapath->mcast_info.group_tnlids, mcgroup->tunnel_key);
>>          } else {
>>              igmp_group->mcgroup.key = 0;
>>          }
>> -        igmp_group->mcgroup.name = sb_igmp_group->address;
>> -        ovs_list_init(&igmp_group->sb_entries);
>> +        igmp_group->mcgroup.name = address_s;
>> +        ovs_list_init(&igmp_group->entries);
>>
>>          hmap_insert(igmp_groups, &igmp_group->hmap_node,
>> -                    ovn_igmp_group_hash(datapath, &group_address));
>> +                    ovn_igmp_group_hash(datapath, address));
>> +        ovs_list_push_back(&datapath->mcast_info.groups,
>> +                           &igmp_group->list_node);
>> +    }
>> +
>> +    return igmp_group;
>> +}
>> +
>> +static bool
>> +ovn_igmp_group_get_address(const struct sbrec_igmp_group *sb_igmp_group,
>> +                           struct in6_addr *address)
>> +{
>> +    ovs_be32 ipv4;
>> +
>> +    if (ip_parse(sb_igmp_group->address, &ipv4)) {
>> +        *address = in6_addr_mapped_ipv4(ipv4);
>> +        return true;
>> +    }
>> +    if (!ipv6_parse(sb_igmp_group->address, address)) {
>> +        return false;
>>      }
>> +    return true;
>> +}
>>
>> +static struct ovn_port **
>> +ovn_igmp_group_get_ports(const struct sbrec_igmp_group *sb_igmp_group,
>> +                         size_t *n_ports, struct hmap *ovn_ports)
>> +{
>> +    struct ovn_port **ports = xmalloc(sb_igmp_group->n_ports * sizeof *ports);
>> +
>> +     *n_ports = 0;
>> +     for (size_t i = 0; i < sb_igmp_group->n_ports; i++) {
>> +        ports[(*n_ports)] =
>> +            ovn_port_find(ovn_ports, sb_igmp_group->ports[i]->logical_port);
>> +        if (ports[(*n_ports)]) {
>> +            (*n_ports)++;
>> +        }
>> +    }
>> +
>> +    return ports;
>> +}
>> +
>> +static void
>> +ovn_igmp_group_add_entry(struct ovn_igmp_group *igmp_group,
>> +                         struct ovn_port **ports, size_t n_ports)
>> +{
>>      struct ovn_igmp_group_entry *entry = xmalloc(sizeof *entry);
>>
>> -    entry->sb = sb_igmp_group;
>> -    ovs_list_push_back(&igmp_group->sb_entries , &entry->list_node);
>> +    entry->ports = ports;
>> +    entry->n_ports = n_ports;
>> +    ovs_list_push_back(&igmp_group->entries, &entry->list_node);
>> +}
>> +
>> +static void
>> +ovn_igmp_group_destroy_entry(struct ovn_igmp_group_entry *entry)
>> +{
>> +    free(entry->ports);
>> +}
>> +
>> +static bool
>> +ovn_igmp_group_allocate_id(struct ovn_igmp_group *igmp_group)
>> +{
>> +    if (igmp_group->mcgroup.key == 0) {
>> +        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>> +        igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
>> +    }
>> +
>> +    if (igmp_group->mcgroup.key == 0) {
>> +        return false;
>> +    }
>> +
>> +    return true;
>>  }
>>
>>  static void
>>  ovn_igmp_group_aggregate_ports(struct ovn_igmp_group *igmp_group,
>> -                               struct hmap *ovn_ports,
>>                                 struct hmap *mcast_groups)
>>  {
>>      struct ovn_igmp_group_entry *entry;
>>
>> -    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
>> -        size_t n_oports = 0;
>> -        struct ovn_port **oports =
>> -            xmalloc(entry->sb->n_ports * sizeof *oports);
>> -
>> -        for (size_t i = 0; i < entry->sb->n_ports; i++) {
>> -            oports[n_oports] =
>> -                ovn_port_find(ovn_ports, entry->sb->ports[i]->logical_port);
>> -            if (oports[n_oports]) {
>> -                n_oports++;
>> -            }
>> -        }
>> -
>> +    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>>          ovn_multicast_add_ports(mcast_groups, igmp_group->datapath,
>> -                                &igmp_group->mcgroup, oports, n_oports);
>> -        free(oports);
>> +                                &igmp_group->mcgroup, entry->ports,
>> +                                entry->n_ports);
>> +
>> +        ovn_igmp_group_destroy_entry(entry);
>>          free(entry);
>>      }
>>  }
>> @@ -3079,10 +3197,12 @@ ovn_igmp_group_destroy(struct hmap *igmp_groups,
>>      if (igmp_group) {
>>          struct ovn_igmp_group_entry *entry;
>>
>> -        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
>> +        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>> +            ovn_igmp_group_destroy_entry(entry);
>>              free(entry);
>>          }
>>          hmap_remove(igmp_groups, &igmp_group->hmap_node);
>> +        ovs_list_remove(&igmp_group->list_node);
>>          free(igmp_group);
>>      }
>>  }
>> @@ -5282,7 +5402,9 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>>              continue;
>>          }
>>
>> -        if (od->mcast_info.enabled) {
>> +        struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> +
>> +        if (mcast_sw_info->enabled) {
>>              /* Punt IGMP traffic to controller. */
>>              ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100,
>>                            "ip4 && ip.proto == 2", "igmp;");
>> @@ -5295,9 +5417,16 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>>                            "outport = \""MC_FLOOD"\"; output;");
>>
>>              /* Drop unregistered IP multicast if not allowed. */
>> -            if (!od->mcast_info.flood_unregistered) {
>> -                ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>> -                              "ip4 && ip4.mcast", "drop;");
>> +            if (!mcast_sw_info->flood_unregistered) {
>> +                /* Forward unregistered IP multicast to mrouter (if any). */
>> +                if (mcast_sw_info->flood_relay) {
>> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>> +                                  "ip4 && ip4.mcast",
>> +                                  "outport = \""MC_MROUTER_FLOOD"\"; output;");
>> +                } else {
>> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>> +                                  "ip4 && ip4.mcast", "drop;");
>> +                }
>>              }
>>          }
>>
>> @@ -5314,18 +5443,26 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>>              continue;
>>          }
>>
>> -        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>> +        struct mcast_switch_info *mcast_sw_info =
>> +            &igmp_group->datapath->mcast_info.sw;
>>
>> -        if (mcast_info->active_flows >= mcast_info->table_size) {
>> +        if (mcast_sw_info->active_flows >= mcast_sw_info->table_size) {
>>              continue;
>>          }
>> -        mcast_info->active_flows++;
>> +        mcast_sw_info->active_flows++;
>>
>>          ds_clear(&match);
>>          ds_clear(&actions);
>>
>>          ds_put_format(&match, "eth.mcast && ip4 && ip4.dst == %s ",
>>                        igmp_group->mcgroup.name);
>> +        /* Also flood traffic to all multicast routers with relay enabled. */
>> +        if (mcast_sw_info->flood_relay) {
>> +            ds_put_cstr(&actions,
>> +                        "clone { "
>> +                            "outport = \""MC_MROUTER_FLOOD "\"; output; "
>> +                        "};");
>> +        }
>>          ds_put_format(&actions, "outport = \"%s\"; output; ",
>>                        igmp_group->mcgroup.name);
>>
>> @@ -6205,7 +6342,7 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>           * source or destination, and zero network source or destination
>>           * (priority 100). */
>>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
>> -                      "ip4.mcast || "
>> +                      "ip4.src[28..31] == 0xe ||"
>
>
> Does it makes sense to add another predicate for source mcast addr ?
> I am fine either way.

Yes, I'll add "ip4.src_mcast" and send v4. I'm thinking of also
renaming "ip4.mcast" to "ip4.dst_mcast" for uniformity. What do you
think?

Thanks,
Dumitru

>
>
>>
>>                        "ip4.src == 255.255.255.255 || "
>>                        "ip4.src == 127.0.0.0/8 || "
>>                        "ip4.dst == 127.0.0.0/8 || "
>> @@ -6213,6 +6350,16 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>                        "ip4.dst == 0.0.0.0/8",
>>                        "drop;");
>>
>> +        /* Allow multicast if relay enabled (priority 95). */
>> +        ds_clear(&actions);
>> +        if (od->mcast_info.rtr.relay) {
>> +            ds_put_cstr(&actions, "next;");
>> +        } else {
>> +            ds_put_cstr(&actions, "drop;");
>> +        }
>> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
>> +                      "ip4.dst[28..31] == 0xe", ds_cstr(&actions));
>
>
> Since ip4.mcast is a predicate to "ip4.dst[28..31] == 0xe", can you please use ip4.mcast here ?
>
>
>>
>> +
>>          /* ARP reply handling.  Use ARP replies to populate the logical
>>           * router's ARP table. */
>>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
>> @@ -7483,6 +7630,27 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>          }
>>      }
>>
>> +    /* IP Multicast lookup. Here we set the output port, adjust TTL and
>> +     * advance to next table (priority 500).
>> +     */
>> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>> +        if (!od->nbr || !od->mcast_info.rtr.relay) {
>> +            continue;
>> +        }
>> +        struct ovn_igmp_group *igmp_group;
>> +
>> +        LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
>> +            ds_clear(&match);
>> +            ds_clear(&actions);
>> +            ds_put_format(&match, "ip4 && ip4.dst == %s ",
>> +                          igmp_group->mcgroup.name);
>> +            ds_put_format(&actions, "outport = \"%s\"; ip.ttl--; next;",
>> +                          igmp_group->mcgroup.name);
>> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 500,
>> +                          ds_cstr(&match), ds_cstr(&actions));
>> +        }
>> +    }
>> +
>>      /* Logical router ingress table 8: Policy.
>>       *
>>       * A packet that arrives at this table is an IP packet that should be
>> @@ -7513,10 +7681,24 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>
>>      /* Local router ingress table 9: ARP Resolution.
>>       *
>> -     * Any packet that reaches this table is an IP packet whose next-hop IP
>> -     * address is in reg0. (ip4.dst is the final destination.) This table
>> -     * resolves the IP address in reg0 into an output port in outport and an
>> -     * Ethernet address in eth.dst. */
>> +     * Multicast packets already have the outport set so just advance to next
>> +     * table (priority 500). */
>> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>> +        if (!od->nbr) {
>> +            continue;
>> +        }
>> +
>> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 500,
>> +                      "ip4.mcast", "next;");
>> +    }
>> +
>> +    /* Local router ingress table 9: ARP Resolution.
>> +     *
>> +     * Any unicast packet that reaches this table is an IP packet whose
>> +     * next-hop IP address is in reg0. (ip4.dst is the final destination.)
>> +     * This table resolves the IP address in reg0 into an output port in
>> +     * outport and an Ethernet address in eth.dst.
>> +     */
>>      HMAP_FOR_EACH (op, key_node, ports) {
>>          if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
>>              continue;
>> @@ -7998,9 +8180,13 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>          ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
>>      }
>>
>> -    /* Logical router egress table 1: Delivery (priority 100).
>> +    /* Logical router egress table 1: Delivery (priority 100-110).
>>       *
>> -     * Priority 100 rules deliver packets to enabled logical ports. */
>> +     * Priority 100 rules deliver packets to enabled logical ports.
>> +     * Priority 110 rules match multicast packets and update the source
>> +     * mac before delivering to enabled logical ports. IP multicast traffic
>> +     * bypasses S_ROUTER_IN_IP_ROUTING route lookups.
>> +     */
>>      HMAP_FOR_EACH (op, key_node, ports) {
>>          if (!op->nbrp) {
>>              continue;
>> @@ -8020,6 +8206,19 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>>              continue;
>>          }
>>
>> +        /* If multicast relay is enabled then also adjust source mac for IP
>> +         * multicast traffic.
>> +         */
>> +        if (op->od->mcast_info.rtr.relay) {
>> +            ds_clear(&match);
>> +            ds_clear(&actions);
>> +            ds_put_format(&match, "ip4.mcast && outport == %s", op->json_key);
>> +            ds_put_format(&actions, "eth.src = %s; output;",
>> +                          op->lrp_networks.ea_s);
>> +            ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 110,
>> +                        ds_cstr(&match), ds_cstr(&actions));
>> +        }
>> +
>>          ds_clear(&match);
>>          ds_put_format(&match, "outport == %s", op->json_key);
>>          ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
>> @@ -8570,7 +8769,7 @@ build_ip_mcast(struct northd_context *ctx, struct hmap *datapaths)
>>          if (!ip_mcast) {
>>              ip_mcast = sbrec_ip_multicast_insert(ctx->ovnsb_txn);
>>          }
>> -        store_mcast_info_for_datapath(ip_mcast, od);
>> +        store_mcast_info_for_switch_datapath(ip_mcast, od);
>>      }
>>
>>      /* Delete southbound records without northbound matches. */
>> @@ -8602,6 +8801,14 @@ build_mcast_groups(struct northd_context *ctx,
>>
>>          if (lsp_is_enabled(op->nbsp)) {
>>              ovn_multicast_add(mcast_groups, &mc_flood, op);
>> +
>> +            /* If this port is connected to a multicast router then add it
>> +             * to the MC_MROUTER_FLOOD group.
>> +             */
>> +            if (op->od->mcast_info.sw.flood_relay && op->peer &&
>> +                    op->peer->od && op->peer->od->mcast_info.rtr.relay) {
>> +                ovn_multicast_add(mcast_groups, &mc_mrouter_flood, op);
>> +            }
>>          }
>>      }
>>
>> @@ -8624,10 +8831,61 @@ build_mcast_groups(struct northd_context *ctx,
>>              continue;
>>          }
>>
>> +        struct in6_addr group_address;
>> +        if (!ovn_igmp_group_get_address(sb_igmp, &group_address)) {
>> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>> +            VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
>> +                         sb_igmp->address);
>> +            continue;
>> +        }
>> +
>>          /* Add the IGMP group entry. Will also try to allocate an ID for it
>>           * if the multicast group already exists.
>>           */
>> -        ovn_igmp_group_add(ctx, igmp_groups, od, sb_igmp);
>> +        struct ovn_igmp_group *igmp_group =
>> +            ovn_igmp_group_add(ctx, igmp_groups, od, &group_address,
>> +                               sb_igmp->address);
>> +
>> +        /* Extract the IGMP group ports from the SB entry and store them
>> +         * in the IGMP group.
>> +         */
>> +        size_t n_igmp_ports;
>> +        struct ovn_port **igmp_ports =
>> +            ovn_igmp_group_get_ports(sb_igmp, &n_igmp_ports, ports);
>> +        ovn_igmp_group_add_entry(igmp_group, igmp_ports, n_igmp_ports);
>> +    }
>> +
>> +    /* Build IGMP groups for multicast routers with relay enabled. The router
>> +     * IGMP groups are based on the groups learnt by their multicast enabled
>> +     * peers.
>> +     */
>> +    struct ovn_datapath *od;
>> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>> +
>> +        if (ovs_list_is_empty(&od->mcast_info.groups)) {
>> +            continue;
>> +        }
>> +
>> +        for (size_t i = 0; i < od->n_router_ports; i++) {
>> +            struct ovn_port *router_port = od->router_ports[i]->peer;
>> +
>> +            if (!router_port || !router_port->od ||
>> +                    !router_port->od->mcast_info.rtr.relay) {
>> +                continue;
>> +            }
>> +
>> +            struct ovn_igmp_group *igmp_group;
>> +            LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
>> +                struct ovn_igmp_group *igmp_group_rtr =
>> +                    ovn_igmp_group_add(ctx, igmp_groups, router_port->od,
>> +                                       &igmp_group->address,
>> +                                       igmp_group->mcgroup.name);
>> +                struct ovn_port **router_igmp_ports =
>> +                    xmalloc(sizeof *router_igmp_ports);
>> +                router_igmp_ports[0] = router_port;
>> +                ovn_igmp_group_add_entry(igmp_group_rtr, router_igmp_ports, 1);
>> +            }
>> +        }
>>      }
>>
>>      /* Walk the aggregated IGMP groups and allocate IDs for new entries.
>> @@ -8635,21 +8893,17 @@ build_mcast_groups(struct northd_context *ctx,
>>       */
>>      struct ovn_igmp_group *igmp_group, *igmp_group_next;
>>      HMAP_FOR_EACH_SAFE (igmp_group, igmp_group_next, hmap_node, igmp_groups) {
>> -        if (igmp_group->mcgroup.key == 0) {
>> -            struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>> -            igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
>> -        }
>>
>> -        /* If we ran out of keys just destroy the entry. */
>> -        if (igmp_group->mcgroup.key == 0) {
>> +        if (!ovn_igmp_group_allocate_id(igmp_group)) {
>> +            /* If we ran out of keys just destroy the entry. */
>>              ovn_igmp_group_destroy(igmp_groups, igmp_group);
>>              continue;
>>          }
>>
>> -        /* Aggregate the ports from all SB entries corresponding to this
>> +        /* Aggregate the ports from all entries corresponding to this
>>           * group.
>>           */
>> -        ovn_igmp_group_aggregate_ports(igmp_group, ports, mcast_groups);
>> +        ovn_igmp_group_aggregate_ports(igmp_group, mcast_groups);
>>      }
>>  }
>>
>> diff --git a/ovn-nb.xml b/ovn-nb.xml
>> index f5f10a5..db8cc20 100644
>> --- a/ovn-nb.xml
>> +++ b/ovn-nb.xml
>> @@ -1526,6 +1526,12 @@
>>            address.
>>          </p>
>>        </column>
>> +      <column name="options" key="mcast_relay" type'{"type": "boolean"}'>
>> +        <p>
>> +          Enables/disables IP multicast relay between logical switches
>> +          connected to the logical router. Default: False.
>> +        </p>
>> +      </column>
>>      </group>
>>
>>      <group title="Common Columns">
>> diff --git a/tests/ovn.at b/tests/ovn.at
>> index 71eb390..52c044c 100644
>> --- a/tests/ovn.at
>> +++ b/tests/ovn.at
>> @@ -14721,12 +14721,12 @@ AT_CHECK([ovn-sbctl get controller_event $uuid seq_num], [0], [dnl
>>  OVN_CLEANUP([hv1], [hv2])
>>  AT_CLEANUP
>>
>> -AT_SETUP([ovn -- IGMP snoop/querier])
>> +AT_SETUP([ovn -- IGMP snoop/querier/relay])
>>  AT_SKIP_IF([test $HAVE_PYTHON = no])
>>  ovn_start
>>
>>  # Logical network:
>> -# Two independent logical switches (sw1 and sw2).
>> +# Three logical switches (sw1-sw3) connected to a logical router (rtr).
>>  # sw1:
>>  #   - subnet 10.0.0.0/8
>>  #   - 2 ports bound on hv1 (sw1-p11, sw1-p12)
>> @@ -14736,6 +14736,10 @@ ovn_start
>>  #   - 1 port bound on hv1 (sw2-p1)
>>  #   - 1 port bound on hv2 (sw2-p2)
>>  #   - IGMP Querier from 20.0.0.254
>> +# sw3:
>> +#   - subnet 30.0.0.0/8
>> +#   - 1 port bound on hv1 (sw3-p1)
>> +#   - 1 port bound on hv2 (sw3-p2)
>>
>>  reset_pcap_file() {
>>      local iface=$1
>> @@ -14812,29 +14816,47 @@ store_igmp_v3_query() {
>>  }
>>
>>  #
>> -# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
>> -#    IP_PROTO DATA OUTFILE
>> +# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
>> +#    IP_CHKSUM IP_PROTO DATA
>>  #
>>  # This shell function causes an IP multicast packet to be received on INPORT
>>  # of HV.
>>  # The hexdump of the packet is stored in OUTFILE.
>>  #
>>  send_ip_multicast_pkt() {
>> -    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ip_src=$5 ip_dst=$6
>> -    local ip_len=$7 ip_chksum=$8 proto=$9 data=${10} outfile=${11}
>> -
>> -    local ip_ttl=20
>> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4
>> +    local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9 proto=${10}
>> +    local data=${11}
>>
>>      local eth=${eth_dst}${eth_src}0800
>>      local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>>      local packet=${eth}${ip}${data}
>>
>>      as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet}
>> +}
>> +
>> +#
>> +# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
>> +#    IP_CHKSUM IP_PROTO DATA OUTFILE
>> +#
>> +# This shell function builds an IP multicast packet and stores the hexdump of
>> +# the packet in OUTFILE.
>> +#
>> +store_ip_multicast_pkt() {
>> +    local eth_src=$1 eth_dst=$2
>> +    local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8
>> +    local data=$9 outfile=${10}
>> +
>> +    local eth=${eth_dst}${eth_src}0800
>> +    local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>> +    local packet=${eth}${ip}${data}
>> +
>>      echo ${packet} >> ${outfile}
>>  }
>>
>>  ovn-nbctl ls-add sw1
>>  ovn-nbctl ls-add sw2
>> +ovn-nbctl ls-add sw3
>>
>>  ovn-nbctl lsp-add sw1 sw1-p11
>>  ovn-nbctl lsp-add sw1 sw1-p12
>> @@ -14842,6 +14864,26 @@ ovn-nbctl lsp-add sw1 sw1-p21
>>  ovn-nbctl lsp-add sw1 sw1-p22
>>  ovn-nbctl lsp-add sw2 sw2-p1
>>  ovn-nbctl lsp-add sw2 sw2-p2
>> +ovn-nbctl lsp-add sw3 sw3-p1
>> +ovn-nbctl lsp-add sw3 sw3-p2
>> +
>> +ovn-nbctl lr-add rtr
>> +ovn-nbctl lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24
>> +ovn-nbctl lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24
>> +ovn-nbctl lrp-add rtr rtr-sw3 00:00:00:00:03:00 30.0.0.254/24
>> +
>> +ovn-nbctl lsp-add sw1 sw1-rtr                      \
>> +    -- lsp-set-type sw1-rtr router                 \
>> +    -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \
>> +    -- lsp-set-options sw1-rtr router-port=rtr-sw1
>> +ovn-nbctl lsp-add sw2 sw2-rtr                      \
>> +    -- lsp-set-type sw2-rtr router                 \
>> +    -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \
>> +    -- lsp-set-options sw2-rtr router-port=rtr-sw2
>> +ovn-nbctl lsp-add sw3 sw3-rtr                      \
>> +    -- lsp-set-type sw3-rtr router                 \
>> +    -- lsp-set-addresses sw3-rtr 00:00:00:00:03:00 \
>> +    -- lsp-set-options sw3-rtr router-port=rtr-sw3
>>
>>  net_add n1
>>  sim_add hv1
>> @@ -14863,6 +14905,11 @@ ovs-vsctl -- add-port br-int hv1-vif3 -- \
>>      options:tx_pcap=hv1/vif3-tx.pcap \
>>      options:rxq_pcap=hv1/vif3-rx.pcap \
>>      ofport-request=1
>> +ovs-vsctl -- add-port br-int hv1-vif4 -- \
>> +    set interface hv1-vif4 external-ids:iface-id=sw3-p1 \
>> +    options:tx_pcap=hv1/vif4-tx.pcap \
>> +    options:rxq_pcap=hv1/vif4-rx.pcap \
>> +    ofport-request=1
>>
>>  sim_add hv2
>>  as hv2
>> @@ -14883,12 +14930,18 @@ ovs-vsctl -- add-port br-int hv2-vif3 -- \
>>      options:tx_pcap=hv2/vif3-tx.pcap \
>>      options:rxq_pcap=hv2/vif3-rx.pcap \
>>      ofport-request=1
>> +ovs-vsctl -- add-port br-int hv2-vif4 -- \
>> +    set interface hv2-vif4 external-ids:iface-id=sw3-p2 \
>> +    options:tx_pcap=hv2/vif4-tx.pcap \
>> +    options:rxq_pcap=hv2/vif4-rx.pcap \
>> +    ofport-request=1
>>
>>  OVN_POPULATE_ARP
>>
>>  # Enable IGMP snooping on sw1.
>> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_querier="false"
>> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_snoop="true"
>> +ovn-nbctl set Logical_Switch sw1       \
>> +    other_config:mcast_querier="false" \
>> +    other_config:mcast_snoop="true"
>>
>>  # No IGMP query should be generated by sw1 (mcast_querier="false").
>>  truncate -s 0 expected
>> @@ -14921,9 +14974,12 @@ truncate -s 0 expected
>>  truncate -s 0 expected_empty
>>  send_ip_multicast_pkt hv1-vif2 hv1 \
>>      000000000001 01005e000144 \
>> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
>> -    e518e518000a3b3a0000 \
>> -    expected
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000
>> +store_ip_multicast_pkt \
>> +    000000000001 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000 expected
>>
>>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected])
>>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
>> @@ -14944,17 +15000,19 @@ OVS_WAIT_UNTIL([
>>      test "${total_entries}" = "1"
>>  ])
>>
>> -# Send traffic traffic and make sure it gets forwarded only on the port that
>> -# joined.
>> +# Send traffic and make sure it gets forwarded only on the port that joined.
>>  as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>>  as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>>  truncate -s 0 expected
>>  truncate -s 0 expected_empty
>>  send_ip_multicast_pkt hv1-vif2 hv1 \
>>      000000000001 01005e000144 \
>> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
>> -    e518e518000a3b3a0000 \
>> -    expected
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000
>> +store_ip_multicast_pkt \
>> +    000000000001 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000 expected
>>
>>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
>> @@ -14988,6 +15046,111 @@ sleep 1
>>  OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected])
>>  OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
>>
>> +# Dissable IGMP querier on sw2.
>> +ovn-nbctl set Logical_Switch sw2 \
>> +    other_config:mcast_querier="false"
>> +
>> +# Enable IGMP snooping on sw3.
>> +ovn-nbctl set Logical_Switch sw3       \
>> +    other_config:mcast_querier="false" \
>> +    other_config:mcast_snoop="true"
>> +
>> +# Send traffic from sw3 and make sure rtr doesn't relay it.
>> +truncate -s 0 expected_empty
>> +
>> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
>> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
>> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
>> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
>> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
>> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
>> +
>> +send_ip_multicast_pkt hv2-vif4 hv2 \
>> +    000000000001 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000
>> +
>> +# Sleep a bit to make sure no traffic is received and then check.
>> +sleep 1
>> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
>> +
>> +# Enable IGMP relay on rtr
>> +ovn-nbctl set logical_router rtr \
>> +    options:mcast_relay="true"
>> +
>> +# Inject IGMP Join for 239.0.1.68 on sw1-p11.
>> +send_igmp_v3_report hv1-vif1 hv1 \
>> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>> +    /dev/null
>> +# Inject IGMP Join for 239.0.1.68 on sw2-p2.
>> +send_igmp_v3_report hv2-vif3 hv2 \
>> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>> +    /dev/null
>> +# Inject IGMP Join for 239.0.1.68 on sw3-p1.
>> +send_igmp_v3_report hv1-vif4 hv1 \
>> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>> +    /dev/null
>> +
>> +# Check that the IGMP Group is learned by all switches.
>> +OVS_WAIT_UNTIL([
>> +    total_entries=`ovn-sbctl find IGMP_Group | grep "239.0.1.68" | wc -l`
>> +    test "${total_entries}" = "3"
>> +])
>> +
>> +# Send traffic from sw3 and make sure it is relayed by rtr.
>> +# and ports that joined.
>> +truncate -s 0 expected_routed_sw1
>> +truncate -s 0 expected_routed_sw2
>> +truncate -s 0 expected_switched
>> +truncate -s 0 expected_empty
>> +
>> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
>> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
>> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
>> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
>> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
>> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
>> +
>> +send_ip_multicast_pkt hv2-vif4 hv2 \
>> +    000000000001 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000
>> +store_ip_multicast_pkt \
>> +    000000000100 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
>> +    e518e518000a3b3a0000 expected_routed_sw1
>> +store_ip_multicast_pkt \
>> +    000000000200 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
>> +    e518e518000a3b3a0000 expected_routed_sw2
>> +store_ip_multicast_pkt \
>> +    000000000001 01005e000144 \
>> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> +    e518e518000a3b3a0000 expected_switched
>> +
>> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_routed_sw1])
>> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_routed_sw2])
>> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_switched])
>> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
>> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
>> +
>>  OVN_CLEANUP([hv1], [hv2])
>>  AT_CLEANUP
>>
>> --
>> 1.8.3.1
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Numan Siddique Aug. 16, 2019, 12:09 p.m. UTC | #3
On Fri, Aug 16, 2019 at 5:26 PM Dumitru Ceara <dceara@redhat.com> wrote:

> On Fri, Aug 16, 2019 at 12:23 PM Numan Siddique <nusiddiq@redhat.com>
> wrote:
> >
> >
> >
> > On Mon, Aug 12, 2019 at 5:32 PM Dumitru Ceara <dceara@redhat.com> wrote:
> >>
> >> Add a new configuration option 'mcast_relay' to the
> Logical_Router:options
> >> in the OVN Northbound database.
> >>
> >> If a router is configured with 'mcast_relay' enabled then ovn-northd
> >> will install Logical_Flows to allow IP multicast traffic to be routed
> >> between Logical_Switches. The logical router will aggregate all IGMP
> >> groups from attached logical switches and modify the routing pipeline in
> >> the following way:
> >> - Table S_ROUTER_IN_IP_INPUT: add flow allowing IP multicast traffic
> >>   if mcast_relay is enabled on the datapath.
> >> - Table S_ROUTER_IN_IP_ROUTING: add flow matching the group address,
> >>   update TTL and set outport="<Multicast_Group> associated with the
> >>   IGMP group". Continue to next table.
> >> - Table S_ROUTER_IN_ARP_RESOLVE: bypass ARP resolve for IP multicast
> >>   traffic and continue to next table.
> >> - Table S_ROUTER_OUT_DELIVERY: add flow matching IP multicast traffic
> >>   and set ETH.SRC to the MAC address of the logical port on which
> >>   traffic is forwarded.
> >>
> >> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
> >> Acked-by: Mark Michelson <mmichels@redhat.com>
> >
> >
> > Hi Dumitru,
> >
> > Just a minor comment. Please see below. Can you please update the patch ?
> > LGTM otherwise.
>
> Hi Numan,
>
> Thanks for the review, I'll send a v4 but I have a follow up to your
> comment below.
>
> Thanks,
> Dumitru
>
> >
> > Thanks
> > Numan
> >
> >>
> >>
> >> ---
> >> v3:
> >> - Address Mark's comment and move setting of the outport in the IP
> >>   Routing stage.
> >> - Update commit message.
> >> - Fix some typos.
> >> v2:
> >> - Optimize flooding to multicast router ports.
> >> - Fix check for source IP multicast in router pipeline.
> >> - Use an enum for OVN_MCAST_*_KEY definitions to avoid hard to debug
> >>   errors due to typos when adding new OVN_MCAST_*_KEY values.
> >> - Fix ovn-northd man page for IGMP.
> >> ---
> >>  NEWS                    |   1 +
> >>  lib/mcast-group-index.h |  13 +-
> >>  northd/ovn-northd.8.xml |  79 +++++++-
> >>  northd/ovn-northd.c     | 504
> ++++++++++++++++++++++++++++++++++++------------
> >>  ovn-nb.xml              |   6 +
> >>  tests/ovn.at            | 199 +++++++++++++++++--
> >>  6 files changed, 651 insertions(+), 151 deletions(-)
> >>
> >> diff --git a/NEWS b/NEWS
> >> index f476984..73045d6 100644
> >> --- a/NEWS
> >> +++ b/NEWS
> >> @@ -39,6 +39,7 @@ Post-v2.11.0
> >>         logical groups which results in tunnels only been formed between
> >>         members of the same transport zone(s).
> >>       * Support for new logical switch port type - 'virtual'.
> >> +     * Support for IGMP Snooping/Querier and Relay.
> >>     - New QoS type "linux-netem" on Linux.
> >>     - Added support for TLS Server Name Indication (SNI).
> >>
> >> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
> >> index 15a1592..cb49ad7 100644
> >> --- a/lib/mcast-group-index.h
> >> +++ b/lib/mcast-group-index.h
> >> @@ -20,8 +20,17 @@ struct ovsdb_idl;
> >>
> >>  struct sbrec_datapath_binding;
> >>
> >> -#define OVN_MCAST_FLOOD_TUNNEL_KEY   65535
> >> -#define OVN_MCAST_UNKNOWN_TUNNEL_KEY (OVN_MCAST_FLOOD_TUNNEL_KEY - 1)
> >> +#define OVN_MIN_MULTICAST 32768
> >> +#define OVN_MAX_MULTICAST 65535
> >> +
> >> +enum ovn_mcast_tunnel_keys {
> >> +
> >> +    OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
> >> +    OVN_MCAST_UNKNOWN_TUNNEL_KEY,
> >> +    OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
> >> +    OVN_MIN_IP_MULTICAST,
> >> +    OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST,
> >> +};
> >>
> >>  struct ovsdb_idl_index *mcast_group_index_create(struct ovsdb_idl *);
> >>  const struct sbrec_multicast_group *
> >> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
> >> index 6d2fbe3..d45bb15 100644
> >> --- a/northd/ovn-northd.8.xml
> >> +++ b/northd/ovn-northd.8.xml
> >> @@ -947,10 +947,40 @@ output;
> >>
> >>      <ul>
> >>        <li>
> >> -        A priority-100 flow that outputs all packets with an Ethernet
> broadcast
> >> +        A priority-100 flow that punts all IGMP packets to
> >> +        <code>ovn-controller</code> if IGMP snooping is enabled on the
> >> +        logical switch.
> >> +      </li>
> >> +
> >> +      <li>
> >> +        Priority-90 flows that forward registered IP multicast traffic
> to
> >> +        their corresponding multicast group, which
> <code>ovn-northd</code>
> >> +        creates based on learnt <ref table="IGMP_Group"
> db="OVN_Southbound"/>
> >> +        entries.  The flows also forward packets to the
> >> +        <code>MC_MROUTER_FLOOD</code> multicast group, which
> >> +        <code>ovn-nortdh</code> populates with all the logical ports
> that
> >> +        are connected to logical routers with
> >> +        <ref column="options"
> table="Logical_Router"/>:mcast_relay='true'.
> >> +      </li>
> >> +
> >> +      <li>
> >> +        A priority-85 flow that forwards all IP multicast traffic
> destined to
> >> +        224.0.0.X to the <code>MC_FLOOD</code> multicast group, which
> >> +        <code>ovn-northd</code> populates with all enabled logical
> ports.
> >> +      </li>
> >> +
> >> +      <li>
> >> +        A priority-80 flow that forwards all unregistered IP multicast
> traffic
> >> +        to the <code>MC_MROUTER_FLOOD</code> multicast group, if any.
> >> +        Otherwise the flow drops all unregistered IP multicast
> packets.  This
> >> +        flow is added only if <ref column="other_config"
> >> +        table="Logical_Switch"/>:mcast_flood_unregistered='false'.
> >> +      </li>
> >> +
> >> +      <li>
> >> +        A priority-70 flow that outputs all packets with an Ethernet
> broadcast
> >>          or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
> >> -        multicast group, which <code>ovn-northd</code> populates with
> all
> >> -        enabled logical ports.
> >> +        multicast group.
> >>        </li>
> >>
> >>        <li>
> >> @@ -1228,6 +1258,14 @@ output;
> >>
> >>        <li>
> >>          <p>
> >> +          A priority-95 flow allows IP multicast traffic if
> >> +          <ref column="options"
> table="Logical_Router"/>:mcast_relay='true',
> >> +          otherwise drops it.
> >> +        </p>
> >> +      </li>
> >> +
> >> +      <li>
> >> +        <p>
> >>            ICMP echo reply.  These flows reply to ICMP echo requests
> received
> >>            for the router's IP address.  Let <var>A</var> be an IP
> address
> >>            owned by a router port.  Then, for each <var>A</var> that is
> >> @@ -1941,6 +1979,16 @@ output;
> >>      <ul>
> >>        <li>
> >>          <p>
> >> +          Priority-500 flows that match IP multicast traffic destined
> to
> >> +          groups registered on any of the attached switches and sets
> >> +          <code>outport</code> to the associated multicast group that
> will
> >> +          eventually flood the traffic to all interested attached
> logical
> >> +          switches. The flows also decrement TTL.
> >> +        </p>
> >> +      </li>
> >> +
> >> +      <li>
> >> +        <p>
> >>            For distributed logical routers where one of the logical
> router
> >>            ports specifies a <code>redirect-chassis</code>, a
> priority-400
> >>            logical flow for each ip source/destination couple that
> matches the
> >> @@ -2074,6 +2122,15 @@ next;
> >>      <ul>
> >>        <li>
> >>          <p>
> >> +          A priority-500 flow that matches IP multicast traffic that
> was
> >> +          allowed in the routing pipeline. For this kind of traffic the
> >> +          <code>outport</code> was already set so the flow just
> advances to
> >> +          the next table.
> >> +        </p>
> >> +      </li>
> >> +
> >> +      <li>
> >> +        <p>
> >>            For distributed logical routers where one of the logical
> router
> >>            ports specifies a <code>redirect-chassis</code>, a
> priority-400
> >>            logical flow with match <code>REGBIT_DISTRIBUTED_NAT ==
> 1</code>
> >> @@ -2641,9 +2698,19 @@ clone {
> >>      <h3>Egress Table 3: Delivery</h3>
> >>
> >>      <p>
> >> -      Packets that reach this table are ready for delivery.  It
> contains
> >> -      priority-100 logical flows that match packets on each enabled
> logical
> >> -      router port, with action <code>output;</code>.
> >> +      Packets that reach this table are ready for delivery.  It
> contains:
> >> +      <ul>
> >> +        <li>
> >> +          Priority-110 logical flows that match IP multicast packets
> on each
> >> +          enabled logical router port and modify the Ethernet source
> address
> >> +          of the packets to the Ethernet address of the port and then
> execute
> >> +          action <code>output;</code>.
> >> +        </li>
> >> +        <li>
> >> +          Priority-100 logical flows that match packets on each enabled
> >> +          logical router port, with action <code>output;</code>.
> >> +        </li>
> >> +      </ul>
> >>      </p>
> >>
> >>  </manpage>
> >> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
> >> index e6953a4..9ee9230 100644
> >> --- a/northd/ovn-northd.c
> >> +++ b/northd/ovn-northd.c
> >> @@ -433,32 +433,52 @@ struct ipam_info {
> >>      bool mac_only;
> >>  };
> >>
> >> -#define OVN_MIN_MULTICAST 32768
> >> -#define OVN_MAX_MULTICAST OVN_MCAST_FLOOD_TUNNEL_KEY
> >> -BUILD_ASSERT_DECL(OVN_MIN_MULTICAST < OVN_MAX_MULTICAST);
> >> -
> >> -#define OVN_MIN_IP_MULTICAST OVN_MIN_MULTICAST
> >> -#define OVN_MAX_IP_MULTICAST (OVN_MCAST_UNKNOWN_TUNNEL_KEY - 1)
> >> -BUILD_ASSERT_DECL(OVN_MAX_IP_MULTICAST >= OVN_MIN_MULTICAST);
> >> -
> >>  /*
> >>   * Multicast snooping and querier per datapath configuration.
> >>   */
> >> +struct mcast_switch_info {
> >> +
> >> +    bool enabled;               /* True if snooping enabled. */
> >> +    bool querier;               /* True if querier enabled. */
> >> +    bool flood_unregistered;    /* True if unregistered multicast
> should be
> >> +                                 * flooded.
> >> +                                 */
> >> +    bool flood_relay;           /* True if the switch is connected to a
> >> +                                 * multicast router and unregistered
> multicast
> >> +                                 * should be flooded to the mrouter.
> Only
> >> +                                 * applicable if flood_unregistered ==
> false.
> >> +                                 */
> >> +
> >> +    int64_t table_size;         /* Max number of IP multicast groups.
> */
> >> +    int64_t idle_timeout;       /* Timeout after which an idle group is
> >> +                                 * flushed.
> >> +                                 */
> >> +    int64_t query_interval;     /* Interval between multicast queries.
> */
> >> +    char *eth_src;              /* ETH src address of the multicast
> queries. */
> >> +    char *ipv4_src;             /* IP src address of the multicast
> queries. */
> >> +    int64_t query_max_response; /* Expected time after which reports
> should
> >> +                                 * be received for queries that were
> sent out.
> >> +                                 */
> >> +
> >> +    uint32_t active_flows;      /* Current number of active IP
> multicast
> >> +                                 * flows.
> >> +                                 */
> >> +};
> >> +
> >> +struct mcast_router_info {
> >> +    bool relay; /* True if the router should relay IP multicast. */
> >> +};
> >> +
> >>  struct mcast_info {
> >> -    bool enabled;
> >> -    bool querier;
> >> -    bool flood_unregistered;
> >> -
> >> -    int64_t table_size;
> >> -    int64_t idle_timeout;
> >> -    int64_t query_interval;
> >> -    char *eth_src;
> >> -    char *ipv4_src;
> >> -    int64_t  query_max_response;
> >> -
> >> -    struct hmap group_tnlids;
> >> -    uint32_t group_tnlid_hint;
> >> -    uint32_t active_flows;
> >> +
> >> +    struct hmap group_tnlids;  /* Group tunnel IDs in use on this DP.
> */
> >> +    uint32_t group_tnlid_hint; /* Hint for allocating next group
> tunnel ID. */
> >> +    struct ovs_list groups;    /* List of groups learnt on this DP. */
> >> +
> >> +    union {
> >> +        struct mcast_switch_info sw;  /* Switch specific multicast
> info. */
> >> +        struct mcast_router_info rtr; /* Router specific multicast
> info. */
> >> +    };
> >>  };
> >>
> >>  static uint32_t
> >> @@ -559,6 +579,7 @@ ovn_datapath_create(struct hmap *datapaths, const
> struct uuid *key,
> >>  }
> >>
> >>  static void ovn_ls_port_group_destroy(struct hmap *nb_pgs);
> >> +static void destroy_mcast_info_for_datapath(struct ovn_datapath *od);
> >>
> >>  static void
> >>  ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
> >> @@ -572,12 +593,7 @@ ovn_datapath_destroy(struct hmap *datapaths,
> struct ovn_datapath *od)
> >>          bitmap_free(od->ipam_info.allocated_ipv4s);
> >>          free(od->router_ports);
> >>          ovn_ls_port_group_destroy(&od->nb_pgs);
> >> -
> >> -        if (od->nbs) {
> >> -            free(od->mcast_info.eth_src);
> >> -            free(od->mcast_info.ipv4_src);
> >> -            destroy_tnlids(&od->mcast_info.group_tnlids);
> >> -        }
> >> +        destroy_mcast_info_for_datapath(od);
> >>
> >>          free(od);
> >>      }
> >> @@ -714,23 +730,28 @@ init_ipam_info_for_datapath(struct ovn_datapath
> *od)
> >>  }
> >>
> >>  static void
> >> -init_mcast_info_for_datapath(struct ovn_datapath *od)
> >> +init_mcast_info_for_router_datapath(struct ovn_datapath *od)
> >>  {
> >> -    if (!od->nbs) {
> >> -        return;
> >> -    }
> >> +    struct mcast_router_info *mcast_rtr_info = &od->mcast_info.rtr;
> >>
> >> -    struct mcast_info *mcast_info = &od->mcast_info;
> >> +    mcast_rtr_info->relay = smap_get_bool(&od->nbr->options,
> "mcast_relay",
> >> +                                          false);
> >> +}
> >>
> >> -    mcast_info->enabled =
> >> +static void
> >> +init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
> >> +{
> >> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> >> +
> >> +    mcast_sw_info->enabled =
> >>          smap_get_bool(&od->nbs->other_config, "mcast_snoop", false);
> >> -    mcast_info->querier =
> >> +    mcast_sw_info->querier =
> >>          smap_get_bool(&od->nbs->other_config, "mcast_querier", true);
> >> -    mcast_info->flood_unregistered =
> >> +    mcast_sw_info->flood_unregistered =
> >>          smap_get_bool(&od->nbs->other_config,
> "mcast_flood_unregistered",
> >>                        false);
> >>
> >> -    mcast_info->table_size =
> >> +    mcast_sw_info->table_size =
> >>          smap_get_ullong(&od->nbs->other_config, "mcast_table_size",
> >>                          OVN_MCAST_DEFAULT_MAX_ENTRIES);
> >>
> >> @@ -742,54 +763,94 @@ init_mcast_info_for_datapath(struct ovn_datapath
> *od)
> >>      } else if (idle_timeout > OVN_MCAST_MAX_IDLE_TIMEOUT_S) {
> >>          idle_timeout = OVN_MCAST_MAX_IDLE_TIMEOUT_S;
> >>      }
> >> -    mcast_info->idle_timeout = idle_timeout;
> >> +    mcast_sw_info->idle_timeout = idle_timeout;
> >>
> >>      uint32_t query_interval =
> >>          smap_get_ullong(&od->nbs->other_config, "mcast_query_interval",
> >> -                        mcast_info->idle_timeout / 2);
> >> +                        mcast_sw_info->idle_timeout / 2);
> >>      if (query_interval < OVN_MCAST_MIN_QUERY_INTERVAL_S) {
> >>          query_interval = OVN_MCAST_MIN_QUERY_INTERVAL_S;
> >>      } else if (query_interval > OVN_MCAST_MAX_QUERY_INTERVAL_S) {
> >>          query_interval = OVN_MCAST_MAX_QUERY_INTERVAL_S;
> >>      }
> >> -    mcast_info->query_interval = query_interval;
> >> +    mcast_sw_info->query_interval = query_interval;
> >>
> >> -    mcast_info->eth_src =
> >> +    mcast_sw_info->eth_src =
> >>          nullable_xstrdup(smap_get(&od->nbs->other_config,
> "mcast_eth_src"));
> >> -    mcast_info->ipv4_src =
> >> +    mcast_sw_info->ipv4_src =
> >>          nullable_xstrdup(smap_get(&od->nbs->other_config,
> "mcast_ip4_src"));
> >>
> >> -    mcast_info->query_max_response =
> >> +    mcast_sw_info->query_max_response =
> >>          smap_get_ullong(&od->nbs->other_config,
> "mcast_query_max_response",
> >>                          OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
> >>
> >> -    hmap_init(&mcast_info->group_tnlids);
> >> -    mcast_info->group_tnlid_hint = OVN_MIN_IP_MULTICAST;
> >> -    mcast_info->active_flows = 0;
> >> +    mcast_sw_info->active_flows = 0;
> >> +}
> >> +
> >> +static void
> >> +init_mcast_info_for_datapath(struct ovn_datapath *od)
> >> +{
> >> +    if (!od->nbr && !od->nbs) {
> >> +        return;
> >> +    }
> >> +
> >> +    hmap_init(&od->mcast_info.group_tnlids);
> >> +    od->mcast_info.group_tnlid_hint = OVN_MIN_IP_MULTICAST;
> >> +    ovs_list_init(&od->mcast_info.groups);
> >> +
> >> +    if (od->nbs) {
> >> +        init_mcast_info_for_switch_datapath(od);
> >> +    } else {
> >> +        init_mcast_info_for_router_datapath(od);
> >> +    }
> >> +}
> >> +
> >> +static void
> >> +destroy_mcast_info_for_switch_datapath(struct ovn_datapath *od)
> >> +{
> >> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> >> +
> >> +    free(mcast_sw_info->eth_src);
> >> +    free(mcast_sw_info->ipv4_src);
> >> +}
> >> +
> >> +static void
> >> +destroy_mcast_info_for_datapath(struct ovn_datapath *od)
> >> +{
> >> +    if (!od->nbr && !od->nbs) {
> >> +        return;
> >> +    }
> >> +
> >> +    if (od->nbs) {
> >> +        destroy_mcast_info_for_switch_datapath(od);
> >> +    }
> >> +
> >> +    destroy_tnlids(&od->mcast_info.group_tnlids);
> >>  }
> >>
> >>  static void
> >> -store_mcast_info_for_datapath(const struct sbrec_ip_multicast *sb,
> >> -                              struct ovn_datapath *od)
> >> +store_mcast_info_for_switch_datapath(const struct sbrec_ip_multicast
> *sb,
> >> +                                     struct ovn_datapath *od)
> >>  {
> >> -    struct mcast_info *mcast_info = &od->mcast_info;
> >> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> >>
> >>      sbrec_ip_multicast_set_datapath(sb, od->sb);
> >> -    sbrec_ip_multicast_set_enabled(sb, &mcast_info->enabled, 1);
> >> -    sbrec_ip_multicast_set_querier(sb, &mcast_info->querier, 1);
> >> -    sbrec_ip_multicast_set_table_size(sb, &mcast_info->table_size, 1);
> >> -    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_info->idle_timeout,
> 1);
> >> +    sbrec_ip_multicast_set_enabled(sb, &mcast_sw_info->enabled, 1);
> >> +    sbrec_ip_multicast_set_querier(sb, &mcast_sw_info->querier, 1);
> >> +    sbrec_ip_multicast_set_table_size(sb, &mcast_sw_info->table_size,
> 1);
> >> +    sbrec_ip_multicast_set_idle_timeout(sb,
> &mcast_sw_info->idle_timeout, 1);
> >>      sbrec_ip_multicast_set_query_interval(sb,
> >> -                                          &mcast_info->query_interval,
> 1);
> >> +
> &mcast_sw_info->query_interval, 1);
> >>      sbrec_ip_multicast_set_query_max_resp(sb,
> >> -
> &mcast_info->query_max_response, 1);
> >> +
> &mcast_sw_info->query_max_response,
> >> +                                          1);
> >>
> >> -    if (mcast_info->eth_src) {
> >> -        sbrec_ip_multicast_set_eth_src(sb, mcast_info->eth_src);
> >> +    if (mcast_sw_info->eth_src) {
> >> +        sbrec_ip_multicast_set_eth_src(sb, mcast_sw_info->eth_src);
> >>      }
> >>
> >> -    if (mcast_info->ipv4_src) {
> >> -        sbrec_ip_multicast_set_ip4_src(sb, mcast_info->ipv4_src);
> >> +    if (mcast_sw_info->ipv4_src) {
> >> +        sbrec_ip_multicast_set_ip4_src(sb, mcast_sw_info->ipv4_src);
> >>      }
> >>  }
> >>
> >> @@ -906,6 +967,7 @@ join_datapaths(struct northd_context *ctx, struct
> hmap *datapaths,
> >>                                       NULL, nbr, NULL);
> >>              ovs_list_push_back(nb_only, &od->list);
> >>          }
> >> +        init_mcast_info_for_datapath(od);
> >>          ovs_list_push_back(lr_list, &od->lr_list);
> >>      }
> >>  }
> >> @@ -1999,6 +2061,13 @@ join_logical_ports(struct northd_context *ctx,
> >>                      break;
> >>                  }
> >>              }
> >> +
> >> +            /* If the router is multicast enabled then set relay on
> the switch
> >> +             * datapath.
> >> +             */
> >> +            if (peer->od && peer->od->mcast_info.rtr.relay) {
> >> +                op->od->mcast_info.sw.flood_relay = true;
> >> +            }
> >>          } else if (op->nbrp && op->nbrp->peer && !op->derived) {
> >>              struct ovn_port *peer = ovn_port_find(ports,
> op->nbrp->peer);
> >>              if (peer) {
> >> @@ -2846,6 +2915,10 @@ struct multicast_group {
> >>  static const struct multicast_group mc_flood =
> >>      { MC_FLOOD, OVN_MCAST_FLOOD_TUNNEL_KEY };
> >>
> >> +#define MC_MROUTER_FLOOD "_MC_mrouter_flood"
> >> +static const struct multicast_group mc_mrouter_flood =
> >> +    { MC_MROUTER_FLOOD, OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY };
> >> +
> >>  #define MC_UNKNOWN "_MC_unknown"
> >>  static const struct multicast_group mc_unknown =
> >>      { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
> >> @@ -2955,7 +3028,8 @@ ovn_multicast_update_sbrec(const struct
> ovn_multicast *mc,
> >>   */
> >>  struct ovn_igmp_group_entry {
> >>      struct ovs_list list_node; /* Linkage in the list of entries. */
> >> -    const struct sbrec_igmp_group *sb;
> >> +    size_t n_ports;
> >> +    struct ovn_port **ports;
> >>  };
> >>
> >>  /*
> >> @@ -2964,12 +3038,13 @@ struct ovn_igmp_group_entry {
> >>   */
> >>  struct ovn_igmp_group {
> >>      struct hmap_node hmap_node; /* Index on 'datapath' and 'address'.
> */
> >> +    struct ovs_list list_node;  /* Linkage in the per-dp igmp group
> list. */
> >>
> >>      struct ovn_datapath *datapath;
> >>      struct in6_addr address; /* Multicast IPv6-mapped-IPv4 or IPv4
> address. */
> >>      struct multicast_group mcgroup;
> >>
> >> -    struct ovs_list sb_entries; /* List of SB entries for this group.
> */
> >> +    struct ovs_list entries; /* List of SB entries for this group. */
> >>  };
> >>
> >>  static uint32_t
> >> @@ -2997,77 +3072,120 @@ ovn_igmp_group_find(struct hmap *igmp_groups,
> >>      return NULL;
> >>  }
> >>
> >> -static void
> >> +static struct ovn_igmp_group *
> >>  ovn_igmp_group_add(struct northd_context *ctx, struct hmap
> *igmp_groups,
> >>                     struct ovn_datapath *datapath,
> >> -                   const struct sbrec_igmp_group *sb_igmp_group)
> >> +                   const struct in6_addr *address,
> >> +                   const char *address_s)
> >>  {
> >> -    struct in6_addr group_address;
> >> -    ovs_be32 ipv4;
> >> -
> >> -    if (ip_parse(sb_igmp_group->address, &ipv4)) {
> >> -        group_address = in6_addr_mapped_ipv4(ipv4);
> >> -    } else if (!ipv6_parse(sb_igmp_group->address, &group_address)) {
> >> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
> >> -        VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
> >> -                     sb_igmp_group->address);
> >> -        return;
> >> -    }
> >> -
> >>      struct ovn_igmp_group *igmp_group =
> >> -        ovn_igmp_group_find(igmp_groups, datapath, &group_address);
> >> +        ovn_igmp_group_find(igmp_groups, datapath, address);
> >>
> >>      if (!igmp_group) {
> >>          igmp_group = xmalloc(sizeof *igmp_group);
> >>
> >>          const struct sbrec_multicast_group *mcgroup =
> >> -            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
> >> -                               sb_igmp_group->address, datapath->sb);
> >> +            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
> address_s,
> >> +                               datapath->sb);
> >>
> >>          igmp_group->datapath = datapath;
> >> -        igmp_group->address = group_address;
> >> +        igmp_group->address = *address;
> >>          if (mcgroup) {
> >>              igmp_group->mcgroup.key = mcgroup->tunnel_key;
> >>              add_tnlid(&datapath->mcast_info.group_tnlids,
> mcgroup->tunnel_key);
> >>          } else {
> >>              igmp_group->mcgroup.key = 0;
> >>          }
> >> -        igmp_group->mcgroup.name = sb_igmp_group->address;
> >> -        ovs_list_init(&igmp_group->sb_entries);
> >> +        igmp_group->mcgroup.name = address_s;
> >> +        ovs_list_init(&igmp_group->entries);
> >>
> >>          hmap_insert(igmp_groups, &igmp_group->hmap_node,
> >> -                    ovn_igmp_group_hash(datapath, &group_address));
> >> +                    ovn_igmp_group_hash(datapath, address));
> >> +        ovs_list_push_back(&datapath->mcast_info.groups,
> >> +                           &igmp_group->list_node);
> >> +    }
> >> +
> >> +    return igmp_group;
> >> +}
> >> +
> >> +static bool
> >> +ovn_igmp_group_get_address(const struct sbrec_igmp_group
> *sb_igmp_group,
> >> +                           struct in6_addr *address)
> >> +{
> >> +    ovs_be32 ipv4;
> >> +
> >> +    if (ip_parse(sb_igmp_group->address, &ipv4)) {
> >> +        *address = in6_addr_mapped_ipv4(ipv4);
> >> +        return true;
> >> +    }
> >> +    if (!ipv6_parse(sb_igmp_group->address, address)) {
> >> +        return false;
> >>      }
> >> +    return true;
> >> +}
> >>
> >> +static struct ovn_port **
> >> +ovn_igmp_group_get_ports(const struct sbrec_igmp_group *sb_igmp_group,
> >> +                         size_t *n_ports, struct hmap *ovn_ports)
> >> +{
> >> +    struct ovn_port **ports = xmalloc(sb_igmp_group->n_ports * sizeof
> *ports);
> >> +
> >> +     *n_ports = 0;
> >> +     for (size_t i = 0; i < sb_igmp_group->n_ports; i++) {
> >> +        ports[(*n_ports)] =
> >> +            ovn_port_find(ovn_ports,
> sb_igmp_group->ports[i]->logical_port);
> >> +        if (ports[(*n_ports)]) {
> >> +            (*n_ports)++;
> >> +        }
> >> +    }
> >> +
> >> +    return ports;
> >> +}
> >> +
> >> +static void
> >> +ovn_igmp_group_add_entry(struct ovn_igmp_group *igmp_group,
> >> +                         struct ovn_port **ports, size_t n_ports)
> >> +{
> >>      struct ovn_igmp_group_entry *entry = xmalloc(sizeof *entry);
> >>
> >> -    entry->sb = sb_igmp_group;
> >> -    ovs_list_push_back(&igmp_group->sb_entries , &entry->list_node);
> >> +    entry->ports = ports;
> >> +    entry->n_ports = n_ports;
> >> +    ovs_list_push_back(&igmp_group->entries, &entry->list_node);
> >> +}
> >> +
> >> +static void
> >> +ovn_igmp_group_destroy_entry(struct ovn_igmp_group_entry *entry)
> >> +{
> >> +    free(entry->ports);
> >> +}
> >> +
> >> +static bool
> >> +ovn_igmp_group_allocate_id(struct ovn_igmp_group *igmp_group)
> >> +{
> >> +    if (igmp_group->mcgroup.key == 0) {
> >> +        struct mcast_info *mcast_info =
> &igmp_group->datapath->mcast_info;
> >> +        igmp_group->mcgroup.key =
> ovn_mcast_group_allocate_key(mcast_info);
> >> +    }
> >> +
> >> +    if (igmp_group->mcgroup.key == 0) {
> >> +        return false;
> >> +    }
> >> +
> >> +    return true;
> >>  }
> >>
> >>  static void
> >>  ovn_igmp_group_aggregate_ports(struct ovn_igmp_group *igmp_group,
> >> -                               struct hmap *ovn_ports,
> >>                                 struct hmap *mcast_groups)
> >>  {
> >>      struct ovn_igmp_group_entry *entry;
> >>
> >> -    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
> >> -        size_t n_oports = 0;
> >> -        struct ovn_port **oports =
> >> -            xmalloc(entry->sb->n_ports * sizeof *oports);
> >> -
> >> -        for (size_t i = 0; i < entry->sb->n_ports; i++) {
> >> -            oports[n_oports] =
> >> -                ovn_port_find(ovn_ports,
> entry->sb->ports[i]->logical_port);
> >> -            if (oports[n_oports]) {
> >> -                n_oports++;
> >> -            }
> >> -        }
> >> -
> >> +    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
> >>          ovn_multicast_add_ports(mcast_groups, igmp_group->datapath,
> >> -                                &igmp_group->mcgroup, oports,
> n_oports);
> >> -        free(oports);
> >> +                                &igmp_group->mcgroup, entry->ports,
> >> +                                entry->n_ports);
> >> +
> >> +        ovn_igmp_group_destroy_entry(entry);
> >>          free(entry);
> >>      }
> >>  }
> >> @@ -3079,10 +3197,12 @@ ovn_igmp_group_destroy(struct hmap *igmp_groups,
> >>      if (igmp_group) {
> >>          struct ovn_igmp_group_entry *entry;
> >>
> >> -        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
> >> +        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
> >> +            ovn_igmp_group_destroy_entry(entry);
> >>              free(entry);
> >>          }
> >>          hmap_remove(igmp_groups, &igmp_group->hmap_node);
> >> +        ovs_list_remove(&igmp_group->list_node);
> >>          free(igmp_group);
> >>      }
> >>  }
> >> @@ -5282,7 +5402,9 @@ build_lswitch_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>              continue;
> >>          }
> >>
> >> -        if (od->mcast_info.enabled) {
> >> +        struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
> >> +
> >> +        if (mcast_sw_info->enabled) {
> >>              /* Punt IGMP traffic to controller. */
> >>              ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100,
> >>                            "ip4 && ip.proto == 2", "igmp;");
> >> @@ -5295,9 +5417,16 @@ build_lswitch_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>                            "outport = \""MC_FLOOD"\"; output;");
> >>
> >>              /* Drop unregistered IP multicast if not allowed. */
> >> -            if (!od->mcast_info.flood_unregistered) {
> >> -                ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
> >> -                              "ip4 && ip4.mcast", "drop;");
> >> +            if (!mcast_sw_info->flood_unregistered) {
> >> +                /* Forward unregistered IP multicast to mrouter (if
> any). */
> >> +                if (mcast_sw_info->flood_relay) {
> >> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
> >> +                                  "ip4 && ip4.mcast",
> >> +                                  "outport = \""MC_MROUTER_FLOOD"\";
> output;");
> >> +                } else {
> >> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
> >> +                                  "ip4 && ip4.mcast", "drop;");
> >> +                }
> >>              }
> >>          }
> >>
> >> @@ -5314,18 +5443,26 @@ build_lswitch_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>              continue;
> >>          }
> >>
> >> -        struct mcast_info *mcast_info =
> &igmp_group->datapath->mcast_info;
> >> +        struct mcast_switch_info *mcast_sw_info =
> >> +            &igmp_group->datapath->mcast_info.sw;
> >>
> >> -        if (mcast_info->active_flows >= mcast_info->table_size) {
> >> +        if (mcast_sw_info->active_flows >= mcast_sw_info->table_size) {
> >>              continue;
> >>          }
> >> -        mcast_info->active_flows++;
> >> +        mcast_sw_info->active_flows++;
> >>
> >>          ds_clear(&match);
> >>          ds_clear(&actions);
> >>
> >>          ds_put_format(&match, "eth.mcast && ip4 && ip4.dst == %s ",
> >>                        igmp_group->mcgroup.name);
> >> +        /* Also flood traffic to all multicast routers with relay
> enabled. */
> >> +        if (mcast_sw_info->flood_relay) {
> >> +            ds_put_cstr(&actions,
> >> +                        "clone { "
> >> +                            "outport = \""MC_MROUTER_FLOOD "\";
> output; "
> >> +                        "};");
> >> +        }
> >>          ds_put_format(&actions, "outport = \"%s\"; output; ",
> >>                        igmp_group->mcgroup.name);
> >>
> >> @@ -6205,7 +6342,7 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>           * source or destination, and zero network source or
> destination
> >>           * (priority 100). */
> >>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
> >> -                      "ip4.mcast || "
> >> +                      "ip4.src[28..31] == 0xe ||"
> >
> >
> > Does it makes sense to add another predicate for source mcast addr ?
> > I am fine either way.
>
> Yes, I'll add "ip4.src_mcast" and send v4. I'm thinking of also
> renaming "ip4.mcast" to "ip4.dst_mcast" for uniformity. What do you
> think?
>
>
For backward compatibility reasons, I would suggest to add a new one -
ip4.dst_mcast
instead of renaming.

I am thinking about the impact when ovn-controller/ovn-northd are upgraded
to a new version.

If we upgrade ovn-controller first (which is the recommended one), then any
logical flow with "ip4.mcast" will
be rejected. This could cause datapath disruption until ovn-northd is
upgraded.



> Thanks,
> Dumitru
>
> >
> >
> >>
> >>                        "ip4.src == 255.255.255.255 || "
> >>                        "ip4.src == 127.0.0.0/8 || "
> >>                        "ip4.dst == 127.0.0.0/8 || "
> >> @@ -6213,6 +6350,16 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>                        "ip4.dst == 0.0.0.0/8",
> >>                        "drop;");
> >>
> >> +        /* Allow multicast if relay enabled (priority 95). */
> >> +        ds_clear(&actions);
> >> +        if (od->mcast_info.rtr.relay) {
> >> +            ds_put_cstr(&actions, "next;");
> >> +        } else {
> >> +            ds_put_cstr(&actions, "drop;");
> >> +        }
> >> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
> >> +                      "ip4.dst[28..31] == 0xe", ds_cstr(&actions));
> >
> >
> > Since ip4.mcast is a predicate to "ip4.dst[28..31] == 0xe", can you
> please use ip4.mcast here ?
> >
> >
> >>
> >> +
> >>          /* ARP reply handling.  Use ARP replies to populate the logical
> >>           * router's ARP table. */
> >>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op ==
> 2",
> >> @@ -7483,6 +7630,27 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>          }
> >>      }
> >>
> >> +    /* IP Multicast lookup. Here we set the output port, adjust TTL and
> >> +     * advance to next table (priority 500).
> >> +     */
> >> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> >> +        if (!od->nbr || !od->mcast_info.rtr.relay) {
> >> +            continue;
> >> +        }
> >> +        struct ovn_igmp_group *igmp_group;
> >> +
> >> +        LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
> >> +            ds_clear(&match);
> >> +            ds_clear(&actions);
> >> +            ds_put_format(&match, "ip4 && ip4.dst == %s ",
> >> +                          igmp_group->mcgroup.name);
> >> +            ds_put_format(&actions, "outport = \"%s\"; ip.ttl--;
> next;",
> >> +                          igmp_group->mcgroup.name);
> >> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 500,
> >> +                          ds_cstr(&match), ds_cstr(&actions));
> >> +        }
> >> +    }
> >> +
> >>      /* Logical router ingress table 8: Policy.
> >>       *
> >>       * A packet that arrives at this table is an IP packet that should
> be
> >> @@ -7513,10 +7681,24 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>
> >>      /* Local router ingress table 9: ARP Resolution.
> >>       *
> >> -     * Any packet that reaches this table is an IP packet whose
> next-hop IP
> >> -     * address is in reg0. (ip4.dst is the final destination.) This
> table
> >> -     * resolves the IP address in reg0 into an output port in outport
> and an
> >> -     * Ethernet address in eth.dst. */
> >> +     * Multicast packets already have the outport set so just advance
> to next
> >> +     * table (priority 500). */
> >> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> >> +        if (!od->nbr) {
> >> +            continue;
> >> +        }
> >> +
> >> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 500,
> >> +                      "ip4.mcast", "next;");
> >> +    }
> >> +
> >> +    /* Local router ingress table 9: ARP Resolution.
> >> +     *
> >> +     * Any unicast packet that reaches this table is an IP packet whose
> >> +     * next-hop IP address is in reg0. (ip4.dst is the final
> destination.)
> >> +     * This table resolves the IP address in reg0 into an output port
> in
> >> +     * outport and an Ethernet address in eth.dst.
> >> +     */
> >>      HMAP_FOR_EACH (op, key_node, ports) {
> >>          if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
> >>              continue;
> >> @@ -7998,9 +8180,13 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>          ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1",
> "output;");
> >>      }
> >>
> >> -    /* Logical router egress table 1: Delivery (priority 100).
> >> +    /* Logical router egress table 1: Delivery (priority 100-110).
> >>       *
> >> -     * Priority 100 rules deliver packets to enabled logical ports. */
> >> +     * Priority 100 rules deliver packets to enabled logical ports.
> >> +     * Priority 110 rules match multicast packets and update the source
> >> +     * mac before delivering to enabled logical ports. IP multicast
> traffic
> >> +     * bypasses S_ROUTER_IN_IP_ROUTING route lookups.
> >> +     */
> >>      HMAP_FOR_EACH (op, key_node, ports) {
> >>          if (!op->nbrp) {
> >>              continue;
> >> @@ -8020,6 +8206,19 @@ build_lrouter_flows(struct hmap *datapaths,
> struct hmap *ports,
> >>              continue;
> >>          }
> >>
> >> +        /* If multicast relay is enabled then also adjust source mac
> for IP
> >> +         * multicast traffic.
> >> +         */
> >> +        if (op->od->mcast_info.rtr.relay) {
> >> +            ds_clear(&match);
> >> +            ds_clear(&actions);
> >> +            ds_put_format(&match, "ip4.mcast && outport == %s",
> op->json_key);
> >> +            ds_put_format(&actions, "eth.src = %s; output;",
> >> +                          op->lrp_networks.ea_s);
> >> +            ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 110,
> >> +                        ds_cstr(&match), ds_cstr(&actions));
> >> +        }
> >> +
> >>          ds_clear(&match);
> >>          ds_put_format(&match, "outport == %s", op->json_key);
> >>          ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
> >> @@ -8570,7 +8769,7 @@ build_ip_mcast(struct northd_context *ctx, struct
> hmap *datapaths)
> >>          if (!ip_mcast) {
> >>              ip_mcast = sbrec_ip_multicast_insert(ctx->ovnsb_txn);
> >>          }
> >> -        store_mcast_info_for_datapath(ip_mcast, od);
> >> +        store_mcast_info_for_switch_datapath(ip_mcast, od);
> >>      }
> >>
> >>      /* Delete southbound records without northbound matches. */
> >> @@ -8602,6 +8801,14 @@ build_mcast_groups(struct northd_context *ctx,
> >>
> >>          if (lsp_is_enabled(op->nbsp)) {
> >>              ovn_multicast_add(mcast_groups, &mc_flood, op);
> >> +
> >> +            /* If this port is connected to a multicast router then
> add it
> >> +             * to the MC_MROUTER_FLOOD group.
> >> +             */
> >> +            if (op->od->mcast_info.sw.flood_relay && op->peer &&
> >> +                    op->peer->od &&
> op->peer->od->mcast_info.rtr.relay) {
> >> +                ovn_multicast_add(mcast_groups, &mc_mrouter_flood, op);
> >> +            }
> >>          }
> >>      }
> >>
> >> @@ -8624,10 +8831,61 @@ build_mcast_groups(struct northd_context *ctx,
> >>              continue;
> >>          }
> >>
> >> +        struct in6_addr group_address;
> >> +        if (!ovn_igmp_group_get_address(sb_igmp, &group_address)) {
> >> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1,
> 1);
> >> +            VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
> >> +                         sb_igmp->address);
> >> +            continue;
> >> +        }
> >> +
> >>          /* Add the IGMP group entry. Will also try to allocate an ID
> for it
> >>           * if the multicast group already exists.
> >>           */
> >> -        ovn_igmp_group_add(ctx, igmp_groups, od, sb_igmp);
> >> +        struct ovn_igmp_group *igmp_group =
> >> +            ovn_igmp_group_add(ctx, igmp_groups, od, &group_address,
> >> +                               sb_igmp->address);
> >> +
> >> +        /* Extract the IGMP group ports from the SB entry and store
> them
> >> +         * in the IGMP group.
> >> +         */
> >> +        size_t n_igmp_ports;
> >> +        struct ovn_port **igmp_ports =
> >> +            ovn_igmp_group_get_ports(sb_igmp, &n_igmp_ports, ports);
> >> +        ovn_igmp_group_add_entry(igmp_group, igmp_ports, n_igmp_ports);
> >> +    }
> >> +
> >> +    /* Build IGMP groups for multicast routers with relay enabled. The
> router
> >> +     * IGMP groups are based on the groups learnt by their multicast
> enabled
> >> +     * peers.
> >> +     */
> >> +    struct ovn_datapath *od;
> >> +    HMAP_FOR_EACH (od, key_node, datapaths) {
> >> +
> >> +        if (ovs_list_is_empty(&od->mcast_info.groups)) {
> >> +            continue;
> >> +        }
> >> +
> >> +        for (size_t i = 0; i < od->n_router_ports; i++) {
> >> +            struct ovn_port *router_port = od->router_ports[i]->peer;
> >> +
> >> +            if (!router_port || !router_port->od ||
> >> +                    !router_port->od->mcast_info.rtr.relay) {
> >> +                continue;
> >> +            }
> >> +
> >> +            struct ovn_igmp_group *igmp_group;
> >> +            LIST_FOR_EACH (igmp_group, list_node,
> &od->mcast_info.groups) {
> >> +                struct ovn_igmp_group *igmp_group_rtr =
> >> +                    ovn_igmp_group_add(ctx, igmp_groups,
> router_port->od,
> >> +                                       &igmp_group->address,
> >> +                                       igmp_group->mcgroup.name);
> >> +                struct ovn_port **router_igmp_ports =
> >> +                    xmalloc(sizeof *router_igmp_ports);
> >> +                router_igmp_ports[0] = router_port;
> >> +                ovn_igmp_group_add_entry(igmp_group_rtr,
> router_igmp_ports, 1);
> >> +            }
> >> +        }
> >>      }
> >>
> >>      /* Walk the aggregated IGMP groups and allocate IDs for new
> entries.
> >> @@ -8635,21 +8893,17 @@ build_mcast_groups(struct northd_context *ctx,
> >>       */
> >>      struct ovn_igmp_group *igmp_group, *igmp_group_next;
> >>      HMAP_FOR_EACH_SAFE (igmp_group, igmp_group_next, hmap_node,
> igmp_groups) {
> >> -        if (igmp_group->mcgroup.key == 0) {
> >> -            struct mcast_info *mcast_info =
> &igmp_group->datapath->mcast_info;
> >> -            igmp_group->mcgroup.key =
> ovn_mcast_group_allocate_key(mcast_info);
> >> -        }
> >>
> >> -        /* If we ran out of keys just destroy the entry. */
> >> -        if (igmp_group->mcgroup.key == 0) {
> >> +        if (!ovn_igmp_group_allocate_id(igmp_group)) {
> >> +            /* If we ran out of keys just destroy the entry. */
> >>              ovn_igmp_group_destroy(igmp_groups, igmp_group);
> >>              continue;
> >>          }
> >>
> >> -        /* Aggregate the ports from all SB entries corresponding to
> this
> >> +        /* Aggregate the ports from all entries corresponding to this
> >>           * group.
> >>           */
> >> -        ovn_igmp_group_aggregate_ports(igmp_group, ports,
> mcast_groups);
> >> +        ovn_igmp_group_aggregate_ports(igmp_group, mcast_groups);
> >>      }
> >>  }
> >>
> >> diff --git a/ovn-nb.xml b/ovn-nb.xml
> >> index f5f10a5..db8cc20 100644
> >> --- a/ovn-nb.xml
> >> +++ b/ovn-nb.xml
> >> @@ -1526,6 +1526,12 @@
> >>            address.
> >>          </p>
> >>        </column>
> >> +      <column name="options" key="mcast_relay" type'{"type":
> "boolean"}'>
> >> +        <p>
> >> +          Enables/disables IP multicast relay between logical switches
> >> +          connected to the logical router. Default: False.
> >> +        </p>
> >> +      </column>
> >>      </group>
> >>
> >>      <group title="Common Columns">
> >> diff --git a/tests/ovn.at b/tests/ovn.at
> >> index 71eb390..52c044c 100644
> >> --- a/tests/ovn.at
> >> +++ b/tests/ovn.at
> >> @@ -14721,12 +14721,12 @@ AT_CHECK([ovn-sbctl get controller_event
> $uuid seq_num], [0], [dnl
> >>  OVN_CLEANUP([hv1], [hv2])
> >>  AT_CLEANUP
> >>
> >> -AT_SETUP([ovn -- IGMP snoop/querier])
> >> +AT_SETUP([ovn -- IGMP snoop/querier/relay])
> >>  AT_SKIP_IF([test $HAVE_PYTHON = no])
> >>  ovn_start
> >>
> >>  # Logical network:
> >> -# Two independent logical switches (sw1 and sw2).
> >> +# Three logical switches (sw1-sw3) connected to a logical router (rtr).
> >>  # sw1:
> >>  #   - subnet 10.0.0.0/8
> >>  #   - 2 ports bound on hv1 (sw1-p11, sw1-p12)
> >> @@ -14736,6 +14736,10 @@ ovn_start
> >>  #   - 1 port bound on hv1 (sw2-p1)
> >>  #   - 1 port bound on hv2 (sw2-p2)
> >>  #   - IGMP Querier from 20.0.0.254
> >> +# sw3:
> >> +#   - subnet 30.0.0.0/8
> >> +#   - 1 port bound on hv1 (sw3-p1)
> >> +#   - 1 port bound on hv2 (sw3-p2)
> >>
> >>  reset_pcap_file() {
> >>      local iface=$1
> >> @@ -14812,29 +14816,47 @@ store_igmp_v3_query() {
> >>  }
> >>
> >>  #
> >> -# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
> >> -#    IP_PROTO DATA OUTFILE
> >> +# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
> TTL
> >> +#    IP_CHKSUM IP_PROTO DATA
> >>  #
> >>  # This shell function causes an IP multicast packet to be received on
> INPORT
> >>  # of HV.
> >>  # The hexdump of the packet is stored in OUTFILE.
> >>  #
> >>  send_ip_multicast_pkt() {
> >> -    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ip_src=$5 ip_dst=$6
> >> -    local ip_len=$7 ip_chksum=$8 proto=$9 data=${10} outfile=${11}
> >> -
> >> -    local ip_ttl=20
> >> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4
> >> +    local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9
> proto=${10}
> >> +    local data=${11}
> >>
> >>      local eth=${eth_dst}${eth_src}0800
> >>      local
> ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
> >>      local packet=${eth}${ip}${data}
> >>
> >>      as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet}
> >> +}
> >> +
> >> +#
> >> +# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
> >> +#    IP_CHKSUM IP_PROTO DATA OUTFILE
> >> +#
> >> +# This shell function builds an IP multicast packet and stores the
> hexdump of
> >> +# the packet in OUTFILE.
> >> +#
> >> +store_ip_multicast_pkt() {
> >> +    local eth_src=$1 eth_dst=$2
> >> +    local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8
> >> +    local data=$9 outfile=${10}
> >> +
> >> +    local eth=${eth_dst}${eth_src}0800
> >> +    local
> ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
> >> +    local packet=${eth}${ip}${data}
> >> +
> >>      echo ${packet} >> ${outfile}
> >>  }
> >>
> >>  ovn-nbctl ls-add sw1
> >>  ovn-nbctl ls-add sw2
> >> +ovn-nbctl ls-add sw3
> >>
> >>  ovn-nbctl lsp-add sw1 sw1-p11
> >>  ovn-nbctl lsp-add sw1 sw1-p12
> >> @@ -14842,6 +14864,26 @@ ovn-nbctl lsp-add sw1 sw1-p21
> >>  ovn-nbctl lsp-add sw1 sw1-p22
> >>  ovn-nbctl lsp-add sw2 sw2-p1
> >>  ovn-nbctl lsp-add sw2 sw2-p2
> >> +ovn-nbctl lsp-add sw3 sw3-p1
> >> +ovn-nbctl lsp-add sw3 sw3-p2
> >> +
> >> +ovn-nbctl lr-add rtr
> >> +ovn-nbctl lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24
> >> +ovn-nbctl lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24
> >> +ovn-nbctl lrp-add rtr rtr-sw3 00:00:00:00:03:00 30.0.0.254/24
> >> +
> >> +ovn-nbctl lsp-add sw1 sw1-rtr                      \
> >> +    -- lsp-set-type sw1-rtr router                 \
> >> +    -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \
> >> +    -- lsp-set-options sw1-rtr router-port=rtr-sw1
> >> +ovn-nbctl lsp-add sw2 sw2-rtr                      \
> >> +    -- lsp-set-type sw2-rtr router                 \
> >> +    -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \
> >> +    -- lsp-set-options sw2-rtr router-port=rtr-sw2
> >> +ovn-nbctl lsp-add sw3 sw3-rtr                      \
> >> +    -- lsp-set-type sw3-rtr router                 \
> >> +    -- lsp-set-addresses sw3-rtr 00:00:00:00:03:00 \
> >> +    -- lsp-set-options sw3-rtr router-port=rtr-sw3
> >>
> >>  net_add n1
> >>  sim_add hv1
> >> @@ -14863,6 +14905,11 @@ ovs-vsctl -- add-port br-int hv1-vif3 -- \
> >>      options:tx_pcap=hv1/vif3-tx.pcap \
> >>      options:rxq_pcap=hv1/vif3-rx.pcap \
> >>      ofport-request=1
> >> +ovs-vsctl -- add-port br-int hv1-vif4 -- \
> >> +    set interface hv1-vif4 external-ids:iface-id=sw3-p1 \
> >> +    options:tx_pcap=hv1/vif4-tx.pcap \
> >> +    options:rxq_pcap=hv1/vif4-rx.pcap \
> >> +    ofport-request=1
> >>
> >>  sim_add hv2
> >>  as hv2
> >> @@ -14883,12 +14930,18 @@ ovs-vsctl -- add-port br-int hv2-vif3 -- \
> >>      options:tx_pcap=hv2/vif3-tx.pcap \
> >>      options:rxq_pcap=hv2/vif3-rx.pcap \
> >>      ofport-request=1
> >> +ovs-vsctl -- add-port br-int hv2-vif4 -- \
> >> +    set interface hv2-vif4 external-ids:iface-id=sw3-p2 \
> >> +    options:tx_pcap=hv2/vif4-tx.pcap \
> >> +    options:rxq_pcap=hv2/vif4-rx.pcap \
> >> +    ofport-request=1
> >>
> >>  OVN_POPULATE_ARP
> >>
> >>  # Enable IGMP snooping on sw1.
> >> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_querier="false"
> >> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_snoop="true"
> >> +ovn-nbctl set Logical_Switch sw1       \
> >> +    other_config:mcast_querier="false" \
> >> +    other_config:mcast_snoop="true"
> >>
> >>  # No IGMP query should be generated by sw1 (mcast_querier="false").
> >>  truncate -s 0 expected
> >> @@ -14921,9 +14974,12 @@ truncate -s 0 expected
> >>  truncate -s 0 expected_empty
> >>  send_ip_multicast_pkt hv1-vif2 hv1 \
> >>      000000000001 01005e000144 \
> >> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
> >> -    e518e518000a3b3a0000 \
> >> -    expected
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000
> >> +store_ip_multicast_pkt \
> >> +    000000000001 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000 expected
> >>
> >>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected])
> >>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
> >> @@ -14944,17 +15000,19 @@ OVS_WAIT_UNTIL([
> >>      test "${total_entries}" = "1"
> >>  ])
> >>
> >> -# Send traffic traffic and make sure it gets forwarded only on the
> port that
> >> -# joined.
> >> +# Send traffic and make sure it gets forwarded only on the port that
> joined.
> >>  as hv1 reset_pcap_file hv1-vif1 hv1/vif1
> >>  as hv2 reset_pcap_file hv2-vif1 hv2/vif1
> >>  truncate -s 0 expected
> >>  truncate -s 0 expected_empty
> >>  send_ip_multicast_pkt hv1-vif2 hv1 \
> >>      000000000001 01005e000144 \
> >> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
> >> -    e518e518000a3b3a0000 \
> >> -    expected
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000
> >> +store_ip_multicast_pkt \
> >> +    000000000001 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000 expected
> >>
> >>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
> >>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
> >> @@ -14988,6 +15046,111 @@ sleep 1
> >>  OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected])
> >>  OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
> >>
> >> +# Dissable IGMP querier on sw2.
> >> +ovn-nbctl set Logical_Switch sw2 \
> >> +    other_config:mcast_querier="false"
> >> +
> >> +# Enable IGMP snooping on sw3.
> >> +ovn-nbctl set Logical_Switch sw3       \
> >> +    other_config:mcast_querier="false" \
> >> +    other_config:mcast_snoop="true"
> >> +
> >> +# Send traffic from sw3 and make sure rtr doesn't relay it.
> >> +truncate -s 0 expected_empty
> >> +
> >> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
> >> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
> >> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
> >> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
> >> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
> >> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
> >> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
> >> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
> >> +
> >> +send_ip_multicast_pkt hv2-vif4 hv2 \
> >> +    000000000001 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000
> >> +
> >> +# Sleep a bit to make sure no traffic is received and then check.
> >> +sleep 1
> >> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
> >> +
> >> +# Enable IGMP relay on rtr
> >> +ovn-nbctl set logical_router rtr \
> >> +    options:mcast_relay="true"
> >> +
> >> +# Inject IGMP Join for 239.0.1.68 on sw1-p11.
> >> +send_igmp_v3_report hv1-vif1 hv1 \
> >> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> >> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> >> +    /dev/null
> >> +# Inject IGMP Join for 239.0.1.68 on sw2-p2.
> >> +send_igmp_v3_report hv2-vif3 hv2 \
> >> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> >> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> >> +    /dev/null
> >> +# Inject IGMP Join for 239.0.1.68 on sw3-p1.
> >> +send_igmp_v3_report hv1-vif4 hv1 \
> >> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
> >> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
> >> +    /dev/null
> >> +
> >> +# Check that the IGMP Group is learned by all switches.
> >> +OVS_WAIT_UNTIL([
> >> +    total_entries=`ovn-sbctl find IGMP_Group | grep "239.0.1.68" | wc
> -l`
> >> +    test "${total_entries}" = "3"
> >> +])
> >> +
> >> +# Send traffic from sw3 and make sure it is relayed by rtr.
> >> +# and ports that joined.
> >> +truncate -s 0 expected_routed_sw1
> >> +truncate -s 0 expected_routed_sw2
> >> +truncate -s 0 expected_switched
> >> +truncate -s 0 expected_empty
> >> +
> >> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
> >> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
> >> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
> >> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
> >> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
> >> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
> >> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
> >> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
> >> +
> >> +send_ip_multicast_pkt hv2-vif4 hv2 \
> >> +    000000000001 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000
> >> +store_ip_multicast_pkt \
> >> +    000000000100 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
> >> +    e518e518000a3b3a0000 expected_routed_sw1
> >> +store_ip_multicast_pkt \
> >> +    000000000200 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
> >> +    e518e518000a3b3a0000 expected_routed_sw2
> >> +store_ip_multicast_pkt \
> >> +    000000000001 01005e000144 \
> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
> >> +    e518e518000a3b3a0000 expected_switched
> >> +
> >> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_routed_sw1])
> >> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_routed_sw2])
> >> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_switched])
> >> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
> >> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
> >> +
> >>  OVN_CLEANUP([hv1], [hv2])
> >>  AT_CLEANUP
> >>
> >> --
> >> 1.8.3.1
> >>
> >> _______________________________________________
> >> dev mailing list
> >> dev@openvswitch.org
> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
Dumitru Ceara Aug. 16, 2019, 12:15 p.m. UTC | #4
On Fri, Aug 16, 2019 at 2:09 PM Numan Siddique <nusiddiq@redhat.com> wrote:
>
>
>
> On Fri, Aug 16, 2019 at 5:26 PM Dumitru Ceara <dceara@redhat.com> wrote:
>>
>> On Fri, Aug 16, 2019 at 12:23 PM Numan Siddique <nusiddiq@redhat.com> wrote:
>> >
>> >
>> >
>> > On Mon, Aug 12, 2019 at 5:32 PM Dumitru Ceara <dceara@redhat.com> wrote:
>> >>
>> >> Add a new configuration option 'mcast_relay' to the Logical_Router:options
>> >> in the OVN Northbound database.
>> >>
>> >> If a router is configured with 'mcast_relay' enabled then ovn-northd
>> >> will install Logical_Flows to allow IP multicast traffic to be routed
>> >> between Logical_Switches. The logical router will aggregate all IGMP
>> >> groups from attached logical switches and modify the routing pipeline in
>> >> the following way:
>> >> - Table S_ROUTER_IN_IP_INPUT: add flow allowing IP multicast traffic
>> >>   if mcast_relay is enabled on the datapath.
>> >> - Table S_ROUTER_IN_IP_ROUTING: add flow matching the group address,
>> >>   update TTL and set outport="<Multicast_Group> associated with the
>> >>   IGMP group". Continue to next table.
>> >> - Table S_ROUTER_IN_ARP_RESOLVE: bypass ARP resolve for IP multicast
>> >>   traffic and continue to next table.
>> >> - Table S_ROUTER_OUT_DELIVERY: add flow matching IP multicast traffic
>> >>   and set ETH.SRC to the MAC address of the logical port on which
>> >>   traffic is forwarded.
>> >>
>> >> Signed-off-by: Dumitru Ceara <dceara@redhat.com>
>> >> Acked-by: Mark Michelson <mmichels@redhat.com>
>> >
>> >
>> > Hi Dumitru,
>> >
>> > Just a minor comment. Please see below. Can you please update the patch ?
>> > LGTM otherwise.
>>
>> Hi Numan,
>>
>> Thanks for the review, I'll send a v4 but I have a follow up to your
>> comment below.
>>
>> Thanks,
>> Dumitru
>>
>> >
>> > Thanks
>> > Numan
>> >
>> >>
>> >>
>> >> ---
>> >> v3:
>> >> - Address Mark's comment and move setting of the outport in the IP
>> >>   Routing stage.
>> >> - Update commit message.
>> >> - Fix some typos.
>> >> v2:
>> >> - Optimize flooding to multicast router ports.
>> >> - Fix check for source IP multicast in router pipeline.
>> >> - Use an enum for OVN_MCAST_*_KEY definitions to avoid hard to debug
>> >>   errors due to typos when adding new OVN_MCAST_*_KEY values.
>> >> - Fix ovn-northd man page for IGMP.
>> >> ---
>> >>  NEWS                    |   1 +
>> >>  lib/mcast-group-index.h |  13 +-
>> >>  northd/ovn-northd.8.xml |  79 +++++++-
>> >>  northd/ovn-northd.c     | 504 ++++++++++++++++++++++++++++++++++++------------
>> >>  ovn-nb.xml              |   6 +
>> >>  tests/ovn.at            | 199 +++++++++++++++++--
>> >>  6 files changed, 651 insertions(+), 151 deletions(-)
>> >>
>> >> diff --git a/NEWS b/NEWS
>> >> index f476984..73045d6 100644
>> >> --- a/NEWS
>> >> +++ b/NEWS
>> >> @@ -39,6 +39,7 @@ Post-v2.11.0
>> >>         logical groups which results in tunnels only been formed between
>> >>         members of the same transport zone(s).
>> >>       * Support for new logical switch port type - 'virtual'.
>> >> +     * Support for IGMP Snooping/Querier and Relay.
>> >>     - New QoS type "linux-netem" on Linux.
>> >>     - Added support for TLS Server Name Indication (SNI).
>> >>
>> >> diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
>> >> index 15a1592..cb49ad7 100644
>> >> --- a/lib/mcast-group-index.h
>> >> +++ b/lib/mcast-group-index.h
>> >> @@ -20,8 +20,17 @@ struct ovsdb_idl;
>> >>
>> >>  struct sbrec_datapath_binding;
>> >>
>> >> -#define OVN_MCAST_FLOOD_TUNNEL_KEY   65535
>> >> -#define OVN_MCAST_UNKNOWN_TUNNEL_KEY (OVN_MCAST_FLOOD_TUNNEL_KEY - 1)
>> >> +#define OVN_MIN_MULTICAST 32768
>> >> +#define OVN_MAX_MULTICAST 65535
>> >> +
>> >> +enum ovn_mcast_tunnel_keys {
>> >> +
>> >> +    OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
>> >> +    OVN_MCAST_UNKNOWN_TUNNEL_KEY,
>> >> +    OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
>> >> +    OVN_MIN_IP_MULTICAST,
>> >> +    OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST,
>> >> +};
>> >>
>> >>  struct ovsdb_idl_index *mcast_group_index_create(struct ovsdb_idl *);
>> >>  const struct sbrec_multicast_group *
>> >> diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
>> >> index 6d2fbe3..d45bb15 100644
>> >> --- a/northd/ovn-northd.8.xml
>> >> +++ b/northd/ovn-northd.8.xml
>> >> @@ -947,10 +947,40 @@ output;
>> >>
>> >>      <ul>
>> >>        <li>
>> >> -        A priority-100 flow that outputs all packets with an Ethernet broadcast
>> >> +        A priority-100 flow that punts all IGMP packets to
>> >> +        <code>ovn-controller</code> if IGMP snooping is enabled on the
>> >> +        logical switch.
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        Priority-90 flows that forward registered IP multicast traffic to
>> >> +        their corresponding multicast group, which <code>ovn-northd</code>
>> >> +        creates based on learnt <ref table="IGMP_Group" db="OVN_Southbound"/>
>> >> +        entries.  The flows also forward packets to the
>> >> +        <code>MC_MROUTER_FLOOD</code> multicast group, which
>> >> +        <code>ovn-nortdh</code> populates with all the logical ports that
>> >> +        are connected to logical routers with
>> >> +        <ref column="options" table="Logical_Router"/>:mcast_relay='true'.
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        A priority-85 flow that forwards all IP multicast traffic destined to
>> >> +        224.0.0.X to the <code>MC_FLOOD</code> multicast group, which
>> >> +        <code>ovn-northd</code> populates with all enabled logical ports.
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        A priority-80 flow that forwards all unregistered IP multicast traffic
>> >> +        to the <code>MC_MROUTER_FLOOD</code> multicast group, if any.
>> >> +        Otherwise the flow drops all unregistered IP multicast packets.  This
>> >> +        flow is added only if <ref column="other_config"
>> >> +        table="Logical_Switch"/>:mcast_flood_unregistered='false'.
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        A priority-70 flow that outputs all packets with an Ethernet broadcast
>> >>          or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
>> >> -        multicast group, which <code>ovn-northd</code> populates with all
>> >> -        enabled logical ports.
>> >> +        multicast group.
>> >>        </li>
>> >>
>> >>        <li>
>> >> @@ -1228,6 +1258,14 @@ output;
>> >>
>> >>        <li>
>> >>          <p>
>> >> +          A priority-95 flow allows IP multicast traffic if
>> >> +          <ref column="options" table="Logical_Router"/>:mcast_relay='true',
>> >> +          otherwise drops it.
>> >> +        </p>
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        <p>
>> >>            ICMP echo reply.  These flows reply to ICMP echo requests received
>> >>            for the router's IP address.  Let <var>A</var> be an IP address
>> >>            owned by a router port.  Then, for each <var>A</var> that is
>> >> @@ -1941,6 +1979,16 @@ output;
>> >>      <ul>
>> >>        <li>
>> >>          <p>
>> >> +          Priority-500 flows that match IP multicast traffic destined to
>> >> +          groups registered on any of the attached switches and sets
>> >> +          <code>outport</code> to the associated multicast group that will
>> >> +          eventually flood the traffic to all interested attached logical
>> >> +          switches. The flows also decrement TTL.
>> >> +        </p>
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        <p>
>> >>            For distributed logical routers where one of the logical router
>> >>            ports specifies a <code>redirect-chassis</code>, a priority-400
>> >>            logical flow for each ip source/destination couple that matches the
>> >> @@ -2074,6 +2122,15 @@ next;
>> >>      <ul>
>> >>        <li>
>> >>          <p>
>> >> +          A priority-500 flow that matches IP multicast traffic that was
>> >> +          allowed in the routing pipeline. For this kind of traffic the
>> >> +          <code>outport</code> was already set so the flow just advances to
>> >> +          the next table.
>> >> +        </p>
>> >> +      </li>
>> >> +
>> >> +      <li>
>> >> +        <p>
>> >>            For distributed logical routers where one of the logical router
>> >>            ports specifies a <code>redirect-chassis</code>, a priority-400
>> >>            logical flow with match <code>REGBIT_DISTRIBUTED_NAT == 1</code>
>> >> @@ -2641,9 +2698,19 @@ clone {
>> >>      <h3>Egress Table 3: Delivery</h3>
>> >>
>> >>      <p>
>> >> -      Packets that reach this table are ready for delivery.  It contains
>> >> -      priority-100 logical flows that match packets on each enabled logical
>> >> -      router port, with action <code>output;</code>.
>> >> +      Packets that reach this table are ready for delivery.  It contains:
>> >> +      <ul>
>> >> +        <li>
>> >> +          Priority-110 logical flows that match IP multicast packets on each
>> >> +          enabled logical router port and modify the Ethernet source address
>> >> +          of the packets to the Ethernet address of the port and then execute
>> >> +          action <code>output;</code>.
>> >> +        </li>
>> >> +        <li>
>> >> +          Priority-100 logical flows that match packets on each enabled
>> >> +          logical router port, with action <code>output;</code>.
>> >> +        </li>
>> >> +      </ul>
>> >>      </p>
>> >>
>> >>  </manpage>
>> >> diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
>> >> index e6953a4..9ee9230 100644
>> >> --- a/northd/ovn-northd.c
>> >> +++ b/northd/ovn-northd.c
>> >> @@ -433,32 +433,52 @@ struct ipam_info {
>> >>      bool mac_only;
>> >>  };
>> >>
>> >> -#define OVN_MIN_MULTICAST 32768
>> >> -#define OVN_MAX_MULTICAST OVN_MCAST_FLOOD_TUNNEL_KEY
>> >> -BUILD_ASSERT_DECL(OVN_MIN_MULTICAST < OVN_MAX_MULTICAST);
>> >> -
>> >> -#define OVN_MIN_IP_MULTICAST OVN_MIN_MULTICAST
>> >> -#define OVN_MAX_IP_MULTICAST (OVN_MCAST_UNKNOWN_TUNNEL_KEY - 1)
>> >> -BUILD_ASSERT_DECL(OVN_MAX_IP_MULTICAST >= OVN_MIN_MULTICAST);
>> >> -
>> >>  /*
>> >>   * Multicast snooping and querier per datapath configuration.
>> >>   */
>> >> +struct mcast_switch_info {
>> >> +
>> >> +    bool enabled;               /* True if snooping enabled. */
>> >> +    bool querier;               /* True if querier enabled. */
>> >> +    bool flood_unregistered;    /* True if unregistered multicast should be
>> >> +                                 * flooded.
>> >> +                                 */
>> >> +    bool flood_relay;           /* True if the switch is connected to a
>> >> +                                 * multicast router and unregistered multicast
>> >> +                                 * should be flooded to the mrouter. Only
>> >> +                                 * applicable if flood_unregistered == false.
>> >> +                                 */
>> >> +
>> >> +    int64_t table_size;         /* Max number of IP multicast groups. */
>> >> +    int64_t idle_timeout;       /* Timeout after which an idle group is
>> >> +                                 * flushed.
>> >> +                                 */
>> >> +    int64_t query_interval;     /* Interval between multicast queries. */
>> >> +    char *eth_src;              /* ETH src address of the multicast queries. */
>> >> +    char *ipv4_src;             /* IP src address of the multicast queries. */
>> >> +    int64_t query_max_response; /* Expected time after which reports should
>> >> +                                 * be received for queries that were sent out.
>> >> +                                 */
>> >> +
>> >> +    uint32_t active_flows;      /* Current number of active IP multicast
>> >> +                                 * flows.
>> >> +                                 */
>> >> +};
>> >> +
>> >> +struct mcast_router_info {
>> >> +    bool relay; /* True if the router should relay IP multicast. */
>> >> +};
>> >> +
>> >>  struct mcast_info {
>> >> -    bool enabled;
>> >> -    bool querier;
>> >> -    bool flood_unregistered;
>> >> -
>> >> -    int64_t table_size;
>> >> -    int64_t idle_timeout;
>> >> -    int64_t query_interval;
>> >> -    char *eth_src;
>> >> -    char *ipv4_src;
>> >> -    int64_t  query_max_response;
>> >> -
>> >> -    struct hmap group_tnlids;
>> >> -    uint32_t group_tnlid_hint;
>> >> -    uint32_t active_flows;
>> >> +
>> >> +    struct hmap group_tnlids;  /* Group tunnel IDs in use on this DP. */
>> >> +    uint32_t group_tnlid_hint; /* Hint for allocating next group tunnel ID. */
>> >> +    struct ovs_list groups;    /* List of groups learnt on this DP. */
>> >> +
>> >> +    union {
>> >> +        struct mcast_switch_info sw;  /* Switch specific multicast info. */
>> >> +        struct mcast_router_info rtr; /* Router specific multicast info. */
>> >> +    };
>> >>  };
>> >>
>> >>  static uint32_t
>> >> @@ -559,6 +579,7 @@ ovn_datapath_create(struct hmap *datapaths, const struct uuid *key,
>> >>  }
>> >>
>> >>  static void ovn_ls_port_group_destroy(struct hmap *nb_pgs);
>> >> +static void destroy_mcast_info_for_datapath(struct ovn_datapath *od);
>> >>
>> >>  static void
>> >>  ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
>> >> @@ -572,12 +593,7 @@ ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
>> >>          bitmap_free(od->ipam_info.allocated_ipv4s);
>> >>          free(od->router_ports);
>> >>          ovn_ls_port_group_destroy(&od->nb_pgs);
>> >> -
>> >> -        if (od->nbs) {
>> >> -            free(od->mcast_info.eth_src);
>> >> -            free(od->mcast_info.ipv4_src);
>> >> -            destroy_tnlids(&od->mcast_info.group_tnlids);
>> >> -        }
>> >> +        destroy_mcast_info_for_datapath(od);
>> >>
>> >>          free(od);
>> >>      }
>> >> @@ -714,23 +730,28 @@ init_ipam_info_for_datapath(struct ovn_datapath *od)
>> >>  }
>> >>
>> >>  static void
>> >> -init_mcast_info_for_datapath(struct ovn_datapath *od)
>> >> +init_mcast_info_for_router_datapath(struct ovn_datapath *od)
>> >>  {
>> >> -    if (!od->nbs) {
>> >> -        return;
>> >> -    }
>> >> +    struct mcast_router_info *mcast_rtr_info = &od->mcast_info.rtr;
>> >>
>> >> -    struct mcast_info *mcast_info = &od->mcast_info;
>> >> +    mcast_rtr_info->relay = smap_get_bool(&od->nbr->options, "mcast_relay",
>> >> +                                          false);
>> >> +}
>> >>
>> >> -    mcast_info->enabled =
>> >> +static void
>> >> +init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>> >> +{
>> >> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> >> +
>> >> +    mcast_sw_info->enabled =
>> >>          smap_get_bool(&od->nbs->other_config, "mcast_snoop", false);
>> >> -    mcast_info->querier =
>> >> +    mcast_sw_info->querier =
>> >>          smap_get_bool(&od->nbs->other_config, "mcast_querier", true);
>> >> -    mcast_info->flood_unregistered =
>> >> +    mcast_sw_info->flood_unregistered =
>> >>          smap_get_bool(&od->nbs->other_config, "mcast_flood_unregistered",
>> >>                        false);
>> >>
>> >> -    mcast_info->table_size =
>> >> +    mcast_sw_info->table_size =
>> >>          smap_get_ullong(&od->nbs->other_config, "mcast_table_size",
>> >>                          OVN_MCAST_DEFAULT_MAX_ENTRIES);
>> >>
>> >> @@ -742,54 +763,94 @@ init_mcast_info_for_datapath(struct ovn_datapath *od)
>> >>      } else if (idle_timeout > OVN_MCAST_MAX_IDLE_TIMEOUT_S) {
>> >>          idle_timeout = OVN_MCAST_MAX_IDLE_TIMEOUT_S;
>> >>      }
>> >> -    mcast_info->idle_timeout = idle_timeout;
>> >> +    mcast_sw_info->idle_timeout = idle_timeout;
>> >>
>> >>      uint32_t query_interval =
>> >>          smap_get_ullong(&od->nbs->other_config, "mcast_query_interval",
>> >> -                        mcast_info->idle_timeout / 2);
>> >> +                        mcast_sw_info->idle_timeout / 2);
>> >>      if (query_interval < OVN_MCAST_MIN_QUERY_INTERVAL_S) {
>> >>          query_interval = OVN_MCAST_MIN_QUERY_INTERVAL_S;
>> >>      } else if (query_interval > OVN_MCAST_MAX_QUERY_INTERVAL_S) {
>> >>          query_interval = OVN_MCAST_MAX_QUERY_INTERVAL_S;
>> >>      }
>> >> -    mcast_info->query_interval = query_interval;
>> >> +    mcast_sw_info->query_interval = query_interval;
>> >>
>> >> -    mcast_info->eth_src =
>> >> +    mcast_sw_info->eth_src =
>> >>          nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_eth_src"));
>> >> -    mcast_info->ipv4_src =
>> >> +    mcast_sw_info->ipv4_src =
>> >>          nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_ip4_src"));
>> >>
>> >> -    mcast_info->query_max_response =
>> >> +    mcast_sw_info->query_max_response =
>> >>          smap_get_ullong(&od->nbs->other_config, "mcast_query_max_response",
>> >>                          OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
>> >>
>> >> -    hmap_init(&mcast_info->group_tnlids);
>> >> -    mcast_info->group_tnlid_hint = OVN_MIN_IP_MULTICAST;
>> >> -    mcast_info->active_flows = 0;
>> >> +    mcast_sw_info->active_flows = 0;
>> >> +}
>> >> +
>> >> +static void
>> >> +init_mcast_info_for_datapath(struct ovn_datapath *od)
>> >> +{
>> >> +    if (!od->nbr && !od->nbs) {
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    hmap_init(&od->mcast_info.group_tnlids);
>> >> +    od->mcast_info.group_tnlid_hint = OVN_MIN_IP_MULTICAST;
>> >> +    ovs_list_init(&od->mcast_info.groups);
>> >> +
>> >> +    if (od->nbs) {
>> >> +        init_mcast_info_for_switch_datapath(od);
>> >> +    } else {
>> >> +        init_mcast_info_for_router_datapath(od);
>> >> +    }
>> >> +}
>> >> +
>> >> +static void
>> >> +destroy_mcast_info_for_switch_datapath(struct ovn_datapath *od)
>> >> +{
>> >> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> >> +
>> >> +    free(mcast_sw_info->eth_src);
>> >> +    free(mcast_sw_info->ipv4_src);
>> >> +}
>> >> +
>> >> +static void
>> >> +destroy_mcast_info_for_datapath(struct ovn_datapath *od)
>> >> +{
>> >> +    if (!od->nbr && !od->nbs) {
>> >> +        return;
>> >> +    }
>> >> +
>> >> +    if (od->nbs) {
>> >> +        destroy_mcast_info_for_switch_datapath(od);
>> >> +    }
>> >> +
>> >> +    destroy_tnlids(&od->mcast_info.group_tnlids);
>> >>  }
>> >>
>> >>  static void
>> >> -store_mcast_info_for_datapath(const struct sbrec_ip_multicast *sb,
>> >> -                              struct ovn_datapath *od)
>> >> +store_mcast_info_for_switch_datapath(const struct sbrec_ip_multicast *sb,
>> >> +                                     struct ovn_datapath *od)
>> >>  {
>> >> -    struct mcast_info *mcast_info = &od->mcast_info;
>> >> +    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> >>
>> >>      sbrec_ip_multicast_set_datapath(sb, od->sb);
>> >> -    sbrec_ip_multicast_set_enabled(sb, &mcast_info->enabled, 1);
>> >> -    sbrec_ip_multicast_set_querier(sb, &mcast_info->querier, 1);
>> >> -    sbrec_ip_multicast_set_table_size(sb, &mcast_info->table_size, 1);
>> >> -    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_info->idle_timeout, 1);
>> >> +    sbrec_ip_multicast_set_enabled(sb, &mcast_sw_info->enabled, 1);
>> >> +    sbrec_ip_multicast_set_querier(sb, &mcast_sw_info->querier, 1);
>> >> +    sbrec_ip_multicast_set_table_size(sb, &mcast_sw_info->table_size, 1);
>> >> +    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_sw_info->idle_timeout, 1);
>> >>      sbrec_ip_multicast_set_query_interval(sb,
>> >> -                                          &mcast_info->query_interval, 1);
>> >> +                                          &mcast_sw_info->query_interval, 1);
>> >>      sbrec_ip_multicast_set_query_max_resp(sb,
>> >> -                                          &mcast_info->query_max_response, 1);
>> >> +                                          &mcast_sw_info->query_max_response,
>> >> +                                          1);
>> >>
>> >> -    if (mcast_info->eth_src) {
>> >> -        sbrec_ip_multicast_set_eth_src(sb, mcast_info->eth_src);
>> >> +    if (mcast_sw_info->eth_src) {
>> >> +        sbrec_ip_multicast_set_eth_src(sb, mcast_sw_info->eth_src);
>> >>      }
>> >>
>> >> -    if (mcast_info->ipv4_src) {
>> >> -        sbrec_ip_multicast_set_ip4_src(sb, mcast_info->ipv4_src);
>> >> +    if (mcast_sw_info->ipv4_src) {
>> >> +        sbrec_ip_multicast_set_ip4_src(sb, mcast_sw_info->ipv4_src);
>> >>      }
>> >>  }
>> >>
>> >> @@ -906,6 +967,7 @@ join_datapaths(struct northd_context *ctx, struct hmap *datapaths,
>> >>                                       NULL, nbr, NULL);
>> >>              ovs_list_push_back(nb_only, &od->list);
>> >>          }
>> >> +        init_mcast_info_for_datapath(od);
>> >>          ovs_list_push_back(lr_list, &od->lr_list);
>> >>      }
>> >>  }
>> >> @@ -1999,6 +2061,13 @@ join_logical_ports(struct northd_context *ctx,
>> >>                      break;
>> >>                  }
>> >>              }
>> >> +
>> >> +            /* If the router is multicast enabled then set relay on the switch
>> >> +             * datapath.
>> >> +             */
>> >> +            if (peer->od && peer->od->mcast_info.rtr.relay) {
>> >> +                op->od->mcast_info.sw.flood_relay = true;
>> >> +            }
>> >>          } else if (op->nbrp && op->nbrp->peer && !op->derived) {
>> >>              struct ovn_port *peer = ovn_port_find(ports, op->nbrp->peer);
>> >>              if (peer) {
>> >> @@ -2846,6 +2915,10 @@ struct multicast_group {
>> >>  static const struct multicast_group mc_flood =
>> >>      { MC_FLOOD, OVN_MCAST_FLOOD_TUNNEL_KEY };
>> >>
>> >> +#define MC_MROUTER_FLOOD "_MC_mrouter_flood"
>> >> +static const struct multicast_group mc_mrouter_flood =
>> >> +    { MC_MROUTER_FLOOD, OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY };
>> >> +
>> >>  #define MC_UNKNOWN "_MC_unknown"
>> >>  static const struct multicast_group mc_unknown =
>> >>      { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
>> >> @@ -2955,7 +3028,8 @@ ovn_multicast_update_sbrec(const struct ovn_multicast *mc,
>> >>   */
>> >>  struct ovn_igmp_group_entry {
>> >>      struct ovs_list list_node; /* Linkage in the list of entries. */
>> >> -    const struct sbrec_igmp_group *sb;
>> >> +    size_t n_ports;
>> >> +    struct ovn_port **ports;
>> >>  };
>> >>
>> >>  /*
>> >> @@ -2964,12 +3038,13 @@ struct ovn_igmp_group_entry {
>> >>   */
>> >>  struct ovn_igmp_group {
>> >>      struct hmap_node hmap_node; /* Index on 'datapath' and 'address'. */
>> >> +    struct ovs_list list_node;  /* Linkage in the per-dp igmp group list. */
>> >>
>> >>      struct ovn_datapath *datapath;
>> >>      struct in6_addr address; /* Multicast IPv6-mapped-IPv4 or IPv4 address. */
>> >>      struct multicast_group mcgroup;
>> >>
>> >> -    struct ovs_list sb_entries; /* List of SB entries for this group. */
>> >> +    struct ovs_list entries; /* List of SB entries for this group. */
>> >>  };
>> >>
>> >>  static uint32_t
>> >> @@ -2997,77 +3072,120 @@ ovn_igmp_group_find(struct hmap *igmp_groups,
>> >>      return NULL;
>> >>  }
>> >>
>> >> -static void
>> >> +static struct ovn_igmp_group *
>> >>  ovn_igmp_group_add(struct northd_context *ctx, struct hmap *igmp_groups,
>> >>                     struct ovn_datapath *datapath,
>> >> -                   const struct sbrec_igmp_group *sb_igmp_group)
>> >> +                   const struct in6_addr *address,
>> >> +                   const char *address_s)
>> >>  {
>> >> -    struct in6_addr group_address;
>> >> -    ovs_be32 ipv4;
>> >> -
>> >> -    if (ip_parse(sb_igmp_group->address, &ipv4)) {
>> >> -        group_address = in6_addr_mapped_ipv4(ipv4);
>> >> -    } else if (!ipv6_parse(sb_igmp_group->address, &group_address)) {
>> >> -        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>> >> -        VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
>> >> -                     sb_igmp_group->address);
>> >> -        return;
>> >> -    }
>> >> -
>> >>      struct ovn_igmp_group *igmp_group =
>> >> -        ovn_igmp_group_find(igmp_groups, datapath, &group_address);
>> >> +        ovn_igmp_group_find(igmp_groups, datapath, address);
>> >>
>> >>      if (!igmp_group) {
>> >>          igmp_group = xmalloc(sizeof *igmp_group);
>> >>
>> >>          const struct sbrec_multicast_group *mcgroup =
>> >> -            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
>> >> -                               sb_igmp_group->address, datapath->sb);
>> >> +            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp, address_s,
>> >> +                               datapath->sb);
>> >>
>> >>          igmp_group->datapath = datapath;
>> >> -        igmp_group->address = group_address;
>> >> +        igmp_group->address = *address;
>> >>          if (mcgroup) {
>> >>              igmp_group->mcgroup.key = mcgroup->tunnel_key;
>> >>              add_tnlid(&datapath->mcast_info.group_tnlids, mcgroup->tunnel_key);
>> >>          } else {
>> >>              igmp_group->mcgroup.key = 0;
>> >>          }
>> >> -        igmp_group->mcgroup.name = sb_igmp_group->address;
>> >> -        ovs_list_init(&igmp_group->sb_entries);
>> >> +        igmp_group->mcgroup.name = address_s;
>> >> +        ovs_list_init(&igmp_group->entries);
>> >>
>> >>          hmap_insert(igmp_groups, &igmp_group->hmap_node,
>> >> -                    ovn_igmp_group_hash(datapath, &group_address));
>> >> +                    ovn_igmp_group_hash(datapath, address));
>> >> +        ovs_list_push_back(&datapath->mcast_info.groups,
>> >> +                           &igmp_group->list_node);
>> >> +    }
>> >> +
>> >> +    return igmp_group;
>> >> +}
>> >> +
>> >> +static bool
>> >> +ovn_igmp_group_get_address(const struct sbrec_igmp_group *sb_igmp_group,
>> >> +                           struct in6_addr *address)
>> >> +{
>> >> +    ovs_be32 ipv4;
>> >> +
>> >> +    if (ip_parse(sb_igmp_group->address, &ipv4)) {
>> >> +        *address = in6_addr_mapped_ipv4(ipv4);
>> >> +        return true;
>> >> +    }
>> >> +    if (!ipv6_parse(sb_igmp_group->address, address)) {
>> >> +        return false;
>> >>      }
>> >> +    return true;
>> >> +}
>> >>
>> >> +static struct ovn_port **
>> >> +ovn_igmp_group_get_ports(const struct sbrec_igmp_group *sb_igmp_group,
>> >> +                         size_t *n_ports, struct hmap *ovn_ports)
>> >> +{
>> >> +    struct ovn_port **ports = xmalloc(sb_igmp_group->n_ports * sizeof *ports);
>> >> +
>> >> +     *n_ports = 0;
>> >> +     for (size_t i = 0; i < sb_igmp_group->n_ports; i++) {
>> >> +        ports[(*n_ports)] =
>> >> +            ovn_port_find(ovn_ports, sb_igmp_group->ports[i]->logical_port);
>> >> +        if (ports[(*n_ports)]) {
>> >> +            (*n_ports)++;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    return ports;
>> >> +}
>> >> +
>> >> +static void
>> >> +ovn_igmp_group_add_entry(struct ovn_igmp_group *igmp_group,
>> >> +                         struct ovn_port **ports, size_t n_ports)
>> >> +{
>> >>      struct ovn_igmp_group_entry *entry = xmalloc(sizeof *entry);
>> >>
>> >> -    entry->sb = sb_igmp_group;
>> >> -    ovs_list_push_back(&igmp_group->sb_entries , &entry->list_node);
>> >> +    entry->ports = ports;
>> >> +    entry->n_ports = n_ports;
>> >> +    ovs_list_push_back(&igmp_group->entries, &entry->list_node);
>> >> +}
>> >> +
>> >> +static void
>> >> +ovn_igmp_group_destroy_entry(struct ovn_igmp_group_entry *entry)
>> >> +{
>> >> +    free(entry->ports);
>> >> +}
>> >> +
>> >> +static bool
>> >> +ovn_igmp_group_allocate_id(struct ovn_igmp_group *igmp_group)
>> >> +{
>> >> +    if (igmp_group->mcgroup.key == 0) {
>> >> +        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>> >> +        igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
>> >> +    }
>> >> +
>> >> +    if (igmp_group->mcgroup.key == 0) {
>> >> +        return false;
>> >> +    }
>> >> +
>> >> +    return true;
>> >>  }
>> >>
>> >>  static void
>> >>  ovn_igmp_group_aggregate_ports(struct ovn_igmp_group *igmp_group,
>> >> -                               struct hmap *ovn_ports,
>> >>                                 struct hmap *mcast_groups)
>> >>  {
>> >>      struct ovn_igmp_group_entry *entry;
>> >>
>> >> -    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
>> >> -        size_t n_oports = 0;
>> >> -        struct ovn_port **oports =
>> >> -            xmalloc(entry->sb->n_ports * sizeof *oports);
>> >> -
>> >> -        for (size_t i = 0; i < entry->sb->n_ports; i++) {
>> >> -            oports[n_oports] =
>> >> -                ovn_port_find(ovn_ports, entry->sb->ports[i]->logical_port);
>> >> -            if (oports[n_oports]) {
>> >> -                n_oports++;
>> >> -            }
>> >> -        }
>> >> -
>> >> +    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>> >>          ovn_multicast_add_ports(mcast_groups, igmp_group->datapath,
>> >> -                                &igmp_group->mcgroup, oports, n_oports);
>> >> -        free(oports);
>> >> +                                &igmp_group->mcgroup, entry->ports,
>> >> +                                entry->n_ports);
>> >> +
>> >> +        ovn_igmp_group_destroy_entry(entry);
>> >>          free(entry);
>> >>      }
>> >>  }
>> >> @@ -3079,10 +3197,12 @@ ovn_igmp_group_destroy(struct hmap *igmp_groups,
>> >>      if (igmp_group) {
>> >>          struct ovn_igmp_group_entry *entry;
>> >>
>> >> -        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
>> >> +        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
>> >> +            ovn_igmp_group_destroy_entry(entry);
>> >>              free(entry);
>> >>          }
>> >>          hmap_remove(igmp_groups, &igmp_group->hmap_node);
>> >> +        ovs_list_remove(&igmp_group->list_node);
>> >>          free(igmp_group);
>> >>      }
>> >>  }
>> >> @@ -5282,7 +5402,9 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>> >>              continue;
>> >>          }
>> >>
>> >> -        if (od->mcast_info.enabled) {
>> >> +        struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
>> >> +
>> >> +        if (mcast_sw_info->enabled) {
>> >>              /* Punt IGMP traffic to controller. */
>> >>              ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100,
>> >>                            "ip4 && ip.proto == 2", "igmp;");
>> >> @@ -5295,9 +5417,16 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>> >>                            "outport = \""MC_FLOOD"\"; output;");
>> >>
>> >>              /* Drop unregistered IP multicast if not allowed. */
>> >> -            if (!od->mcast_info.flood_unregistered) {
>> >> -                ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>> >> -                              "ip4 && ip4.mcast", "drop;");
>> >> +            if (!mcast_sw_info->flood_unregistered) {
>> >> +                /* Forward unregistered IP multicast to mrouter (if any). */
>> >> +                if (mcast_sw_info->flood_relay) {
>> >> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>> >> +                                  "ip4 && ip4.mcast",
>> >> +                                  "outport = \""MC_MROUTER_FLOOD"\"; output;");
>> >> +                } else {
>> >> +                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
>> >> +                                  "ip4 && ip4.mcast", "drop;");
>> >> +                }
>> >>              }
>> >>          }
>> >>
>> >> @@ -5314,18 +5443,26 @@ build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
>> >>              continue;
>> >>          }
>> >>
>> >> -        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>> >> +        struct mcast_switch_info *mcast_sw_info =
>> >> +            &igmp_group->datapath->mcast_info.sw;
>> >>
>> >> -        if (mcast_info->active_flows >= mcast_info->table_size) {
>> >> +        if (mcast_sw_info->active_flows >= mcast_sw_info->table_size) {
>> >>              continue;
>> >>          }
>> >> -        mcast_info->active_flows++;
>> >> +        mcast_sw_info->active_flows++;
>> >>
>> >>          ds_clear(&match);
>> >>          ds_clear(&actions);
>> >>
>> >>          ds_put_format(&match, "eth.mcast && ip4 && ip4.dst == %s ",
>> >>                        igmp_group->mcgroup.name);
>> >> +        /* Also flood traffic to all multicast routers with relay enabled. */
>> >> +        if (mcast_sw_info->flood_relay) {
>> >> +            ds_put_cstr(&actions,
>> >> +                        "clone { "
>> >> +                            "outport = \""MC_MROUTER_FLOOD "\"; output; "
>> >> +                        "};");
>> >> +        }
>> >>          ds_put_format(&actions, "outport = \"%s\"; output; ",
>> >>                        igmp_group->mcgroup.name);
>> >>
>> >> @@ -6205,7 +6342,7 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>> >>           * source or destination, and zero network source or destination
>> >>           * (priority 100). */
>> >>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
>> >> -                      "ip4.mcast || "
>> >> +                      "ip4.src[28..31] == 0xe ||"
>> >
>> >
>> > Does it makes sense to add another predicate for source mcast addr ?
>> > I am fine either way.
>>
>> Yes, I'll add "ip4.src_mcast" and send v4. I'm thinking of also
>> renaming "ip4.mcast" to "ip4.dst_mcast" for uniformity. What do you
>> think?
>>
>
> For backward compatibility reasons, I would suggest to add a new one - ip4.dst_mcast
> instead of renaming.
>
> I am thinking about the impact when ovn-controller/ovn-northd are upgraded to a new version.
>
> If we upgrade ovn-controller first (which is the recommended one), then any logical flow with "ip4.mcast" will
> be rejected. This could cause datapath disruption until ovn-northd is upgraded.
>

Ah, true, thanks for pointing it out.

>
>>
>> Thanks,
>> Dumitru
>>
>> >
>> >
>> >>
>> >>                        "ip4.src == 255.255.255.255 || "
>> >>                        "ip4.src == 127.0.0.0/8 || "
>> >>                        "ip4.dst == 127.0.0.0/8 || "
>> >> @@ -6213,6 +6350,16 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>> >>                        "ip4.dst == 0.0.0.0/8",
>> >>                        "drop;");
>> >>
>> >> +        /* Allow multicast if relay enabled (priority 95). */
>> >> +        ds_clear(&actions);
>> >> +        if (od->mcast_info.rtr.relay) {
>> >> +            ds_put_cstr(&actions, "next;");
>> >> +        } else {
>> >> +            ds_put_cstr(&actions, "drop;");
>> >> +        }
>> >> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
>> >> +                      "ip4.dst[28..31] == 0xe", ds_cstr(&actions));
>> >
>> >
>> > Since ip4.mcast is a predicate to "ip4.dst[28..31] == 0xe", can you please use ip4.mcast here ?
>> >
>> >
>> >>
>> >> +
>> >>          /* ARP reply handling.  Use ARP replies to populate the logical
>> >>           * router's ARP table. */
>> >>          ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
>> >> @@ -7483,6 +7630,27 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>> >>          }
>> >>      }
>> >>
>> >> +    /* IP Multicast lookup. Here we set the output port, adjust TTL and
>> >> +     * advance to next table (priority 500).
>> >> +     */
>> >> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>> >> +        if (!od->nbr || !od->mcast_info.rtr.relay) {
>> >> +            continue;
>> >> +        }
>> >> +        struct ovn_igmp_group *igmp_group;
>> >> +
>> >> +        LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
>> >> +            ds_clear(&match);
>> >> +            ds_clear(&actions);
>> >> +            ds_put_format(&match, "ip4 && ip4.dst == %s ",
>> >> +                          igmp_group->mcgroup.name);
>> >> +            ds_put_format(&actions, "outport = \"%s\"; ip.ttl--; next;",
>> >> +                          igmp_group->mcgroup.name);
>> >> +            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 500,
>> >> +                          ds_cstr(&match), ds_cstr(&actions));
>> >> +        }
>> >> +    }
>> >> +
>> >>      /* Logical router ingress table 8: Policy.
>> >>       *
>> >>       * A packet that arrives at this table is an IP packet that should be
>> >> @@ -7513,10 +7681,24 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>> >>
>> >>      /* Local router ingress table 9: ARP Resolution.
>> >>       *
>> >> -     * Any packet that reaches this table is an IP packet whose next-hop IP
>> >> -     * address is in reg0. (ip4.dst is the final destination.) This table
>> >> -     * resolves the IP address in reg0 into an output port in outport and an
>> >> -     * Ethernet address in eth.dst. */
>> >> +     * Multicast packets already have the outport set so just advance to next
>> >> +     * table (priority 500). */
>> >> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>> >> +        if (!od->nbr) {
>> >> +            continue;
>> >> +        }
>> >> +
>> >> +        ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 500,
>> >> +                      "ip4.mcast", "next;");
>> >> +    }
>> >> +
>> >> +    /* Local router ingress table 9: ARP Resolution.
>> >> +     *
>> >> +     * Any unicast packet that reaches this table is an IP packet whose
>> >> +     * next-hop IP address is in reg0. (ip4.dst is the final destination.)
>> >> +     * This table resolves the IP address in reg0 into an output port in
>> >> +     * outport and an Ethernet address in eth.dst.
>> >> +     */
>> >>      HMAP_FOR_EACH (op, key_node, ports) {
>> >>          if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
>> >>              continue;
>> >> @@ -7998,9 +8180,13 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>> >>          ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
>> >>      }
>> >>
>> >> -    /* Logical router egress table 1: Delivery (priority 100).
>> >> +    /* Logical router egress table 1: Delivery (priority 100-110).
>> >>       *
>> >> -     * Priority 100 rules deliver packets to enabled logical ports. */
>> >> +     * Priority 100 rules deliver packets to enabled logical ports.
>> >> +     * Priority 110 rules match multicast packets and update the source
>> >> +     * mac before delivering to enabled logical ports. IP multicast traffic
>> >> +     * bypasses S_ROUTER_IN_IP_ROUTING route lookups.
>> >> +     */
>> >>      HMAP_FOR_EACH (op, key_node, ports) {
>> >>          if (!op->nbrp) {
>> >>              continue;
>> >> @@ -8020,6 +8206,19 @@ build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
>> >>              continue;
>> >>          }
>> >>
>> >> +        /* If multicast relay is enabled then also adjust source mac for IP
>> >> +         * multicast traffic.
>> >> +         */
>> >> +        if (op->od->mcast_info.rtr.relay) {
>> >> +            ds_clear(&match);
>> >> +            ds_clear(&actions);
>> >> +            ds_put_format(&match, "ip4.mcast && outport == %s", op->json_key);
>> >> +            ds_put_format(&actions, "eth.src = %s; output;",
>> >> +                          op->lrp_networks.ea_s);
>> >> +            ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 110,
>> >> +                        ds_cstr(&match), ds_cstr(&actions));
>> >> +        }
>> >> +
>> >>          ds_clear(&match);
>> >>          ds_put_format(&match, "outport == %s", op->json_key);
>> >>          ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
>> >> @@ -8570,7 +8769,7 @@ build_ip_mcast(struct northd_context *ctx, struct hmap *datapaths)
>> >>          if (!ip_mcast) {
>> >>              ip_mcast = sbrec_ip_multicast_insert(ctx->ovnsb_txn);
>> >>          }
>> >> -        store_mcast_info_for_datapath(ip_mcast, od);
>> >> +        store_mcast_info_for_switch_datapath(ip_mcast, od);
>> >>      }
>> >>
>> >>      /* Delete southbound records without northbound matches. */
>> >> @@ -8602,6 +8801,14 @@ build_mcast_groups(struct northd_context *ctx,
>> >>
>> >>          if (lsp_is_enabled(op->nbsp)) {
>> >>              ovn_multicast_add(mcast_groups, &mc_flood, op);
>> >> +
>> >> +            /* If this port is connected to a multicast router then add it
>> >> +             * to the MC_MROUTER_FLOOD group.
>> >> +             */
>> >> +            if (op->od->mcast_info.sw.flood_relay && op->peer &&
>> >> +                    op->peer->od && op->peer->od->mcast_info.rtr.relay) {
>> >> +                ovn_multicast_add(mcast_groups, &mc_mrouter_flood, op);
>> >> +            }
>> >>          }
>> >>      }
>> >>
>> >> @@ -8624,10 +8831,61 @@ build_mcast_groups(struct northd_context *ctx,
>> >>              continue;
>> >>          }
>> >>
>> >> +        struct in6_addr group_address;
>> >> +        if (!ovn_igmp_group_get_address(sb_igmp, &group_address)) {
>> >> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
>> >> +            VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
>> >> +                         sb_igmp->address);
>> >> +            continue;
>> >> +        }
>> >> +
>> >>          /* Add the IGMP group entry. Will also try to allocate an ID for it
>> >>           * if the multicast group already exists.
>> >>           */
>> >> -        ovn_igmp_group_add(ctx, igmp_groups, od, sb_igmp);
>> >> +        struct ovn_igmp_group *igmp_group =
>> >> +            ovn_igmp_group_add(ctx, igmp_groups, od, &group_address,
>> >> +                               sb_igmp->address);
>> >> +
>> >> +        /* Extract the IGMP group ports from the SB entry and store them
>> >> +         * in the IGMP group.
>> >> +         */
>> >> +        size_t n_igmp_ports;
>> >> +        struct ovn_port **igmp_ports =
>> >> +            ovn_igmp_group_get_ports(sb_igmp, &n_igmp_ports, ports);
>> >> +        ovn_igmp_group_add_entry(igmp_group, igmp_ports, n_igmp_ports);
>> >> +    }
>> >> +
>> >> +    /* Build IGMP groups for multicast routers with relay enabled. The router
>> >> +     * IGMP groups are based on the groups learnt by their multicast enabled
>> >> +     * peers.
>> >> +     */
>> >> +    struct ovn_datapath *od;
>> >> +    HMAP_FOR_EACH (od, key_node, datapaths) {
>> >> +
>> >> +        if (ovs_list_is_empty(&od->mcast_info.groups)) {
>> >> +            continue;
>> >> +        }
>> >> +
>> >> +        for (size_t i = 0; i < od->n_router_ports; i++) {
>> >> +            struct ovn_port *router_port = od->router_ports[i]->peer;
>> >> +
>> >> +            if (!router_port || !router_port->od ||
>> >> +                    !router_port->od->mcast_info.rtr.relay) {
>> >> +                continue;
>> >> +            }
>> >> +
>> >> +            struct ovn_igmp_group *igmp_group;
>> >> +            LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
>> >> +                struct ovn_igmp_group *igmp_group_rtr =
>> >> +                    ovn_igmp_group_add(ctx, igmp_groups, router_port->od,
>> >> +                                       &igmp_group->address,
>> >> +                                       igmp_group->mcgroup.name);
>> >> +                struct ovn_port **router_igmp_ports =
>> >> +                    xmalloc(sizeof *router_igmp_ports);
>> >> +                router_igmp_ports[0] = router_port;
>> >> +                ovn_igmp_group_add_entry(igmp_group_rtr, router_igmp_ports, 1);
>> >> +            }
>> >> +        }
>> >>      }
>> >>
>> >>      /* Walk the aggregated IGMP groups and allocate IDs for new entries.
>> >> @@ -8635,21 +8893,17 @@ build_mcast_groups(struct northd_context *ctx,
>> >>       */
>> >>      struct ovn_igmp_group *igmp_group, *igmp_group_next;
>> >>      HMAP_FOR_EACH_SAFE (igmp_group, igmp_group_next, hmap_node, igmp_groups) {
>> >> -        if (igmp_group->mcgroup.key == 0) {
>> >> -            struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
>> >> -            igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
>> >> -        }
>> >>
>> >> -        /* If we ran out of keys just destroy the entry. */
>> >> -        if (igmp_group->mcgroup.key == 0) {
>> >> +        if (!ovn_igmp_group_allocate_id(igmp_group)) {
>> >> +            /* If we ran out of keys just destroy the entry. */
>> >>              ovn_igmp_group_destroy(igmp_groups, igmp_group);
>> >>              continue;
>> >>          }
>> >>
>> >> -        /* Aggregate the ports from all SB entries corresponding to this
>> >> +        /* Aggregate the ports from all entries corresponding to this
>> >>           * group.
>> >>           */
>> >> -        ovn_igmp_group_aggregate_ports(igmp_group, ports, mcast_groups);
>> >> +        ovn_igmp_group_aggregate_ports(igmp_group, mcast_groups);
>> >>      }
>> >>  }
>> >>
>> >> diff --git a/ovn-nb.xml b/ovn-nb.xml
>> >> index f5f10a5..db8cc20 100644
>> >> --- a/ovn-nb.xml
>> >> +++ b/ovn-nb.xml
>> >> @@ -1526,6 +1526,12 @@
>> >>            address.
>> >>          </p>
>> >>        </column>
>> >> +      <column name="options" key="mcast_relay" type'{"type": "boolean"}'>
>> >> +        <p>
>> >> +          Enables/disables IP multicast relay between logical switches
>> >> +          connected to the logical router. Default: False.
>> >> +        </p>
>> >> +      </column>
>> >>      </group>
>> >>
>> >>      <group title="Common Columns">
>> >> diff --git a/tests/ovn.at b/tests/ovn.at
>> >> index 71eb390..52c044c 100644
>> >> --- a/tests/ovn.at
>> >> +++ b/tests/ovn.at
>> >> @@ -14721,12 +14721,12 @@ AT_CHECK([ovn-sbctl get controller_event $uuid seq_num], [0], [dnl
>> >>  OVN_CLEANUP([hv1], [hv2])
>> >>  AT_CLEANUP
>> >>
>> >> -AT_SETUP([ovn -- IGMP snoop/querier])
>> >> +AT_SETUP([ovn -- IGMP snoop/querier/relay])
>> >>  AT_SKIP_IF([test $HAVE_PYTHON = no])
>> >>  ovn_start
>> >>
>> >>  # Logical network:
>> >> -# Two independent logical switches (sw1 and sw2).
>> >> +# Three logical switches (sw1-sw3) connected to a logical router (rtr).
>> >>  # sw1:
>> >>  #   - subnet 10.0.0.0/8
>> >>  #   - 2 ports bound on hv1 (sw1-p11, sw1-p12)
>> >> @@ -14736,6 +14736,10 @@ ovn_start
>> >>  #   - 1 port bound on hv1 (sw2-p1)
>> >>  #   - 1 port bound on hv2 (sw2-p2)
>> >>  #   - IGMP Querier from 20.0.0.254
>> >> +# sw3:
>> >> +#   - subnet 30.0.0.0/8
>> >> +#   - 1 port bound on hv1 (sw3-p1)
>> >> +#   - 1 port bound on hv2 (sw3-p2)
>> >>
>> >>  reset_pcap_file() {
>> >>      local iface=$1
>> >> @@ -14812,29 +14816,47 @@ store_igmp_v3_query() {
>> >>  }
>> >>
>> >>  #
>> >> -# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
>> >> -#    IP_PROTO DATA OUTFILE
>> >> +# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
>> >> +#    IP_CHKSUM IP_PROTO DATA
>> >>  #
>> >>  # This shell function causes an IP multicast packet to be received on INPORT
>> >>  # of HV.
>> >>  # The hexdump of the packet is stored in OUTFILE.
>> >>  #
>> >>  send_ip_multicast_pkt() {
>> >> -    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ip_src=$5 ip_dst=$6
>> >> -    local ip_len=$7 ip_chksum=$8 proto=$9 data=${10} outfile=${11}
>> >> -
>> >> -    local ip_ttl=20
>> >> +    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4
>> >> +    local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9 proto=${10}
>> >> +    local data=${11}
>> >>
>> >>      local eth=${eth_dst}${eth_src}0800
>> >>      local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>> >>      local packet=${eth}${ip}${data}
>> >>
>> >>      as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet}
>> >> +}
>> >> +
>> >> +#
>> >> +# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
>> >> +#    IP_CHKSUM IP_PROTO DATA OUTFILE
>> >> +#
>> >> +# This shell function builds an IP multicast packet and stores the hexdump of
>> >> +# the packet in OUTFILE.
>> >> +#
>> >> +store_ip_multicast_pkt() {
>> >> +    local eth_src=$1 eth_dst=$2
>> >> +    local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8
>> >> +    local data=$9 outfile=${10}
>> >> +
>> >> +    local eth=${eth_dst}${eth_src}0800
>> >> +    local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
>> >> +    local packet=${eth}${ip}${data}
>> >> +
>> >>      echo ${packet} >> ${outfile}
>> >>  }
>> >>
>> >>  ovn-nbctl ls-add sw1
>> >>  ovn-nbctl ls-add sw2
>> >> +ovn-nbctl ls-add sw3
>> >>
>> >>  ovn-nbctl lsp-add sw1 sw1-p11
>> >>  ovn-nbctl lsp-add sw1 sw1-p12
>> >> @@ -14842,6 +14864,26 @@ ovn-nbctl lsp-add sw1 sw1-p21
>> >>  ovn-nbctl lsp-add sw1 sw1-p22
>> >>  ovn-nbctl lsp-add sw2 sw2-p1
>> >>  ovn-nbctl lsp-add sw2 sw2-p2
>> >> +ovn-nbctl lsp-add sw3 sw3-p1
>> >> +ovn-nbctl lsp-add sw3 sw3-p2
>> >> +
>> >> +ovn-nbctl lr-add rtr
>> >> +ovn-nbctl lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24
>> >> +ovn-nbctl lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24
>> >> +ovn-nbctl lrp-add rtr rtr-sw3 00:00:00:00:03:00 30.0.0.254/24
>> >> +
>> >> +ovn-nbctl lsp-add sw1 sw1-rtr                      \
>> >> +    -- lsp-set-type sw1-rtr router                 \
>> >> +    -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \
>> >> +    -- lsp-set-options sw1-rtr router-port=rtr-sw1
>> >> +ovn-nbctl lsp-add sw2 sw2-rtr                      \
>> >> +    -- lsp-set-type sw2-rtr router                 \
>> >> +    -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \
>> >> +    -- lsp-set-options sw2-rtr router-port=rtr-sw2
>> >> +ovn-nbctl lsp-add sw3 sw3-rtr                      \
>> >> +    -- lsp-set-type sw3-rtr router                 \
>> >> +    -- lsp-set-addresses sw3-rtr 00:00:00:00:03:00 \
>> >> +    -- lsp-set-options sw3-rtr router-port=rtr-sw3
>> >>
>> >>  net_add n1
>> >>  sim_add hv1
>> >> @@ -14863,6 +14905,11 @@ ovs-vsctl -- add-port br-int hv1-vif3 -- \
>> >>      options:tx_pcap=hv1/vif3-tx.pcap \
>> >>      options:rxq_pcap=hv1/vif3-rx.pcap \
>> >>      ofport-request=1
>> >> +ovs-vsctl -- add-port br-int hv1-vif4 -- \
>> >> +    set interface hv1-vif4 external-ids:iface-id=sw3-p1 \
>> >> +    options:tx_pcap=hv1/vif4-tx.pcap \
>> >> +    options:rxq_pcap=hv1/vif4-rx.pcap \
>> >> +    ofport-request=1
>> >>
>> >>  sim_add hv2
>> >>  as hv2
>> >> @@ -14883,12 +14930,18 @@ ovs-vsctl -- add-port br-int hv2-vif3 -- \
>> >>      options:tx_pcap=hv2/vif3-tx.pcap \
>> >>      options:rxq_pcap=hv2/vif3-rx.pcap \
>> >>      ofport-request=1
>> >> +ovs-vsctl -- add-port br-int hv2-vif4 -- \
>> >> +    set interface hv2-vif4 external-ids:iface-id=sw3-p2 \
>> >> +    options:tx_pcap=hv2/vif4-tx.pcap \
>> >> +    options:rxq_pcap=hv2/vif4-rx.pcap \
>> >> +    ofport-request=1
>> >>
>> >>  OVN_POPULATE_ARP
>> >>
>> >>  # Enable IGMP snooping on sw1.
>> >> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_querier="false"
>> >> -ovn-nbctl set Logical_Switch sw1 other_config:mcast_snoop="true"
>> >> +ovn-nbctl set Logical_Switch sw1       \
>> >> +    other_config:mcast_querier="false" \
>> >> +    other_config:mcast_snoop="true"
>> >>
>> >>  # No IGMP query should be generated by sw1 (mcast_querier="false").
>> >>  truncate -s 0 expected
>> >> @@ -14921,9 +14974,12 @@ truncate -s 0 expected
>> >>  truncate -s 0 expected_empty
>> >>  send_ip_multicast_pkt hv1-vif2 hv1 \
>> >>      000000000001 01005e000144 \
>> >> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
>> >> -    e518e518000a3b3a0000 \
>> >> -    expected
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000
>> >> +store_ip_multicast_pkt \
>> >> +    000000000001 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000 expected
>> >>
>> >>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected])
>> >>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
>> >> @@ -14944,17 +15000,19 @@ OVS_WAIT_UNTIL([
>> >>      test "${total_entries}" = "1"
>> >>  ])
>> >>
>> >> -# Send traffic traffic and make sure it gets forwarded only on the port that
>> >> -# joined.
>> >> +# Send traffic and make sure it gets forwarded only on the port that joined.
>> >>  as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>> >>  as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>> >>  truncate -s 0 expected
>> >>  truncate -s 0 expected_empty
>> >>  send_ip_multicast_pkt hv1-vif2 hv1 \
>> >>      000000000001 01005e000144 \
>> >> -    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
>> >> -    e518e518000a3b3a0000 \
>> >> -    expected
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000
>> >> +store_ip_multicast_pkt \
>> >> +    000000000001 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000 expected
>> >>
>> >>  OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>> >>  OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
>> >> @@ -14988,6 +15046,111 @@ sleep 1
>> >>  OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected])
>> >>  OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
>> >>
>> >> +# Dissable IGMP querier on sw2.
>> >> +ovn-nbctl set Logical_Switch sw2 \
>> >> +    other_config:mcast_querier="false"
>> >> +
>> >> +# Enable IGMP snooping on sw3.
>> >> +ovn-nbctl set Logical_Switch sw3       \
>> >> +    other_config:mcast_querier="false" \
>> >> +    other_config:mcast_snoop="true"
>> >> +
>> >> +# Send traffic from sw3 and make sure rtr doesn't relay it.
>> >> +truncate -s 0 expected_empty
>> >> +
>> >> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>> >> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
>> >> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
>> >> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
>> >> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>> >> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
>> >> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
>> >> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
>> >> +
>> >> +send_ip_multicast_pkt hv2-vif4 hv2 \
>> >> +    000000000001 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000
>> >> +
>> >> +# Sleep a bit to make sure no traffic is received and then check.
>> >> +sleep 1
>> >> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
>> >> +
>> >> +# Enable IGMP relay on rtr
>> >> +ovn-nbctl set logical_router rtr \
>> >> +    options:mcast_relay="true"
>> >> +
>> >> +# Inject IGMP Join for 239.0.1.68 on sw1-p11.
>> >> +send_igmp_v3_report hv1-vif1 hv1 \
>> >> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>> >> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>> >> +    /dev/null
>> >> +# Inject IGMP Join for 239.0.1.68 on sw2-p2.
>> >> +send_igmp_v3_report hv2-vif3 hv2 \
>> >> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>> >> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>> >> +    /dev/null
>> >> +# Inject IGMP Join for 239.0.1.68 on sw3-p1.
>> >> +send_igmp_v3_report hv1-vif4 hv1 \
>> >> +    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
>> >> +    $(ip_to_hex 239 0 1 68) 04 e9b9 \
>> >> +    /dev/null
>> >> +
>> >> +# Check that the IGMP Group is learned by all switches.
>> >> +OVS_WAIT_UNTIL([
>> >> +    total_entries=`ovn-sbctl find IGMP_Group | grep "239.0.1.68" | wc -l`
>> >> +    test "${total_entries}" = "3"
>> >> +])
>> >> +
>> >> +# Send traffic from sw3 and make sure it is relayed by rtr.
>> >> +# and ports that joined.
>> >> +truncate -s 0 expected_routed_sw1
>> >> +truncate -s 0 expected_routed_sw2
>> >> +truncate -s 0 expected_switched
>> >> +truncate -s 0 expected_empty
>> >> +
>> >> +as hv1 reset_pcap_file hv1-vif1 hv1/vif1
>> >> +as hv1 reset_pcap_file hv1-vif2 hv1/vif2
>> >> +as hv1 reset_pcap_file hv1-vif3 hv1/vif3
>> >> +as hv1 reset_pcap_file hv1-vif4 hv1/vif4
>> >> +as hv2 reset_pcap_file hv2-vif1 hv2/vif1
>> >> +as hv2 reset_pcap_file hv2-vif2 hv2/vif2
>> >> +as hv2 reset_pcap_file hv2-vif3 hv2/vif3
>> >> +as hv2 reset_pcap_file hv2-vif4 hv2/vif4
>> >> +
>> >> +send_ip_multicast_pkt hv2-vif4 hv2 \
>> >> +    000000000001 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000
>> >> +store_ip_multicast_pkt \
>> >> +    000000000100 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
>> >> +    e518e518000a3b3a0000 expected_routed_sw1
>> >> +store_ip_multicast_pkt \
>> >> +    000000000200 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
>> >> +    e518e518000a3b3a0000 expected_routed_sw2
>> >> +store_ip_multicast_pkt \
>> >> +    000000000001 01005e000144 \
>> >> +    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
>> >> +    e518e518000a3b3a0000 expected_switched
>> >> +
>> >> +OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_routed_sw1])
>> >> +OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_routed_sw2])
>> >> +OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_switched])
>> >> +OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
>> >> +OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
>> >> +
>> >>  OVN_CLEANUP([hv1], [hv2])
>> >>  AT_CLEANUP
>> >>
>> >> --
>> >> 1.8.3.1
>> >>
>> >> _______________________________________________
>> >> dev mailing list
>> >> dev@openvswitch.org
>> >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index f476984..73045d6 100644
--- a/NEWS
+++ b/NEWS
@@ -39,6 +39,7 @@  Post-v2.11.0
        logical groups which results in tunnels only been formed between
        members of the same transport zone(s).
      * Support for new logical switch port type - 'virtual'.
+     * Support for IGMP Snooping/Querier and Relay.
    - New QoS type "linux-netem" on Linux.
    - Added support for TLS Server Name Indication (SNI).
 
diff --git a/lib/mcast-group-index.h b/lib/mcast-group-index.h
index 15a1592..cb49ad7 100644
--- a/lib/mcast-group-index.h
+++ b/lib/mcast-group-index.h
@@ -20,8 +20,17 @@  struct ovsdb_idl;
 
 struct sbrec_datapath_binding;
 
-#define OVN_MCAST_FLOOD_TUNNEL_KEY   65535
-#define OVN_MCAST_UNKNOWN_TUNNEL_KEY (OVN_MCAST_FLOOD_TUNNEL_KEY - 1)
+#define OVN_MIN_MULTICAST 32768
+#define OVN_MAX_MULTICAST 65535
+
+enum ovn_mcast_tunnel_keys {
+
+    OVN_MCAST_FLOOD_TUNNEL_KEY = OVN_MIN_MULTICAST,
+    OVN_MCAST_UNKNOWN_TUNNEL_KEY,
+    OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY,
+    OVN_MIN_IP_MULTICAST,
+    OVN_MAX_IP_MULTICAST = OVN_MAX_MULTICAST,
+};
 
 struct ovsdb_idl_index *mcast_group_index_create(struct ovsdb_idl *);
 const struct sbrec_multicast_group *
diff --git a/northd/ovn-northd.8.xml b/northd/ovn-northd.8.xml
index 6d2fbe3..d45bb15 100644
--- a/northd/ovn-northd.8.xml
+++ b/northd/ovn-northd.8.xml
@@ -947,10 +947,40 @@  output;
 
     <ul>
       <li>
-        A priority-100 flow that outputs all packets with an Ethernet broadcast
+        A priority-100 flow that punts all IGMP packets to
+        <code>ovn-controller</code> if IGMP snooping is enabled on the
+        logical switch.
+      </li>
+
+      <li>
+        Priority-90 flows that forward registered IP multicast traffic to
+        their corresponding multicast group, which <code>ovn-northd</code>
+        creates based on learnt <ref table="IGMP_Group" db="OVN_Southbound"/>
+        entries.  The flows also forward packets to the
+        <code>MC_MROUTER_FLOOD</code> multicast group, which
+        <code>ovn-nortdh</code> populates with all the logical ports that
+        are connected to logical routers with
+        <ref column="options" table="Logical_Router"/>:mcast_relay='true'.
+      </li>
+
+      <li>
+        A priority-85 flow that forwards all IP multicast traffic destined to
+        224.0.0.X to the <code>MC_FLOOD</code> multicast group, which
+        <code>ovn-northd</code> populates with all enabled logical ports.
+      </li>
+
+      <li>
+        A priority-80 flow that forwards all unregistered IP multicast traffic
+        to the <code>MC_MROUTER_FLOOD</code> multicast group, if any.
+        Otherwise the flow drops all unregistered IP multicast packets.  This
+        flow is added only if <ref column="other_config"
+        table="Logical_Switch"/>:mcast_flood_unregistered='false'.
+      </li>
+
+      <li>
+        A priority-70 flow that outputs all packets with an Ethernet broadcast
         or multicast <code>eth.dst</code> to the <code>MC_FLOOD</code>
-        multicast group, which <code>ovn-northd</code> populates with all
-        enabled logical ports.
+        multicast group.
       </li>
 
       <li>
@@ -1228,6 +1258,14 @@  output;
 
       <li>
         <p>
+          A priority-95 flow allows IP multicast traffic if
+          <ref column="options" table="Logical_Router"/>:mcast_relay='true',
+          otherwise drops it.
+        </p>
+      </li>
+
+      <li>
+        <p>
           ICMP echo reply.  These flows reply to ICMP echo requests received
           for the router's IP address.  Let <var>A</var> be an IP address
           owned by a router port.  Then, for each <var>A</var> that is
@@ -1941,6 +1979,16 @@  output;
     <ul>
       <li>
         <p>
+          Priority-500 flows that match IP multicast traffic destined to
+          groups registered on any of the attached switches and sets
+          <code>outport</code> to the associated multicast group that will
+          eventually flood the traffic to all interested attached logical
+          switches. The flows also decrement TTL.
+        </p>
+      </li>
+
+      <li>
+        <p>
           For distributed logical routers where one of the logical router
           ports specifies a <code>redirect-chassis</code>, a priority-400
           logical flow for each ip source/destination couple that matches the
@@ -2074,6 +2122,15 @@  next;
     <ul>
       <li>
         <p>
+          A priority-500 flow that matches IP multicast traffic that was
+          allowed in the routing pipeline. For this kind of traffic the
+          <code>outport</code> was already set so the flow just advances to
+          the next table.
+        </p>
+      </li>
+
+      <li>
+        <p>
           For distributed logical routers where one of the logical router
           ports specifies a <code>redirect-chassis</code>, a priority-400
           logical flow with match <code>REGBIT_DISTRIBUTED_NAT == 1</code>
@@ -2641,9 +2698,19 @@  clone {
     <h3>Egress Table 3: Delivery</h3>
 
     <p>
-      Packets that reach this table are ready for delivery.  It contains
-      priority-100 logical flows that match packets on each enabled logical
-      router port, with action <code>output;</code>.
+      Packets that reach this table are ready for delivery.  It contains:
+      <ul>
+        <li>
+          Priority-110 logical flows that match IP multicast packets on each
+          enabled logical router port and modify the Ethernet source address
+          of the packets to the Ethernet address of the port and then execute
+          action <code>output;</code>.
+        </li>
+        <li>
+          Priority-100 logical flows that match packets on each enabled
+          logical router port, with action <code>output;</code>.
+        </li>
+      </ul>
     </p>
 
 </manpage>
diff --git a/northd/ovn-northd.c b/northd/ovn-northd.c
index e6953a4..9ee9230 100644
--- a/northd/ovn-northd.c
+++ b/northd/ovn-northd.c
@@ -433,32 +433,52 @@  struct ipam_info {
     bool mac_only;
 };
 
-#define OVN_MIN_MULTICAST 32768
-#define OVN_MAX_MULTICAST OVN_MCAST_FLOOD_TUNNEL_KEY
-BUILD_ASSERT_DECL(OVN_MIN_MULTICAST < OVN_MAX_MULTICAST);
-
-#define OVN_MIN_IP_MULTICAST OVN_MIN_MULTICAST
-#define OVN_MAX_IP_MULTICAST (OVN_MCAST_UNKNOWN_TUNNEL_KEY - 1)
-BUILD_ASSERT_DECL(OVN_MAX_IP_MULTICAST >= OVN_MIN_MULTICAST);
-
 /*
  * Multicast snooping and querier per datapath configuration.
  */
+struct mcast_switch_info {
+
+    bool enabled;               /* True if snooping enabled. */
+    bool querier;               /* True if querier enabled. */
+    bool flood_unregistered;    /* True if unregistered multicast should be
+                                 * flooded.
+                                 */
+    bool flood_relay;           /* True if the switch is connected to a
+                                 * multicast router and unregistered multicast
+                                 * should be flooded to the mrouter. Only
+                                 * applicable if flood_unregistered == false.
+                                 */
+
+    int64_t table_size;         /* Max number of IP multicast groups. */
+    int64_t idle_timeout;       /* Timeout after which an idle group is
+                                 * flushed.
+                                 */
+    int64_t query_interval;     /* Interval between multicast queries. */
+    char *eth_src;              /* ETH src address of the multicast queries. */
+    char *ipv4_src;             /* IP src address of the multicast queries. */
+    int64_t query_max_response; /* Expected time after which reports should
+                                 * be received for queries that were sent out.
+                                 */
+
+    uint32_t active_flows;      /* Current number of active IP multicast
+                                 * flows.
+                                 */
+};
+
+struct mcast_router_info {
+    bool relay; /* True if the router should relay IP multicast. */
+};
+
 struct mcast_info {
-    bool enabled;
-    bool querier;
-    bool flood_unregistered;
-
-    int64_t table_size;
-    int64_t idle_timeout;
-    int64_t query_interval;
-    char *eth_src;
-    char *ipv4_src;
-    int64_t  query_max_response;
-
-    struct hmap group_tnlids;
-    uint32_t group_tnlid_hint;
-    uint32_t active_flows;
+
+    struct hmap group_tnlids;  /* Group tunnel IDs in use on this DP. */
+    uint32_t group_tnlid_hint; /* Hint for allocating next group tunnel ID. */
+    struct ovs_list groups;    /* List of groups learnt on this DP. */
+
+    union {
+        struct mcast_switch_info sw;  /* Switch specific multicast info. */
+        struct mcast_router_info rtr; /* Router specific multicast info. */
+    };
 };
 
 static uint32_t
@@ -559,6 +579,7 @@  ovn_datapath_create(struct hmap *datapaths, const struct uuid *key,
 }
 
 static void ovn_ls_port_group_destroy(struct hmap *nb_pgs);
+static void destroy_mcast_info_for_datapath(struct ovn_datapath *od);
 
 static void
 ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
@@ -572,12 +593,7 @@  ovn_datapath_destroy(struct hmap *datapaths, struct ovn_datapath *od)
         bitmap_free(od->ipam_info.allocated_ipv4s);
         free(od->router_ports);
         ovn_ls_port_group_destroy(&od->nb_pgs);
-
-        if (od->nbs) {
-            free(od->mcast_info.eth_src);
-            free(od->mcast_info.ipv4_src);
-            destroy_tnlids(&od->mcast_info.group_tnlids);
-        }
+        destroy_mcast_info_for_datapath(od);
 
         free(od);
     }
@@ -714,23 +730,28 @@  init_ipam_info_for_datapath(struct ovn_datapath *od)
 }
 
 static void
-init_mcast_info_for_datapath(struct ovn_datapath *od)
+init_mcast_info_for_router_datapath(struct ovn_datapath *od)
 {
-    if (!od->nbs) {
-        return;
-    }
+    struct mcast_router_info *mcast_rtr_info = &od->mcast_info.rtr;
 
-    struct mcast_info *mcast_info = &od->mcast_info;
+    mcast_rtr_info->relay = smap_get_bool(&od->nbr->options, "mcast_relay",
+                                          false);
+}
 
-    mcast_info->enabled =
+static void
+init_mcast_info_for_switch_datapath(struct ovn_datapath *od)
+{
+    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
+
+    mcast_sw_info->enabled =
         smap_get_bool(&od->nbs->other_config, "mcast_snoop", false);
-    mcast_info->querier =
+    mcast_sw_info->querier =
         smap_get_bool(&od->nbs->other_config, "mcast_querier", true);
-    mcast_info->flood_unregistered =
+    mcast_sw_info->flood_unregistered =
         smap_get_bool(&od->nbs->other_config, "mcast_flood_unregistered",
                       false);
 
-    mcast_info->table_size =
+    mcast_sw_info->table_size =
         smap_get_ullong(&od->nbs->other_config, "mcast_table_size",
                         OVN_MCAST_DEFAULT_MAX_ENTRIES);
 
@@ -742,54 +763,94 @@  init_mcast_info_for_datapath(struct ovn_datapath *od)
     } else if (idle_timeout > OVN_MCAST_MAX_IDLE_TIMEOUT_S) {
         idle_timeout = OVN_MCAST_MAX_IDLE_TIMEOUT_S;
     }
-    mcast_info->idle_timeout = idle_timeout;
+    mcast_sw_info->idle_timeout = idle_timeout;
 
     uint32_t query_interval =
         smap_get_ullong(&od->nbs->other_config, "mcast_query_interval",
-                        mcast_info->idle_timeout / 2);
+                        mcast_sw_info->idle_timeout / 2);
     if (query_interval < OVN_MCAST_MIN_QUERY_INTERVAL_S) {
         query_interval = OVN_MCAST_MIN_QUERY_INTERVAL_S;
     } else if (query_interval > OVN_MCAST_MAX_QUERY_INTERVAL_S) {
         query_interval = OVN_MCAST_MAX_QUERY_INTERVAL_S;
     }
-    mcast_info->query_interval = query_interval;
+    mcast_sw_info->query_interval = query_interval;
 
-    mcast_info->eth_src =
+    mcast_sw_info->eth_src =
         nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_eth_src"));
-    mcast_info->ipv4_src =
+    mcast_sw_info->ipv4_src =
         nullable_xstrdup(smap_get(&od->nbs->other_config, "mcast_ip4_src"));
 
-    mcast_info->query_max_response =
+    mcast_sw_info->query_max_response =
         smap_get_ullong(&od->nbs->other_config, "mcast_query_max_response",
                         OVN_MCAST_DEFAULT_QUERY_MAX_RESPONSE_S);
 
-    hmap_init(&mcast_info->group_tnlids);
-    mcast_info->group_tnlid_hint = OVN_MIN_IP_MULTICAST;
-    mcast_info->active_flows = 0;
+    mcast_sw_info->active_flows = 0;
+}
+
+static void
+init_mcast_info_for_datapath(struct ovn_datapath *od)
+{
+    if (!od->nbr && !od->nbs) {
+        return;
+    }
+
+    hmap_init(&od->mcast_info.group_tnlids);
+    od->mcast_info.group_tnlid_hint = OVN_MIN_IP_MULTICAST;
+    ovs_list_init(&od->mcast_info.groups);
+
+    if (od->nbs) {
+        init_mcast_info_for_switch_datapath(od);
+    } else {
+        init_mcast_info_for_router_datapath(od);
+    }
+}
+
+static void
+destroy_mcast_info_for_switch_datapath(struct ovn_datapath *od)
+{
+    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
+
+    free(mcast_sw_info->eth_src);
+    free(mcast_sw_info->ipv4_src);
+}
+
+static void
+destroy_mcast_info_for_datapath(struct ovn_datapath *od)
+{
+    if (!od->nbr && !od->nbs) {
+        return;
+    }
+
+    if (od->nbs) {
+        destroy_mcast_info_for_switch_datapath(od);
+    }
+
+    destroy_tnlids(&od->mcast_info.group_tnlids);
 }
 
 static void
-store_mcast_info_for_datapath(const struct sbrec_ip_multicast *sb,
-                              struct ovn_datapath *od)
+store_mcast_info_for_switch_datapath(const struct sbrec_ip_multicast *sb,
+                                     struct ovn_datapath *od)
 {
-    struct mcast_info *mcast_info = &od->mcast_info;
+    struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
 
     sbrec_ip_multicast_set_datapath(sb, od->sb);
-    sbrec_ip_multicast_set_enabled(sb, &mcast_info->enabled, 1);
-    sbrec_ip_multicast_set_querier(sb, &mcast_info->querier, 1);
-    sbrec_ip_multicast_set_table_size(sb, &mcast_info->table_size, 1);
-    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_info->idle_timeout, 1);
+    sbrec_ip_multicast_set_enabled(sb, &mcast_sw_info->enabled, 1);
+    sbrec_ip_multicast_set_querier(sb, &mcast_sw_info->querier, 1);
+    sbrec_ip_multicast_set_table_size(sb, &mcast_sw_info->table_size, 1);
+    sbrec_ip_multicast_set_idle_timeout(sb, &mcast_sw_info->idle_timeout, 1);
     sbrec_ip_multicast_set_query_interval(sb,
-                                          &mcast_info->query_interval, 1);
+                                          &mcast_sw_info->query_interval, 1);
     sbrec_ip_multicast_set_query_max_resp(sb,
-                                          &mcast_info->query_max_response, 1);
+                                          &mcast_sw_info->query_max_response,
+                                          1);
 
-    if (mcast_info->eth_src) {
-        sbrec_ip_multicast_set_eth_src(sb, mcast_info->eth_src);
+    if (mcast_sw_info->eth_src) {
+        sbrec_ip_multicast_set_eth_src(sb, mcast_sw_info->eth_src);
     }
 
-    if (mcast_info->ipv4_src) {
-        sbrec_ip_multicast_set_ip4_src(sb, mcast_info->ipv4_src);
+    if (mcast_sw_info->ipv4_src) {
+        sbrec_ip_multicast_set_ip4_src(sb, mcast_sw_info->ipv4_src);
     }
 }
 
@@ -906,6 +967,7 @@  join_datapaths(struct northd_context *ctx, struct hmap *datapaths,
                                      NULL, nbr, NULL);
             ovs_list_push_back(nb_only, &od->list);
         }
+        init_mcast_info_for_datapath(od);
         ovs_list_push_back(lr_list, &od->lr_list);
     }
 }
@@ -1999,6 +2061,13 @@  join_logical_ports(struct northd_context *ctx,
                     break;
                 }
             }
+
+            /* If the router is multicast enabled then set relay on the switch
+             * datapath.
+             */
+            if (peer->od && peer->od->mcast_info.rtr.relay) {
+                op->od->mcast_info.sw.flood_relay = true;
+            }
         } else if (op->nbrp && op->nbrp->peer && !op->derived) {
             struct ovn_port *peer = ovn_port_find(ports, op->nbrp->peer);
             if (peer) {
@@ -2846,6 +2915,10 @@  struct multicast_group {
 static const struct multicast_group mc_flood =
     { MC_FLOOD, OVN_MCAST_FLOOD_TUNNEL_KEY };
 
+#define MC_MROUTER_FLOOD "_MC_mrouter_flood"
+static const struct multicast_group mc_mrouter_flood =
+    { MC_MROUTER_FLOOD, OVN_MCAST_MROUTER_FLOOD_TUNNEL_KEY };
+
 #define MC_UNKNOWN "_MC_unknown"
 static const struct multicast_group mc_unknown =
     { MC_UNKNOWN, OVN_MCAST_UNKNOWN_TUNNEL_KEY };
@@ -2955,7 +3028,8 @@  ovn_multicast_update_sbrec(const struct ovn_multicast *mc,
  */
 struct ovn_igmp_group_entry {
     struct ovs_list list_node; /* Linkage in the list of entries. */
-    const struct sbrec_igmp_group *sb;
+    size_t n_ports;
+    struct ovn_port **ports;
 };
 
 /*
@@ -2964,12 +3038,13 @@  struct ovn_igmp_group_entry {
  */
 struct ovn_igmp_group {
     struct hmap_node hmap_node; /* Index on 'datapath' and 'address'. */
+    struct ovs_list list_node;  /* Linkage in the per-dp igmp group list. */
 
     struct ovn_datapath *datapath;
     struct in6_addr address; /* Multicast IPv6-mapped-IPv4 or IPv4 address. */
     struct multicast_group mcgroup;
 
-    struct ovs_list sb_entries; /* List of SB entries for this group. */
+    struct ovs_list entries; /* List of SB entries for this group. */
 };
 
 static uint32_t
@@ -2997,77 +3072,120 @@  ovn_igmp_group_find(struct hmap *igmp_groups,
     return NULL;
 }
 
-static void
+static struct ovn_igmp_group *
 ovn_igmp_group_add(struct northd_context *ctx, struct hmap *igmp_groups,
                    struct ovn_datapath *datapath,
-                   const struct sbrec_igmp_group *sb_igmp_group)
+                   const struct in6_addr *address,
+                   const char *address_s)
 {
-    struct in6_addr group_address;
-    ovs_be32 ipv4;
-
-    if (ip_parse(sb_igmp_group->address, &ipv4)) {
-        group_address = in6_addr_mapped_ipv4(ipv4);
-    } else if (!ipv6_parse(sb_igmp_group->address, &group_address)) {
-        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
-        VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
-                     sb_igmp_group->address);
-        return;
-    }
-
     struct ovn_igmp_group *igmp_group =
-        ovn_igmp_group_find(igmp_groups, datapath, &group_address);
+        ovn_igmp_group_find(igmp_groups, datapath, address);
 
     if (!igmp_group) {
         igmp_group = xmalloc(sizeof *igmp_group);
 
         const struct sbrec_multicast_group *mcgroup =
-            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp,
-                               sb_igmp_group->address, datapath->sb);
+            mcast_group_lookup(ctx->sbrec_mcast_group_by_name_dp, address_s,
+                               datapath->sb);
 
         igmp_group->datapath = datapath;
-        igmp_group->address = group_address;
+        igmp_group->address = *address;
         if (mcgroup) {
             igmp_group->mcgroup.key = mcgroup->tunnel_key;
             add_tnlid(&datapath->mcast_info.group_tnlids, mcgroup->tunnel_key);
         } else {
             igmp_group->mcgroup.key = 0;
         }
-        igmp_group->mcgroup.name = sb_igmp_group->address;
-        ovs_list_init(&igmp_group->sb_entries);
+        igmp_group->mcgroup.name = address_s;
+        ovs_list_init(&igmp_group->entries);
 
         hmap_insert(igmp_groups, &igmp_group->hmap_node,
-                    ovn_igmp_group_hash(datapath, &group_address));
+                    ovn_igmp_group_hash(datapath, address));
+        ovs_list_push_back(&datapath->mcast_info.groups,
+                           &igmp_group->list_node);
+    }
+
+    return igmp_group;
+}
+
+static bool
+ovn_igmp_group_get_address(const struct sbrec_igmp_group *sb_igmp_group,
+                           struct in6_addr *address)
+{
+    ovs_be32 ipv4;
+
+    if (ip_parse(sb_igmp_group->address, &ipv4)) {
+        *address = in6_addr_mapped_ipv4(ipv4);
+        return true;
+    }
+    if (!ipv6_parse(sb_igmp_group->address, address)) {
+        return false;
     }
+    return true;
+}
 
+static struct ovn_port **
+ovn_igmp_group_get_ports(const struct sbrec_igmp_group *sb_igmp_group,
+                         size_t *n_ports, struct hmap *ovn_ports)
+{
+    struct ovn_port **ports = xmalloc(sb_igmp_group->n_ports * sizeof *ports);
+
+     *n_ports = 0;
+     for (size_t i = 0; i < sb_igmp_group->n_ports; i++) {
+        ports[(*n_ports)] =
+            ovn_port_find(ovn_ports, sb_igmp_group->ports[i]->logical_port);
+        if (ports[(*n_ports)]) {
+            (*n_ports)++;
+        }
+    }
+
+    return ports;
+}
+
+static void
+ovn_igmp_group_add_entry(struct ovn_igmp_group *igmp_group,
+                         struct ovn_port **ports, size_t n_ports)
+{
     struct ovn_igmp_group_entry *entry = xmalloc(sizeof *entry);
 
-    entry->sb = sb_igmp_group;
-    ovs_list_push_back(&igmp_group->sb_entries , &entry->list_node);
+    entry->ports = ports;
+    entry->n_ports = n_ports;
+    ovs_list_push_back(&igmp_group->entries, &entry->list_node);
+}
+
+static void
+ovn_igmp_group_destroy_entry(struct ovn_igmp_group_entry *entry)
+{
+    free(entry->ports);
+}
+
+static bool
+ovn_igmp_group_allocate_id(struct ovn_igmp_group *igmp_group)
+{
+    if (igmp_group->mcgroup.key == 0) {
+        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
+        igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
+    }
+
+    if (igmp_group->mcgroup.key == 0) {
+        return false;
+    }
+
+    return true;
 }
 
 static void
 ovn_igmp_group_aggregate_ports(struct ovn_igmp_group *igmp_group,
-                               struct hmap *ovn_ports,
                                struct hmap *mcast_groups)
 {
     struct ovn_igmp_group_entry *entry;
 
-    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
-        size_t n_oports = 0;
-        struct ovn_port **oports =
-            xmalloc(entry->sb->n_ports * sizeof *oports);
-
-        for (size_t i = 0; i < entry->sb->n_ports; i++) {
-            oports[n_oports] =
-                ovn_port_find(ovn_ports, entry->sb->ports[i]->logical_port);
-            if (oports[n_oports]) {
-                n_oports++;
-            }
-        }
-
+    LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
         ovn_multicast_add_ports(mcast_groups, igmp_group->datapath,
-                                &igmp_group->mcgroup, oports, n_oports);
-        free(oports);
+                                &igmp_group->mcgroup, entry->ports,
+                                entry->n_ports);
+
+        ovn_igmp_group_destroy_entry(entry);
         free(entry);
     }
 }
@@ -3079,10 +3197,12 @@  ovn_igmp_group_destroy(struct hmap *igmp_groups,
     if (igmp_group) {
         struct ovn_igmp_group_entry *entry;
 
-        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->sb_entries) {
+        LIST_FOR_EACH_POP (entry, list_node, &igmp_group->entries) {
+            ovn_igmp_group_destroy_entry(entry);
             free(entry);
         }
         hmap_remove(igmp_groups, &igmp_group->hmap_node);
+        ovs_list_remove(&igmp_group->list_node);
         free(igmp_group);
     }
 }
@@ -5282,7 +5402,9 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
             continue;
         }
 
-        if (od->mcast_info.enabled) {
+        struct mcast_switch_info *mcast_sw_info = &od->mcast_info.sw;
+
+        if (mcast_sw_info->enabled) {
             /* Punt IGMP traffic to controller. */
             ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 100,
                           "ip4 && ip.proto == 2", "igmp;");
@@ -5295,9 +5417,16 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
                           "outport = \""MC_FLOOD"\"; output;");
 
             /* Drop unregistered IP multicast if not allowed. */
-            if (!od->mcast_info.flood_unregistered) {
-                ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
-                              "ip4 && ip4.mcast", "drop;");
+            if (!mcast_sw_info->flood_unregistered) {
+                /* Forward unregistered IP multicast to mrouter (if any). */
+                if (mcast_sw_info->flood_relay) {
+                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
+                                  "ip4 && ip4.mcast",
+                                  "outport = \""MC_MROUTER_FLOOD"\"; output;");
+                } else {
+                    ovn_lflow_add(lflows, od, S_SWITCH_IN_L2_LKUP, 80,
+                                  "ip4 && ip4.mcast", "drop;");
+                }
             }
         }
 
@@ -5314,18 +5443,26 @@  build_lswitch_flows(struct hmap *datapaths, struct hmap *ports,
             continue;
         }
 
-        struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
+        struct mcast_switch_info *mcast_sw_info =
+            &igmp_group->datapath->mcast_info.sw;
 
-        if (mcast_info->active_flows >= mcast_info->table_size) {
+        if (mcast_sw_info->active_flows >= mcast_sw_info->table_size) {
             continue;
         }
-        mcast_info->active_flows++;
+        mcast_sw_info->active_flows++;
 
         ds_clear(&match);
         ds_clear(&actions);
 
         ds_put_format(&match, "eth.mcast && ip4 && ip4.dst == %s ",
                       igmp_group->mcgroup.name);
+        /* Also flood traffic to all multicast routers with relay enabled. */
+        if (mcast_sw_info->flood_relay) {
+            ds_put_cstr(&actions,
+                        "clone { "
+                            "outport = \""MC_MROUTER_FLOOD "\"; output; "
+                        "};");
+        }
         ds_put_format(&actions, "outport = \"%s\"; output; ",
                       igmp_group->mcgroup.name);
 
@@ -6205,7 +6342,7 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
          * source or destination, and zero network source or destination
          * (priority 100). */
         ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 100,
-                      "ip4.mcast || "
+                      "ip4.src[28..31] == 0xe ||"
                       "ip4.src == 255.255.255.255 || "
                       "ip4.src == 127.0.0.0/8 || "
                       "ip4.dst == 127.0.0.0/8 || "
@@ -6213,6 +6350,16 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
                       "ip4.dst == 0.0.0.0/8",
                       "drop;");
 
+        /* Allow multicast if relay enabled (priority 95). */
+        ds_clear(&actions);
+        if (od->mcast_info.rtr.relay) {
+            ds_put_cstr(&actions, "next;");
+        } else {
+            ds_put_cstr(&actions, "drop;");
+        }
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 95,
+                      "ip4.dst[28..31] == 0xe", ds_cstr(&actions));
+
         /* ARP reply handling.  Use ARP replies to populate the logical
          * router's ARP table. */
         ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_INPUT, 90, "arp.op == 2",
@@ -7483,6 +7630,27 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         }
     }
 
+    /* IP Multicast lookup. Here we set the output port, adjust TTL and
+     * advance to next table (priority 500).
+     */
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+        if (!od->nbr || !od->mcast_info.rtr.relay) {
+            continue;
+        }
+        struct ovn_igmp_group *igmp_group;
+
+        LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
+            ds_clear(&match);
+            ds_clear(&actions);
+            ds_put_format(&match, "ip4 && ip4.dst == %s ",
+                          igmp_group->mcgroup.name);
+            ds_put_format(&actions, "outport = \"%s\"; ip.ttl--; next;",
+                          igmp_group->mcgroup.name);
+            ovn_lflow_add(lflows, od, S_ROUTER_IN_IP_ROUTING, 500,
+                          ds_cstr(&match), ds_cstr(&actions));
+        }
+    }
+
     /* Logical router ingress table 8: Policy.
      *
      * A packet that arrives at this table is an IP packet that should be
@@ -7513,10 +7681,24 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
 
     /* Local router ingress table 9: ARP Resolution.
      *
-     * Any packet that reaches this table is an IP packet whose next-hop IP
-     * address is in reg0. (ip4.dst is the final destination.) This table
-     * resolves the IP address in reg0 into an output port in outport and an
-     * Ethernet address in eth.dst. */
+     * Multicast packets already have the outport set so just advance to next
+     * table (priority 500). */
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+        if (!od->nbr) {
+            continue;
+        }
+
+        ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_RESOLVE, 500,
+                      "ip4.mcast", "next;");
+    }
+
+    /* Local router ingress table 9: ARP Resolution.
+     *
+     * Any unicast packet that reaches this table is an IP packet whose
+     * next-hop IP address is in reg0. (ip4.dst is the final destination.)
+     * This table resolves the IP address in reg0 into an output port in
+     * outport and an Ethernet address in eth.dst.
+     */
     HMAP_FOR_EACH (op, key_node, ports) {
         if (op->nbsp && !lsp_is_enabled(op->nbsp)) {
             continue;
@@ -7998,9 +8180,13 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
         ovn_lflow_add(lflows, od, S_ROUTER_IN_ARP_REQUEST, 0, "1", "output;");
     }
 
-    /* Logical router egress table 1: Delivery (priority 100).
+    /* Logical router egress table 1: Delivery (priority 100-110).
      *
-     * Priority 100 rules deliver packets to enabled logical ports. */
+     * Priority 100 rules deliver packets to enabled logical ports.
+     * Priority 110 rules match multicast packets and update the source
+     * mac before delivering to enabled logical ports. IP multicast traffic
+     * bypasses S_ROUTER_IN_IP_ROUTING route lookups.
+     */
     HMAP_FOR_EACH (op, key_node, ports) {
         if (!op->nbrp) {
             continue;
@@ -8020,6 +8206,19 @@  build_lrouter_flows(struct hmap *datapaths, struct hmap *ports,
             continue;
         }
 
+        /* If multicast relay is enabled then also adjust source mac for IP
+         * multicast traffic.
+         */
+        if (op->od->mcast_info.rtr.relay) {
+            ds_clear(&match);
+            ds_clear(&actions);
+            ds_put_format(&match, "ip4.mcast && outport == %s", op->json_key);
+            ds_put_format(&actions, "eth.src = %s; output;",
+                          op->lrp_networks.ea_s);
+            ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 110,
+                        ds_cstr(&match), ds_cstr(&actions));
+        }
+
         ds_clear(&match);
         ds_put_format(&match, "outport == %s", op->json_key);
         ovn_lflow_add(lflows, op->od, S_ROUTER_OUT_DELIVERY, 100,
@@ -8570,7 +8769,7 @@  build_ip_mcast(struct northd_context *ctx, struct hmap *datapaths)
         if (!ip_mcast) {
             ip_mcast = sbrec_ip_multicast_insert(ctx->ovnsb_txn);
         }
-        store_mcast_info_for_datapath(ip_mcast, od);
+        store_mcast_info_for_switch_datapath(ip_mcast, od);
     }
 
     /* Delete southbound records without northbound matches. */
@@ -8602,6 +8801,14 @@  build_mcast_groups(struct northd_context *ctx,
 
         if (lsp_is_enabled(op->nbsp)) {
             ovn_multicast_add(mcast_groups, &mc_flood, op);
+
+            /* If this port is connected to a multicast router then add it
+             * to the MC_MROUTER_FLOOD group.
+             */
+            if (op->od->mcast_info.sw.flood_relay && op->peer &&
+                    op->peer->od && op->peer->od->mcast_info.rtr.relay) {
+                ovn_multicast_add(mcast_groups, &mc_mrouter_flood, op);
+            }
         }
     }
 
@@ -8624,10 +8831,61 @@  build_mcast_groups(struct northd_context *ctx,
             continue;
         }
 
+        struct in6_addr group_address;
+        if (!ovn_igmp_group_get_address(sb_igmp, &group_address)) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
+            VLOG_WARN_RL(&rl, "invalid IGMP group address: %s",
+                         sb_igmp->address);
+            continue;
+        }
+
         /* Add the IGMP group entry. Will also try to allocate an ID for it
          * if the multicast group already exists.
          */
-        ovn_igmp_group_add(ctx, igmp_groups, od, sb_igmp);
+        struct ovn_igmp_group *igmp_group =
+            ovn_igmp_group_add(ctx, igmp_groups, od, &group_address,
+                               sb_igmp->address);
+
+        /* Extract the IGMP group ports from the SB entry and store them
+         * in the IGMP group.
+         */
+        size_t n_igmp_ports;
+        struct ovn_port **igmp_ports =
+            ovn_igmp_group_get_ports(sb_igmp, &n_igmp_ports, ports);
+        ovn_igmp_group_add_entry(igmp_group, igmp_ports, n_igmp_ports);
+    }
+
+    /* Build IGMP groups for multicast routers with relay enabled. The router
+     * IGMP groups are based on the groups learnt by their multicast enabled
+     * peers.
+     */
+    struct ovn_datapath *od;
+    HMAP_FOR_EACH (od, key_node, datapaths) {
+
+        if (ovs_list_is_empty(&od->mcast_info.groups)) {
+            continue;
+        }
+
+        for (size_t i = 0; i < od->n_router_ports; i++) {
+            struct ovn_port *router_port = od->router_ports[i]->peer;
+
+            if (!router_port || !router_port->od ||
+                    !router_port->od->mcast_info.rtr.relay) {
+                continue;
+            }
+
+            struct ovn_igmp_group *igmp_group;
+            LIST_FOR_EACH (igmp_group, list_node, &od->mcast_info.groups) {
+                struct ovn_igmp_group *igmp_group_rtr =
+                    ovn_igmp_group_add(ctx, igmp_groups, router_port->od,
+                                       &igmp_group->address,
+                                       igmp_group->mcgroup.name);
+                struct ovn_port **router_igmp_ports =
+                    xmalloc(sizeof *router_igmp_ports);
+                router_igmp_ports[0] = router_port;
+                ovn_igmp_group_add_entry(igmp_group_rtr, router_igmp_ports, 1);
+            }
+        }
     }
 
     /* Walk the aggregated IGMP groups and allocate IDs for new entries.
@@ -8635,21 +8893,17 @@  build_mcast_groups(struct northd_context *ctx,
      */
     struct ovn_igmp_group *igmp_group, *igmp_group_next;
     HMAP_FOR_EACH_SAFE (igmp_group, igmp_group_next, hmap_node, igmp_groups) {
-        if (igmp_group->mcgroup.key == 0) {
-            struct mcast_info *mcast_info = &igmp_group->datapath->mcast_info;
-            igmp_group->mcgroup.key = ovn_mcast_group_allocate_key(mcast_info);
-        }
 
-        /* If we ran out of keys just destroy the entry. */
-        if (igmp_group->mcgroup.key == 0) {
+        if (!ovn_igmp_group_allocate_id(igmp_group)) {
+            /* If we ran out of keys just destroy the entry. */
             ovn_igmp_group_destroy(igmp_groups, igmp_group);
             continue;
         }
 
-        /* Aggregate the ports from all SB entries corresponding to this
+        /* Aggregate the ports from all entries corresponding to this
          * group.
          */
-        ovn_igmp_group_aggregate_ports(igmp_group, ports, mcast_groups);
+        ovn_igmp_group_aggregate_ports(igmp_group, mcast_groups);
     }
 }
 
diff --git a/ovn-nb.xml b/ovn-nb.xml
index f5f10a5..db8cc20 100644
--- a/ovn-nb.xml
+++ b/ovn-nb.xml
@@ -1526,6 +1526,12 @@ 
           address.
         </p>
       </column>
+      <column name="options" key="mcast_relay" type'{"type": "boolean"}'>
+        <p>
+          Enables/disables IP multicast relay between logical switches
+          connected to the logical router. Default: False.
+        </p>
+      </column>
     </group>
 
     <group title="Common Columns">
diff --git a/tests/ovn.at b/tests/ovn.at
index 71eb390..52c044c 100644
--- a/tests/ovn.at
+++ b/tests/ovn.at
@@ -14721,12 +14721,12 @@  AT_CHECK([ovn-sbctl get controller_event $uuid seq_num], [0], [dnl
 OVN_CLEANUP([hv1], [hv2])
 AT_CLEANUP
 
-AT_SETUP([ovn -- IGMP snoop/querier])
+AT_SETUP([ovn -- IGMP snoop/querier/relay])
 AT_SKIP_IF([test $HAVE_PYTHON = no])
 ovn_start
 
 # Logical network:
-# Two independent logical switches (sw1 and sw2).
+# Three logical switches (sw1-sw3) connected to a logical router (rtr).
 # sw1:
 #   - subnet 10.0.0.0/8
 #   - 2 ports bound on hv1 (sw1-p11, sw1-p12)
@@ -14736,6 +14736,10 @@  ovn_start
 #   - 1 port bound on hv1 (sw2-p1)
 #   - 1 port bound on hv2 (sw2-p2)
 #   - IGMP Querier from 20.0.0.254
+# sw3:
+#   - subnet 30.0.0.0/8
+#   - 1 port bound on hv1 (sw3-p1)
+#   - 1 port bound on hv2 (sw3-p2)
 
 reset_pcap_file() {
     local iface=$1
@@ -14812,29 +14816,47 @@  store_igmp_v3_query() {
 }
 
 #
-# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN
-#    IP_PROTO DATA OUTFILE
+# send_ip_multicast_pkt INPORT HV ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
+#    IP_CHKSUM IP_PROTO DATA
 #
 # This shell function causes an IP multicast packet to be received on INPORT
 # of HV.
 # The hexdump of the packet is stored in OUTFILE.
 #
 send_ip_multicast_pkt() {
-    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4 ip_src=$5 ip_dst=$6
-    local ip_len=$7 ip_chksum=$8 proto=$9 data=${10} outfile=${11}
-
-    local ip_ttl=20
+    local inport=$1 hv=$2 eth_src=$3 eth_dst=$4
+    local ip_src=$5 ip_dst=$6 ip_len=$7 ip_ttl=$8 ip_chksum=$9 proto=${10}
+    local data=${11}
 
     local eth=${eth_dst}${eth_src}0800
     local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
     local packet=${eth}${ip}${data}
 
     as $hv ovs-appctl netdev-dummy/receive ${inport} ${packet}
+}
+
+#
+# store_ip_multicast_pkt ETH_SRC ETH_DST IP_SRC IP_DST IP_LEN TTL
+#    IP_CHKSUM IP_PROTO DATA OUTFILE
+#
+# This shell function builds an IP multicast packet and stores the hexdump of
+# the packet in OUTFILE.
+#
+store_ip_multicast_pkt() {
+    local eth_src=$1 eth_dst=$2
+    local ip_src=$3 ip_dst=$4 ip_len=$5 ip_ttl=$6 ip_chksum=$7 proto=$8
+    local data=$9 outfile=${10}
+
+    local eth=${eth_dst}${eth_src}0800
+    local ip=450000${ip_len}95f14000${ip_ttl}${proto}${ip_chksum}${ip_src}${ip_dst}
+    local packet=${eth}${ip}${data}
+
     echo ${packet} >> ${outfile}
 }
 
 ovn-nbctl ls-add sw1
 ovn-nbctl ls-add sw2
+ovn-nbctl ls-add sw3
 
 ovn-nbctl lsp-add sw1 sw1-p11
 ovn-nbctl lsp-add sw1 sw1-p12
@@ -14842,6 +14864,26 @@  ovn-nbctl lsp-add sw1 sw1-p21
 ovn-nbctl lsp-add sw1 sw1-p22
 ovn-nbctl lsp-add sw2 sw2-p1
 ovn-nbctl lsp-add sw2 sw2-p2
+ovn-nbctl lsp-add sw3 sw3-p1
+ovn-nbctl lsp-add sw3 sw3-p2
+
+ovn-nbctl lr-add rtr
+ovn-nbctl lrp-add rtr rtr-sw1 00:00:00:00:01:00 10.0.0.254/24
+ovn-nbctl lrp-add rtr rtr-sw2 00:00:00:00:02:00 20.0.0.254/24
+ovn-nbctl lrp-add rtr rtr-sw3 00:00:00:00:03:00 30.0.0.254/24
+
+ovn-nbctl lsp-add sw1 sw1-rtr                      \
+    -- lsp-set-type sw1-rtr router                 \
+    -- lsp-set-addresses sw1-rtr 00:00:00:00:01:00 \
+    -- lsp-set-options sw1-rtr router-port=rtr-sw1
+ovn-nbctl lsp-add sw2 sw2-rtr                      \
+    -- lsp-set-type sw2-rtr router                 \
+    -- lsp-set-addresses sw2-rtr 00:00:00:00:02:00 \
+    -- lsp-set-options sw2-rtr router-port=rtr-sw2
+ovn-nbctl lsp-add sw3 sw3-rtr                      \
+    -- lsp-set-type sw3-rtr router                 \
+    -- lsp-set-addresses sw3-rtr 00:00:00:00:03:00 \
+    -- lsp-set-options sw3-rtr router-port=rtr-sw3
 
 net_add n1
 sim_add hv1
@@ -14863,6 +14905,11 @@  ovs-vsctl -- add-port br-int hv1-vif3 -- \
     options:tx_pcap=hv1/vif3-tx.pcap \
     options:rxq_pcap=hv1/vif3-rx.pcap \
     ofport-request=1
+ovs-vsctl -- add-port br-int hv1-vif4 -- \
+    set interface hv1-vif4 external-ids:iface-id=sw3-p1 \
+    options:tx_pcap=hv1/vif4-tx.pcap \
+    options:rxq_pcap=hv1/vif4-rx.pcap \
+    ofport-request=1
 
 sim_add hv2
 as hv2
@@ -14883,12 +14930,18 @@  ovs-vsctl -- add-port br-int hv2-vif3 -- \
     options:tx_pcap=hv2/vif3-tx.pcap \
     options:rxq_pcap=hv2/vif3-rx.pcap \
     ofport-request=1
+ovs-vsctl -- add-port br-int hv2-vif4 -- \
+    set interface hv2-vif4 external-ids:iface-id=sw3-p2 \
+    options:tx_pcap=hv2/vif4-tx.pcap \
+    options:rxq_pcap=hv2/vif4-rx.pcap \
+    ofport-request=1
 
 OVN_POPULATE_ARP
 
 # Enable IGMP snooping on sw1.
-ovn-nbctl set Logical_Switch sw1 other_config:mcast_querier="false"
-ovn-nbctl set Logical_Switch sw1 other_config:mcast_snoop="true"
+ovn-nbctl set Logical_Switch sw1       \
+    other_config:mcast_querier="false" \
+    other_config:mcast_snoop="true"
 
 # No IGMP query should be generated by sw1 (mcast_querier="false").
 truncate -s 0 expected
@@ -14921,9 +14974,12 @@  truncate -s 0 expected
 truncate -s 0 expected_empty
 send_ip_multicast_pkt hv1-vif2 hv1 \
     000000000001 01005e000144 \
-    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
-    e518e518000a3b3a0000 \
-    expected
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000
+store_ip_multicast_pkt \
+    000000000001 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000 expected
 
 OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected])
 OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
@@ -14944,17 +15000,19 @@  OVS_WAIT_UNTIL([
     test "${total_entries}" = "1"
 ])
 
-# Send traffic traffic and make sure it gets forwarded only on the port that
-# joined.
+# Send traffic and make sure it gets forwarded only on the port that joined.
 as hv1 reset_pcap_file hv1-vif1 hv1/vif1
 as hv2 reset_pcap_file hv2-vif1 hv2/vif1
 truncate -s 0 expected
 truncate -s 0 expected_empty
 send_ip_multicast_pkt hv1-vif2 hv1 \
     000000000001 01005e000144 \
-    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e ca70 11 \
-    e518e518000a3b3a0000 \
-    expected
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000
+store_ip_multicast_pkt \
+    000000000001 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000 expected
 
 OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
 OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected])
@@ -14988,6 +15046,111 @@  sleep 1
 OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected])
 OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected])
 
+# Dissable IGMP querier on sw2.
+ovn-nbctl set Logical_Switch sw2 \
+    other_config:mcast_querier="false"
+
+# Enable IGMP snooping on sw3.
+ovn-nbctl set Logical_Switch sw3       \
+    other_config:mcast_querier="false" \
+    other_config:mcast_snoop="true"
+
+# Send traffic from sw3 and make sure rtr doesn't relay it.
+truncate -s 0 expected_empty
+
+as hv1 reset_pcap_file hv1-vif1 hv1/vif1
+as hv1 reset_pcap_file hv1-vif2 hv1/vif2
+as hv1 reset_pcap_file hv1-vif3 hv1/vif3
+as hv1 reset_pcap_file hv1-vif4 hv1/vif4
+as hv2 reset_pcap_file hv2-vif1 hv2/vif1
+as hv2 reset_pcap_file hv2-vif2 hv2/vif2
+as hv2 reset_pcap_file hv2-vif3 hv2/vif3
+as hv2 reset_pcap_file hv2-vif4 hv2/vif4
+
+send_ip_multicast_pkt hv2-vif4 hv2 \
+    000000000001 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000
+
+# Sleep a bit to make sure no traffic is received and then check.
+sleep 1
+OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
+
+# Enable IGMP relay on rtr
+ovn-nbctl set logical_router rtr \
+    options:mcast_relay="true"
+
+# Inject IGMP Join for 239.0.1.68 on sw1-p11.
+send_igmp_v3_report hv1-vif1 hv1 \
+    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
+    $(ip_to_hex 239 0 1 68) 04 e9b9 \
+    /dev/null
+# Inject IGMP Join for 239.0.1.68 on sw2-p2.
+send_igmp_v3_report hv2-vif3 hv2 \
+    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
+    $(ip_to_hex 239 0 1 68) 04 e9b9 \
+    /dev/null
+# Inject IGMP Join for 239.0.1.68 on sw3-p1.
+send_igmp_v3_report hv1-vif4 hv1 \
+    000000000001 $(ip_to_hex 10 0 0 1) f9f8 \
+    $(ip_to_hex 239 0 1 68) 04 e9b9 \
+    /dev/null
+
+# Check that the IGMP Group is learned by all switches.
+OVS_WAIT_UNTIL([
+    total_entries=`ovn-sbctl find IGMP_Group | grep "239.0.1.68" | wc -l`
+    test "${total_entries}" = "3"
+])
+
+# Send traffic from sw3 and make sure it is relayed by rtr.
+# and ports that joined.
+truncate -s 0 expected_routed_sw1
+truncate -s 0 expected_routed_sw2
+truncate -s 0 expected_switched
+truncate -s 0 expected_empty
+
+as hv1 reset_pcap_file hv1-vif1 hv1/vif1
+as hv1 reset_pcap_file hv1-vif2 hv1/vif2
+as hv1 reset_pcap_file hv1-vif3 hv1/vif3
+as hv1 reset_pcap_file hv1-vif4 hv1/vif4
+as hv2 reset_pcap_file hv2-vif1 hv2/vif1
+as hv2 reset_pcap_file hv2-vif2 hv2/vif2
+as hv2 reset_pcap_file hv2-vif3 hv2/vif3
+as hv2 reset_pcap_file hv2-vif4 hv2/vif4
+
+send_ip_multicast_pkt hv2-vif4 hv2 \
+    000000000001 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000
+store_ip_multicast_pkt \
+    000000000100 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
+    e518e518000a3b3a0000 expected_routed_sw1
+store_ip_multicast_pkt \
+    000000000200 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 1f cb70 11 \
+    e518e518000a3b3a0000 expected_routed_sw2
+store_ip_multicast_pkt \
+    000000000001 01005e000144 \
+    $(ip_to_hex 10 0 0 42) $(ip_to_hex 239 0 1 68) 1e 20 ca70 11 \
+    e518e518000a3b3a0000 expected_switched
+
+OVN_CHECK_PACKETS([hv1/vif1-tx.pcap], [expected_routed_sw1])
+OVN_CHECK_PACKETS([hv2/vif3-tx.pcap], [expected_routed_sw2])
+OVN_CHECK_PACKETS([hv1/vif4-tx.pcap], [expected_switched])
+OVN_CHECK_PACKETS([hv1/vif2-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv1/vif3-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif1-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif2-tx.pcap], [expected_empty])
+OVN_CHECK_PACKETS([hv2/vif4-tx.pcap], [expected_empty])
+
 OVN_CLEANUP([hv1], [hv2])
 AT_CLEANUP