diff mbox series

[ovs-dev] controller: split mg action in table=39 and 40 to fit kernel netlink buffer size

Message ID 8f213b51c95c7d13421c3f557d45312b83cd9906.1695291228.git.lorenzo.bianconi@redhat.com
State Changes Requested
Headers show
Series [ovs-dev] controller: split mg action in table=39 and 40 to fit kernel netlink buffer size | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/github-robot-_ovn-kubernetes fail github build: failed

Commit Message

Lorenzo Bianconi Sept. 21, 2023, 10:15 a.m. UTC
Introduce the capability to split multicast group openflow actions
created in consider_mc_group routine in multiple buffers if the
single buffer size is over netlink buffer size limits.

Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2232152
Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
---
 controller/physical.c   | 272 +++++++++++++++++++++++++++++-----------
 controller/pinctrl.c    |  66 ++++++++++
 include/ovn/actions.h   |   3 +
 tests/ovn-controller.at |  45 +++++++
 4 files changed, 313 insertions(+), 73 deletions(-)

Comments

Mark Michelson Sept. 26, 2023, 5:26 p.m. UTC | #1
Hi Lorenzo,

Overall, I think this has the beginnings of a good solution.

There are a few high-level problems.

1) I think MC_OFPACTS_MAX_MSG_SIZE is too low. The maximum size for an 
OpenFlow message is 65535, and the netlink MAX_ACTIONS_BUFSIZE is 32768. 
Even if we wanted to be conservative, I think MC_OFPACTS_MAX_MSG_SIZE 
could be 8x or even 16x larger than it is in this patch.

2) I don't think a controller action is necessary. The controller action 
sets some registers and then resubmits to table 40 or 41. I think this 
could be done directly in OpenFlow instead.

3) There is a lot of near-identical code in consider_mc_group() in 
physical.c. It would be good if the common bits could be extracted to a 
function.

4) This could use further test cases. I'm not sure how viable it is, but 
if we could add a system test that ensures packets are delivered to all 
ports in the multicast group, then that would be good. In addition, we 
should ensure that when configuration changes (e.g. the number of ports 
increases or decreases beyond the threshold) that the installed flows 
are what we expect and that traffic is delivered to all the ports still.

5) It wouldn't hurt to add a NEWS entry that states that arbitrarily 
large multicast groups can now be used.

I have a few more notes below.

On 9/21/23 06:15, Lorenzo Bianconi wrote:
> Introduce the capability to split multicast group openflow actions
> created in consider_mc_group routine in multiple buffers if the
> single buffer size is over netlink buffer size limits.
> 
> Reported-at: https://bugzilla.redhat.com/show_bug.cgi?id=2232152
> Signed-off-by: Lorenzo Bianconi <lorenzo.bianconi@redhat.com>
> ---
>   controller/physical.c   | 272 +++++++++++++++++++++++++++++-----------
>   controller/pinctrl.c    |  66 ++++++++++
>   include/ovn/actions.h   |   3 +
>   tests/ovn-controller.at |  45 +++++++
>   4 files changed, 313 insertions(+), 73 deletions(-)
> 
> diff --git a/controller/physical.c b/controller/physical.c
> index 75257bc85..663fd1c8d 100644
> --- a/controller/physical.c
> +++ b/controller/physical.c
> @@ -1969,6 +1969,22 @@ local_output_pb(int64_t tunnel_key, struct ofpbuf *ofpacts)
>       put_resubmit(OFTABLE_CHECK_LOOPBACK, ofpacts);
>   }
>   
> +static void
> +mc_split_buf_controller_action(struct ofpbuf *ofpacts, uint32_t index,
> +                               uint8_t table_id, uint32_t port_key)
> +{
> +    size_t oc_offset = encode_start_controller_op(
> +            ACTION_OPCODE_MG_SPLIT_BUF, false, NX_CTLR_NO_METER, ofpacts);
> +    ovs_be32 val = htonl(index);
> +    ofpbuf_put(ofpacts, &val, sizeof val);
> +    val = htonl(port_key);
> +    ofpbuf_put(ofpacts, &val, sizeof val);
> +    ofpbuf_put(ofpacts, &table_id, sizeof table_id);
> +    encode_finish_controller_op(oc_offset, ofpacts);
> +}
> +
> +#define MC_OFPACTS_MAX_MSG_SIZE     1024
> +#define MC_BUF_START_ID             0x9000
>   static void
>   consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>                     enum mf_field_id mff_ovn_geneve,
> @@ -1990,9 +2006,6 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>       struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
>       struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
>   
> -    struct match match;
> -    match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
> -
>       /* Go through all of the ports in the multicast group:
>        *
>        *    - For remote ports, add the chassis to 'remote_chassis' or
> @@ -2013,10 +2026,18 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>        *      set the output port to be the router patch port for which
>        *      the redirect port was added.
>        */
> -    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
> +    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp, reset_ofpacts;
>       ofpbuf_init(&ofpacts, 0);
>       ofpbuf_init(&remote_ofpacts, 0);
>       ofpbuf_init(&remote_ofpacts_ramp, 0);
> +    ofpbuf_init(&reset_ofpacts, 0);
> +
> +    bool local_ports = false, remote_ports = false, remote_ramp_ports = false;
> +
> +    put_load(0, MFF_REG6, 0, 32, &reset_ofpacts);
> +
> +    /* local port loop. */
> +    uint32_t local_flow_index = MC_BUF_START_ID;
>       for (size_t i = 0; i < mc->n_ports; i++) {
>           struct sbrec_port_binding *port = mc->ports[i];
>   
> @@ -2040,19 +2061,15 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>               if (ldp->is_transit_switch) {
>                   local_output_pb(port->tunnel_key, &ofpacts);
>               } else {
> -                local_output_pb(port->tunnel_key, &remote_ofpacts);
> -                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
> +                remote_ramp_ports = true;
> +                remote_ports = true;
>               }
>           } if (!strcmp(port->type, "remote")) {
>               if (port->chassis) {
> -                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> -                         &remote_ofpacts);
> -                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
> -                                  chassis_tunnels, mc->datapath,
> -                                  port->tunnel_key, &remote_ofpacts);
> +                remote_ports = true;
>               }
>           } else if (!strcmp(port->type, "localport")) {
> -            local_output_pb(port->tunnel_key, &remote_ofpacts);
> +            remote_ports = true;
>           } else if ((port->chassis == chassis
>                       || is_additional_chassis(port, chassis))
>                      && (local_binding_get_primary_pb(local_bindings, lport_name)
> @@ -2095,87 +2112,196 @@ consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
>                   }
>               }
>           }
> -    }
>   
> -    /* Table 40, priority 100.
> -     * =======================
> -     *
> -     * Handle output to the local logical ports in the multicast group, if
> -     * any. */
> -    bool local_ports = ofpacts.size > 0;
> -    if (local_ports) {
> -        /* Following delivery to local logical ports, restore the multicast
> -         * group as the logical output port. */
> -        put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
> +        local_ports |= !!ofpacts.size;
> +        if (!local_ports) {
> +            continue;
> +        }
>   
> -        ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100,
> -                        mc->header_.uuid.parts[0],
> -                        &match, &ofpacts, &mc->header_.uuid);
> +        /* do not overcome max netlink message size used by ovs-vswitchd to
> +         * send netlink configuration to the kernel. */
> +        if (ofpacts.size > MC_OFPACTS_MAX_MSG_SIZE || i == mc->n_ports - 1) {
> +            struct match match;
> +
> +            match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
> +
> +            bool is_first = local_flow_index == MC_BUF_START_ID;
> +            if (!is_first) {
> +                match_set_reg(&match, MFF_REG6 - MFF_REG0, local_flow_index);
> +            }
> +
> +            if (i == mc->n_ports - 1) {
> +                /* Following delivery to local logical ports, restore the
> +                 * multicast group as the logical output port. */
> +                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
> +            } else {
> +                mc_split_buf_controller_action(&ofpacts, ++local_flow_index,
> +                                               OFTABLE_LOCAL_OUTPUT,
> +                                               mc->tunnel_key);
> +            }

What happens if the final port in the multicast group is the one that 
makes ofpacts.size greater than MAX_OFPACTS_MAX_MSG_SIZE? I don't think 
we would split up the multicast flows in this case because the top if 
condition would trigger instead of the else. In this case, we likely 
want both conditions to trigger.

> +
> +            /* reset MFF_REG6. */
> +            ofpbuf_insert(&ofpacts, 0, reset_ofpacts.data, reset_ofpacts.size);
> +
> +            /* Table 40, priority 100.
> +             * ==========================
> +             *
> +             * Handle output to the local logical ports in the multicast group,
> +             * if any. */
> +            ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT,
> +                            is_first ? 100 : 110,
> +                            mc->header_.uuid.parts[0], &match, &ofpacts,
> +                            &mc->header_.uuid);
> +            ofpbuf_clear(&ofpacts);
> +        }
>       }
>   
> -    /* Table 39, priority 100.
> -     * =======================
> -     *
> -     * Handle output to the remote chassis in the multicast group, if
> -     * any. */
> -    if (!sset_is_empty(&remote_chassis) ||
> -            !sset_is_empty(&vtep_chassis) || remote_ofpacts.size > 0) {
> -        if (remote_ofpacts.size > 0) {
> -            /* Following delivery to logical patch ports, restore the
> -             * multicast group as the logical output port. */
> -            put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> -                     &remote_ofpacts);
> -
> -            if (get_vtep_port(local_datapaths, mc->datapath->tunnel_key)) {
> -                struct match match_ramp;
> -                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
> -                                     MLF_RCV_FROM_RAMP);
> +    /* remote port loop. */
> +    struct ofpbuf remote_ofpacts_last;
> +    ofpbuf_init(&remote_ofpacts_last, 0);
> +    if (remote_ports) {
> +        put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &remote_ofpacts_last);
> +    }
> +    fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
> +                      mc->datapath, mc->tunnel_key, false,
> +                      &remote_ofpacts_last);
> +    fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels,
> +                      mc->datapath, mc->tunnel_key, true,
> +                      &remote_ofpacts_last);
> +    remote_ports |= !!remote_ofpacts_last.size;
> +    if (remote_ports && local_ports) {
> +        put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_last);
> +    }
>   
> -                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> -                         &remote_ofpacts_ramp);
> +    bool has_vetp = get_vtep_port(local_datapaths, mc->datapath->tunnel_key);

s/has_vetp/has_vtep/

> +    uint32_t reverse_remap_flow_index = MC_BUF_START_ID;

I don't understand the name of this variable. What does "remap" mean 
here? Was it supposed to be "ramp"?

> +    uint32_t reverse_flow_index = MC_BUF_START_ID;
>   
> -                /* MCAST traffic which was originally received from RAMP_SWITCH
> -                 * is not allowed to be re-sent to remote_chassis.
> -                 * Process "patch" port binding for routing in
> -                 * OFTABLE_REMOTE_OUTPUT and resubmit packets to
> -                 * OFTABLE_LOCAL_OUTPUT for local delivery. */
> +    for (size_t i = 0; i < mc->n_ports; i++) {
> +        struct sbrec_port_binding *port = mc->ports[i];
>   
> -                match_outport_dp_and_port_keys(&match_ramp, dp_key,
> -                                               mc->tunnel_key);
> +        if (port->datapath != mc->datapath) {
> +            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
> +            VLOG_WARN_RL(&rl, UUID_FMT": multicast group contains ports "
> +                         "in wrong datapath",
> +                         UUID_ARGS(&mc->header_.uuid));
> +            continue;
> +        }
> +
> +        if (!strcmp(port->type, "patch")) {
> +            if (!ldp->is_transit_switch) {
> +                local_output_pb(port->tunnel_key, &remote_ofpacts);
> +                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
> +            }
> +        } if (!strcmp(port->type, "remote")) {
> +            if (port->chassis) {
> +                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> +                         &remote_ofpacts);
> +                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
> +                                  chassis_tunnels, mc->datapath,
> +                                  port->tunnel_key, &remote_ofpacts);
> +            }
> +        } else if (!strcmp(port->type, "localport")) {
> +            local_output_pb(port->tunnel_key, &remote_ofpacts);
> +        }
>   
> -                /* Match packets coming from RAMP_SWITCH and allowed for
> -                * loopback processing (including routing). */
> -                match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
> -                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
> -                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
> +        /* do not overcome max netlink message size used by ovs-vswitchd to
> +         * send netlink configuration to the kernel. */
> +        if (remote_ofpacts.size > MC_OFPACTS_MAX_MSG_SIZE ||
> +            i == mc->n_ports - 1) {
> +            struct match match;
> +            match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
> +            if (has_vetp) {
> +                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
> +                                     MLF_RCV_FROM_RAMP);
> +            }
>   
> -                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
> +            bool is_first = reverse_flow_index == MC_BUF_START_ID;
> +            if (!is_first) {
> +                match_set_reg(&match, MFF_REG6 - MFF_REG0, reverse_flow_index);
> +            }
> +
> +            if (i == mc->n_ports - 1) {
> +                ofpbuf_put(&remote_ofpacts, remote_ofpacts_last.data,
> +                           remote_ofpacts_last.size);
> +            } else {
> +                mc_split_buf_controller_action(&remote_ofpacts,
> +                                               ++reverse_flow_index,
> +                                               OFTABLE_REMOTE_OUTPUT,
> +                                               mc->tunnel_key);
> +            }
>   
> -                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 120,
> -                                mc->header_.uuid.parts[0], &match_ramp,
> -                                &remote_ofpacts_ramp, &mc->header_.uuid);
> +            /* reset MFF_REG6. */
> +            ofpbuf_insert(&remote_ofpacts, 0, reset_ofpacts.data,
> +                          reset_ofpacts.size);
> +            if (remote_ports) {
> +                /* Table 39, priority 100.
> +                 * =======================
> +                 *
> +                 * Handle output to the remote chassis in the multicast group,
> +                 * if any. */
> +                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT,
> +                                is_first ? 100 : 110,
> +                                mc->header_.uuid.parts[0],
> +                                &match, &remote_ofpacts, &mc->header_.uuid);
>               }
> +            ofpbuf_clear(&remote_ofpacts);
>           }
>   
> -        fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
> -                          mc->datapath, mc->tunnel_key, false,
> -                          &remote_ofpacts);
> -        fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels,
> -                          mc->datapath, mc->tunnel_key, true,
> -                          &remote_ofpacts);
> +        if (!remote_ports || !remote_ramp_ports || !has_vetp) {
> +            continue;
> +        }
>   
> -        if (remote_ofpacts.size) {
> -            if (local_ports) {
> -                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts);
> +        if (remote_ofpacts_ramp.size > MC_OFPACTS_MAX_MSG_SIZE ||
> +            i == mc->n_ports - 1) {
> +            /* MCAST traffic which was originally received from RAMP_SWITCH
> +             * is not allowed to be re-sent to remote_chassis.
> +             * Process "patch" port binding for routing in
> +             * OFTABLE_REMOTE_OUTPUT and resubmit packets to
> +             * OFTABLE_LOCAL_OUTPUT for local delivery. */
> +            struct match match_ramp;
> +            match_outport_dp_and_port_keys(&match_ramp, dp_key,
> +                                           mc->tunnel_key);
> +
> +            /* Match packets coming from RAMP_SWITCH and allowed for
> +            * loopback processing (including routing). */
> +            match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
> +                                 MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
> +                                 MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
> +
> +            bool is_first = reverse_remap_flow_index == MC_BUF_START_ID;
> +            if (!is_first) {
> +                match_set_reg(&match_ramp, MFF_REG6 - MFF_REG0,
> +                              reverse_remap_flow_index);
>               }
> -            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
> -                            mc->header_.uuid.parts[0],
> -                            &match, &remote_ofpacts, &mc->header_.uuid);
> +
> +            if (i == mc->n_ports - 1) {
> +                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
> +                         &remote_ofpacts_ramp);
> +                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
> +            } else {
> +                mc_split_buf_controller_action(&remote_ofpacts_ramp,
> +                                               ++reverse_remap_flow_index,
> +                                               OFTABLE_REMOTE_OUTPUT,
> +                                               mc->tunnel_key);
> +            }
> +
> +            /* reset MFF_REG6. */
> +            ofpbuf_insert(&remote_ofpacts_ramp, 0, reset_ofpacts.data,
> +                          reset_ofpacts.size);
> +            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT,
> +                            is_first ? 120 : 130,
> +                            mc->header_.uuid.parts[0], &match_ramp,
> +                            &remote_ofpacts_ramp, &mc->header_.uuid);
> +            ofpbuf_clear(&remote_ofpacts_ramp);
>           }
>       }
> +
>       ofpbuf_uninit(&ofpacts);
>       ofpbuf_uninit(&remote_ofpacts);
> +    ofpbuf_uninit(&remote_ofpacts_last);
>       ofpbuf_uninit(&remote_ofpacts_ramp);
> +    ofpbuf_uninit(&reset_ofpacts);
>       sset_destroy(&remote_chassis);
>       sset_destroy(&vtep_chassis);
>   }
> diff --git a/controller/pinctrl.c b/controller/pinctrl.c
> index ff5a3444c..996f78d7b 100644
> --- a/controller/pinctrl.c
> +++ b/controller/pinctrl.c
> @@ -211,6 +211,10 @@ static void send_mac_binding_buffered_pkts(struct rconn *swconn)
>   
>   static void pinctrl_rarp_activation_strategy_handler(const struct match *md);
>   
> +static void pinctrl_mg_split_buff_handler(
> +        struct rconn *swconn, struct dp_packet *pkt,
> +        const struct match *md, struct ofpbuf *userdata);
> +
>   static void init_activated_ports(void);
>   static void destroy_activated_ports(void);
>   static void wait_activated_ports(void);
> @@ -3283,6 +3287,11 @@ process_packet_in(struct rconn *swconn, const struct ofp_header *msg)
>           ovs_mutex_unlock(&pinctrl_mutex);
>           break;
>   
> +    case ACTION_OPCODE_MG_SPLIT_BUF:
> +        pinctrl_mg_split_buff_handler(swconn, &packet, &pin.flow_metadata,
> +                                      &userdata);
> +        break;
> +
>       default:
>           VLOG_WARN_RL(&rl, "unrecognized packet-in opcode %"PRIu32,
>                        ntohl(ah->opcode));
> @@ -8148,6 +8157,63 @@ pinctrl_rarp_activation_strategy_handler(const struct match *md)
>       notify_pinctrl_main();
>   }
>   
> +static void
> +pinctrl_mg_split_buff_handler(struct rconn *swconn, struct dp_packet *pkt,
> +                              const struct match *md, struct ofpbuf *userdata)
> +{
> +    ovs_be32 *index = ofpbuf_try_pull(userdata, sizeof *index);
> +    if (!index) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl, "%s: missing index field", __func__);
> +        return;
> +    }
> +
> +    ovs_be32 *mg = ofpbuf_try_pull(userdata, sizeof *mg);
> +    if (!mg) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl, "%s: missing multicast group field", __func__);
> +        return;
> +    }
> +
> +    uint8_t *table_id = ofpbuf_try_pull(userdata, sizeof *table_id);
> +    if (!table_id) {
> +        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
> +        VLOG_WARN_RL(&rl, "%s: missing table_id field", __func__);
> +        return;
> +    }
> +
> +    struct ofpbuf ofpacts;
> +    ofpbuf_init(&ofpacts, 4096);
> +    reload_metadata(&ofpacts, md);
> +
> +    /* reload pkt_mark field */
> +    const struct mf_field *pkt_mark_field = mf_from_id(MFF_PKT_MARK);
> +    union mf_value pkt_mark_value;
> +    mf_get_value(pkt_mark_field, &md->flow, &pkt_mark_value);
> +    ofpact_put_set_field(&ofpacts, pkt_mark_field, &pkt_mark_value, NULL);
> +
> +    put_load(ntohl(*index), MFF_REG6, 0, 32, &ofpacts);
> +    put_load(ntohl(*mg), MFF_LOG_OUTPORT, 0, 32, &ofpacts);
> +
> +    struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts);
> +    resubmit->in_port = OFPP_CONTROLLER;
> +    resubmit->table_id = *table_id;
> +
> +    struct ofputil_packet_out po = {
> +        .packet = dp_packet_data(pkt),
> +        .packet_len = dp_packet_size(pkt),
> +        .buffer_id = UINT32_MAX,
> +        .ofpacts = ofpacts.data,
> +        .ofpacts_len = ofpacts.size,
> +    };
> +    match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER);
> +    enum ofp_version version = rconn_get_version(swconn);
> +    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
> +    queue_msg(swconn, ofputil_encode_packet_out(&po, proto));
> +
> +    ofpbuf_uninit(&ofpacts);
> +}
> +
>   static struct hmap put_fdbs;
>   
>   /* MAC learning (fdb) related functions.  Runs within the main
> diff --git a/include/ovn/actions.h b/include/ovn/actions.h
> index 04bb6ffd0..49cfe0624 100644
> --- a/include/ovn/actions.h
> +++ b/include/ovn/actions.h
> @@ -747,6 +747,9 @@ enum action_opcode {
>   
>       /* activation_strategy_rarp() */
>       ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
> +
> +    /* multicast group split buffer action. */
> +    ACTION_OPCODE_MG_SPLIT_BUF,
>   };
>   
>   /* Header. */
> diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
> index 4212d601a..7d1645e53 100644
> --- a/tests/ovn-controller.at
> +++ b/tests/ovn-controller.at
> @@ -2651,3 +2651,48 @@ OVS_WAIT_FOR_OUTPUT([as hv1 ovs-ofctl dump-flows br-int table=0 | grep -c in_por
>   OVN_CLEANUP([hv1])
>   AT_CLEANUP
>   ])
> +
> +OVN_FOR_EACH_NORTHD([
> +AT_SETUP([ovn-controller - multicast group buffer split])
> +AT_KEYWORDS([ovn])
> +ovn_start
> +
> +net_add n1
> +
> +sim_add hv1
> +as hv1
> +ovs-vsctl add-br br-phys
> +ovn_attach n1 br-phys 192.168.0.1
> +
> +for i in $(seq 1 20); do
> +    ovs-vsctl -- add-port br-int hv1-vif$i -- \
> +        set interface hv1-vif$i external-ids:iface-id=sw0-p$i \
> +        options:tx_pcap=hv1/vif${i}-tx.pcap \
> +        options:rxq_pcap=hv1/vif${i}-rx.pcap \
> +        ofport-request=$i
> +done
> +
> +
> +check ovn-nbctl ls-add sw0
> +for i in $(seq 1 20); do
> +    check ovn-nbctl lsp-add sw0 sw0-p$i
> +    check ovn-nbctl lsp-set-addresses sw0-p$i "unknown"
> +done
> +
> +wait_for_ports_up
> +ovn-nbctl --wait=hv sync
> +
> +OVS_WAIT_FOR_OUTPUT([as hv1 ovs-ofctl dump-flows br-int table=40 | grep -c "controller(userdata=00.00.00.1b.00.00.00.00.00.00.90.01"], [0],[dnl
> +3
> +])
> +
> +for i in $(seq 1 10); do
> +    check ovn-nbctl lsp-del sw0-p$i
> +done
> +ovn-nbctl --wait=hv sync
> +
> +AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=40 | grep -q controller], [1])
> +
> +OVN_CLEANUP([hv1])
> +AT_CLEANUP
> +])
Dumitru Ceara Sept. 27, 2023, 11:15 a.m. UTC | #2
On 9/26/23 19:26, Mark Michelson wrote:
> 
> 2) I don't think a controller action is necessary. The controller action
> sets some registers and then resubmits to table 40 or 41. I think this
> could be done directly in OpenFlow instead.

Hi Mark,

I didn't review the patch at all yet so I'll just comment on this
specific point:

The controller action is actually necessary in order to "break" the
openflow rule chain that gets executed for packets that need to be
flooded on these large multicast groups.

Otherwise ovs-vswitchd will "combine" those OpenFlow rules into a single
(or a handful of) datapath flow(s) with a huge action set that
sequentially outputs to all ports in the multicast group.

Regards,
Dumitru
Ihar Hrachyshka Sept. 27, 2023, 12:49 p.m. UTC | #3
On Wed, Sep 27, 2023 at 7:16 AM Dumitru Ceara <dceara@redhat.com> wrote:

> On 9/26/23 19:26, Mark Michelson wrote:
> >
> > 2) I don't think a controller action is necessary. The controller action
> > sets some registers and then resubmits to table 40 or 41. I think this
> > could be done directly in OpenFlow instead.
>
> Hi Mark,
>
> I didn't review the patch at all yet so I'll just comment on this
> specific point:
>
> The controller action is actually necessary in order to "break" the
> openflow rule chain that gets executed for packets that need to be
> flooded on these large multicast groups.
>
> Otherwise ovs-vswitchd will "combine" those OpenFlow rules into a single
> (or a handful of) datapath flow(s) with a huge action set that
> sequentially outputs to all ports in the multicast group.
>

+1. But this should probably be clear from the code, so additional comments
on this are due.


>
> Regards,
> Dumitru
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>
>
Mark Michelson Sept. 27, 2023, 3:33 p.m. UTC | #4
On 9/27/23 07:15, Dumitru Ceara wrote:
> On 9/26/23 19:26, Mark Michelson wrote:
>>
>> 2) I don't think a controller action is necessary. The controller action
>> sets some registers and then resubmits to table 40 or 41. I think this
>> could be done directly in OpenFlow instead.
> 
> Hi Mark,
> 
> I didn't review the patch at all yet so I'll just comment on this
> specific point:
> 
> The controller action is actually necessary in order to "break" the
> openflow rule chain that gets executed for packets that need to be
> flooded on these large multicast groups.
> 
> Otherwise ovs-vswitchd will "combine" those OpenFlow rules into a single
> (or a handful of) datapath flow(s) with a huge action set that
> sequentially outputs to all ports in the multicast group.

Is the problem that the megaflow's giant action set still overflows the 
netlink maximum action size?

> 
> Regards,
> Dumitru
>
Dumitru Ceara Sept. 27, 2023, 3:52 p.m. UTC | #5
On 9/27/23 17:33, Mark Michelson wrote:
> On 9/27/23 07:15, Dumitru Ceara wrote:
>> On 9/26/23 19:26, Mark Michelson wrote:
>>>
>>> 2) I don't think a controller action is necessary. The controller action
>>> sets some registers and then resubmits to table 40 or 41. I think this
>>> could be done directly in OpenFlow instead.
>>
>> Hi Mark,
>>
>> I didn't review the patch at all yet so I'll just comment on this
>> specific point:
>>
>> The controller action is actually necessary in order to "break" the
>> openflow rule chain that gets executed for packets that need to be
>> flooded on these large multicast groups.
>>
>> Otherwise ovs-vswitchd will "combine" those OpenFlow rules into a single
>> (or a handful of) datapath flow(s) with a huge action set that
>> sequentially outputs to all ports in the multicast group.
> 
> Is the problem that the megaflow's giant action set still overflows the
> netlink maximum action size?
> 

I think, in theory, the resulting megaflow is exactly the same as it
would be without this patch applied.
diff mbox series

Patch

diff --git a/controller/physical.c b/controller/physical.c
index 75257bc85..663fd1c8d 100644
--- a/controller/physical.c
+++ b/controller/physical.c
@@ -1969,6 +1969,22 @@  local_output_pb(int64_t tunnel_key, struct ofpbuf *ofpacts)
     put_resubmit(OFTABLE_CHECK_LOOPBACK, ofpacts);
 }
 
+static void
+mc_split_buf_controller_action(struct ofpbuf *ofpacts, uint32_t index,
+                               uint8_t table_id, uint32_t port_key)
+{
+    size_t oc_offset = encode_start_controller_op(
+            ACTION_OPCODE_MG_SPLIT_BUF, false, NX_CTLR_NO_METER, ofpacts);
+    ovs_be32 val = htonl(index);
+    ofpbuf_put(ofpacts, &val, sizeof val);
+    val = htonl(port_key);
+    ofpbuf_put(ofpacts, &val, sizeof val);
+    ofpbuf_put(ofpacts, &table_id, sizeof table_id);
+    encode_finish_controller_op(oc_offset, ofpacts);
+}
+
+#define MC_OFPACTS_MAX_MSG_SIZE     1024
+#define MC_BUF_START_ID             0x9000
 static void
 consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                   enum mf_field_id mff_ovn_geneve,
@@ -1990,9 +2006,6 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
     struct sset remote_chassis = SSET_INITIALIZER(&remote_chassis);
     struct sset vtep_chassis = SSET_INITIALIZER(&vtep_chassis);
 
-    struct match match;
-    match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
-
     /* Go through all of the ports in the multicast group:
      *
      *    - For remote ports, add the chassis to 'remote_chassis' or
@@ -2013,10 +2026,18 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
      *      set the output port to be the router patch port for which
      *      the redirect port was added.
      */
-    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp;
+    struct ofpbuf ofpacts, remote_ofpacts, remote_ofpacts_ramp, reset_ofpacts;
     ofpbuf_init(&ofpacts, 0);
     ofpbuf_init(&remote_ofpacts, 0);
     ofpbuf_init(&remote_ofpacts_ramp, 0);
+    ofpbuf_init(&reset_ofpacts, 0);
+
+    bool local_ports = false, remote_ports = false, remote_ramp_ports = false;
+
+    put_load(0, MFF_REG6, 0, 32, &reset_ofpacts);
+
+    /* local port loop. */
+    uint32_t local_flow_index = MC_BUF_START_ID;
     for (size_t i = 0; i < mc->n_ports; i++) {
         struct sbrec_port_binding *port = mc->ports[i];
 
@@ -2040,19 +2061,15 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
             if (ldp->is_transit_switch) {
                 local_output_pb(port->tunnel_key, &ofpacts);
             } else {
-                local_output_pb(port->tunnel_key, &remote_ofpacts);
-                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
+                remote_ramp_ports = true;
+                remote_ports = true;
             }
         } if (!strcmp(port->type, "remote")) {
             if (port->chassis) {
-                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
-                         &remote_ofpacts);
-                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
-                                  chassis_tunnels, mc->datapath,
-                                  port->tunnel_key, &remote_ofpacts);
+                remote_ports = true;
             }
         } else if (!strcmp(port->type, "localport")) {
-            local_output_pb(port->tunnel_key, &remote_ofpacts);
+            remote_ports = true;
         } else if ((port->chassis == chassis
                     || is_additional_chassis(port, chassis))
                    && (local_binding_get_primary_pb(local_bindings, lport_name)
@@ -2095,87 +2112,196 @@  consider_mc_group(struct ovsdb_idl_index *sbrec_port_binding_by_name,
                 }
             }
         }
-    }
 
-    /* Table 40, priority 100.
-     * =======================
-     *
-     * Handle output to the local logical ports in the multicast group, if
-     * any. */
-    bool local_ports = ofpacts.size > 0;
-    if (local_ports) {
-        /* Following delivery to local logical ports, restore the multicast
-         * group as the logical output port. */
-        put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
+        local_ports |= !!ofpacts.size;
+        if (!local_ports) {
+            continue;
+        }
 
-        ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT, 100,
-                        mc->header_.uuid.parts[0],
-                        &match, &ofpacts, &mc->header_.uuid);
+        /* do not overcome max netlink message size used by ovs-vswitchd to
+         * send netlink configuration to the kernel. */
+        if (ofpacts.size > MC_OFPACTS_MAX_MSG_SIZE || i == mc->n_ports - 1) {
+            struct match match;
+
+            match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
+
+            bool is_first = local_flow_index == MC_BUF_START_ID;
+            if (!is_first) {
+                match_set_reg(&match, MFF_REG6 - MFF_REG0, local_flow_index);
+            }
+
+            if (i == mc->n_ports - 1) {
+                /* Following delivery to local logical ports, restore the
+                 * multicast group as the logical output port. */
+                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &ofpacts);
+            } else {
+                mc_split_buf_controller_action(&ofpacts, ++local_flow_index,
+                                               OFTABLE_LOCAL_OUTPUT,
+                                               mc->tunnel_key);
+            }
+
+            /* reset MFF_REG6. */
+            ofpbuf_insert(&ofpacts, 0, reset_ofpacts.data, reset_ofpacts.size);
+
+            /* Table 40, priority 100.
+             * ==========================
+             *
+             * Handle output to the local logical ports in the multicast group,
+             * if any. */
+            ofctrl_add_flow(flow_table, OFTABLE_LOCAL_OUTPUT,
+                            is_first ? 100 : 110,
+                            mc->header_.uuid.parts[0], &match, &ofpacts,
+                            &mc->header_.uuid);
+            ofpbuf_clear(&ofpacts);
+        }
     }
 
-    /* Table 39, priority 100.
-     * =======================
-     *
-     * Handle output to the remote chassis in the multicast group, if
-     * any. */
-    if (!sset_is_empty(&remote_chassis) ||
-            !sset_is_empty(&vtep_chassis) || remote_ofpacts.size > 0) {
-        if (remote_ofpacts.size > 0) {
-            /* Following delivery to logical patch ports, restore the
-             * multicast group as the logical output port. */
-            put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
-                     &remote_ofpacts);
-
-            if (get_vtep_port(local_datapaths, mc->datapath->tunnel_key)) {
-                struct match match_ramp;
-                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
-                                     MLF_RCV_FROM_RAMP);
+    /* remote port loop. */
+    struct ofpbuf remote_ofpacts_last;
+    ofpbuf_init(&remote_ofpacts_last, 0);
+    if (remote_ports) {
+        put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32, &remote_ofpacts_last);
+    }
+    fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
+                      mc->datapath, mc->tunnel_key, false,
+                      &remote_ofpacts_last);
+    fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels,
+                      mc->datapath, mc->tunnel_key, true,
+                      &remote_ofpacts_last);
+    remote_ports |= !!remote_ofpacts_last.size;
+    if (remote_ports && local_ports) {
+        put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_last);
+    }
 
-                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
-                         &remote_ofpacts_ramp);
+    bool has_vetp = get_vtep_port(local_datapaths, mc->datapath->tunnel_key);
+    uint32_t reverse_remap_flow_index = MC_BUF_START_ID;
+    uint32_t reverse_flow_index = MC_BUF_START_ID;
 
-                /* MCAST traffic which was originally received from RAMP_SWITCH
-                 * is not allowed to be re-sent to remote_chassis.
-                 * Process "patch" port binding for routing in
-                 * OFTABLE_REMOTE_OUTPUT and resubmit packets to
-                 * OFTABLE_LOCAL_OUTPUT for local delivery. */
+    for (size_t i = 0; i < mc->n_ports; i++) {
+        struct sbrec_port_binding *port = mc->ports[i];
 
-                match_outport_dp_and_port_keys(&match_ramp, dp_key,
-                                               mc->tunnel_key);
+        if (port->datapath != mc->datapath) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 1);
+            VLOG_WARN_RL(&rl, UUID_FMT": multicast group contains ports "
+                         "in wrong datapath",
+                         UUID_ARGS(&mc->header_.uuid));
+            continue;
+        }
+
+        if (!strcmp(port->type, "patch")) {
+            if (!ldp->is_transit_switch) {
+                local_output_pb(port->tunnel_key, &remote_ofpacts);
+                local_output_pb(port->tunnel_key, &remote_ofpacts_ramp);
+            }
+        } if (!strcmp(port->type, "remote")) {
+            if (port->chassis) {
+                put_load(port->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
+                         &remote_ofpacts);
+                tunnel_to_chassis(mff_ovn_geneve, port->chassis->name,
+                                  chassis_tunnels, mc->datapath,
+                                  port->tunnel_key, &remote_ofpacts);
+            }
+        } else if (!strcmp(port->type, "localport")) {
+            local_output_pb(port->tunnel_key, &remote_ofpacts);
+        }
 
-                /* Match packets coming from RAMP_SWITCH and allowed for
-                * loopback processing (including routing). */
-                match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
-                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
-                                     MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
+        /* do not overcome max netlink message size used by ovs-vswitchd to
+         * send netlink configuration to the kernel. */
+        if (remote_ofpacts.size > MC_OFPACTS_MAX_MSG_SIZE ||
+            i == mc->n_ports - 1) {
+            struct match match;
+            match_outport_dp_and_port_keys(&match, dp_key, mc->tunnel_key);
+            if (has_vetp) {
+                match_set_reg_masked(&match, MFF_LOG_FLAGS - MFF_REG0, 0,
+                                     MLF_RCV_FROM_RAMP);
+            }
 
-                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
+            bool is_first = reverse_flow_index == MC_BUF_START_ID;
+            if (!is_first) {
+                match_set_reg(&match, MFF_REG6 - MFF_REG0, reverse_flow_index);
+            }
+
+            if (i == mc->n_ports - 1) {
+                ofpbuf_put(&remote_ofpacts, remote_ofpacts_last.data,
+                           remote_ofpacts_last.size);
+            } else {
+                mc_split_buf_controller_action(&remote_ofpacts,
+                                               ++reverse_flow_index,
+                                               OFTABLE_REMOTE_OUTPUT,
+                                               mc->tunnel_key);
+            }
 
-                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 120,
-                                mc->header_.uuid.parts[0], &match_ramp,
-                                &remote_ofpacts_ramp, &mc->header_.uuid);
+            /* reset MFF_REG6. */
+            ofpbuf_insert(&remote_ofpacts, 0, reset_ofpacts.data,
+                          reset_ofpacts.size);
+            if (remote_ports) {
+                /* Table 39, priority 100.
+                 * =======================
+                 *
+                 * Handle output to the remote chassis in the multicast group,
+                 * if any. */
+                ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT,
+                                is_first ? 100 : 110,
+                                mc->header_.uuid.parts[0],
+                                &match, &remote_ofpacts, &mc->header_.uuid);
             }
+            ofpbuf_clear(&remote_ofpacts);
         }
 
-        fanout_to_chassis(mff_ovn_geneve, &remote_chassis, chassis_tunnels,
-                          mc->datapath, mc->tunnel_key, false,
-                          &remote_ofpacts);
-        fanout_to_chassis(mff_ovn_geneve, &vtep_chassis, chassis_tunnels,
-                          mc->datapath, mc->tunnel_key, true,
-                          &remote_ofpacts);
+        if (!remote_ports || !remote_ramp_ports || !has_vetp) {
+            continue;
+        }
 
-        if (remote_ofpacts.size) {
-            if (local_ports) {
-                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts);
+        if (remote_ofpacts_ramp.size > MC_OFPACTS_MAX_MSG_SIZE ||
+            i == mc->n_ports - 1) {
+            /* MCAST traffic which was originally received from RAMP_SWITCH
+             * is not allowed to be re-sent to remote_chassis.
+             * Process "patch" port binding for routing in
+             * OFTABLE_REMOTE_OUTPUT and resubmit packets to
+             * OFTABLE_LOCAL_OUTPUT for local delivery. */
+            struct match match_ramp;
+            match_outport_dp_and_port_keys(&match_ramp, dp_key,
+                                           mc->tunnel_key);
+
+            /* Match packets coming from RAMP_SWITCH and allowed for
+            * loopback processing (including routing). */
+            match_set_reg_masked(&match_ramp, MFF_LOG_FLAGS - MFF_REG0,
+                                 MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK,
+                                 MLF_RCV_FROM_RAMP | MLF_ALLOW_LOOPBACK);
+
+            bool is_first = reverse_remap_flow_index == MC_BUF_START_ID;
+            if (!is_first) {
+                match_set_reg(&match_ramp, MFF_REG6 - MFF_REG0,
+                              reverse_remap_flow_index);
             }
-            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT, 100,
-                            mc->header_.uuid.parts[0],
-                            &match, &remote_ofpacts, &mc->header_.uuid);
+
+            if (i == mc->n_ports - 1) {
+                put_load(mc->tunnel_key, MFF_LOG_OUTPORT, 0, 32,
+                         &remote_ofpacts_ramp);
+                put_resubmit(OFTABLE_LOCAL_OUTPUT, &remote_ofpacts_ramp);
+            } else {
+                mc_split_buf_controller_action(&remote_ofpacts_ramp,
+                                               ++reverse_remap_flow_index,
+                                               OFTABLE_REMOTE_OUTPUT,
+                                               mc->tunnel_key);
+            }
+
+            /* reset MFF_REG6. */
+            ofpbuf_insert(&remote_ofpacts_ramp, 0, reset_ofpacts.data,
+                          reset_ofpacts.size);
+            ofctrl_add_flow(flow_table, OFTABLE_REMOTE_OUTPUT,
+                            is_first ? 120 : 130,
+                            mc->header_.uuid.parts[0], &match_ramp,
+                            &remote_ofpacts_ramp, &mc->header_.uuid);
+            ofpbuf_clear(&remote_ofpacts_ramp);
         }
     }
+
     ofpbuf_uninit(&ofpacts);
     ofpbuf_uninit(&remote_ofpacts);
+    ofpbuf_uninit(&remote_ofpacts_last);
     ofpbuf_uninit(&remote_ofpacts_ramp);
+    ofpbuf_uninit(&reset_ofpacts);
     sset_destroy(&remote_chassis);
     sset_destroy(&vtep_chassis);
 }
diff --git a/controller/pinctrl.c b/controller/pinctrl.c
index ff5a3444c..996f78d7b 100644
--- a/controller/pinctrl.c
+++ b/controller/pinctrl.c
@@ -211,6 +211,10 @@  static void send_mac_binding_buffered_pkts(struct rconn *swconn)
 
 static void pinctrl_rarp_activation_strategy_handler(const struct match *md);
 
+static void pinctrl_mg_split_buff_handler(
+        struct rconn *swconn, struct dp_packet *pkt,
+        const struct match *md, struct ofpbuf *userdata);
+
 static void init_activated_ports(void);
 static void destroy_activated_ports(void);
 static void wait_activated_ports(void);
@@ -3283,6 +3287,11 @@  process_packet_in(struct rconn *swconn, const struct ofp_header *msg)
         ovs_mutex_unlock(&pinctrl_mutex);
         break;
 
+    case ACTION_OPCODE_MG_SPLIT_BUF:
+        pinctrl_mg_split_buff_handler(swconn, &packet, &pin.flow_metadata,
+                                      &userdata);
+        break;
+
     default:
         VLOG_WARN_RL(&rl, "unrecognized packet-in opcode %"PRIu32,
                      ntohl(ah->opcode));
@@ -8148,6 +8157,63 @@  pinctrl_rarp_activation_strategy_handler(const struct match *md)
     notify_pinctrl_main();
 }
 
+static void
+pinctrl_mg_split_buff_handler(struct rconn *swconn, struct dp_packet *pkt,
+                              const struct match *md, struct ofpbuf *userdata)
+{
+    ovs_be32 *index = ofpbuf_try_pull(userdata, sizeof *index);
+    if (!index) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_WARN_RL(&rl, "%s: missing index field", __func__);
+        return;
+    }
+
+    ovs_be32 *mg = ofpbuf_try_pull(userdata, sizeof *mg);
+    if (!mg) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_WARN_RL(&rl, "%s: missing multicast group field", __func__);
+        return;
+    }
+
+    uint8_t *table_id = ofpbuf_try_pull(userdata, sizeof *table_id);
+    if (!table_id) {
+        static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+        VLOG_WARN_RL(&rl, "%s: missing table_id field", __func__);
+        return;
+    }
+
+    struct ofpbuf ofpacts;
+    ofpbuf_init(&ofpacts, 4096);
+    reload_metadata(&ofpacts, md);
+
+    /* reload pkt_mark field */
+    const struct mf_field *pkt_mark_field = mf_from_id(MFF_PKT_MARK);
+    union mf_value pkt_mark_value;
+    mf_get_value(pkt_mark_field, &md->flow, &pkt_mark_value);
+    ofpact_put_set_field(&ofpacts, pkt_mark_field, &pkt_mark_value, NULL);
+
+    put_load(ntohl(*index), MFF_REG6, 0, 32, &ofpacts);
+    put_load(ntohl(*mg), MFF_LOG_OUTPORT, 0, 32, &ofpacts);
+
+    struct ofpact_resubmit *resubmit = ofpact_put_RESUBMIT(&ofpacts);
+    resubmit->in_port = OFPP_CONTROLLER;
+    resubmit->table_id = *table_id;
+
+    struct ofputil_packet_out po = {
+        .packet = dp_packet_data(pkt),
+        .packet_len = dp_packet_size(pkt),
+        .buffer_id = UINT32_MAX,
+        .ofpacts = ofpacts.data,
+        .ofpacts_len = ofpacts.size,
+    };
+    match_set_in_port(&po.flow_metadata, OFPP_CONTROLLER);
+    enum ofp_version version = rconn_get_version(swconn);
+    enum ofputil_protocol proto = ofputil_protocol_from_ofp_version(version);
+    queue_msg(swconn, ofputil_encode_packet_out(&po, proto));
+
+    ofpbuf_uninit(&ofpacts);
+}
+
 static struct hmap put_fdbs;
 
 /* MAC learning (fdb) related functions.  Runs within the main
diff --git a/include/ovn/actions.h b/include/ovn/actions.h
index 04bb6ffd0..49cfe0624 100644
--- a/include/ovn/actions.h
+++ b/include/ovn/actions.h
@@ -747,6 +747,9 @@  enum action_opcode {
 
     /* activation_strategy_rarp() */
     ACTION_OPCODE_ACTIVATION_STRATEGY_RARP,
+
+    /* multicast group split buffer action. */
+    ACTION_OPCODE_MG_SPLIT_BUF,
 };
 
 /* Header. */
diff --git a/tests/ovn-controller.at b/tests/ovn-controller.at
index 4212d601a..7d1645e53 100644
--- a/tests/ovn-controller.at
+++ b/tests/ovn-controller.at
@@ -2651,3 +2651,48 @@  OVS_WAIT_FOR_OUTPUT([as hv1 ovs-ofctl dump-flows br-int table=0 | grep -c in_por
 OVN_CLEANUP([hv1])
 AT_CLEANUP
 ])
+
+OVN_FOR_EACH_NORTHD([
+AT_SETUP([ovn-controller - multicast group buffer split])
+AT_KEYWORDS([ovn])
+ovn_start
+
+net_add n1
+
+sim_add hv1
+as hv1
+ovs-vsctl add-br br-phys
+ovn_attach n1 br-phys 192.168.0.1
+
+for i in $(seq 1 20); do
+    ovs-vsctl -- add-port br-int hv1-vif$i -- \
+        set interface hv1-vif$i external-ids:iface-id=sw0-p$i \
+        options:tx_pcap=hv1/vif${i}-tx.pcap \
+        options:rxq_pcap=hv1/vif${i}-rx.pcap \
+        ofport-request=$i
+done
+
+
+check ovn-nbctl ls-add sw0
+for i in $(seq 1 20); do
+    check ovn-nbctl lsp-add sw0 sw0-p$i
+    check ovn-nbctl lsp-set-addresses sw0-p$i "unknown"
+done
+
+wait_for_ports_up
+ovn-nbctl --wait=hv sync
+
+OVS_WAIT_FOR_OUTPUT([as hv1 ovs-ofctl dump-flows br-int table=40 | grep -c "controller(userdata=00.00.00.1b.00.00.00.00.00.00.90.01"], [0],[dnl
+3
+])
+
+for i in $(seq 1 10); do
+    check ovn-nbctl lsp-del sw0-p$i
+done
+ovn-nbctl --wait=hv sync
+
+AT_CHECK([as hv1 ovs-ofctl dump-flows br-int table=40 | grep -q controller], [1])
+
+OVN_CLEANUP([hv1])
+AT_CLEANUP
+])