diff mbox series

[ovs-dev,v4,5/5] dpif-netdev: Support partial-action-offload of VXLAN encap flow

Message ID 20200629095020.8491-6-sriharsha.basavapatna@broadcom.com
State New
Headers show
Series netdev datapath: Partial action offload | expand

Commit Message

Sriharsha Basavapatna June 29, 2020, 9:50 a.m. UTC
In this patch, we support offloading of VXLAN_ENCAP action for a vhost-user
port (aka "partial-action-offload"). At the time of offloading the flow, we
determine if the flow can be offloaded to an egress device, if the input
port is not offload capable such as a vhost-user port. We then offload the
flow with a VXLAN_ENCAP RTE action, to the egress device. We do not add
the OUTPUT RTE action, which indicates to the PMD that is is a partial
action offload request. Note that since the action is being offloaded in
egress direction, classification is expected to be done by OVS SW datapath
and hence there's no need to offload a MARK action.

If offload succeeds, we save the information in 'dp_netdev_flow' so that
we skip execution of the corresponding action (previous patch) during SW
datapath processing.

Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
---
 lib/dpif-netdev.c         | 212 ++++++++++++++++++++++++++++++++++++--
 lib/netdev-offload-dpdk.c |  78 ++++++++++----
 lib/netdev-offload.h      |   2 +
 3 files changed, 262 insertions(+), 30 deletions(-)

Comments

Eli Britstein July 5, 2020, 12:59 p.m. UTC | #1
On 6/29/2020 12:50 PM, Sriharsha Basavapatna wrote:
> In this patch, we support offloading of VXLAN_ENCAP action for a vhost-user
> port (aka "partial-action-offload"). At the time of offloading the flow, we
> determine if the flow can be offloaded to an egress device, if the input
> port is not offload capable such as a vhost-user port. We then offload the
> flow with a VXLAN_ENCAP RTE action, to the egress device. We do not add
> the OUTPUT RTE action, which indicates to the PMD that is is a partial
> action offload request. Note that since the action is being offloaded in
> egress direction, classification is expected to be done by OVS SW datapath
> and hence there's no need to offload a MARK action.
>
> If offload succeeds, we save the information in 'dp_netdev_flow' so that
> we skip execution of the corresponding action (previous patch) during SW
> datapath processing.
>
> Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
> ---
>   lib/dpif-netdev.c         | 212 ++++++++++++++++++++++++++++++++++++--
>   lib/netdev-offload-dpdk.c |  78 ++++++++++----
>   lib/netdev-offload.h      |   2 +
>   3 files changed, 262 insertions(+), 30 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e489e2d90..d289d265d 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -2488,10 +2488,174 @@ dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
>       ovs_mutex_unlock(&dp_flow_offload.mutex);
>   }
>   
> +static int
> +partial_offload_egress_flow_del(struct dp_flow_offload_item *offload)
> +{
> +    struct dp_netdev_pmd_thread *pmd = offload->pmd;
> +    struct dp_netdev_flow *flow = offload->flow;
> +    const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
> +    struct netdev *port;
> +    int ret;
> +
> +    port = netdev_ports_get(flow->egress_offload_port, dpif_type_str);
> +    if (!port) {
> +        return -1;
> +    }
> +
> +    /* Taking a global 'port_mutex' to fulfill thread safety
> +     * restrictions for the netdev-offload-dpdk module. */
> +    ovs_mutex_lock(&pmd->dp->port_mutex);
> +    ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
> +    ovs_mutex_unlock(&pmd->dp->port_mutex);
> +    netdev_close(port);
> +
> +    if (ret) {
> +        return ret;
> +    }
> +
> +    flow->egress_offload_port = NULL;
> +    flow->partial_actions_offloaded = false;
> +
> +    VLOG_DBG_RL("%s: flow: %p mega_ufid: "UUID_FMT" pmd_id: %d\n", __func__,
> +                flow, UUID_ARGS((struct uuid *)&flow->mega_ufid),
> +                offload->flow->pmd_id);
> +    return ret;
> +}
It's almost the same code as mark_to_flow_disassociate. Maybe just 
enhance it?
> +
>   static int
>   dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
>   {
> -    return mark_to_flow_disassociate(offload->pmd, offload->flow);
> +    if (unlikely(offload->flow->partial_actions_offloaded &&
> +        offload->flow->egress_offload_port != ODPP_NONE)) {
> +        return partial_offload_egress_flow_del(offload);
> +    } else {
> +        return mark_to_flow_disassociate(offload->pmd, offload->flow);
> +    }
> +}
> +
> +/* Structure to hold a nl_parsed OVS action */
> +struct action_attr {
> +    int type;                /* OVS action type */
> +    struct nlattr *action;   /* action attribute */
> +};
> +
> +/*
> + * Maxium number of actions to be parsed while selecting a flow for partial
> + * action offload. This number is currently based on the minimum number of
> + * attributes seen with the tunnel encap action (clone, tunnel_push, output).
> + * This number includes output action to a single egress device (uplink) and
> + * supports neither multiple clone() actions nor multiple output actions.
> + * This number could change if and when we support other actions or
> + * combinations of actions for partial offload.
> + */
> +#define MAX_ACTION_ATTRS    3 /* Max # action attributes supported */
> +
> +/*
> + * This function parses the list of OVS "actions" of length "actions_len",
> + * and returns them in an array of action "attrs", of size "max_attrs".
> + * The parsed number of actions is returned in "num_attrs". If the number
> + * of actions exceeds "max_attrs", parsing is stopped and E2BIG is returned.
> + * Otherwise, returns success (0).
> + */
> +static int
> +parse_nlattr_actions(struct nlattr *actions, size_t actions_len,
> +                     struct action_attr *attrs, int max_attrs, int *num_attrs)
> +{
> +    const struct nlattr *a;
> +    unsigned int left;
> +    int num_actions = 0;
> +    int n_attrs = 0;
> +    int rc = 0;
> +    int type;
> +
> +    *num_attrs = 0;
> +
> +    NL_ATTR_FOR_EACH (a, left, actions, actions_len) {
> +        type = nl_attr_type(a);
> +
> +        if (num_actions >= max_attrs) {
> +            *num_attrs = num_actions;
> +            return E2BIG;
> +        }
> +
> +        attrs[num_actions].type = type;
> +        attrs[num_actions].action = a;
> +        num_actions++;
> +        if (type == OVS_ACTION_ATTR_CLONE) {
> +            rc = parse_nlattr_actions(nl_attr_get(a), nl_attr_get_size(a),
> +                                      &attrs[num_actions],
> +                                      (max_attrs - num_actions), &n_attrs);
> +            num_actions += n_attrs;
> +            if (rc == E2BIG) {
> +                *num_attrs = num_actions;
> +                return rc;
> +            }
> +        }
> +    }
> +
> +    *num_attrs = num_actions;
> +    return 0;
> +}
> +
> +/* This function determines if the given flow should be partially offloaded
> + * on the egress device, when the in-port is not offload-capable like a
> + * vhost-user port. The function currently supports offloading of only
> + * tunnel encap action.
> + */
> +static bool
> +should_partial_offload_egress(struct netdev *in_netdev,
> +                              struct dp_flow_offload_item *offload,
> +                              struct netdev **egress_netdev)
> +{
> +    const char *dpif_type_str =
> +        dpif_normalize_type(offload->pmd->dp->class->type);
> +    struct action_attr attrs[MAX_ACTION_ATTRS];
> +    odp_port_t out_port = ODPP_NONE;
> +    struct netdev *out_netdev;
> +    int num_attrs = 0;
> +    int type;
> +    int rc;
> +
> +    /* Support egress partial-offload only when in-port is vhost-user. */
> +    if (!is_dpdk_vhost_netdev(in_netdev)) {
> +        return false;
> +    }
> +
> +    rc = parse_nlattr_actions(offload->actions, offload->actions_len, attrs,
> +                              MAX_ACTION_ATTRS, &num_attrs);
> +    if (rc == E2BIG) {
> +        /* Action list too big; decline partial offload */
> +        return false;
> +    }
> +
> +    /* Number of attrs expected with tunnel encap action */
> +    if (num_attrs < MAX_ACTION_ATTRS) {
> +        return false;
> +    }
> +
> +    /* Only support clone sub-actions for now, tnl-push specifically. */
> +    if (attrs[0].type != OVS_ACTION_ATTR_CLONE ||
> +        attrs[1].type != OVS_ACTION_ATTR_TUNNEL_PUSH ||
> +        attrs[2].type != OVS_ACTION_ATTR_OUTPUT) {
> +        return false;
> +    }
I think it's better to validate it in lib/netdev-offload-dpdk.c and not 
here. This will also nullify parse_nlattr_actions helper here.
> +
> +    /* Egress partial-offload needs an output action at the end. */
> +    out_port = nl_attr_get_odp_port(attrs[2].action);
> +    if (out_port == ODPP_NONE) {
> +        return false;
> +    }
> +
> +    /* Support egress partial-offload only when out-port is offload capable. */
> +    out_netdev = netdev_ports_get(out_port, dpif_type_str);
> +    if (!out_netdev || !netdev_dpdk_flow_api_supported(out_netdev)) {
> +        return false;
> +    }
> +
> +    /* Flow can be egress partial-offloaded. */
> +    *egress_netdev = out_netdev;
> +    offload->flow->egress_offload_port = out_port;
> +    return true;
>   }
>   
>   static int
> @@ -2552,7 +2716,9 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
>       bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
>       struct offload_info info;
>       struct netdev *port;
> -    uint32_t mark;
> +    struct netdev *egress_port = NULL;
> +    bool alloc_mark = true;
> +    uint32_t mark = INVALID_FLOW_MARK;
>       int ret;
>   
>       if (flow->dead) {
> @@ -2564,11 +2730,25 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
>           return -1;
>       }
>   
> -    if (dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
> -            /* flow already offloaded */
> +    info.attr_egress = 0;
> +    info.partial_actions = 0;
> +
> +    if (unlikely(should_partial_offload_egress(port, offload, &egress_port))) {
> +        if (egress_port) {
>               netdev_close(port);
> -            return 0;
> +            port = egress_port;
> +            info.attr_egress = 1;
> +            alloc_mark = false;
> +        }
> +        info.partial_actions = 1;
> +    }
> +
> +    if (alloc_mark && dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
> +            /* flow already offloaded */
> +        netdev_close(port);
> +        return 0;
>       }
> +
>       info.flow_mark = mark;
>   
>       /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
> @@ -2585,17 +2765,24 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
>           goto err_free;
>       }
>   
> -    if (!modification) {
> +    if (unlikely(info.partial_actions && egress_port)) {
> +        VLOG_DBG_RL("%s: flow: %p mega_ufid: "UUID_FMT" pmd_id: %d\n",
> +                    __func__, flow, UUID_ARGS((struct uuid *)&flow->mega_ufid),
> +                    flow->pmd_id);
> +        flow->partial_actions_offloaded = true;
> +    } else if (!modification) {
>           megaflow_to_mark_associate(&flow->mega_ufid, mark);
>           mark_to_flow_associate(mark, flow);
>       }
>       return 0;
>   
>   err_free:
> -    if (!modification) {
> -        flow_mark_free(mark);
> -    } else {
> -        mark_to_flow_disassociate(pmd, flow);
> +    if (mark != INVALID_FLOW_MARK) {
> +        if (!modification) {
> +            flow_mark_free(mark);
> +        } else {
> +            mark_to_flow_disassociate(pmd, flow);
> +        }
>       }
>       return -1;
>   }
> @@ -2711,7 +2898,8 @@ dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
>       ovs_assert(cls != NULL);
>       dpcls_remove(cls, &flow->cr);
>       cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
> -    if (flow->mark != INVALID_FLOW_MARK) {
> +    if (flow->mark != INVALID_FLOW_MARK || (flow->partial_actions_offloaded
> +        && flow->egress_offload_port != ODPP_NONE)) {
>           queue_netdev_flow_del(pmd, flow);
>       }
>       flow->dead = true;
> @@ -3469,6 +3657,8 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
>       flow->dead = false;
>       flow->batch = NULL;
>       flow->mark = INVALID_FLOW_MARK;
> +    flow->partial_actions_offloaded = false;
> +    flow->egress_offload_port = ODPP_NONE;
>       *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
>       *CONST_CAST(struct flow *, &flow->flow) = match->flow;
>       *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
> diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> index 2ed3cb125..ad384e101 100644
> --- a/lib/netdev-offload-dpdk.c
> +++ b/lib/netdev-offload-dpdk.c
> @@ -57,6 +57,7 @@ static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
>   struct ufid_to_rte_flow_data {
>       struct cmap_node node;
>       ovs_u128 ufid;
> +    uint32_t refcnt;
>       struct rte_flow *rte_flow;
>       bool actions_offloaded;
>       struct dpif_flow_stats stats;
> @@ -97,6 +98,7 @@ ufid_to_rte_flow_associate(const ovs_u128 *ufid,
>           ovs_assert(data_prev->rte_flow == NULL);
>       }
>   
> +    data->refcnt = 1;
>       data->ufid = *ufid;
>       data->rte_flow = rte_flow;
>       data->actions_offloaded = actions_offloaded;
> @@ -1494,7 +1496,8 @@ static int
>   parse_clone_actions(struct netdev *netdev,
>                       struct flow_actions *actions,
>                       const struct nlattr *clone_actions,
> -                    const size_t clone_actions_len)
> +                    const size_t clone_actions_len,
> +                    struct offload_info *info)
>   {
>       const struct nlattr *ca;
>       unsigned int cleft;
> @@ -1519,8 +1522,11 @@ parse_clone_actions(struct netdev *netdev,
>               add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RAW_ENCAP,
>                               raw_encap);
>           } else if (clone_type == OVS_ACTION_ATTR_OUTPUT) {
> -            if (add_output_action(netdev, actions, ca)) {
> -                return -1;
> +            /* add output action only if full-offload */
> +            if (!info->partial_actions) {
> +                if (add_output_action(netdev, actions, ca)) {
> +                    return -1;
> +                }
>               }
>           } else {
>               VLOG_DBG_RL(&rl,
> @@ -1537,12 +1543,15 @@ static int
>   parse_flow_actions(struct netdev *netdev,
>                      struct flow_actions *actions,
>                      struct nlattr *nl_actions,
> -                   size_t nl_actions_len)
> +                   size_t nl_actions_len,
> +                   struct offload_info *info)
>   {
>       struct nlattr *nla;
>       size_t left;
>   
> -    add_count_action(actions);
> +    if (!info->partial_actions) {
> +        add_count_action(actions);
> +    }
>       NL_ATTR_FOR_EACH_UNSAFE (nla, left, nl_actions, nl_actions_len) {
>           if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
>               if (add_output_action(netdev, actions, nla)) {
> @@ -1573,7 +1582,7 @@ parse_flow_actions(struct netdev *netdev,
>               size_t clone_actions_len = nl_attr_get_size(nla);
>   
>               if (parse_clone_actions(netdev, actions, clone_actions,
> -                                    clone_actions_len)) {
> +                                    clone_actions_len, info)) {
>                   return -1;
>               }
>           } else {
> @@ -1595,15 +1604,22 @@ static struct rte_flow *
>   netdev_offload_dpdk_actions(struct netdev *netdev,
>                               struct flow_patterns *patterns,
>                               struct nlattr *nl_actions,
> -                            size_t actions_len)
> +                            size_t actions_len,
> +                            struct offload_info *info)
>   {
> -    const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
> +    struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
>       struct flow_actions actions = { .actions = NULL, .cnt = 0 };
>       struct rte_flow *flow = NULL;
>       struct rte_flow_error error;
>       int ret;
>   
> -    ret = parse_flow_actions(netdev, &actions, nl_actions, actions_len);
> +    if (info->attr_egress) {
> +        flow_attr.ingress = 0;
> +        flow_attr.egress = 1;
> +        flow_attr.transfer = 0;
> +    }
> +
> +    ret = parse_flow_actions(netdev, &actions, nl_actions, actions_len,info);
>       if (ret) {
>           goto out;
>       }
> @@ -1635,8 +1651,15 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
>       }
>   
>       flow = netdev_offload_dpdk_actions(netdev, &patterns, nl_actions,
> -                                       actions_len);
> -    if (!flow) {
> +                                       actions_len, info);
> +    if (flow) {
> +        if (info->partial_actions && info->attr_egress) {
> +            /* actions_offloaded should be set to false with partial actions,
> +             * since it is still considered as partial-offload and not
> +             * full-offload. */
> +            actions_offloaded = false;
> +        }
> +    } else if (!(info->partial_actions && info->attr_egress)) {
>           /* If we failed to offload the rule actions fallback to MARK+RSS
>            * actions.
>            */
> @@ -1686,18 +1709,29 @@ netdev_offload_dpdk_flow_put(struct netdev *netdev, struct match *match,
>                                struct dpif_flow_stats *stats)
>   {
>       struct ufid_to_rte_flow_data *rte_flow_data;
> -    int ret;
> +    int ret = 0;
>   
> -    /*
> -     * If an old rte_flow exists, it means it's a flow modification.
> -     * Here destroy the old rte flow first before adding a new one.
> -     */
>       rte_flow_data = ufid_to_rte_flow_data_find(ufid);
>       if (rte_flow_data && rte_flow_data->rte_flow) {
> -        ret = netdev_offload_dpdk_destroy_flow(netdev, ufid,
> -                                               rte_flow_data->rte_flow);
> -        if (ret < 0) {
> +        if (unlikely(info->partial_actions && info->attr_egress)) {
> +            /* In the case of partial action offload, the same mega-flow
> +             * could be offloaded by multiple PMD threads. Avoid creating
> +             * multiple rte_flows and just update the refcnt.
> +             */
> +            VLOG_DBG_RL("%s: mega_ufid: "UUID_FMT" refcnt: %d\n", __func__,
> +                        UUID_ARGS((struct uuid *)ufid), rte_flow_data->refcnt);
> +            rte_flow_data->refcnt++;
>               return ret;
> +        } else {
> +            /*
> +             * If an old rte_flow exists, it means it's a flow modification.
> +             * Here destroy the old rte flow first before adding a new one.
> +             */
> +            ret = netdev_offload_dpdk_destroy_flow(netdev, ufid,
> +                                                   rte_flow_data->rte_flow);
> +            if (ret < 0) {
> +                return ret;
> +            }
>           }
>       }
>   
> @@ -1719,6 +1753,12 @@ netdev_offload_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
>           return -1;
>       }
>   
> +    VLOG_DBG_RL("%s: mega_ufid: "UUID_FMT" refcnt: %d\n", __func__,
> +                UUID_ARGS((struct uuid *)ufid), rte_flow_data->refcnt);
> +    if (rte_flow_data->refcnt-- > 1) {
> +        return 0;
> +    }
> +
>       if (stats) {
>           memset(stats, 0, sizeof *stats);
>       }
> diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h
> index 4c0ed2ae8..55fcc711c 100644
> --- a/lib/netdev-offload.h
> +++ b/lib/netdev-offload.h
> @@ -67,6 +67,8 @@ struct offload_info {
>   
>       bool recirc_id_shared_with_tc;  /* Indicates whever tc chains will be in
>                                        * sync with datapath recirc ids. */
> +    uint8_t attr_egress;      /* Egress direction offload */
> +    uint8_t partial_actions;  /* Partial action offload; no forward action */
>   
>       /*
>        * The flow mark id assigened to the flow. If any pkts hit the flow,
Sriharsha Basavapatna July 6, 2020, 3:49 a.m. UTC | #2
On Sun, Jul 5, 2020 at 6:29 PM Eli Britstein <elibr@mellanox.com> wrote:
>
>
> On 6/29/2020 12:50 PM, Sriharsha Basavapatna wrote:
> > In this patch, we support offloading of VXLAN_ENCAP action for a vhost-user
> > port (aka "partial-action-offload"). At the time of offloading the flow, we
> > determine if the flow can be offloaded to an egress device, if the input
> > port is not offload capable such as a vhost-user port. We then offload the
> > flow with a VXLAN_ENCAP RTE action, to the egress device. We do not add
> > the OUTPUT RTE action, which indicates to the PMD that is is a partial
> > action offload request. Note that since the action is being offloaded in
> > egress direction, classification is expected to be done by OVS SW datapath
> > and hence there's no need to offload a MARK action.
> >
> > If offload succeeds, we save the information in 'dp_netdev_flow' so that
> > we skip execution of the corresponding action (previous patch) during SW
> > datapath processing.
> >
> > Signed-off-by: Sriharsha Basavapatna <sriharsha.basavapatna@broadcom.com>
> > ---
> >   lib/dpif-netdev.c         | 212 ++++++++++++++++++++++++++++++++++++--
> >   lib/netdev-offload-dpdk.c |  78 ++++++++++----
> >   lib/netdev-offload.h      |   2 +
> >   3 files changed, 262 insertions(+), 30 deletions(-)
> >
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index e489e2d90..d289d265d 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -2488,10 +2488,174 @@ dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
> >       ovs_mutex_unlock(&dp_flow_offload.mutex);
> >   }
> >
> > +static int
> > +partial_offload_egress_flow_del(struct dp_flow_offload_item *offload)
> > +{
> > +    struct dp_netdev_pmd_thread *pmd = offload->pmd;
> > +    struct dp_netdev_flow *flow = offload->flow;
> > +    const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
> > +    struct netdev *port;
> > +    int ret;
> > +
> > +    port = netdev_ports_get(flow->egress_offload_port, dpif_type_str);
> > +    if (!port) {
> > +        return -1;
> > +    }
> > +
> > +    /* Taking a global 'port_mutex' to fulfill thread safety
> > +     * restrictions for the netdev-offload-dpdk module. */
> > +    ovs_mutex_lock(&pmd->dp->port_mutex);
> > +    ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
> > +    ovs_mutex_unlock(&pmd->dp->port_mutex);
> > +    netdev_close(port);
> > +
> > +    if (ret) {
> > +        return ret;
> > +    }
> > +
> > +    flow->egress_offload_port = NULL;
> > +    flow->partial_actions_offloaded = false;
> > +
> > +    VLOG_DBG_RL("%s: flow: %p mega_ufid: "UUID_FMT" pmd_id: %d\n", __func__,
> > +                flow, UUID_ARGS((struct uuid *)&flow->mega_ufid),
> > +                offload->flow->pmd_id);
> > +    return ret;
> > +}
> It's almost the same code as mark_to_flow_disassociate. Maybe just
> enhance it?

This is in egress direction, while mark applies to ingress. It is
better to keep them separate.

> > +
> >   static int
> >   dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
> >   {
> > -    return mark_to_flow_disassociate(offload->pmd, offload->flow);
> > +    if (unlikely(offload->flow->partial_actions_offloaded &&
> > +        offload->flow->egress_offload_port != ODPP_NONE)) {
> > +        return partial_offload_egress_flow_del(offload);
> > +    } else {
> > +        return mark_to_flow_disassociate(offload->pmd, offload->flow);
> > +    }
> > +}
> > +
> > +/* Structure to hold a nl_parsed OVS action */
> > +struct action_attr {
> > +    int type;                /* OVS action type */
> > +    struct nlattr *action;   /* action attribute */
> > +};
> > +
> > +/*
> > + * Maxium number of actions to be parsed while selecting a flow for partial
> > + * action offload. This number is currently based on the minimum number of
> > + * attributes seen with the tunnel encap action (clone, tunnel_push, output).
> > + * This number includes output action to a single egress device (uplink) and
> > + * supports neither multiple clone() actions nor multiple output actions.
> > + * This number could change if and when we support other actions or
> > + * combinations of actions for partial offload.
> > + */
> > +#define MAX_ACTION_ATTRS    3 /* Max # action attributes supported */
> > +
> > +/*
> > + * This function parses the list of OVS "actions" of length "actions_len",
> > + * and returns them in an array of action "attrs", of size "max_attrs".
> > + * The parsed number of actions is returned in "num_attrs". If the number
> > + * of actions exceeds "max_attrs", parsing is stopped and E2BIG is returned.
> > + * Otherwise, returns success (0).
> > + */
> > +static int
> > +parse_nlattr_actions(struct nlattr *actions, size_t actions_len,
> > +                     struct action_attr *attrs, int max_attrs, int *num_attrs)
> > +{
> > +    const struct nlattr *a;
> > +    unsigned int left;
> > +    int num_actions = 0;
> > +    int n_attrs = 0;
> > +    int rc = 0;
> > +    int type;
> > +
> > +    *num_attrs = 0;
> > +
> > +    NL_ATTR_FOR_EACH (a, left, actions, actions_len) {
> > +        type = nl_attr_type(a);
> > +
> > +        if (num_actions >= max_attrs) {
> > +            *num_attrs = num_actions;
> > +            return E2BIG;
> > +        }
> > +
> > +        attrs[num_actions].type = type;
> > +        attrs[num_actions].action = a;
> > +        num_actions++;
> > +        if (type == OVS_ACTION_ATTR_CLONE) {
> > +            rc = parse_nlattr_actions(nl_attr_get(a), nl_attr_get_size(a),
> > +                                      &attrs[num_actions],
> > +                                      (max_attrs - num_actions), &n_attrs);
> > +            num_actions += n_attrs;
> > +            if (rc == E2BIG) {
> > +                *num_attrs = num_actions;
> > +                return rc;
> > +            }
> > +        }
> > +    }
> > +
> > +    *num_attrs = num_actions;
> > +    return 0;
> > +}
> > +
> > +/* This function determines if the given flow should be partially offloaded
> > + * on the egress device, when the in-port is not offload-capable like a
> > + * vhost-user port. The function currently supports offloading of only
> > + * tunnel encap action.
> > + */
> > +static bool
> > +should_partial_offload_egress(struct netdev *in_netdev,
> > +                              struct dp_flow_offload_item *offload,
> > +                              struct netdev **egress_netdev)
> > +{
> > +    const char *dpif_type_str =
> > +        dpif_normalize_type(offload->pmd->dp->class->type);
> > +    struct action_attr attrs[MAX_ACTION_ATTRS];
> > +    odp_port_t out_port = ODPP_NONE;
> > +    struct netdev *out_netdev;
> > +    int num_attrs = 0;
> > +    int type;
> > +    int rc;
> > +
> > +    /* Support egress partial-offload only when in-port is vhost-user. */
> > +    if (!is_dpdk_vhost_netdev(in_netdev)) {
> > +        return false;
> > +    }
> > +
> > +    rc = parse_nlattr_actions(offload->actions, offload->actions_len, attrs,
> > +                              MAX_ACTION_ATTRS, &num_attrs);
> > +    if (rc == E2BIG) {
> > +        /* Action list too big; decline partial offload */
> > +        return false;
> > +    }
> > +
> > +    /* Number of attrs expected with tunnel encap action */
> > +    if (num_attrs < MAX_ACTION_ATTRS) {
> > +        return false;
> > +    }
> > +
> > +    /* Only support clone sub-actions for now, tnl-push specifically. */
> > +    if (attrs[0].type != OVS_ACTION_ATTR_CLONE ||
> > +        attrs[1].type != OVS_ACTION_ATTR_TUNNEL_PUSH ||
> > +        attrs[2].type != OVS_ACTION_ATTR_OUTPUT) {
> > +        return false;
> > +    }
> I think it's better to validate it in lib/netdev-offload-dpdk.c and not
> here. This will also nullify parse_nlattr_actions helper here.

We need to parse and get the output port here to determine if it
supports offloads (since we have already seen that the in-port is a
vhost-user port). Otherwise, there's no need to select this flow for
partial-action offload. This is the place where we make this decision
and hence it is better to eliminate actions that are not supported
also here. Basically, this function contains the entire
partial-action-offload selection logic for the flow. It is better not
to split it across files. Apart from this, we need to get the egress
netdev in this function since we invoke netdev_flow_put() on it (and
not on the ingress netdev).

> > +
> > +    /* Egress partial-offload needs an output action at the end. */
> > +    out_port = nl_attr_get_odp_port(attrs[2].action);
> > +    if (out_port == ODPP_NONE) {
> > +        return false;
> > +    }
> > +
> > +    /* Support egress partial-offload only when out-port is offload capable. */
> > +    out_netdev = netdev_ports_get(out_port, dpif_type_str);
> > +    if (!out_netdev || !netdev_dpdk_flow_api_supported(out_netdev)) {
> > +        return false;
> > +    }
> > +
> > +    /* Flow can be egress partial-offloaded. */
> > +    *egress_netdev = out_netdev;
> > +    offload->flow->egress_offload_port = out_port;
> > +    return true;
> >   }
> >
> >   static int
> > @@ -2552,7 +2716,9 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
> >       bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
> >       struct offload_info info;
> >       struct netdev *port;
> > -    uint32_t mark;
> > +    struct netdev *egress_port = NULL;
> > +    bool alloc_mark = true;
> > +    uint32_t mark = INVALID_FLOW_MARK;
> >       int ret;
> >
> >       if (flow->dead) {
> > @@ -2564,11 +2730,25 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
> >           return -1;
> >       }
> >
> > -    if (dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
> > -            /* flow already offloaded */
> > +    info.attr_egress = 0;
> > +    info.partial_actions = 0;
> > +
> > +    if (unlikely(should_partial_offload_egress(port, offload, &egress_port))) {
> > +        if (egress_port) {
> >               netdev_close(port);
> > -            return 0;
> > +            port = egress_port;
> > +            info.attr_egress = 1;
> > +            alloc_mark = false;
> > +        }
> > +        info.partial_actions = 1;
> > +    }
> > +
> > +    if (alloc_mark && dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
> > +            /* flow already offloaded */
> > +        netdev_close(port);
> > +        return 0;
> >       }
> > +
> >       info.flow_mark = mark;
> >
> >       /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
> > @@ -2585,17 +2765,24 @@ dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
> >           goto err_free;
> >       }
> >
> > -    if (!modification) {
> > +    if (unlikely(info.partial_actions && egress_port)) {
> > +        VLOG_DBG_RL("%s: flow: %p mega_ufid: "UUID_FMT" pmd_id: %d\n",
> > +                    __func__, flow, UUID_ARGS((struct uuid *)&flow->mega_ufid),
> > +                    flow->pmd_id);
> > +        flow->partial_actions_offloaded = true;
> > +    } else if (!modification) {
> >           megaflow_to_mark_associate(&flow->mega_ufid, mark);
> >           mark_to_flow_associate(mark, flow);
> >       }
> >       return 0;
> >
> >   err_free:
> > -    if (!modification) {
> > -        flow_mark_free(mark);
> > -    } else {
> > -        mark_to_flow_disassociate(pmd, flow);
> > +    if (mark != INVALID_FLOW_MARK) {
> > +        if (!modification) {
> > +            flow_mark_free(mark);
> > +        } else {
> > +            mark_to_flow_disassociate(pmd, flow);
> > +        }
> >       }
> >       return -1;
> >   }
> > @@ -2711,7 +2898,8 @@ dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
> >       ovs_assert(cls != NULL);
> >       dpcls_remove(cls, &flow->cr);
> >       cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
> > -    if (flow->mark != INVALID_FLOW_MARK) {
> > +    if (flow->mark != INVALID_FLOW_MARK || (flow->partial_actions_offloaded
> > +        && flow->egress_offload_port != ODPP_NONE)) {
> >           queue_netdev_flow_del(pmd, flow);
> >       }
> >       flow->dead = true;
> > @@ -3469,6 +3657,8 @@ dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
> >       flow->dead = false;
> >       flow->batch = NULL;
> >       flow->mark = INVALID_FLOW_MARK;
> > +    flow->partial_actions_offloaded = false;
> > +    flow->egress_offload_port = ODPP_NONE;
> >       *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
> >       *CONST_CAST(struct flow *, &flow->flow) = match->flow;
> >       *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
> > diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
> > index 2ed3cb125..ad384e101 100644
> > --- a/lib/netdev-offload-dpdk.c
> > +++ b/lib/netdev-offload-dpdk.c
> > @@ -57,6 +57,7 @@ static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
> >   struct ufid_to_rte_flow_data {
> >       struct cmap_node node;
> >       ovs_u128 ufid;
> > +    uint32_t refcnt;
> >       struct rte_flow *rte_flow;
> >       bool actions_offloaded;
> >       struct dpif_flow_stats stats;
> > @@ -97,6 +98,7 @@ ufid_to_rte_flow_associate(const ovs_u128 *ufid,
> >           ovs_assert(data_prev->rte_flow == NULL);
> >       }
> >
> > +    data->refcnt = 1;
> >       data->ufid = *ufid;
> >       data->rte_flow = rte_flow;
> >       data->actions_offloaded = actions_offloaded;
> > @@ -1494,7 +1496,8 @@ static int
> >   parse_clone_actions(struct netdev *netdev,
> >                       struct flow_actions *actions,
> >                       const struct nlattr *clone_actions,
> > -                    const size_t clone_actions_len)
> > +                    const size_t clone_actions_len,
> > +                    struct offload_info *info)
> >   {
> >       const struct nlattr *ca;
> >       unsigned int cleft;
> > @@ -1519,8 +1522,11 @@ parse_clone_actions(struct netdev *netdev,
> >               add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RAW_ENCAP,
> >                               raw_encap);
> >           } else if (clone_type == OVS_ACTION_ATTR_OUTPUT) {
> > -            if (add_output_action(netdev, actions, ca)) {
> > -                return -1;
> > +            /* add output action only if full-offload */
> > +            if (!info->partial_actions) {
> > +                if (add_output_action(netdev, actions, ca)) {
> > +                    return -1;
> > +                }
> >               }
> >           } else {
> >               VLOG_DBG_RL(&rl,
> > @@ -1537,12 +1543,15 @@ static int
> >   parse_flow_actions(struct netdev *netdev,
> >                      struct flow_actions *actions,
> >                      struct nlattr *nl_actions,
> > -                   size_t nl_actions_len)
> > +                   size_t nl_actions_len,
> > +                   struct offload_info *info)
> >   {
> >       struct nlattr *nla;
> >       size_t left;
> >
> > -    add_count_action(actions);
> > +    if (!info->partial_actions) {
> > +        add_count_action(actions);
> > +    }
> >       NL_ATTR_FOR_EACH_UNSAFE (nla, left, nl_actions, nl_actions_len) {
> >           if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
> >               if (add_output_action(netdev, actions, nla)) {
> > @@ -1573,7 +1582,7 @@ parse_flow_actions(struct netdev *netdev,
> >               size_t clone_actions_len = nl_attr_get_size(nla);
> >
> >               if (parse_clone_actions(netdev, actions, clone_actions,
> > -                                    clone_actions_len)) {
> > +                                    clone_actions_len, info)) {
> >                   return -1;
> >               }
> >           } else {
> > @@ -1595,15 +1604,22 @@ static struct rte_flow *
> >   netdev_offload_dpdk_actions(struct netdev *netdev,
> >                               struct flow_patterns *patterns,
> >                               struct nlattr *nl_actions,
> > -                            size_t actions_len)
> > +                            size_t actions_len,
> > +                            struct offload_info *info)
> >   {
> > -    const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
> > +    struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
> >       struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> >       struct rte_flow *flow = NULL;
> >       struct rte_flow_error error;
> >       int ret;
> >
> > -    ret = parse_flow_actions(netdev, &actions, nl_actions, actions_len);
> > +    if (info->attr_egress) {
> > +        flow_attr.ingress = 0;
> > +        flow_attr.egress = 1;
> > +        flow_attr.transfer = 0;
> > +    }
> > +
> > +    ret = parse_flow_actions(netdev, &actions, nl_actions, actions_len,info);
> >       if (ret) {
> >           goto out;
> >       }
> > @@ -1635,8 +1651,15 @@ netdev_offload_dpdk_add_flow(struct netdev *netdev,
> >       }
> >
> >       flow = netdev_offload_dpdk_actions(netdev, &patterns, nl_actions,
> > -                                       actions_len);
> > -    if (!flow) {
> > +                                       actions_len, info);
> > +    if (flow) {
> > +        if (info->partial_actions && info->attr_egress) {
> > +            /* actions_offloaded should be set to false with partial actions,
> > +             * since it is still considered as partial-offload and not
> > +             * full-offload. */
> > +            actions_offloaded = false;
> > +        }
> > +    } else if (!(info->partial_actions && info->attr_egress)) {
> >           /* If we failed to offload the rule actions fallback to MARK+RSS
> >            * actions.
> >            */
> > @@ -1686,18 +1709,29 @@ netdev_offload_dpdk_flow_put(struct netdev *netdev, struct match *match,
> >                                struct dpif_flow_stats *stats)
> >   {
> >       struct ufid_to_rte_flow_data *rte_flow_data;
> > -    int ret;
> > +    int ret = 0;
> >
> > -    /*
> > -     * If an old rte_flow exists, it means it's a flow modification.
> > -     * Here destroy the old rte flow first before adding a new one.
> > -     */
> >       rte_flow_data = ufid_to_rte_flow_data_find(ufid);
> >       if (rte_flow_data && rte_flow_data->rte_flow) {
> > -        ret = netdev_offload_dpdk_destroy_flow(netdev, ufid,
> > -                                               rte_flow_data->rte_flow);
> > -        if (ret < 0) {
> > +        if (unlikely(info->partial_actions && info->attr_egress)) {
> > +            /* In the case of partial action offload, the same mega-flow
> > +             * could be offloaded by multiple PMD threads. Avoid creating
> > +             * multiple rte_flows and just update the refcnt.
> > +             */
> > +            VLOG_DBG_RL("%s: mega_ufid: "UUID_FMT" refcnt: %d\n", __func__,
> > +                        UUID_ARGS((struct uuid *)ufid), rte_flow_data->refcnt);
> > +            rte_flow_data->refcnt++;
> >               return ret;
> > +        } else {
> > +            /*
> > +             * If an old rte_flow exists, it means it's a flow modification.
> > +             * Here destroy the old rte flow first before adding a new one.
> > +             */
> > +            ret = netdev_offload_dpdk_destroy_flow(netdev, ufid,
> > +                                                   rte_flow_data->rte_flow);
> > +            if (ret < 0) {
> > +                return ret;
> > +            }
> >           }
> >       }
> >
> > @@ -1719,6 +1753,12 @@ netdev_offload_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
> >           return -1;
> >       }
> >
> > +    VLOG_DBG_RL("%s: mega_ufid: "UUID_FMT" refcnt: %d\n", __func__,
> > +                UUID_ARGS((struct uuid *)ufid), rte_flow_data->refcnt);
> > +    if (rte_flow_data->refcnt-- > 1) {
> > +        return 0;
> > +    }
> > +
> >       if (stats) {
> >           memset(stats, 0, sizeof *stats);
> >       }
> > diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h
> > index 4c0ed2ae8..55fcc711c 100644
> > --- a/lib/netdev-offload.h
> > +++ b/lib/netdev-offload.h
> > @@ -67,6 +67,8 @@ struct offload_info {
> >
> >       bool recirc_id_shared_with_tc;  /* Indicates whever tc chains will be in
> >                                        * sync with datapath recirc ids. */
> > +    uint8_t attr_egress;      /* Egress direction offload */
> > +    uint8_t partial_actions;  /* Partial action offload; no forward action */
> >
> >       /*
> >        * The flow mark id assigened to the flow. If any pkts hit the flow,
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e489e2d90..d289d265d 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2488,10 +2488,174 @@  dp_netdev_append_flow_offload(struct dp_flow_offload_item *offload)
     ovs_mutex_unlock(&dp_flow_offload.mutex);
 }
 
+static int
+partial_offload_egress_flow_del(struct dp_flow_offload_item *offload)
+{
+    struct dp_netdev_pmd_thread *pmd = offload->pmd;
+    struct dp_netdev_flow *flow = offload->flow;
+    const char *dpif_type_str = dpif_normalize_type(pmd->dp->class->type);
+    struct netdev *port;
+    int ret;
+
+    port = netdev_ports_get(flow->egress_offload_port, dpif_type_str);
+    if (!port) {
+        return -1;
+    }
+
+    /* Taking a global 'port_mutex' to fulfill thread safety
+     * restrictions for the netdev-offload-dpdk module. */
+    ovs_mutex_lock(&pmd->dp->port_mutex);
+    ret = netdev_flow_del(port, &flow->mega_ufid, NULL);
+    ovs_mutex_unlock(&pmd->dp->port_mutex);
+    netdev_close(port);
+
+    if (ret) {
+        return ret;
+    }
+
+    flow->egress_offload_port = NULL;
+    flow->partial_actions_offloaded = false;
+
+    VLOG_DBG_RL("%s: flow: %p mega_ufid: "UUID_FMT" pmd_id: %d\n", __func__,
+                flow, UUID_ARGS((struct uuid *)&flow->mega_ufid),
+                offload->flow->pmd_id);
+    return ret;
+}
+
 static int
 dp_netdev_flow_offload_del(struct dp_flow_offload_item *offload)
 {
-    return mark_to_flow_disassociate(offload->pmd, offload->flow);
+    if (unlikely(offload->flow->partial_actions_offloaded &&
+        offload->flow->egress_offload_port != ODPP_NONE)) {
+        return partial_offload_egress_flow_del(offload);
+    } else {
+        return mark_to_flow_disassociate(offload->pmd, offload->flow);
+    }
+}
+
+/* Structure to hold a nl_parsed OVS action */
+struct action_attr {
+    int type;                /* OVS action type */
+    struct nlattr *action;   /* action attribute */
+};
+
+/*
+ * Maxium number of actions to be parsed while selecting a flow for partial
+ * action offload. This number is currently based on the minimum number of
+ * attributes seen with the tunnel encap action (clone, tunnel_push, output).
+ * This number includes output action to a single egress device (uplink) and
+ * supports neither multiple clone() actions nor multiple output actions.
+ * This number could change if and when we support other actions or
+ * combinations of actions for partial offload.
+ */
+#define MAX_ACTION_ATTRS    3 /* Max # action attributes supported */
+
+/*
+ * This function parses the list of OVS "actions" of length "actions_len",
+ * and returns them in an array of action "attrs", of size "max_attrs".
+ * The parsed number of actions is returned in "num_attrs". If the number
+ * of actions exceeds "max_attrs", parsing is stopped and E2BIG is returned.
+ * Otherwise, returns success (0).
+ */
+static int
+parse_nlattr_actions(struct nlattr *actions, size_t actions_len,
+                     struct action_attr *attrs, int max_attrs, int *num_attrs)
+{
+    const struct nlattr *a;
+    unsigned int left;
+    int num_actions = 0;
+    int n_attrs = 0;
+    int rc = 0;
+    int type;
+
+    *num_attrs = 0;
+
+    NL_ATTR_FOR_EACH (a, left, actions, actions_len) {
+        type = nl_attr_type(a);
+
+        if (num_actions >= max_attrs) {
+            *num_attrs = num_actions;
+            return E2BIG;
+        }
+
+        attrs[num_actions].type = type;
+        attrs[num_actions].action = a;
+        num_actions++;
+        if (type == OVS_ACTION_ATTR_CLONE) {
+            rc = parse_nlattr_actions(nl_attr_get(a), nl_attr_get_size(a),
+                                      &attrs[num_actions],
+                                      (max_attrs - num_actions), &n_attrs);
+            num_actions += n_attrs;
+            if (rc == E2BIG) {
+                *num_attrs = num_actions;
+                return rc;
+            }
+        }
+    }
+
+    *num_attrs = num_actions;
+    return 0;
+}
+
+/* This function determines if the given flow should be partially offloaded
+ * on the egress device, when the in-port is not offload-capable like a
+ * vhost-user port. The function currently supports offloading of only
+ * tunnel encap action.
+ */
+static bool
+should_partial_offload_egress(struct netdev *in_netdev,
+                              struct dp_flow_offload_item *offload,
+                              struct netdev **egress_netdev)
+{
+    const char *dpif_type_str =
+        dpif_normalize_type(offload->pmd->dp->class->type);
+    struct action_attr attrs[MAX_ACTION_ATTRS];
+    odp_port_t out_port = ODPP_NONE;
+    struct netdev *out_netdev;
+    int num_attrs = 0;
+    int type;
+    int rc;
+
+    /* Support egress partial-offload only when in-port is vhost-user. */
+    if (!is_dpdk_vhost_netdev(in_netdev)) {
+        return false;
+    }
+
+    rc = parse_nlattr_actions(offload->actions, offload->actions_len, attrs,
+                              MAX_ACTION_ATTRS, &num_attrs);
+    if (rc == E2BIG) {
+        /* Action list too big; decline partial offload */
+        return false;
+    }
+
+    /* Number of attrs expected with tunnel encap action */
+    if (num_attrs < MAX_ACTION_ATTRS) {
+        return false;
+    }
+
+    /* Only support clone sub-actions for now, tnl-push specifically. */
+    if (attrs[0].type != OVS_ACTION_ATTR_CLONE ||
+        attrs[1].type != OVS_ACTION_ATTR_TUNNEL_PUSH ||
+        attrs[2].type != OVS_ACTION_ATTR_OUTPUT) {
+        return false;
+    }
+
+    /* Egress partial-offload needs an output action at the end. */
+    out_port = nl_attr_get_odp_port(attrs[2].action);
+    if (out_port == ODPP_NONE) {
+        return false;
+    }
+
+    /* Support egress partial-offload only when out-port is offload capable. */
+    out_netdev = netdev_ports_get(out_port, dpif_type_str);
+    if (!out_netdev || !netdev_dpdk_flow_api_supported(out_netdev)) {
+        return false;
+    }
+
+    /* Flow can be egress partial-offloaded. */
+    *egress_netdev = out_netdev;
+    offload->flow->egress_offload_port = out_port;
+    return true;
 }
 
 static int
@@ -2552,7 +2716,9 @@  dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
     bool modification = offload->op == DP_NETDEV_FLOW_OFFLOAD_OP_MOD;
     struct offload_info info;
     struct netdev *port;
-    uint32_t mark;
+    struct netdev *egress_port = NULL;
+    bool alloc_mark = true;
+    uint32_t mark = INVALID_FLOW_MARK;
     int ret;
 
     if (flow->dead) {
@@ -2564,11 +2730,25 @@  dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
         return -1;
     }
 
-    if (dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
-            /* flow already offloaded */
+    info.attr_egress = 0;
+    info.partial_actions = 0;
+
+    if (unlikely(should_partial_offload_egress(port, offload, &egress_port))) {
+        if (egress_port) {
             netdev_close(port);
-            return 0;
+            port = egress_port;
+            info.attr_egress = 1;
+            alloc_mark = false;
+        }
+        info.partial_actions = 1;
+    }
+
+    if (alloc_mark && dp_netdev_alloc_flow_mark(flow, modification, &mark)) {
+            /* flow already offloaded */
+        netdev_close(port);
+        return 0;
     }
+
     info.flow_mark = mark;
 
     /* Taking a global 'port_mutex' to fulfill thread safety restrictions for
@@ -2585,17 +2765,24 @@  dp_netdev_flow_offload_put(struct dp_flow_offload_item *offload)
         goto err_free;
     }
 
-    if (!modification) {
+    if (unlikely(info.partial_actions && egress_port)) {
+        VLOG_DBG_RL("%s: flow: %p mega_ufid: "UUID_FMT" pmd_id: %d\n",
+                    __func__, flow, UUID_ARGS((struct uuid *)&flow->mega_ufid),
+                    flow->pmd_id);
+        flow->partial_actions_offloaded = true;
+    } else if (!modification) {
         megaflow_to_mark_associate(&flow->mega_ufid, mark);
         mark_to_flow_associate(mark, flow);
     }
     return 0;
 
 err_free:
-    if (!modification) {
-        flow_mark_free(mark);
-    } else {
-        mark_to_flow_disassociate(pmd, flow);
+    if (mark != INVALID_FLOW_MARK) {
+        if (!modification) {
+            flow_mark_free(mark);
+        } else {
+            mark_to_flow_disassociate(pmd, flow);
+        }
     }
     return -1;
 }
@@ -2711,7 +2898,8 @@  dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
     ovs_assert(cls != NULL);
     dpcls_remove(cls, &flow->cr);
     cmap_remove(&pmd->flow_table, node, dp_netdev_flow_hash(&flow->ufid));
-    if (flow->mark != INVALID_FLOW_MARK) {
+    if (flow->mark != INVALID_FLOW_MARK || (flow->partial_actions_offloaded
+        && flow->egress_offload_port != ODPP_NONE)) {
         queue_netdev_flow_del(pmd, flow);
     }
     flow->dead = true;
@@ -3469,6 +3657,8 @@  dp_netdev_flow_add(struct dp_netdev_pmd_thread *pmd,
     flow->dead = false;
     flow->batch = NULL;
     flow->mark = INVALID_FLOW_MARK;
+    flow->partial_actions_offloaded = false;
+    flow->egress_offload_port = ODPP_NONE;
     *CONST_CAST(unsigned *, &flow->pmd_id) = pmd->core_id;
     *CONST_CAST(struct flow *, &flow->flow) = match->flow;
     *CONST_CAST(ovs_u128 *, &flow->ufid) = *ufid;
diff --git a/lib/netdev-offload-dpdk.c b/lib/netdev-offload-dpdk.c
index 2ed3cb125..ad384e101 100644
--- a/lib/netdev-offload-dpdk.c
+++ b/lib/netdev-offload-dpdk.c
@@ -57,6 +57,7 @@  static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
 struct ufid_to_rte_flow_data {
     struct cmap_node node;
     ovs_u128 ufid;
+    uint32_t refcnt;
     struct rte_flow *rte_flow;
     bool actions_offloaded;
     struct dpif_flow_stats stats;
@@ -97,6 +98,7 @@  ufid_to_rte_flow_associate(const ovs_u128 *ufid,
         ovs_assert(data_prev->rte_flow == NULL);
     }
 
+    data->refcnt = 1;
     data->ufid = *ufid;
     data->rte_flow = rte_flow;
     data->actions_offloaded = actions_offloaded;
@@ -1494,7 +1496,8 @@  static int
 parse_clone_actions(struct netdev *netdev,
                     struct flow_actions *actions,
                     const struct nlattr *clone_actions,
-                    const size_t clone_actions_len)
+                    const size_t clone_actions_len,
+                    struct offload_info *info)
 {
     const struct nlattr *ca;
     unsigned int cleft;
@@ -1519,8 +1522,11 @@  parse_clone_actions(struct netdev *netdev,
             add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RAW_ENCAP,
                             raw_encap);
         } else if (clone_type == OVS_ACTION_ATTR_OUTPUT) {
-            if (add_output_action(netdev, actions, ca)) {
-                return -1;
+            /* add output action only if full-offload */
+            if (!info->partial_actions) {
+                if (add_output_action(netdev, actions, ca)) {
+                    return -1;
+                }
             }
         } else {
             VLOG_DBG_RL(&rl,
@@ -1537,12 +1543,15 @@  static int
 parse_flow_actions(struct netdev *netdev,
                    struct flow_actions *actions,
                    struct nlattr *nl_actions,
-                   size_t nl_actions_len)
+                   size_t nl_actions_len,
+                   struct offload_info *info)
 {
     struct nlattr *nla;
     size_t left;
 
-    add_count_action(actions);
+    if (!info->partial_actions) {
+        add_count_action(actions);
+    }
     NL_ATTR_FOR_EACH_UNSAFE (nla, left, nl_actions, nl_actions_len) {
         if (nl_attr_type(nla) == OVS_ACTION_ATTR_OUTPUT) {
             if (add_output_action(netdev, actions, nla)) {
@@ -1573,7 +1582,7 @@  parse_flow_actions(struct netdev *netdev,
             size_t clone_actions_len = nl_attr_get_size(nla);
 
             if (parse_clone_actions(netdev, actions, clone_actions,
-                                    clone_actions_len)) {
+                                    clone_actions_len, info)) {
                 return -1;
             }
         } else {
@@ -1595,15 +1604,22 @@  static struct rte_flow *
 netdev_offload_dpdk_actions(struct netdev *netdev,
                             struct flow_patterns *patterns,
                             struct nlattr *nl_actions,
-                            size_t actions_len)
+                            size_t actions_len,
+                            struct offload_info *info)
 {
-    const struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
+    struct rte_flow_attr flow_attr = { .ingress = 1, .transfer = 1 };
     struct flow_actions actions = { .actions = NULL, .cnt = 0 };
     struct rte_flow *flow = NULL;
     struct rte_flow_error error;
     int ret;
 
-    ret = parse_flow_actions(netdev, &actions, nl_actions, actions_len);
+    if (info->attr_egress) {
+        flow_attr.ingress = 0;
+        flow_attr.egress = 1;
+        flow_attr.transfer = 0;
+    }
+
+    ret = parse_flow_actions(netdev, &actions, nl_actions, actions_len,info);
     if (ret) {
         goto out;
     }
@@ -1635,8 +1651,15 @@  netdev_offload_dpdk_add_flow(struct netdev *netdev,
     }
 
     flow = netdev_offload_dpdk_actions(netdev, &patterns, nl_actions,
-                                       actions_len);
-    if (!flow) {
+                                       actions_len, info);
+    if (flow) {
+        if (info->partial_actions && info->attr_egress) {
+            /* actions_offloaded should be set to false with partial actions,
+             * since it is still considered as partial-offload and not
+             * full-offload. */
+            actions_offloaded = false;
+        }
+    } else if (!(info->partial_actions && info->attr_egress)) {
         /* If we failed to offload the rule actions fallback to MARK+RSS
          * actions.
          */
@@ -1686,18 +1709,29 @@  netdev_offload_dpdk_flow_put(struct netdev *netdev, struct match *match,
                              struct dpif_flow_stats *stats)
 {
     struct ufid_to_rte_flow_data *rte_flow_data;
-    int ret;
+    int ret = 0;
 
-    /*
-     * If an old rte_flow exists, it means it's a flow modification.
-     * Here destroy the old rte flow first before adding a new one.
-     */
     rte_flow_data = ufid_to_rte_flow_data_find(ufid);
     if (rte_flow_data && rte_flow_data->rte_flow) {
-        ret = netdev_offload_dpdk_destroy_flow(netdev, ufid,
-                                               rte_flow_data->rte_flow);
-        if (ret < 0) {
+        if (unlikely(info->partial_actions && info->attr_egress)) {
+            /* In the case of partial action offload, the same mega-flow
+             * could be offloaded by multiple PMD threads. Avoid creating
+             * multiple rte_flows and just update the refcnt.
+             */
+            VLOG_DBG_RL("%s: mega_ufid: "UUID_FMT" refcnt: %d\n", __func__,
+                        UUID_ARGS((struct uuid *)ufid), rte_flow_data->refcnt);
+            rte_flow_data->refcnt++;
             return ret;
+        } else {
+            /*
+             * If an old rte_flow exists, it means it's a flow modification.
+             * Here destroy the old rte flow first before adding a new one.
+             */
+            ret = netdev_offload_dpdk_destroy_flow(netdev, ufid,
+                                                   rte_flow_data->rte_flow);
+            if (ret < 0) {
+                return ret;
+            }
         }
     }
 
@@ -1719,6 +1753,12 @@  netdev_offload_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
         return -1;
     }
 
+    VLOG_DBG_RL("%s: mega_ufid: "UUID_FMT" refcnt: %d\n", __func__,
+                UUID_ARGS((struct uuid *)ufid), rte_flow_data->refcnt);
+    if (rte_flow_data->refcnt-- > 1) {
+        return 0;
+    }
+
     if (stats) {
         memset(stats, 0, sizeof *stats);
     }
diff --git a/lib/netdev-offload.h b/lib/netdev-offload.h
index 4c0ed2ae8..55fcc711c 100644
--- a/lib/netdev-offload.h
+++ b/lib/netdev-offload.h
@@ -67,6 +67,8 @@  struct offload_info {
 
     bool recirc_id_shared_with_tc;  /* Indicates whever tc chains will be in
                                      * sync with datapath recirc ids. */
+    uint8_t attr_egress;      /* Egress direction offload */
+    uint8_t partial_actions;  /* Partial action offload; no forward action */
 
     /*
      * The flow mark id assigened to the flow. If any pkts hit the flow,