[ovs-dev,v8,3/6] netdev-dpdk: implement flow offload with rte flow

Message ID ec0794cf62a37fdb5a1d5c4496b206203afe9a9c.1522136948.git.shahafs@mellanox.com
State Superseded
Headers show
Series
  • OVS-DPDK flow offload with rte_flow
Related show

Commit Message

Shahaf Shuler March 27, 2018, 7:54 a.m.
From: Finn Christensen <fc@napatech.com>

The basic yet the major part of this patch is to translate the "match"
to rte flow patterns. And then, we create a rte flow with MARK + RSS
actions. Afterwards, all packets match the flow will have the mark id in
the mbuf.

The reason RSS is needed is, for most NICs, a MARK only action is not
allowed. It has to be used together with some other actions, such as
QUEUE, RSS, etc. However, QUEUE action can specify one queue only, which
may break the rss. Likely, RSS action is currently the best we could
now. Thus, RSS action is choosen.

For any unsupported flows, such as MPLS, -1 is returned, meaning the
flow offload is failed and then skipped.

Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
---
 lib/netdev-dpdk.c | 563 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 562 insertions(+), 1 deletion(-)

Comments

Ian Stokes April 10, 2018, 7:58 p.m. | #1
> The basic yet the major part of this patch is to translate the "match"
> to rte flow patterns. And then, we create a rte flow with MARK + RSS
> actions. Afterwards, all packets match the flow will have the mark id in
> the mbuf.
> 
> The reason RSS is needed is, for most NICs, a MARK only action is not
> allowed. It has to be used together with some other actions, such as
> QUEUE, RSS, etc. However, QUEUE action can specify one queue only, which
> may break the rss. Likely, RSS action is currently the best we could now.
> Thus, RSS action is choosen.
> 
> For any unsupported flows, such as MPLS, -1 is returned, meaning the flow
> offload is failed and then skipped.
> 
> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
> Signed-off-by: Finn Christensen <fc@napatech.com>
> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> ---
>  lib/netdev-dpdk.c | 563 ++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 562 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index af9843a..df4d480
> 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,7 +38,9 @@
>  #include <rte_pci.h>
>  #include <rte_vhost.h>
>  #include <rte_version.h>
> +#include <rte_flow.h>
> 
> +#include "cmap.h"
>  #include "dirs.h"
>  #include "dp-packet.h"
>  #include "dpdk.h"
> @@ -51,6 +53,7 @@
>  #include "openvswitch/list.h"
>  #include "openvswitch/ofp-print.h"
>  #include "openvswitch/vlog.h"
> +#include "openvswitch/match.h"
>  #include "ovs-numa.h"
>  #include "ovs-thread.h"
>  #include "ovs-rcu.h"
> @@ -60,6 +63,7 @@
>  #include "sset.h"
>  #include "unaligned.h"
>  #include "timeval.h"
> +#include "uuid.h"
>  #include "unixctl.h"
> 
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -170,6 +174,17 @@ static
> const struct rte_eth_conf port_conf = {  };
> 
>  /*
> + * A mapping from ufid to dpdk rte_flow.
> + */
> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
> +
> +struct ufid_to_rte_flow_data {
> +    struct cmap_node node;
> +    ovs_u128 ufid;
> +    struct rte_flow *rte_flow;
> +};
> +
> +/*
>   * These callbacks allow virtio-net devices to be added to vhost ports
> when
>   * configuration has been fully completed.
>   */
> @@ -3709,6 +3724,552 @@ unlock:
>      return err;
>  }
> 
> +
> +/* Find rte_flow with @ufid */
> +static struct rte_flow *
> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> +    struct ufid_to_rte_flow_data *data;
> +
> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> +            return data->rte_flow;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static inline void
> +ufid_to_rte_flow_associate(const ovs_u128 *ufid,
> +                           struct rte_flow *rte_flow) {
> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
> +
> +    /*
> +     * We should not simply overwrite an existing rte flow.
> +     * We should have deleted it first before re-adding it.
> +     * Thus, if following assert triggers, something is wrong:
> +     * the rte_flow is not destroyed.
> +     */
> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
> +
> +    data->ufid = *ufid;
> +    data->rte_flow = rte_flow;
> +
> +    cmap_insert(&ufid_to_rte_flow,
> +                CONST_CAST(struct cmap_node *, &data->node), hash); }
> +
> +static inline void
> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> +    struct ufid_to_rte_flow_data *data;
> +
> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> +            cmap_remove(&ufid_to_rte_flow,
> +                        CONST_CAST(struct cmap_node *, &data->node),
> hash);
> +            free(data);
> +            return;
> +        }
> +    }
> +
> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
> +              UUID_ARGS((struct uuid *)ufid)); }
> +
> +/*
> + * To avoid individual xrealloc calls for each new element, a
> 'curent_max'
> + * is used to keep track of current allocated number of elements.
> +Starts
> + * by 8 and doubles on each xrealloc call  */ struct flow_patterns {
> +    struct rte_flow_item *items;
> +    int cnt;
> +    int current_max;
> +};
> +
> +struct flow_actions {
> +    struct rte_flow_action *actions;
> +    int cnt;
> +    int current_max;
> +};
> +
> +static void
> +add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type
> type,
> +                 const void *spec, const void *mask) {
> +    int cnt = patterns->cnt;
> +
> +    if (cnt == 0) {
> +        patterns->current_max = 8;
> +        patterns->items = xcalloc(patterns->current_max, sizeof(struct
> rte_flow_item));

Above line exceeds the 79 character limit.

> +    } else if (cnt == patterns->current_max) {
> +        patterns->current_max *= 2;
> +        patterns->items = xrealloc(patterns->items, patterns->current_max
> *
> +                                   sizeof(struct rte_flow_item));
> +    }
> +
> +    patterns->items[cnt].type = type;
> +    patterns->items[cnt].spec = spec;
> +    patterns->items[cnt].mask = mask;
> +    patterns->items[cnt].last = NULL;
> +    patterns->cnt++;
> +}
> +
> +static void
> +add_flow_action(struct flow_actions *actions, enum rte_flow_action_type
> type,
> +                const void *conf)
> +{
> +    int cnt = actions->cnt;
> +
> +    if (cnt == 0) {
> +        actions->current_max = 8;
> +        actions->actions = xcalloc(actions->current_max,
> +                                   sizeof(struct rte_flow_action));
> +    } else if (cnt == actions->current_max) {
> +        actions->current_max *= 2;
> +        actions->actions = xrealloc(actions->actions, actions-
> >current_max *
> +                                    sizeof(struct rte_flow_action));
> +    }
> +
> +    actions->actions[cnt].type = type;
> +    actions->actions[cnt].conf = conf;
> +    actions->cnt++;
> +}
> +
> +static struct rte_flow_action_rss *
> +add_flow_rss_action(struct flow_actions *actions,
> +                    struct netdev *netdev) {
> +    int i;
> +    struct rte_flow_action_rss *rss;
> +
> +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
> +    /*
> +     * Setting it to NULL will let the driver use the default RSS
> +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
> +     */
> +    rss->rss_conf = NULL;
> +    rss->num = netdev->n_rxq;
> +
> +    for (i = 0; i < rss->num; i++) {
> +        rss->queue[i] = i;
> +    }
> +
> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
> +
> +    return rss;
> +}
> +
> +static int
> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
> +                                 const struct match *match,
> +                                 struct nlattr *nl_actions OVS_UNUSED,
> +                                 size_t actions_len OVS_UNUSED,
> +                                 const ovs_u128 *ufid,
> +                                 struct offload_info *info) {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    const struct rte_flow_attr flow_attr = {
> +        .group = 0,
> +        .priority = 0,
> +        .ingress = 1,
> +        .egress = 0
> +    };
> +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
> +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> +    struct rte_flow *flow;
> +    struct rte_flow_error error;
> +    uint8_t *ipv4_next_proto_mask = NULL;
> +    int ret = 0;
> +
> +    /* Eth */
> +    struct rte_flow_item_eth eth_spec;
> +    struct rte_flow_item_eth eth_mask;
> +    memset(&eth_spec, 0, sizeof(eth_spec));
> +    memset(&eth_mask, 0, sizeof(eth_mask));
> +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
> +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
> +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
> sizeof(eth_spec.dst));
> +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
> sizeof(eth_spec.src));
> +        eth_spec.type = match->flow.dl_type;
> +
> +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
> +                   sizeof(eth_mask.dst));
> +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
> +                   sizeof(eth_mask.src));
> +        eth_mask.type = match->wc.masks.dl_type;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
> +                         &eth_spec, &eth_mask);
> +    } else {
> +        /*
> +         * If user specifies a flow (like UDP flow) without L2 patterns,
> +         * OVS will at least set the dl_type. Normally, it's enough to
> +         * create an eth pattern just with it. Unluckily, some Intel's
> +         * NIC (such as XL710) doesn't support that. Below is a
> workaround,
> +         * which simply matches any L2 pkts.
> +         */
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
> +    }
> +
> +    /* VLAN */
> +    struct rte_flow_item_vlan vlan_spec;
> +    struct rte_flow_item_vlan vlan_mask;
> +    memset(&vlan_spec, 0, sizeof(vlan_spec));
> +    memset(&vlan_mask, 0, sizeof(vlan_mask));
> +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
> +        vlan_spec.tci  = match->flow.vlans[0].tci;
> +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
> +
> +        /* match any protocols */
> +        vlan_mask.tpid = 0;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
> +                         &vlan_spec, &vlan_mask);
> +    }
> +
> +    /* IP v4 */
> +    uint8_t proto = 0;
> +    struct rte_flow_item_ipv4 ipv4_spec;
> +    struct rte_flow_item_ipv4 ipv4_mask;
> +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
> +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
> +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
> +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
> +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
> +         match->wc.masks.nw_proto)) {
> +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
> +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
> +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
> +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
> +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
> +
> +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
> +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
> +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
> +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
> +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
> +                         &ipv4_spec, &ipv4_mask);
> +
> +        /* Save proto for L4 protocol setup */
> +        proto = ipv4_spec.hdr.next_proto_id &
> +                ipv4_mask.hdr.next_proto_id;
> +
> +        /* Remember proto mask address for later modification */
> +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
> +    }
> +
> +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
> +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
> +        (match->wc.masks.tp_src ||
> +         match->wc.masks.tp_dst ||
> +         match->wc.masks.tcp_flags)) {
> +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
> +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    struct rte_flow_item_tcp tcp_spec;
> +    struct rte_flow_item_tcp tcp_mask;
> +    memset(&tcp_spec, 0, sizeof(tcp_spec));
> +    memset(&tcp_mask, 0, sizeof(tcp_mask));
> +    if (proto == IPPROTO_TCP &&
> +        (match->wc.masks.tp_src ||
> +         match->wc.masks.tp_dst ||
> +         match->wc.masks.tcp_flags)) {
> +        tcp_spec.hdr.src_port  = match->flow.tp_src;
> +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
> +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
> +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
> +
> +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
> +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
> +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
> +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) &
> + 0xff;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
> +                         &tcp_spec, &tcp_mask);
> +
> +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +        goto end_proto_check;
> +    }
> +
> +    struct rte_flow_item_udp udp_spec;
> +    struct rte_flow_item_udp udp_mask;
> +    memset(&udp_spec, 0, sizeof(udp_spec));
> +    memset(&udp_mask, 0, sizeof(udp_mask));
> +    if (proto == IPPROTO_UDP &&
> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> +        udp_spec.hdr.src_port = match->flow.tp_src;
> +        udp_spec.hdr.dst_port = match->flow.tp_dst;
> +
> +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
> +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
> +                         &udp_spec, &udp_mask);
> +
> +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +        goto end_proto_check;
> +    }
> +
> +    struct rte_flow_item_sctp sctp_spec;
> +    struct rte_flow_item_sctp sctp_mask;
> +    memset(&sctp_spec, 0, sizeof(sctp_spec));
> +    memset(&sctp_mask, 0, sizeof(sctp_mask));
> +    if (proto == IPPROTO_SCTP &&
> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> +        sctp_spec.hdr.src_port = match->flow.tp_src;
> +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
> +
> +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
> +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
> +                         &sctp_spec, &sctp_mask);
> +
> +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +        goto end_proto_check;
> +    }
> +
> +    struct rte_flow_item_icmp icmp_spec;
> +    struct rte_flow_item_icmp icmp_mask;
> +    memset(&icmp_spec, 0, sizeof(icmp_spec));
> +    memset(&icmp_mask, 0, sizeof(icmp_mask));
> +    if (proto == IPPROTO_ICMP &&
> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
> +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
> +
> +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
> +        icmp_mask.hdr.icmp_code =
> + (uint8_t)ntohs(match->wc.masks.tp_dst);
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
> +                         &icmp_spec, &icmp_mask);
> +
> +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +        goto end_proto_check;
> +    }
> +
> +end_proto_check:
> +
> +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
> +
> +    struct rte_flow_action_mark mark;
> +    mark.id = info->flow_mark;
> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
> +
> +    struct rte_flow_action_rss *rss;
> +    rss = add_flow_rss_action(&actions, netdev);
> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
> +
> +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
> +                           actions.actions, &error);
> +    free(rss);
> +    if (!flow) {
> +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
> +                 error.type, error.message);
> +        ret = -1;
> +        goto out;
> +    }
> +    ufid_to_rte_flow_associate(ufid, flow);
> +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
> +             flow, UUID_ARGS((struct uuid *)ufid));
> +
> +out:
> +    free(patterns.items);
> +    free(actions.actions);
> +    return ret;
> +}

This there's a lot going on above, had you considered refactoring the function? Would there be an impact on the performance at which the flows could be added to the device? We can look to do this in the future rather than in this patchset but was curious what your thoughts are WRT to it.

> +
> +static bool
> +is_all_zero(const void *addr, size_t n) {
> +    size_t i = 0;
> +    const uint8_t *p = (uint8_t *)addr;
> +
> +    for (i = 0; i < n; i++) {
> +        if (p[i] != 0) {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * Check if any unsupported flow patterns are specified.
> + */
> +static int
> +netdev_dpdk_validate_flow(const struct match *match) {
> +    struct match match_zero_wc;
> +
> +    /* Create a wc-zeroed version of flow */
> +    match_init(&match_zero_wc, &match->flow, &match->wc);
> +
> +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
> +                     sizeof(match_zero_wc.flow.tunnel))) {
> +        goto err;
> +    }
> +
> +    if (match->wc.masks.metadata ||
> +        match->wc.masks.skb_priority ||
> +        match->wc.masks.pkt_mark ||
> +        match->wc.masks.dp_hash) {
> +        goto err;
> +    }
> +
> +    /* recirc id must be zero */
> +    if (match_zero_wc.flow.recirc_id) {
> +        goto err;
> +    }
> +
> +    if (match->wc.masks.ct_state ||
> +        match->wc.masks.ct_nw_proto ||
> +        match->wc.masks.ct_zone ||
> +        match->wc.masks.ct_mark ||
> +        match->wc.masks.ct_label.u64.hi ||
> +        match->wc.masks.ct_label.u64.lo) {
> +        goto err;
> +    }
> +
> +    if (match->wc.masks.conj_id ||
> +        match->wc.masks.actset_output) {
> +        goto err;
> +    }
> +
> +    /* unsupported L2 */
> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
> +        goto err;
> +    }
> +
> +    /* unsupported L3 */
> +    if (match->wc.masks.ipv6_label ||
> +        match->wc.masks.ct_nw_src ||
> +        match->wc.masks.ct_nw_dst ||
> +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr))
> ||
> +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr))
> ||
> +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
> in6_addr)) ||
> +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
> in6_addr)) ||
> +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr))
> ||
> +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
> +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr))
> ||
> +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr)))
> {
> +        goto err;
> +    }
> +
> +    /* If fragmented, then don't HW accelerate - for now */
> +    if (match_zero_wc.flow.nw_frag) {
> +        goto err;
> +    }
> +
> +    /* unsupported L4 */
> +    if (match->wc.masks.igmp_group_ip4 ||
> +        match->wc.masks.ct_tp_src ||
> +        match->wc.masks.ct_tp_dst) {
> +        goto err;
> +    }
> +
> +    return 0;
> +
> +err:
> +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
> protocols");
> +    return -1;
> +}
> +
> +static int
> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
> +                             const ovs_u128 *ufid,
> +                             struct rte_flow *rte_flow) {
> +    struct rte_flow_error error;
> +    int ret;
> +
> +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
> +    if (ret == 0) {
> +        ufid_to_rte_flow_disassociate(ufid);
> +        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT
> "\n",
> +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
> +    } else {
> +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
> +                 error.type, error.message);
> +    }
> +
> +    return ret;
> +}
> +
> +static int
> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
> +                     struct nlattr *actions, size_t actions_len,
> +                     const ovs_u128 *ufid, struct offload_info *info,
> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> +    struct rte_flow *rte_flow;
> +    int ret;
> +
> +    /*
> +     * If an old rte_flow exists, it means it's a flow modification.
> +     * Here destroy the old rte flow first before adding a new one.
> +     */
> +    rte_flow = ufid_to_rte_flow_find(ufid);
> +    if (rte_flow) {
> +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> +                                           ufid, rte_flow);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    }
> +
> +    ret = netdev_dpdk_validate_flow(match);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
> +                                            actions_len, ufid, info); }
> +
> +static int
> +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> +
> +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
> +
> +    if (!rte_flow) {
> +        return -1;
> +    }
> +
> +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> +                                        ufid, rte_flow); }
> +
> +#define DPDK_FLOW_OFFLOAD_API                                 \
> +    NULL,                   /* flow_flush */                  \
> +    NULL,                   /* flow_dump_create */            \
> +    NULL,                   /* flow_dump_destroy */           \
> +    NULL,                   /* flow_dump_next */              \
> +    netdev_dpdk_flow_put,                                     \
> +    NULL,                   /* flow_get */                    \
> +    netdev_dpdk_flow_del,                                     \
> +    NULL                    /* init_flow_api */
> +
> +
>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>                            GET_CARRIER, GET_STATS,			  \
> @@ -3783,7 +4344,7 @@ unlock:
>      RXQ_RECV,                                                 \
>      NULL,                       /* rx_wait */                 \
>      NULL,                       /* rxq_drain */               \
> -    NO_OFFLOAD_API                                            \
> +    DPDK_FLOW_OFFLOAD_API                                     \
>  }
> 
>  static const struct netdev_class dpdk_class =
> --
> 2.7.4
Finn Christensen April 11, 2018, 3 p.m. | #2
>-----Original Message-----
>From: Stokes, Ian <ian.stokes@intel.com>
>Sent: 10. april 2018 21:59
>To: Shahaf Shuler <shahafs@mellanox.com>; Finn Christensen
><fc@napatech.com>
>Cc: simon.horman@netronome.com; ovs-dev@openvswitch.org; Chandran,
>Sugesh <sugesh.chandran@intel.com>; dball@vmware.com; Yuanhan Liu
><yliu@fridaylinux.org>
>Subject: RE: [PATCH v8 3/6] netdev-dpdk: implement flow offload with rte
>flow
>
>> The basic yet the major part of this patch is to translate the "match"
>> to rte flow patterns. And then, we create a rte flow with MARK + RSS
>> actions. Afterwards, all packets match the flow will have the mark id
>> in the mbuf.
>>
>> The reason RSS is needed is, for most NICs, a MARK only action is not
>> allowed. It has to be used together with some other actions, such as
>> QUEUE, RSS, etc. However, QUEUE action can specify one queue only,
>> which may break the rss. Likely, RSS action is currently the best we could
>now.
>> Thus, RSS action is choosen.
>>
>> For any unsupported flows, such as MPLS, -1 is returned, meaning the
>> flow offload is failed and then skipped.
>>
>> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
>> Signed-off-by: Finn Christensen <fc@napatech.com>
>> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
>> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
>> ---
>>  lib/netdev-dpdk.c | 563
>> ++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 562 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>> af9843a..df4d480
>> 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -38,7 +38,9 @@
>>  #include <rte_pci.h>
>>  #include <rte_vhost.h>
>>  #include <rte_version.h>
>> +#include <rte_flow.h>
>>
>> +#include "cmap.h"
>>  #include "dirs.h"
>>  #include "dp-packet.h"
>>  #include "dpdk.h"
>> @@ -51,6 +53,7 @@
>>  #include "openvswitch/list.h"
>>  #include "openvswitch/ofp-print.h"
>>  #include "openvswitch/vlog.h"
>> +#include "openvswitch/match.h"
>>  #include "ovs-numa.h"
>>  #include "ovs-thread.h"
>>  #include "ovs-rcu.h"
>> @@ -60,6 +63,7 @@
>>  #include "sset.h"
>>  #include "unaligned.h"
>>  #include "timeval.h"
>> +#include "uuid.h"
>>  #include "unixctl.h"
>>
>>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -170,6 +174,17 @@
>> static const struct rte_eth_conf port_conf = {  };
>>
>>  /*
>> + * A mapping from ufid to dpdk rte_flow.
>> + */
>> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
>> +
>> +struct ufid_to_rte_flow_data {
>> +    struct cmap_node node;
>> +    ovs_u128 ufid;
>> +    struct rte_flow *rte_flow;
>> +};
>> +
>> +/*
>>   * These callbacks allow virtio-net devices to be added to vhost
>> ports when
>>   * configuration has been fully completed.
>>   */
>> @@ -3709,6 +3724,552 @@ unlock:
>>      return err;
>>  }
>>
>> +
>> +/* Find rte_flow with @ufid */
>> +static struct rte_flow *
>> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
>> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> +    struct ufid_to_rte_flow_data *data;
>> +
>> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
>> +        if (ovs_u128_equals(*ufid, data->ufid)) {
>> +            return data->rte_flow;
>> +        }
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static inline void
>> +ufid_to_rte_flow_associate(const ovs_u128 *ufid,
>> +                           struct rte_flow *rte_flow) {
>> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
>> +
>> +    /*
>> +     * We should not simply overwrite an existing rte flow.
>> +     * We should have deleted it first before re-adding it.
>> +     * Thus, if following assert triggers, something is wrong:
>> +     * the rte_flow is not destroyed.
>> +     */
>> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
>> +
>> +    data->ufid = *ufid;
>> +    data->rte_flow = rte_flow;
>> +
>> +    cmap_insert(&ufid_to_rte_flow,
>> +                CONST_CAST(struct cmap_node *, &data->node), hash); }
>> +
>> +static inline void
>> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
>> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> +    struct ufid_to_rte_flow_data *data;
>> +
>> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
>> +        if (ovs_u128_equals(*ufid, data->ufid)) {
>> +            cmap_remove(&ufid_to_rte_flow,
>> +                        CONST_CAST(struct cmap_node *, &data->node),
>> hash);
>> +            free(data);
>> +            return;
>> +        }
>> +    }
>> +
>> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
>> +              UUID_ARGS((struct uuid *)ufid)); }
>> +
>> +/*
>> + * To avoid individual xrealloc calls for each new element, a
>> 'curent_max'
>> + * is used to keep track of current allocated number of elements.
>> +Starts
>> + * by 8 and doubles on each xrealloc call  */ struct flow_patterns {
>> +    struct rte_flow_item *items;
>> +    int cnt;
>> +    int current_max;
>> +};
>> +
>> +struct flow_actions {
>> +    struct rte_flow_action *actions;
>> +    int cnt;
>> +    int current_max;
>> +};
>> +
>> +static void
>> +add_flow_pattern(struct flow_patterns *patterns, enum
>> +rte_flow_item_type
>> type,
>> +                 const void *spec, const void *mask) {
>> +    int cnt = patterns->cnt;
>> +
>> +    if (cnt == 0) {
>> +        patterns->current_max = 8;
>> +        patterns->items = xcalloc(patterns->current_max,
>> + sizeof(struct
>> rte_flow_item));
>
>Above line exceeds the 79 character limit.
>
>> +    } else if (cnt == patterns->current_max) {
>> +        patterns->current_max *= 2;
>> +        patterns->items = xrealloc(patterns->items,
>> + patterns->current_max
>> *
>> +                                   sizeof(struct rte_flow_item));
>> +    }
>> +
>> +    patterns->items[cnt].type = type;
>> +    patterns->items[cnt].spec = spec;
>> +    patterns->items[cnt].mask = mask;
>> +    patterns->items[cnt].last = NULL;
>> +    patterns->cnt++;
>> +}
>> +
>> +static void
>> +add_flow_action(struct flow_actions *actions, enum
>> +rte_flow_action_type
>> type,
>> +                const void *conf)
>> +{
>> +    int cnt = actions->cnt;
>> +
>> +    if (cnt == 0) {
>> +        actions->current_max = 8;
>> +        actions->actions = xcalloc(actions->current_max,
>> +                                   sizeof(struct rte_flow_action));
>> +    } else if (cnt == actions->current_max) {
>> +        actions->current_max *= 2;
>> +        actions->actions = xrealloc(actions->actions, actions-
>> >current_max *
>> +                                    sizeof(struct rte_flow_action));
>> +    }
>> +
>> +    actions->actions[cnt].type = type;
>> +    actions->actions[cnt].conf = conf;
>> +    actions->cnt++;
>> +}
>> +
>> +static struct rte_flow_action_rss *
>> +add_flow_rss_action(struct flow_actions *actions,
>> +                    struct netdev *netdev) {
>> +    int i;
>> +    struct rte_flow_action_rss *rss;
>> +
>> +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
>> +    /*
>> +     * Setting it to NULL will let the driver use the default RSS
>> +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
>> +     */
>> +    rss->rss_conf = NULL;
>> +    rss->num = netdev->n_rxq;
>> +
>> +    for (i = 0; i < rss->num; i++) {
>> +        rss->queue[i] = i;
>> +    }
>> +
>> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
>> +
>> +    return rss;
>> +}
>> +
>> +static int
>> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
>> +                                 const struct match *match,
>> +                                 struct nlattr *nl_actions OVS_UNUSED,
>> +                                 size_t actions_len OVS_UNUSED,
>> +                                 const ovs_u128 *ufid,
>> +                                 struct offload_info *info) {
>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +    const struct rte_flow_attr flow_attr = {
>> +        .group = 0,
>> +        .priority = 0,
>> +        .ingress = 1,
>> +        .egress = 0
>> +    };
>> +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
>> +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
>> +    struct rte_flow *flow;
>> +    struct rte_flow_error error;
>> +    uint8_t *ipv4_next_proto_mask = NULL;
>> +    int ret = 0;
>> +
>> +    /* Eth */
>> +    struct rte_flow_item_eth eth_spec;
>> +    struct rte_flow_item_eth eth_mask;
>> +    memset(&eth_spec, 0, sizeof(eth_spec));
>> +    memset(&eth_mask, 0, sizeof(eth_mask));
>> +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
>> +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
>> +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
>> sizeof(eth_spec.dst));
>> +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
>> sizeof(eth_spec.src));
>> +        eth_spec.type = match->flow.dl_type;
>> +
>> +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
>> +                   sizeof(eth_mask.dst));
>> +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
>> +                   sizeof(eth_mask.src));
>> +        eth_mask.type = match->wc.masks.dl_type;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
>> +                         &eth_spec, &eth_mask);
>> +    } else {
>> +        /*
>> +         * If user specifies a flow (like UDP flow) without L2 patterns,
>> +         * OVS will at least set the dl_type. Normally, it's enough to
>> +         * create an eth pattern just with it. Unluckily, some Intel's
>> +         * NIC (such as XL710) doesn't support that. Below is a
>> workaround,
>> +         * which simply matches any L2 pkts.
>> +         */
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL,
>NULL);
>> +    }
>> +
>> +    /* VLAN */
>> +    struct rte_flow_item_vlan vlan_spec;
>> +    struct rte_flow_item_vlan vlan_mask;
>> +    memset(&vlan_spec, 0, sizeof(vlan_spec));
>> +    memset(&vlan_mask, 0, sizeof(vlan_mask));
>> +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
>> +        vlan_spec.tci  = match->flow.vlans[0].tci;
>> +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
>> +
>> +        /* match any protocols */
>> +        vlan_mask.tpid = 0;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
>> +                         &vlan_spec, &vlan_mask);
>> +    }
>> +
>> +    /* IP v4 */
>> +    uint8_t proto = 0;
>> +    struct rte_flow_item_ipv4 ipv4_spec;
>> +    struct rte_flow_item_ipv4 ipv4_mask;
>> +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
>> +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
>> +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
>> +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
>> +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
>> +         match->wc.masks.nw_proto)) {
>> +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
>> +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
>> +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
>> +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
>> +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
>> +
>> +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
>> +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
>> +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
>> +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
>> +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
>> +                         &ipv4_spec, &ipv4_mask);
>> +
>> +        /* Save proto for L4 protocol setup */
>> +        proto = ipv4_spec.hdr.next_proto_id &
>> +                ipv4_mask.hdr.next_proto_id;
>> +
>> +        /* Remember proto mask address for later modification */
>> +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
>> +    }
>> +
>> +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
>> +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
>> +        (match->wc.masks.tp_src ||
>> +         match->wc.masks.tp_dst ||
>> +         match->wc.masks.tcp_flags)) {
>> +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +
>> +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
>> +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +
>> +    struct rte_flow_item_tcp tcp_spec;
>> +    struct rte_flow_item_tcp tcp_mask;
>> +    memset(&tcp_spec, 0, sizeof(tcp_spec));
>> +    memset(&tcp_mask, 0, sizeof(tcp_mask));
>> +    if (proto == IPPROTO_TCP &&
>> +        (match->wc.masks.tp_src ||
>> +         match->wc.masks.tp_dst ||
>> +         match->wc.masks.tcp_flags)) {
>> +        tcp_spec.hdr.src_port  = match->flow.tp_src;
>> +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
>> +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
>> +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
>> +
>> +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
>> +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
>> +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
>> +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) &
>> + 0xff;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
>> +                         &tcp_spec, &tcp_mask);
>> +
>> +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +        goto end_proto_check;
>> +    }
>> +
>> +    struct rte_flow_item_udp udp_spec;
>> +    struct rte_flow_item_udp udp_mask;
>> +    memset(&udp_spec, 0, sizeof(udp_spec));
>> +    memset(&udp_mask, 0, sizeof(udp_mask));
>> +    if (proto == IPPROTO_UDP &&
>> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> +        udp_spec.hdr.src_port = match->flow.tp_src;
>> +        udp_spec.hdr.dst_port = match->flow.tp_dst;
>> +
>> +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
>> +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
>> +                         &udp_spec, &udp_mask);
>> +
>> +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +        goto end_proto_check;
>> +    }
>> +
>> +    struct rte_flow_item_sctp sctp_spec;
>> +    struct rte_flow_item_sctp sctp_mask;
>> +    memset(&sctp_spec, 0, sizeof(sctp_spec));
>> +    memset(&sctp_mask, 0, sizeof(sctp_mask));
>> +    if (proto == IPPROTO_SCTP &&
>> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> +        sctp_spec.hdr.src_port = match->flow.tp_src;
>> +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
>> +
>> +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
>> +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
>> +                         &sctp_spec, &sctp_mask);
>> +
>> +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +        goto end_proto_check;
>> +    }
>> +
>> +    struct rte_flow_item_icmp icmp_spec;
>> +    struct rte_flow_item_icmp icmp_mask;
>> +    memset(&icmp_spec, 0, sizeof(icmp_spec));
>> +    memset(&icmp_mask, 0, sizeof(icmp_mask));
>> +    if (proto == IPPROTO_ICMP &&
>> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
>> +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
>> +
>> +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
>> +        icmp_mask.hdr.icmp_code =
>> + (uint8_t)ntohs(match->wc.masks.tp_dst);
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
>> +                         &icmp_spec, &icmp_mask);
>> +
>> +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +        goto end_proto_check;
>> +    }
>> +
>> +end_proto_check:
>> +
>> +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL,
>NULL);
>> +
>> +    struct rte_flow_action_mark mark;
>> +    mark.id = info->flow_mark;
>> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
>> +
>> +    struct rte_flow_action_rss *rss;
>> +    rss = add_flow_rss_action(&actions, netdev);
>> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
>> +
>> +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
>> +                           actions.actions, &error);
>> +    free(rss);
>> +    if (!flow) {
>> +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
>> +                 error.type, error.message);
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +    ufid_to_rte_flow_associate(ufid, flow);
>> +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
>> +             flow, UUID_ARGS((struct uuid *)ufid));
>> +
>> +out:
>> +    free(patterns.items);
>> +    free(actions.actions);
>> +    return ret;
>> +}
>
>This there's a lot going on above, had you considered refactoring the
>function? Would there be an impact on the performance at which the flows
>could be added to the device? We can look to do this in the future rather than
>in this patchset but was curious what your thoughts are WRT to it.

Yes, I completely agree. It should be refactored, so that each RTE FLOW item 
has its own function. That would make it easier to read and extend. 
While the next upgrade to this functionality would most likely extend the RTE FLOW 
item offload support, I thought it could be done at that point.
I see no performance penalty by doing this.

>
>> +
>> +static bool
>> +is_all_zero(const void *addr, size_t n) {
>> +    size_t i = 0;
>> +    const uint8_t *p = (uint8_t *)addr;
>> +
>> +    for (i = 0; i < n; i++) {
>> +        if (p[i] != 0) {
>> +            return false;
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +/*
>> + * Check if any unsupported flow patterns are specified.
>> + */
>> +static int
>> +netdev_dpdk_validate_flow(const struct match *match) {
>> +    struct match match_zero_wc;
>> +
>> +    /* Create a wc-zeroed version of flow */
>> +    match_init(&match_zero_wc, &match->flow, &match->wc);
>> +
>> +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
>> +                     sizeof(match_zero_wc.flow.tunnel))) {
>> +        goto err;
>> +    }
>> +
>> +    if (match->wc.masks.metadata ||
>> +        match->wc.masks.skb_priority ||
>> +        match->wc.masks.pkt_mark ||
>> +        match->wc.masks.dp_hash) {
>> +        goto err;
>> +    }
>> +
>> +    /* recirc id must be zero */
>> +    if (match_zero_wc.flow.recirc_id) {
>> +        goto err;
>> +    }
>> +
>> +    if (match->wc.masks.ct_state ||
>> +        match->wc.masks.ct_nw_proto ||
>> +        match->wc.masks.ct_zone ||
>> +        match->wc.masks.ct_mark ||
>> +        match->wc.masks.ct_label.u64.hi ||
>> +        match->wc.masks.ct_label.u64.lo) {
>> +        goto err;
>> +    }
>> +
>> +    if (match->wc.masks.conj_id ||
>> +        match->wc.masks.actset_output) {
>> +        goto err;
>> +    }
>> +
>> +    /* unsupported L2 */
>> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
>> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
>> +        goto err;
>> +    }
>> +
>> +    /* unsupported L3 */
>> +    if (match->wc.masks.ipv6_label ||
>> +        match->wc.masks.ct_nw_src ||
>> +        match->wc.masks.ct_nw_dst ||
>> +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct
>> + in6_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct
>> + in6_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
>> in6_addr)) ||
>> +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
>> in6_addr)) ||
>> +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct
>> + in6_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
>> +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct
>> + eth_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct
>> + eth_addr)))
>> {
>> +        goto err;
>> +    }
>> +
>> +    /* If fragmented, then don't HW accelerate - for now */
>> +    if (match_zero_wc.flow.nw_frag) {
>> +        goto err;
>> +    }
>> +
>> +    /* unsupported L4 */
>> +    if (match->wc.masks.igmp_group_ip4 ||
>> +        match->wc.masks.ct_tp_src ||
>> +        match->wc.masks.ct_tp_dst) {
>> +        goto err;
>> +    }
>> +
>> +    return 0;
>> +
>> +err:
>> +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
>> protocols");
>> +    return -1;
>> +}
>> +
>> +static int
>> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
>> +                             const ovs_u128 *ufid,
>> +                             struct rte_flow *rte_flow) {
>> +    struct rte_flow_error error;
>> +    int ret;
>> +
>> +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
>> +    if (ret == 0) {
>> +        ufid_to_rte_flow_disassociate(ufid);
>> +        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT
>> "\n",
>> +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
>> +    } else {
>> +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
>> +                 error.type, error.message);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static int
>> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
>> +                     struct nlattr *actions, size_t actions_len,
>> +                     const ovs_u128 *ufid, struct offload_info *info,
>> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
>> +    struct rte_flow *rte_flow;
>> +    int ret;
>> +
>> +    /*
>> +     * If an old rte_flow exists, it means it's a flow modification.
>> +     * Here destroy the old rte flow first before adding a new one.
>> +     */
>> +    rte_flow = ufid_to_rte_flow_find(ufid);
>> +    if (rte_flow) {
>> +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>> +                                           ufid, rte_flow);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    ret = netdev_dpdk_validate_flow(match);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
>> +                                            actions_len, ufid, info);
>> + }
>> +
>> +static int
>> +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
>> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
>> +
>> +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
>> +
>> +    if (!rte_flow) {
>> +        return -1;
>> +    }
>> +
>> +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>> +                                        ufid, rte_flow); }
>> +
>> +#define DPDK_FLOW_OFFLOAD_API                                 \
>> +    NULL,                   /* flow_flush */                  \
>> +    NULL,                   /* flow_dump_create */            \
>> +    NULL,                   /* flow_dump_destroy */           \
>> +    NULL,                   /* flow_dump_next */              \
>> +    netdev_dpdk_flow_put,                                     \
>> +    NULL,                   /* flow_get */                    \
>> +    netdev_dpdk_flow_del,                                     \
>> +    NULL                    /* init_flow_api */
>> +
>> +
>>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>>                            GET_CARRIER, GET_STATS,			  \
>> @@ -3783,7 +4344,7 @@ unlock:
>>      RXQ_RECV,                                                 \
>>      NULL,                       /* rx_wait */                 \
>>      NULL,                       /* rxq_drain */               \
>> -    NO_OFFLOAD_API                                            \
>> +    DPDK_FLOW_OFFLOAD_API                                     \
>>  }
>>
>>  static const struct netdev_class dpdk_class =
>> --
>> 2.7.4

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index af9843a..df4d480 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -38,7 +38,9 @@ 
 #include <rte_pci.h>
 #include <rte_vhost.h>
 #include <rte_version.h>
+#include <rte_flow.h>
 
+#include "cmap.h"
 #include "dirs.h"
 #include "dp-packet.h"
 #include "dpdk.h"
@@ -51,6 +53,7 @@ 
 #include "openvswitch/list.h"
 #include "openvswitch/ofp-print.h"
 #include "openvswitch/vlog.h"
+#include "openvswitch/match.h"
 #include "ovs-numa.h"
 #include "ovs-thread.h"
 #include "ovs-rcu.h"
@@ -60,6 +63,7 @@ 
 #include "sset.h"
 #include "unaligned.h"
 #include "timeval.h"
+#include "uuid.h"
 #include "unixctl.h"
 
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
@@ -170,6 +174,17 @@  static const struct rte_eth_conf port_conf = {
 };
 
 /*
+ * A mapping from ufid to dpdk rte_flow.
+ */
+static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
+
+struct ufid_to_rte_flow_data {
+    struct cmap_node node;
+    ovs_u128 ufid;
+    struct rte_flow *rte_flow;
+};
+
+/*
  * These callbacks allow virtio-net devices to be added to vhost ports when
  * configuration has been fully completed.
  */
@@ -3709,6 +3724,552 @@  unlock:
     return err;
 }
 
+
+/* Find rte_flow with @ufid */
+static struct rte_flow *
+ufid_to_rte_flow_find(const ovs_u128 *ufid) {
+    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+    struct ufid_to_rte_flow_data *data;
+
+    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
+        if (ovs_u128_equals(*ufid, data->ufid)) {
+            return data->rte_flow;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void
+ufid_to_rte_flow_associate(const ovs_u128 *ufid,
+                           struct rte_flow *rte_flow) {
+    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
+
+    /*
+     * We should not simply overwrite an existing rte flow.
+     * We should have deleted it first before re-adding it.
+     * Thus, if following assert triggers, something is wrong:
+     * the rte_flow is not destroyed.
+     */
+    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
+
+    data->ufid = *ufid;
+    data->rte_flow = rte_flow;
+
+    cmap_insert(&ufid_to_rte_flow,
+                CONST_CAST(struct cmap_node *, &data->node), hash);
+}
+
+static inline void
+ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
+    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+    struct ufid_to_rte_flow_data *data;
+
+    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
+        if (ovs_u128_equals(*ufid, data->ufid)) {
+            cmap_remove(&ufid_to_rte_flow,
+                        CONST_CAST(struct cmap_node *, &data->node), hash);
+            free(data);
+            return;
+        }
+    }
+
+    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
+              UUID_ARGS((struct uuid *)ufid));
+}
+
+/*
+ * To avoid individual xrealloc calls for each new element, a 'curent_max'
+ * is used to keep track of current allocated number of elements. Starts
+ * by 8 and doubles on each xrealloc call
+ */
+struct flow_patterns {
+    struct rte_flow_item *items;
+    int cnt;
+    int current_max;
+};
+
+struct flow_actions {
+    struct rte_flow_action *actions;
+    int cnt;
+    int current_max;
+};
+
+static void
+add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type type,
+                 const void *spec, const void *mask) {
+    int cnt = patterns->cnt;
+
+    if (cnt == 0) {
+        patterns->current_max = 8;
+        patterns->items = xcalloc(patterns->current_max, sizeof(struct rte_flow_item));
+    } else if (cnt == patterns->current_max) {
+        patterns->current_max *= 2;
+        patterns->items = xrealloc(patterns->items, patterns->current_max *
+                                   sizeof(struct rte_flow_item));
+    }
+
+    patterns->items[cnt].type = type;
+    patterns->items[cnt].spec = spec;
+    patterns->items[cnt].mask = mask;
+    patterns->items[cnt].last = NULL;
+    patterns->cnt++;
+}
+
+static void
+add_flow_action(struct flow_actions *actions, enum rte_flow_action_type type,
+                const void *conf)
+{
+    int cnt = actions->cnt;
+
+    if (cnt == 0) {
+        actions->current_max = 8;
+        actions->actions = xcalloc(actions->current_max,
+                                   sizeof(struct rte_flow_action));
+    } else if (cnt == actions->current_max) {
+        actions->current_max *= 2;
+        actions->actions = xrealloc(actions->actions, actions->current_max *
+                                    sizeof(struct rte_flow_action));
+    }
+
+    actions->actions[cnt].type = type;
+    actions->actions[cnt].conf = conf;
+    actions->cnt++;
+}
+
+static struct rte_flow_action_rss *
+add_flow_rss_action(struct flow_actions *actions,
+                    struct netdev *netdev) {
+    int i;
+    struct rte_flow_action_rss *rss;
+
+    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
+    /*
+     * Setting it to NULL will let the driver use the default RSS
+     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
+     */
+    rss->rss_conf = NULL;
+    rss->num = netdev->n_rxq;
+
+    for (i = 0; i < rss->num; i++) {
+        rss->queue[i] = i;
+    }
+
+    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
+
+    return rss;
+}
+
+static int
+netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
+                                 const struct match *match,
+                                 struct nlattr *nl_actions OVS_UNUSED,
+                                 size_t actions_len OVS_UNUSED,
+                                 const ovs_u128 *ufid,
+                                 struct offload_info *info) {
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    const struct rte_flow_attr flow_attr = {
+        .group = 0,
+        .priority = 0,
+        .ingress = 1,
+        .egress = 0
+    };
+    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
+    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
+    struct rte_flow *flow;
+    struct rte_flow_error error;
+    uint8_t *ipv4_next_proto_mask = NULL;
+    int ret = 0;
+
+    /* Eth */
+    struct rte_flow_item_eth eth_spec;
+    struct rte_flow_item_eth eth_mask;
+    memset(&eth_spec, 0, sizeof(eth_spec));
+    memset(&eth_mask, 0, sizeof(eth_mask));
+    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
+        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
+        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
+        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
+        eth_spec.type = match->flow.dl_type;
+
+        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
+                   sizeof(eth_mask.dst));
+        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
+                   sizeof(eth_mask.src));
+        eth_mask.type = match->wc.masks.dl_type;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
+                         &eth_spec, &eth_mask);
+    } else {
+        /*
+         * If user specifies a flow (like UDP flow) without L2 patterns,
+         * OVS will at least set the dl_type. Normally, it's enough to
+         * create an eth pattern just with it. Unluckily, some Intel's
+         * NIC (such as XL710) doesn't support that. Below is a workaround,
+         * which simply matches any L2 pkts.
+         */
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
+    }
+
+    /* VLAN */
+    struct rte_flow_item_vlan vlan_spec;
+    struct rte_flow_item_vlan vlan_mask;
+    memset(&vlan_spec, 0, sizeof(vlan_spec));
+    memset(&vlan_mask, 0, sizeof(vlan_mask));
+    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
+        vlan_spec.tci  = match->flow.vlans[0].tci;
+        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
+
+        /* match any protocols */
+        vlan_mask.tpid = 0;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
+                         &vlan_spec, &vlan_mask);
+    }
+
+    /* IP v4 */
+    uint8_t proto = 0;
+    struct rte_flow_item_ipv4 ipv4_spec;
+    struct rte_flow_item_ipv4 ipv4_mask;
+    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
+    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
+        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
+         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
+         match->wc.masks.nw_proto)) {
+        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
+        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
+        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
+        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
+        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
+
+        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
+        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
+        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
+        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
+        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
+                         &ipv4_spec, &ipv4_mask);
+
+        /* Save proto for L4 protocol setup */
+        proto = ipv4_spec.hdr.next_proto_id &
+                ipv4_mask.hdr.next_proto_id;
+
+        /* Remember proto mask address for later modification */
+        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
+    }
+
+    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
+        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        VLOG_DBG("L4 Protocol (%u) not supported", proto);
+        ret = -1;
+        goto out;
+    }
+
+    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
+        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
+        ret = -1;
+        goto out;
+    }
+
+    struct rte_flow_item_tcp tcp_spec;
+    struct rte_flow_item_tcp tcp_mask;
+    memset(&tcp_spec, 0, sizeof(tcp_spec));
+    memset(&tcp_mask, 0, sizeof(tcp_mask));
+    if (proto == IPPROTO_TCP &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        tcp_spec.hdr.src_port  = match->flow.tp_src;
+        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
+        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
+        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
+
+        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
+        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
+        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
+        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
+                         &tcp_spec, &tcp_mask);
+
+        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+        goto end_proto_check;
+    }
+
+    struct rte_flow_item_udp udp_spec;
+    struct rte_flow_item_udp udp_mask;
+    memset(&udp_spec, 0, sizeof(udp_spec));
+    memset(&udp_mask, 0, sizeof(udp_mask));
+    if (proto == IPPROTO_UDP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        udp_spec.hdr.src_port = match->flow.tp_src;
+        udp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        udp_mask.hdr.src_port = match->wc.masks.tp_src;
+        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
+                         &udp_spec, &udp_mask);
+
+        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+        goto end_proto_check;
+    }
+
+    struct rte_flow_item_sctp sctp_spec;
+    struct rte_flow_item_sctp sctp_mask;
+    memset(&sctp_spec, 0, sizeof(sctp_spec));
+    memset(&sctp_mask, 0, sizeof(sctp_mask));
+    if (proto == IPPROTO_SCTP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        sctp_spec.hdr.src_port = match->flow.tp_src;
+        sctp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
+        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
+                         &sctp_spec, &sctp_mask);
+
+        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+        goto end_proto_check;
+    }
+
+    struct rte_flow_item_icmp icmp_spec;
+    struct rte_flow_item_icmp icmp_mask;
+    memset(&icmp_spec, 0, sizeof(icmp_spec));
+    memset(&icmp_mask, 0, sizeof(icmp_mask));
+    if (proto == IPPROTO_ICMP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
+        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
+
+        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
+        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
+                         &icmp_spec, &icmp_mask);
+
+        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+        goto end_proto_check;
+    }
+
+end_proto_check:
+
+    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
+
+    struct rte_flow_action_mark mark;
+    mark.id = info->flow_mark;
+    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
+
+    struct rte_flow_action_rss *rss;
+    rss = add_flow_rss_action(&actions, netdev);
+    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
+
+    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
+                           actions.actions, &error);
+    free(rss);
+    if (!flow) {
+        VLOG_ERR("rte flow creat error: %u : message : %s\n",
+                 error.type, error.message);
+        ret = -1;
+        goto out;
+    }
+    ufid_to_rte_flow_associate(ufid, flow);
+    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
+             flow, UUID_ARGS((struct uuid *)ufid));
+
+out:
+    free(patterns.items);
+    free(actions.actions);
+    return ret;
+}
+
+static bool
+is_all_zero(const void *addr, size_t n) {
+    size_t i = 0;
+    const uint8_t *p = (uint8_t *)addr;
+
+    for (i = 0; i < n; i++) {
+        if (p[i] != 0) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/*
+ * Check if any unsupported flow patterns are specified.
+ */
+static int
+netdev_dpdk_validate_flow(const struct match *match) {
+    struct match match_zero_wc;
+
+    /* Create a wc-zeroed version of flow */
+    match_init(&match_zero_wc, &match->flow, &match->wc);
+
+    if (!is_all_zero(&match_zero_wc.flow.tunnel,
+                     sizeof(match_zero_wc.flow.tunnel))) {
+        goto err;
+    }
+
+    if (match->wc.masks.metadata ||
+        match->wc.masks.skb_priority ||
+        match->wc.masks.pkt_mark ||
+        match->wc.masks.dp_hash) {
+        goto err;
+    }
+
+    /* recirc id must be zero */
+    if (match_zero_wc.flow.recirc_id) {
+        goto err;
+    }
+
+    if (match->wc.masks.ct_state ||
+        match->wc.masks.ct_nw_proto ||
+        match->wc.masks.ct_zone ||
+        match->wc.masks.ct_mark ||
+        match->wc.masks.ct_label.u64.hi ||
+        match->wc.masks.ct_label.u64.lo) {
+        goto err;
+    }
+
+    if (match->wc.masks.conj_id ||
+        match->wc.masks.actset_output) {
+        goto err;
+    }
+
+    /* unsupported L2 */
+    if (!is_all_zero(&match->wc.masks.mpls_lse,
+                     sizeof(match_zero_wc.flow.mpls_lse))) {
+        goto err;
+    }
+
+    /* unsupported L3 */
+    if (match->wc.masks.ipv6_label ||
+        match->wc.masks.ct_nw_src ||
+        match->wc.masks.ct_nw_dst ||
+        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
+        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr)) ||
+        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr))) {
+        goto err;
+    }
+
+    /* If fragmented, then don't HW accelerate - for now */
+    if (match_zero_wc.flow.nw_frag) {
+        goto err;
+    }
+
+    /* unsupported L4 */
+    if (match->wc.masks.igmp_group_ip4 ||
+        match->wc.masks.ct_tp_src ||
+        match->wc.masks.ct_tp_dst) {
+        goto err;
+    }
+
+    return 0;
+
+err:
+    VLOG_ERR("cannot HW accelerate this flow due to unsupported protocols");
+    return -1;
+}
+
+static int
+netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
+                             const ovs_u128 *ufid,
+                             struct rte_flow *rte_flow) {
+    struct rte_flow_error error;
+    int ret;
+
+    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
+    if (ret == 0) {
+        ufid_to_rte_flow_disassociate(ufid);
+        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT "\n",
+                 rte_flow, UUID_ARGS((struct uuid *)ufid));
+    } else {
+        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
+                 error.type, error.message);
+    }
+
+    return ret;
+}
+
+static int
+netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
+                     struct nlattr *actions, size_t actions_len,
+                     const ovs_u128 *ufid, struct offload_info *info,
+                     struct dpif_flow_stats *stats OVS_UNUSED) {
+    struct rte_flow *rte_flow;
+    int ret;
+
+    /*
+     * If an old rte_flow exists, it means it's a flow modification.
+     * Here destroy the old rte flow first before adding a new one.
+     */
+    rte_flow = ufid_to_rte_flow_find(ufid);
+    if (rte_flow) {
+        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+                                           ufid, rte_flow);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = netdev_dpdk_validate_flow(match);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
+                                            actions_len, ufid, info); }
+
+static int
+netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
+                     struct dpif_flow_stats *stats OVS_UNUSED) {
+
+    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
+
+    if (!rte_flow) {
+        return -1;
+    }
+
+    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+                                        ufid, rte_flow); }
+
+#define DPDK_FLOW_OFFLOAD_API                                 \
+    NULL,                   /* flow_flush */                  \
+    NULL,                   /* flow_dump_create */            \
+    NULL,                   /* flow_dump_destroy */           \
+    NULL,                   /* flow_dump_next */              \
+    netdev_dpdk_flow_put,                                     \
+    NULL,                   /* flow_get */                    \
+    netdev_dpdk_flow_del,                                     \
+    NULL                    /* init_flow_api */
+
+
 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,			  \
@@ -3783,7 +4344,7 @@  unlock:
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
-    NO_OFFLOAD_API                                            \
+    DPDK_FLOW_OFFLOAD_API                                     \
 }
 
 static const struct netdev_class dpdk_class =