[ovs-dev,v7,3/6] netdev-dpdk: implement flow offload with rte flow

Message ID 1517209188-16608-4-git-send-email-yliu@fridaylinux.org
State Changes Requested
Delegated to: Ian Stokes
Headers show
Series
  • OVS-DPDK flow offload with rte_flow
Related show

Commit Message

Yuanhan Liu Jan. 29, 2018, 6:59 a.m.
From: Finn Christensen <fc@napatech.com>

The basic yet the major part of this patch is to translate the "match"
to rte flow patterns. And then, we create a rte flow with MARK + RSS
actions. Afterwards, all packets match the flow will have the mark id in
the mbuf.

The reason RSS is needed is, for most NICs, a MARK only action is not
allowed. It has to be used together with some other actions, such as
QUEUE, RSS, etc. However, QUEUE action can specify one queue only, which
may break the rss. Likely, RSS action is currently the best we could
now. Thus, RSS action is choosen.

For any unsupported flows, such as MPLS, -1 is returned, meaning the
flow offload is failed and then skipped.

Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
---

v7: - set the rss_conf for rss action to NULL, to workaround a mlx5 change
      in DPDK v17.11. Note that it will obey the rss settings OVS-DPDK has
      set in the beginning. Thus, nothing should be effected.

---
 lib/netdev-dpdk.c | 559 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 558 insertions(+), 1 deletion(-)

Comments

Ian Stokes March 12, 2018, 3 p.m. | #1
> -----Original Message-----
> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> Sent: Monday, January 29, 2018 7:00 AM
> To: dev@openvswitch.org
> Cc: Stokes, Ian <ian.stokes@intel.com>; Finn Christensen
> <fc@napatech.com>; Darrell Ball <dball@vmware.com>; Chandran, Sugesh
> <sugesh.chandran@intel.com>; Simon Horman <simon.horman@netronome.com>;
> Yuanhan Liu <yliu@fridaylinux.org>
> Subject: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte flow
> 
> From: Finn Christensen <fc@napatech.com>
> 
> The basic yet the major part of this patch is to translate the "match"
> to rte flow patterns. And then, we create a rte flow with MARK + RSS
> actions. Afterwards, all packets match the flow will have the mark id in
> the mbuf.
> 
> The reason RSS is needed is, for most NICs, a MARK only action is not
> allowed. It has to be used together with some other actions, such as
> QUEUE, RSS, etc. However, QUEUE action can specify one queue only, which
> may break the rss. Likely, RSS action is currently the best we could now.
> Thus, RSS action is choosen.
> 
> For any unsupported flows, such as MPLS, -1 is returned, meaning the flow
> offload is failed and then skipped.
> 
> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
> Signed-off-by: Finn Christensen <fc@napatech.com>
> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>

Hi Finn,

Thanks for working on this. There are compilation errors introduced by this commit, details below along with other comments.

For completeness here is the link to the OVS Travis build with associated failures

https://travis-ci.org/istokes/ovs/builds/352283122


Thanks
Ian

> ---
> 
> v7: - set the rss_conf for rss action to NULL, to workaround a mlx5 change
>       in DPDK v17.11. Note that it will obey the rss settings OVS-DPDK has
>       set in the beginning. Thus, nothing should be effected.
> 
> ---
>  lib/netdev-dpdk.c | 559
> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 558 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index ac2e38e..4bd0503
> 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -38,7 +38,9 @@
>  #include <rte_pci.h>
>  #include <rte_vhost.h>
>  #include <rte_version.h>
> +#include <rte_flow.h>
> 
> +#include "cmap.h"
>  #include "dirs.h"
>  #include "dp-packet.h"
>  #include "dpdk.h"
> @@ -60,6 +62,7 @@
>  #include "sset.h"
>  #include "unaligned.h"
>  #include "timeval.h"
> +#include "uuid.h"
>  #include "unixctl.h"
> 
>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -3633,6 +3636,560 @@
> unlock:
>      return err;
>  }
> 
> +/*
> + * A mapping from ufid to dpdk rte_flow.
> + */
> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
> +
> +struct ufid_to_rte_flow_data {
> +    struct cmap_node node;
> +    ovs_u128 ufid;
> +    struct rte_flow *rte_flow;
> +};

Might be cleaner to declare above along with the other structs/maps specific to netdev-dpdk.c towards the beginning of this file.

> +
> +/* Find rte_flow with @ufid */
> +static struct rte_flow *
> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> +    struct ufid_to_rte_flow_data *data;
> +
> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> +            return data->rte_flow;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
> +static inline void
> +ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow
> +*rte_flow) {
> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
> +
> +    /*
> +     * We should not simply overwrite an existing rte flow.
> +     * We should have deleted it first before re-adding it.
> +     * Thus, if following assert triggers, something is wrong:
> +     * the rte_flow is not destroyed.
> +     */
> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
> +
> +    data->ufid = *ufid;
> +    data->rte_flow = rte_flow;
> +
> +    cmap_insert(&ufid_to_rte_flow,
> +                CONST_CAST(struct cmap_node *, &data->node), hash); }
> +
> +static inline void
> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> +    struct ufid_to_rte_flow_data *data;
> +
> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> +            cmap_remove(&ufid_to_rte_flow,
> +                        CONST_CAST(struct cmap_node *, &data->node),
> hash);
> +            free(data);
> +            return;
> +        }
> +    }
> +
> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
> +              UUID_ARGS((struct uuid *)ufid)); }
> +
> +struct flow_patterns {
> +    struct rte_flow_item *items;
> +    int cnt;
> +    int max;
> +};
> +
> +struct flow_actions {
> +    struct rte_flow_action *actions;
> +    int cnt;
> +    int max;
> +};
> +
> +static void
> +add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type
> type,
> +                 const void *spec, const void *mask) {
> +    int cnt = patterns->cnt;
> +
> +    if (cnt == 0) {
> +        patterns->max = 8;
> +        patterns->items = xcalloc(patterns->max, sizeof(struct
> rte_flow_item));
> +    } else if (cnt == patterns->max) {
> +        patterns->max *= 2;
> +        patterns->items = xrealloc(patterns->items, patterns->max *
> +                                   sizeof(struct rte_flow_item));
> +    }

Just a general query about max value above, so if cnt == 0 you set the max to 8. However if cnt is == max you then double the value of max.

What is the purpose of max and what is it's limit? Maybe it becomes clearer later in the patch but at the moment it seems 8 is the default max however it can be higher, I just find the behavior a little misleading.

> +
> +    patterns->items[cnt].type = type;
> +    patterns->items[cnt].spec = spec;
> +    patterns->items[cnt].mask = mask;
> +    patterns->items[cnt].last = NULL;
> +    patterns->cnt++;
> +}
> +
> +static void
> +add_flow_action(struct flow_actions *actions, enum rte_flow_action_type
> type,
> +                const void *conf)
> +{
> +    int cnt = actions->cnt;
> +
> +    if (cnt == 0) {
> +        actions->max = 8;
> +        actions->actions = xcalloc(actions->max,
> +                                   sizeof(struct rte_flow_action));
> +    } else if (cnt == actions->max) {
> +        actions->max *= 2;
> +        actions->actions = xrealloc(actions->actions, actions->max *
> +                                    sizeof(struct rte_flow_action));
> +    }
> +
> +    actions->actions[cnt].type = type;
> +    actions->actions[cnt].conf = conf;
> +    actions->cnt++;
> +}
> +
> +static struct rte_flow_action_rss *
> +add_flow_rss_action(struct flow_actions *actions, struct netdev
> +*netdev) {
> +    int i;
> +    struct rte_flow_action_rss *rss;
> +
> +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
> +    /*
> +     * Setting it to NULL will let the driver use the default RSS
> +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
> +     */
> +    rss->rss_conf = NULL;
> +    rss->num = netdev->n_rxq;
> +
> +    for (i = 0; i < rss->num; i++) {
> +        rss->queue[i] = i;
> +    }
> +
> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
> +
> +    return rss;
> +}
> +
> +static int
> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
> +                                 const struct match *match,
> +                                 struct nlattr *nl_actions OVS_UNUSED,
> +                                 size_t actions_len OVS_UNUSED,
> +                                 const ovs_u128 *ufid,
> +                                 struct offload_info *info) {
> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> +    const struct rte_flow_attr flow_attr = {
> +        .group = 0,
> +        .priority = 0,
> +        .ingress = 1,
> +        .egress = 0
> +    };
> +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
> +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> +    struct rte_flow *flow;
> +    struct rte_flow_error error;
> +    uint8_t *ipv4_next_proto_mask = NULL;
> +    int ret = 0;
> +
> +    /* Eth */
> +    struct rte_flow_item_eth eth_spec;
> +    struct rte_flow_item_eth eth_mask;
> +    memset(&eth_spec, 0, sizeof(eth_spec));
> +    memset(&eth_mask, 0, sizeof(eth_mask));
> +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||

Above line will cause compilation error error: dereferencing pointer to incomplete type 'const struct match'

> +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
> +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
> sizeof(eth_spec.dst));
> +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
> sizeof(eth_spec.src));
> +        eth_spec.type = match->flow.dl_type;
> +
> +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
> +                   sizeof(eth_mask.dst));
> +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
> +                   sizeof(eth_mask.src));
> +        eth_mask.type = match->wc.masks.dl_type;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
> +                         &eth_spec, &eth_mask);
> +    } else {
> +        /*
> +         * If user specifies a flow (like UDP flow) without L2 patterns,
> +         * OVS will at least set the dl_type. Normally, it's enough to
> +         * create an eth pattern just with it. Unluckily, some Intel's
> +         * NIC (such as XL710) doesn't support that. Below is a
> workaround,
> +         * which simply matches any L2 pkts.
> +         */
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
> +    }
> +
> +    /* VLAN */
> +    struct rte_flow_item_vlan vlan_spec;
> +    struct rte_flow_item_vlan vlan_mask;
> +    memset(&vlan_spec, 0, sizeof(vlan_spec));
> +    memset(&vlan_mask, 0, sizeof(vlan_mask));
> +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
> +        vlan_spec.tci  = match->flow.vlans[0].tci;
> +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
> +
> +        /* match any protocols */
> +        vlan_mask.tpid = 0;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
> +                         &vlan_spec, &vlan_mask);
> +    }
> +
> +    /* IP v4 */
> +    uint8_t proto = 0;
> +    struct rte_flow_item_ipv4 ipv4_spec;
> +    struct rte_flow_item_ipv4 ipv4_mask;
> +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
> +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
> +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
> +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
> +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
> +         match->wc.masks.nw_proto)) {
> +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
> +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
> +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
> +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
> +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
> +
> +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
> +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
> +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
> +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
> +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
> +                         &ipv4_spec, &ipv4_mask);
> +
> +        /* Save proto for L4 protocol setup */
> +        proto = ipv4_spec.hdr.next_proto_id &
> + ipv4_mask.hdr.next_proto_id;
> +
> +        /* Remember proto mask address for later modification */
> +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
> +    }
> +
> +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
> +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
> +        (match->wc.masks.tp_src ||
> +         match->wc.masks.tp_dst ||
> +         match->wc.masks.tcp_flags)) {
> +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
> +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
> +        ret = -1;
> +        goto out;
> +    }
> +
> +    struct rte_flow_item_udp udp_spec;
> +    struct rte_flow_item_udp udp_mask;
> +    memset(&udp_spec, 0, sizeof(udp_spec));
> +    memset(&udp_mask, 0, sizeof(udp_mask));
> +    if (proto == IPPROTO_UDP &&
> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> +        udp_spec.hdr.src_port = match->flow.tp_src;
> +        udp_spec.hdr.dst_port = match->flow.tp_dst;
> +
> +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
> +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
> +                         &udp_spec, &udp_mask);
> +
> +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +    }
> +
> +    struct rte_flow_item_sctp sctp_spec;
> +    struct rte_flow_item_sctp sctp_mask;
> +    memset(&sctp_spec, 0, sizeof(sctp_spec));
> +    memset(&sctp_mask, 0, sizeof(sctp_mask));
> +    if (proto == IPPROTO_SCTP &&
> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> +        sctp_spec.hdr.src_port = match->flow.tp_src;
> +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
> +
> +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
> +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
> +                         &sctp_spec, &sctp_mask);
> +
> +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +    }
> +
> +    struct rte_flow_item_icmp icmp_spec;
> +    struct rte_flow_item_icmp icmp_mask;
> +    memset(&icmp_spec, 0, sizeof(icmp_spec));
> +    memset(&icmp_mask, 0, sizeof(icmp_mask));
> +    if (proto == IPPROTO_ICMP &&
> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
> +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
> +
> +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
> +        icmp_mask.hdr.icmp_code =
> + (uint8_t)ntohs(match->wc.masks.tp_dst);
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
> +                         &icmp_spec, &icmp_mask);
> +
> +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +    }
> +
> +    struct rte_flow_item_tcp tcp_spec;
> +    struct rte_flow_item_tcp tcp_mask;
> +    memset(&tcp_spec, 0, sizeof(tcp_spec));
> +    memset(&tcp_mask, 0, sizeof(tcp_mask));
> +    if (proto == IPPROTO_TCP &&
> +        (match->wc.masks.tp_src ||
> +         match->wc.masks.tp_dst ||
> +         match->wc.masks.tcp_flags)) {
> +        tcp_spec.hdr.src_port  = match->flow.tp_src;
> +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
> +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
> +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
> +
> +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
> +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
> +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
> +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) &
> + 0xff;
> +
> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
> +                         &tcp_spec, &tcp_mask);
> +
> +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match
> */
> +        if (ipv4_next_proto_mask) {
> +            *ipv4_next_proto_mask = 0;
> +        }
> +    }
> +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
> +
> +    struct rte_flow_action_mark mark;
> +    mark.id = info->flow_mark;
> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
> +
> +    struct rte_flow_action_rss *rss;
> +    rss = add_flow_rss_action(&actions, netdev);
> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
> +
> +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
> +                           actions.actions, &error);
> +    free(rss);
> +    if (!flow) {
> +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
> +                 error.type, error.message);
> +        ret = -1;
> +        goto out;
> +    }
> +    ufid_to_rte_flow_associate(ufid, flow);
> +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
> +             flow, UUID_ARGS((struct uuid *)ufid));

A general comment, right now for a matching flow this function checks all supported options. E.g. if the protocol is identified as UDP, this will still check for TCP etc.

Could it be refactored similar to how miniflow extract parses info? i.e. check for most common protocols first, once a protocol is identified and relevant info extracted then jump to the end of the function instead of checking against remaining protocols?

> +
> +out:
> +    free(patterns.items);
> +    free(actions.actions);
> +    return ret;
> +}
> +
> +static bool
> +is_all_zero(const void *addr, size_t n) {
> +    size_t i = 0;
> +    const uint8_t *p = (uint8_t *)addr;
> +
> +    for (i = 0; i < n; i++) {
> +        if (p[i] != 0) {
> +            return false;
> +        }
> +    }
> +
> +    return true;
> +}
> +
> +/*
> + * Check if any unsupported flow patterns are specified.
> + */
> +static int
> +netdev_dpdk_validate_flow(const struct match *match) {
> +    struct match match_zero_wc;

Causes compilation error error: storage size of 'match_zero_wc' isn't known. Also gcc complains with unused variable 'match_zero_wc'.

> +
> +    /* Create a wc-zeroed version of flow */
> +    match_init(&match_zero_wc, &match->flow, &match->wc);

Compilation fails with implicit declaration of function 'match_init'.

> +
> +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
> +                     sizeof(match_zero_wc.flow.tunnel))) {
> +        goto err;
> +    }
> +
> +    if (match->wc.masks.metadata ||
> +        match->wc.masks.skb_priority ||
> +        match->wc.masks.pkt_mark ||
> +        match->wc.masks.dp_hash) {
> +        goto err;
> +    }
> +
> +    /* recirc id must be zero */
> +    if (match_zero_wc.flow.recirc_id) {
> +        goto err;
> +    }
> +
> +    if (match->wc.masks.ct_state ||
> +        match->wc.masks.ct_nw_proto ||
> +        match->wc.masks.ct_zone ||
> +        match->wc.masks.ct_mark ||
> +        match->wc.masks.ct_label.u64.hi ||
> +        match->wc.masks.ct_label.u64.lo) {
> +        goto err;
> +    }
> +
> +    if (match->wc.masks.conj_id ||
> +        match->wc.masks.actset_output) {
> +        goto err;
> +    }
> +
> +    /* unsupported L2 */
> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
> +        goto err;
> +    }
> +
> +    /* unsupported L3 */
> +    if (match->wc.masks.ipv6_label ||
> +        match->wc.masks.ct_nw_src ||
> +        match->wc.masks.ct_nw_dst ||
> +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr))
> ||
> +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr))
> ||
> +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
> in6_addr)) ||
> +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
> in6_addr)) ||
> +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr))
> ||
> +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
> +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr))
> ||
> +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr)))
> {
> +        goto err;
> +    }
> +
> +    /* If fragmented, then don't HW accelerate - for now */
> +    if (match_zero_wc.flow.nw_frag) {
> +        goto err;
> +    }
> +
> +    /* unsupported L4 */
> +    if (match->wc.masks.igmp_group_ip4 ||
> +        match->wc.masks.ct_tp_src ||
> +        match->wc.masks.ct_tp_dst) {
> +        goto err;
> +    }
> +
> +    return 0;
> +
> +err:
> +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
> protocols");
> +    return -1;
> +}
> +
> +static int
> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
> +                             const ovs_u128 *ufid,
> +                             struct rte_flow *rte_flow) {
> +    struct rte_flow_error error;
> +    int ret;
> +
> +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
> +    if (ret == 0) {
> +        ufid_to_rte_flow_disassociate(ufid);
> +        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT
> "\n",
> +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
> +    } else {
> +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
> +                 error.type, error.message);
> +    }
> +
> +    return ret;
> +}
> +
> +static int
> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
> +                     struct nlattr *actions, size_t actions_len,
> +                     const ovs_u128 *ufid, struct offload_info *info,
> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> +    struct rte_flow *rte_flow;
> +    int ret;
> +
> +    /*
> +     * If an old rte_flow exists, it means it's a flow modification.
> +     * Here destroy the old rte flow first before adding a new one.
> +     */
> +    rte_flow = ufid_to_rte_flow_find(ufid);
> +    if (rte_flow) {
> +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> +                                           ufid, rte_flow);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +    }
> +
> +    ret = netdev_dpdk_validate_flow(match);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
> +                                            actions_len, ufid, info); }
> +
> +static int
> +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> +
> +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
> +
> +    if (!rte_flow) {
> +        return -1;
> +    }
> +
> +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> +                                        ufid, rte_flow); }
> +
> +#define DPDK_FLOW_OFFLOAD_API                                 \
> +    NULL,                   /* flow_flush */                  \
> +    NULL,                   /* flow_dump_create */            \
> +    NULL,                   /* flow_dump_destroy */           \
> +    NULL,                   /* flow_dump_next */              \
> +    netdev_dpdk_flow_put,                                     \
> +    NULL,                   /* flow_get */                    \
> +    netdev_dpdk_flow_del,                                     \
> +    NULL                    /* init_flow_api */
> +
> +
>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>                            GET_CARRIER, GET_STATS,			  \
> @@ -3707,7 +4264,7 @@ unlock:
>      RXQ_RECV,                                                 \
>      NULL,                       /* rx_wait */                 \
>      NULL,                       /* rxq_drain */               \
> -    NO_OFFLOAD_API                                            \
> +    DPDK_FLOW_OFFLOAD_API                                     \
>  }
> 
>  static const struct netdev_class dpdk_class =
> --
> 2.7.4
Ian Stokes March 15, 2018, 11:56 a.m. | #2
> -----Original Message-----
> From: Stokes, Ian
> Sent: Monday, March 12, 2018 3:00 PM
> To: 'Yuanhan Liu' <yliu@fridaylinux.org>; dev@openvswitch.org
> Cc: Finn Christensen <fc@napatech.com>; Darrell Ball <dball@vmware.com>;
> Chandran, Sugesh <sugesh.chandran@intel.com>; Simon Horman
> <simon.horman@netronome.com>
> Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
> flow
> 

Adding Shahaf Shuler.
> > -----Original Message-----
> > From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> > Sent: Monday, January 29, 2018 7:00 AM
> > To: dev@openvswitch.org
> > Cc: Stokes, Ian <ian.stokes@intel.com>; Finn Christensen
> > <fc@napatech.com>; Darrell Ball <dball@vmware.com>; Chandran, Sugesh
> > <sugesh.chandran@intel.com>; Simon Horman
> > <simon.horman@netronome.com>; Yuanhan Liu <yliu@fridaylinux.org>
> > Subject: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
> > flow
> >
> > From: Finn Christensen <fc@napatech.com>
> >
> > The basic yet the major part of this patch is to translate the "match"
> > to rte flow patterns. And then, we create a rte flow with MARK + RSS
> > actions. Afterwards, all packets match the flow will have the mark id
> > in the mbuf.
> >
> > The reason RSS is needed is, for most NICs, a MARK only action is not
> > allowed. It has to be used together with some other actions, such as
> > QUEUE, RSS, etc. However, QUEUE action can specify one queue only,
> > which may break the rss. Likely, RSS action is currently the best we
> could now.
> > Thus, RSS action is choosen.
> >
> > For any unsupported flows, such as MPLS, -1 is returned, meaning the
> > flow offload is failed and then skipped.
> >
> > Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
> > Signed-off-by: Finn Christensen <fc@napatech.com>
> > Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
> 
> Hi Finn,
> 
> Thanks for working on this. There are compilation errors introduced by
> this commit, details below along with other comments.
> 
> For completeness here is the link to the OVS Travis build with associated
> failures
> 
> https://travis-ci.org/istokes/ovs/builds/352283122
> 
> 
> Thanks
> Ian
> 
> > ---
> >
> > v7: - set the rss_conf for rss action to NULL, to workaround a mlx5
> change
> >       in DPDK v17.11. Note that it will obey the rss settings OVS-DPDK
> has
> >       set in the beginning. Thus, nothing should be effected.
> >
> > ---
> >  lib/netdev-dpdk.c | 559
> > +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 558 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> > ac2e38e..4bd0503
> > 100644
> > --- a/lib/netdev-dpdk.c
> > +++ b/lib/netdev-dpdk.c
> > @@ -38,7 +38,9 @@
> >  #include <rte_pci.h>
> >  #include <rte_vhost.h>
> >  #include <rte_version.h>
> > +#include <rte_flow.h>
> >
> > +#include "cmap.h"
> >  #include "dirs.h"
> >  #include "dp-packet.h"
> >  #include "dpdk.h"
> > @@ -60,6 +62,7 @@
> >  #include "sset.h"
> >  #include "unaligned.h"
> >  #include "timeval.h"
> > +#include "uuid.h"
> >  #include "unixctl.h"
> >
> >  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -3633,6 +3636,560 @@
> > unlock:
> >      return err;
> >  }
> >
> > +/*
> > + * A mapping from ufid to dpdk rte_flow.
> > + */
> > +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
> > +
> > +struct ufid_to_rte_flow_data {
> > +    struct cmap_node node;
> > +    ovs_u128 ufid;
> > +    struct rte_flow *rte_flow;
> > +};
> 
> Might be cleaner to declare above along with the other structs/maps
> specific to netdev-dpdk.c towards the beginning of this file.
> 
> > +
> > +/* Find rte_flow with @ufid */
> > +static struct rte_flow *
> > +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
> > +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> > +    struct ufid_to_rte_flow_data *data;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> > +        if (ovs_u128_equals(*ufid, data->ufid)) {
> > +            return data->rte_flow;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> > +static inline void
> > +ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow
> > +*rte_flow) {
> > +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> > +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
> > +
> > +    /*
> > +     * We should not simply overwrite an existing rte flow.
> > +     * We should have deleted it first before re-adding it.
> > +     * Thus, if following assert triggers, something is wrong:
> > +     * the rte_flow is not destroyed.
> > +     */
> > +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
> > +
> > +    data->ufid = *ufid;
> > +    data->rte_flow = rte_flow;
> > +
> > +    cmap_insert(&ufid_to_rte_flow,
> > +                CONST_CAST(struct cmap_node *, &data->node), hash); }
> > +
> > +static inline void
> > +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
> > +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> > +    struct ufid_to_rte_flow_data *data;
> > +
> > +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> > +        if (ovs_u128_equals(*ufid, data->ufid)) {
> > +            cmap_remove(&ufid_to_rte_flow,
> > +                        CONST_CAST(struct cmap_node *, &data->node),
> > hash);
> > +            free(data);
> > +            return;
> > +        }
> > +    }
> > +
> > +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
> > +              UUID_ARGS((struct uuid *)ufid)); }
> > +
> > +struct flow_patterns {
> > +    struct rte_flow_item *items;
> > +    int cnt;
> > +    int max;
> > +};
> > +
> > +struct flow_actions {
> > +    struct rte_flow_action *actions;
> > +    int cnt;
> > +    int max;
> > +};
> > +
> > +static void
> > +add_flow_pattern(struct flow_patterns *patterns, enum
> > +rte_flow_item_type
> > type,
> > +                 const void *spec, const void *mask) {
> > +    int cnt = patterns->cnt;
> > +
> > +    if (cnt == 0) {
> > +        patterns->max = 8;
> > +        patterns->items = xcalloc(patterns->max, sizeof(struct
> > rte_flow_item));
> > +    } else if (cnt == patterns->max) {
> > +        patterns->max *= 2;
> > +        patterns->items = xrealloc(patterns->items, patterns->max *
> > +                                   sizeof(struct rte_flow_item));
> > +    }
> 
> Just a general query about max value above, so if cnt == 0 you set the max
> to 8. However if cnt is == max you then double the value of max.
> 
> What is the purpose of max and what is it's limit? Maybe it becomes
> clearer later in the patch but at the moment it seems 8 is the default max
> however it can be higher, I just find the behavior a little misleading.
> 
> > +
> > +    patterns->items[cnt].type = type;
> > +    patterns->items[cnt].spec = spec;
> > +    patterns->items[cnt].mask = mask;
> > +    patterns->items[cnt].last = NULL;
> > +    patterns->cnt++;
> > +}
> > +
> > +static void
> > +add_flow_action(struct flow_actions *actions, enum
> > +rte_flow_action_type
> > type,
> > +                const void *conf)
> > +{
> > +    int cnt = actions->cnt;
> > +
> > +    if (cnt == 0) {
> > +        actions->max = 8;
> > +        actions->actions = xcalloc(actions->max,
> > +                                   sizeof(struct rte_flow_action));
> > +    } else if (cnt == actions->max) {
> > +        actions->max *= 2;
> > +        actions->actions = xrealloc(actions->actions, actions->max *
> > +                                    sizeof(struct rte_flow_action));
> > +    }
> > +
> > +    actions->actions[cnt].type = type;
> > +    actions->actions[cnt].conf = conf;
> > +    actions->cnt++;
> > +}
> > +
> > +static struct rte_flow_action_rss *
> > +add_flow_rss_action(struct flow_actions *actions, struct netdev
> > +*netdev) {
> > +    int i;
> > +    struct rte_flow_action_rss *rss;
> > +
> > +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
> > +    /*
> > +     * Setting it to NULL will let the driver use the default RSS
> > +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
> > +     */
> > +    rss->rss_conf = NULL;
> > +    rss->num = netdev->n_rxq;
> > +
> > +    for (i = 0; i < rss->num; i++) {
> > +        rss->queue[i] = i;
> > +    }
> > +
> > +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
> > +
> > +    return rss;
> > +}
> > +
> > +static int
> > +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
> > +                                 const struct match *match,
> > +                                 struct nlattr *nl_actions OVS_UNUSED,
> > +                                 size_t actions_len OVS_UNUSED,
> > +                                 const ovs_u128 *ufid,
> > +                                 struct offload_info *info) {
> > +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> > +    const struct rte_flow_attr flow_attr = {
> > +        .group = 0,
> > +        .priority = 0,
> > +        .ingress = 1,
> > +        .egress = 0
> > +    };
> > +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
> > +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> > +    struct rte_flow *flow;
> > +    struct rte_flow_error error;
> > +    uint8_t *ipv4_next_proto_mask = NULL;
> > +    int ret = 0;
> > +
> > +    /* Eth */
> > +    struct rte_flow_item_eth eth_spec;
> > +    struct rte_flow_item_eth eth_mask;
> > +    memset(&eth_spec, 0, sizeof(eth_spec));
> > +    memset(&eth_mask, 0, sizeof(eth_mask));
> > +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
> 
> Above line will cause compilation error error: dereferencing pointer to
> incomplete type 'const struct match'
> 
> > +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
> > +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
> > sizeof(eth_spec.dst));
> > +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
> > sizeof(eth_spec.src));
> > +        eth_spec.type = match->flow.dl_type;
> > +
> > +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
> > +                   sizeof(eth_mask.dst));
> > +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
> > +                   sizeof(eth_mask.src));
> > +        eth_mask.type = match->wc.masks.dl_type;
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
> > +                         &eth_spec, &eth_mask);
> > +    } else {
> > +        /*
> > +         * If user specifies a flow (like UDP flow) without L2
> patterns,
> > +         * OVS will at least set the dl_type. Normally, it's enough to
> > +         * create an eth pattern just with it. Unluckily, some Intel's
> > +         * NIC (such as XL710) doesn't support that. Below is a
> > workaround,
> > +         * which simply matches any L2 pkts.
> > +         */
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL,
> NULL);
> > +    }
> > +
> > +    /* VLAN */
> > +    struct rte_flow_item_vlan vlan_spec;
> > +    struct rte_flow_item_vlan vlan_mask;
> > +    memset(&vlan_spec, 0, sizeof(vlan_spec));
> > +    memset(&vlan_mask, 0, sizeof(vlan_mask));
> > +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
> > +        vlan_spec.tci  = match->flow.vlans[0].tci;
> > +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
> > +
> > +        /* match any protocols */
> > +        vlan_mask.tpid = 0;
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
> > +                         &vlan_spec, &vlan_mask);
> > +    }
> > +
> > +    /* IP v4 */
> > +    uint8_t proto = 0;
> > +    struct rte_flow_item_ipv4 ipv4_spec;
> > +    struct rte_flow_item_ipv4 ipv4_mask;
> > +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
> > +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
> > +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
> > +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
> > +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
> > +         match->wc.masks.nw_proto)) {
> > +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
> > +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
> > +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
> > +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
> > +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
> > +
> > +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
> > +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
> > +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
> > +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
> > +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
> > +                         &ipv4_spec, &ipv4_mask);
> > +
> > +        /* Save proto for L4 protocol setup */
> > +        proto = ipv4_spec.hdr.next_proto_id &
> > + ipv4_mask.hdr.next_proto_id;
> > +
> > +        /* Remember proto mask address for later modification */
> > +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
> > +    }
> > +
> > +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
> > +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
> > +        (match->wc.masks.tp_src ||
> > +         match->wc.masks.tp_dst ||
> > +         match->wc.masks.tcp_flags)) {
> > +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +
> > +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
> > +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +
> > +    struct rte_flow_item_udp udp_spec;
> > +    struct rte_flow_item_udp udp_mask;
> > +    memset(&udp_spec, 0, sizeof(udp_spec));
> > +    memset(&udp_mask, 0, sizeof(udp_mask));
> > +    if (proto == IPPROTO_UDP &&
> > +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> > +        udp_spec.hdr.src_port = match->flow.tp_src;
> > +        udp_spec.hdr.dst_port = match->flow.tp_dst;
> > +
> > +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
> > +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
> > +                         &udp_spec, &udp_mask);
> > +
> > +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto
> > + match
> > */
> > +        if (ipv4_next_proto_mask) {
> > +            *ipv4_next_proto_mask = 0;
> > +        }
> > +    }
> > +
> > +    struct rte_flow_item_sctp sctp_spec;
> > +    struct rte_flow_item_sctp sctp_mask;
> > +    memset(&sctp_spec, 0, sizeof(sctp_spec));
> > +    memset(&sctp_mask, 0, sizeof(sctp_mask));
> > +    if (proto == IPPROTO_SCTP &&
> > +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> > +        sctp_spec.hdr.src_port = match->flow.tp_src;
> > +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
> > +
> > +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
> > +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
> > +                         &sctp_spec, &sctp_mask);
> > +
> > +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto
> > + match
> > */
> > +        if (ipv4_next_proto_mask) {
> > +            *ipv4_next_proto_mask = 0;
> > +        }
> > +    }
> > +
> > +    struct rte_flow_item_icmp icmp_spec;
> > +    struct rte_flow_item_icmp icmp_mask;
> > +    memset(&icmp_spec, 0, sizeof(icmp_spec));
> > +    memset(&icmp_mask, 0, sizeof(icmp_mask));
> > +    if (proto == IPPROTO_ICMP &&
> > +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> > +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
> > +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
> > +
> > +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match-
> >wc.masks.tp_src);
> > +        icmp_mask.hdr.icmp_code =
> > + (uint8_t)ntohs(match->wc.masks.tp_dst);
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
> > +                         &icmp_spec, &icmp_mask);
> > +
> > +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto
> > + match
> > */
> > +        if (ipv4_next_proto_mask) {
> > +            *ipv4_next_proto_mask = 0;
> > +        }
> > +    }
> > +
> > +    struct rte_flow_item_tcp tcp_spec;
> > +    struct rte_flow_item_tcp tcp_mask;
> > +    memset(&tcp_spec, 0, sizeof(tcp_spec));
> > +    memset(&tcp_mask, 0, sizeof(tcp_mask));
> > +    if (proto == IPPROTO_TCP &&
> > +        (match->wc.masks.tp_src ||
> > +         match->wc.masks.tp_dst ||
> > +         match->wc.masks.tcp_flags)) {
> > +        tcp_spec.hdr.src_port  = match->flow.tp_src;
> > +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
> > +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
> > +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
> > +
> > +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
> > +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
> > +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
> > +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) &
> > + 0xff;
> > +
> > +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
> > +                         &tcp_spec, &tcp_mask);
> > +
> > +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto
> > + match
> > */
> > +        if (ipv4_next_proto_mask) {
> > +            *ipv4_next_proto_mask = 0;
> > +        }
> > +    }
> > +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
> > +
> > +    struct rte_flow_action_mark mark;
> > +    mark.id = info->flow_mark;
> > +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
> > +
> > +    struct rte_flow_action_rss *rss;
> > +    rss = add_flow_rss_action(&actions, netdev);
> > +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
> > +
> > +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
> > +                           actions.actions, &error);
> > +    free(rss);
> > +    if (!flow) {
> > +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
> > +                 error.type, error.message);
> > +        ret = -1;
> > +        goto out;
> > +    }
> > +    ufid_to_rte_flow_associate(ufid, flow);
> > +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
> > +             flow, UUID_ARGS((struct uuid *)ufid));
> 
> A general comment, right now for a matching flow this function checks all
> supported options. E.g. if the protocol is identified as UDP, this will
> still check for TCP etc.
> 
> Could it be refactored similar to how miniflow extract parses info? i.e.
> check for most common protocols first, once a protocol is identified and
> relevant info extracted then jump to the end of the function instead of
> checking against remaining protocols?
> 
> > +
> > +out:
> > +    free(patterns.items);
> > +    free(actions.actions);
> > +    return ret;
> > +}
> > +
> > +static bool
> > +is_all_zero(const void *addr, size_t n) {
> > +    size_t i = 0;
> > +    const uint8_t *p = (uint8_t *)addr;
> > +
> > +    for (i = 0; i < n; i++) {
> > +        if (p[i] != 0) {
> > +            return false;
> > +        }
> > +    }
> > +
> > +    return true;
> > +}
> > +
> > +/*
> > + * Check if any unsupported flow patterns are specified.
> > + */
> > +static int
> > +netdev_dpdk_validate_flow(const struct match *match) {
> > +    struct match match_zero_wc;
> 
> Causes compilation error error: storage size of 'match_zero_wc' isn't
> known. Also gcc complains with unused variable 'match_zero_wc'.
> 
> > +
> > +    /* Create a wc-zeroed version of flow */
> > +    match_init(&match_zero_wc, &match->flow, &match->wc);
> 
> Compilation fails with implicit declaration of function 'match_init'.
> 
> > +
> > +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
> > +                     sizeof(match_zero_wc.flow.tunnel))) {
> > +        goto err;
> > +    }
> > +
> > +    if (match->wc.masks.metadata ||
> > +        match->wc.masks.skb_priority ||
> > +        match->wc.masks.pkt_mark ||
> > +        match->wc.masks.dp_hash) {
> > +        goto err;
> > +    }
> > +
> > +    /* recirc id must be zero */
> > +    if (match_zero_wc.flow.recirc_id) {
> > +        goto err;
> > +    }
> > +
> > +    if (match->wc.masks.ct_state ||
> > +        match->wc.masks.ct_nw_proto ||
> > +        match->wc.masks.ct_zone ||
> > +        match->wc.masks.ct_mark ||
> > +        match->wc.masks.ct_label.u64.hi ||
> > +        match->wc.masks.ct_label.u64.lo) {
> > +        goto err;
> > +    }
> > +
> > +    if (match->wc.masks.conj_id ||
> > +        match->wc.masks.actset_output) {
> > +        goto err;
> > +    }
> > +
> > +    /* unsupported L2 */
> > +    if (!is_all_zero(&match->wc.masks.mpls_lse,
> > +                     sizeof(match_zero_wc.flow.mpls_lse))) {
> > +        goto err;
> > +    }
> > +
> > +    /* unsupported L3 */
> > +    if (match->wc.masks.ipv6_label ||
> > +        match->wc.masks.ct_nw_src ||
> > +        match->wc.masks.ct_nw_dst ||
> > +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct
> > + in6_addr))
> > ||
> > +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct
> > + in6_addr))
> > ||
> > +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
> > in6_addr)) ||
> > +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
> > in6_addr)) ||
> > +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct
> > + in6_addr))
> > ||
> > +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh))
> ||
> > +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct
> > + eth_addr))
> > ||
> > +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct
> > + eth_addr)))
> > {
> > +        goto err;
> > +    }
> > +
> > +    /* If fragmented, then don't HW accelerate - for now */
> > +    if (match_zero_wc.flow.nw_frag) {
> > +        goto err;
> > +    }
> > +
> > +    /* unsupported L4 */
> > +    if (match->wc.masks.igmp_group_ip4 ||
> > +        match->wc.masks.ct_tp_src ||
> > +        match->wc.masks.ct_tp_dst) {
> > +        goto err;
> > +    }
> > +
> > +    return 0;
> > +
> > +err:
> > +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
> > protocols");
> > +    return -1;
> > +}
> > +
> > +static int
> > +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
> > +                             const ovs_u128 *ufid,
> > +                             struct rte_flow *rte_flow) {
> > +    struct rte_flow_error error;
> > +    int ret;
> > +
> > +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
> > +    if (ret == 0) {
> > +        ufid_to_rte_flow_disassociate(ufid);
> > +        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT
> > "\n",
> > +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
> > +    } else {
> > +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
> > +                 error.type, error.message);
> > +    }
> > +
> > +    return ret;
> > +}
> > +
> > +static int
> > +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
> > +                     struct nlattr *actions, size_t actions_len,
> > +                     const ovs_u128 *ufid, struct offload_info *info,
> > +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> > +    struct rte_flow *rte_flow;
> > +    int ret;
> > +
> > +    /*
> > +     * If an old rte_flow exists, it means it's a flow modification.
> > +     * Here destroy the old rte flow first before adding a new one.
> > +     */
> > +    rte_flow = ufid_to_rte_flow_find(ufid);
> > +    if (rte_flow) {
> > +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> > +                                           ufid, rte_flow);
> > +        if (ret < 0) {
> > +            return ret;
> > +        }
> > +    }
> > +
> > +    ret = netdev_dpdk_validate_flow(match);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
> > +                                            actions_len, ufid, info);
> > + }
> > +
> > +static int
> > +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
> > +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> > +
> > +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
> > +
> > +    if (!rte_flow) {
> > +        return -1;
> > +    }
> > +
> > +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> > +                                        ufid, rte_flow); }
> > +
> > +#define DPDK_FLOW_OFFLOAD_API                                 \
> > +    NULL,                   /* flow_flush */                  \
> > +    NULL,                   /* flow_dump_create */            \
> > +    NULL,                   /* flow_dump_destroy */           \
> > +    NULL,                   /* flow_dump_next */              \
> > +    netdev_dpdk_flow_put,                                     \
> > +    NULL,                   /* flow_get */                    \
> > +    netdev_dpdk_flow_del,                                     \
> > +    NULL                    /* init_flow_api */
> > +
> > +
> >  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
> >                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
> >                            GET_CARRIER, GET_STATS,			  \
> > @@ -3707,7 +4264,7 @@ unlock:
> >      RXQ_RECV,                                                 \
> >      NULL,                       /* rx_wait */                 \
> >      NULL,                       /* rxq_drain */               \
> > -    NO_OFFLOAD_API                                            \
> > +    DPDK_FLOW_OFFLOAD_API                                     \
> >  }
> >
> >  static const struct netdev_class dpdk_class =
> > --
> > 2.7.4
Finn Christensen March 16, 2018, 12:44 p.m. | #3
Hi Ian,

Thanks for the review.
I'll fix the compilation issues and the comments you have below.
Please see my answers below.

Thanks
Finn

>-----Original Message-----
>From: Stokes, Ian <ian.stokes@intel.com>
>Sent: 12. marts 2018 16:00
>To: 'Yuanhan Liu' <yliu@fridaylinux.org>; dev@openvswitch.org
>Cc: Finn Christensen <fc@napatech.com>; Darrell Ball <dball@vmware.com>;
>Chandran, Sugesh <sugesh.chandran@intel.com>; Simon Horman
><simon.horman@netronome.com>
>Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
>flow
>
>> -----Original Message-----
>> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
>> Sent: Monday, January 29, 2018 7:00 AM
>> To: dev@openvswitch.org
>> Cc: Stokes, Ian <ian.stokes@intel.com>; Finn Christensen
>> <fc@napatech.com>; Darrell Ball <dball@vmware.com>; Chandran, Sugesh
>> <sugesh.chandran@intel.com>; Simon Horman
>> <simon.horman@netronome.com>; Yuanhan Liu <yliu@fridaylinux.org>
>> Subject: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
>> flow
>>
>> From: Finn Christensen <fc@napatech.com>
>>
>> The basic yet the major part of this patch is to translate the "match"
>> to rte flow patterns. And then, we create a rte flow with MARK + RSS
>> actions. Afterwards, all packets match the flow will have the mark id
>> in the mbuf.
>>
>> The reason RSS is needed is, for most NICs, a MARK only action is not
>> allowed. It has to be used together with some other actions, such as
>> QUEUE, RSS, etc. However, QUEUE action can specify one queue only,
>> which may break the rss. Likely, RSS action is currently the best we could
>now.
>> Thus, RSS action is choosen.
>>
>> For any unsupported flows, such as MPLS, -1 is returned, meaning the
>> flow offload is failed and then skipped.
>>
>> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
>> Signed-off-by: Finn Christensen <fc@napatech.com>
>> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
>
>Hi Finn,
>
>Thanks for working on this. There are compilation errors introduced by this
>commit, details below along with other comments.
>
>For completeness here is the link to the OVS Travis build with associated
>failures
>
>https://travis-ci.org/istokes/ovs/builds/352283122
>
>
>Thanks
>Ian
>
>> ---
>>
>> v7: - set the rss_conf for rss action to NULL, to workaround a mlx5 change
>>       in DPDK v17.11. Note that it will obey the rss settings OVS-DPDK has
>>       set in the beginning. Thus, nothing should be effected.
>>
>> ---
>>  lib/netdev-dpdk.c | 559
>> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 558 insertions(+), 1 deletion(-)
>>
>> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>> ac2e38e..4bd0503
>> 100644
>> --- a/lib/netdev-dpdk.c
>> +++ b/lib/netdev-dpdk.c
>> @@ -38,7 +38,9 @@
>>  #include <rte_pci.h>
>>  #include <rte_vhost.h>
>>  #include <rte_version.h>
>> +#include <rte_flow.h>
>>
>> +#include "cmap.h"
>>  #include "dirs.h"
>>  #include "dp-packet.h"
>>  #include "dpdk.h"
>> @@ -60,6 +62,7 @@
>>  #include "sset.h"
>>  #include "unaligned.h"
>>  #include "timeval.h"
>> +#include "uuid.h"
>>  #include "unixctl.h"
>>
>>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -3633,6 +3636,560
>@@
>> unlock:
>>      return err;
>>  }
>>
>> +/*
>> + * A mapping from ufid to dpdk rte_flow.
>> + */
>> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
>> +
>> +struct ufid_to_rte_flow_data {
>> +    struct cmap_node node;
>> +    ovs_u128 ufid;
>> +    struct rte_flow *rte_flow;
>> +};
>
>Might be cleaner to declare above along with the other structs/maps specific
>to netdev-dpdk.c towards the beginning of this file.

Ok done.

>
>> +
>> +/* Find rte_flow with @ufid */
>> +static struct rte_flow *
>> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
>> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> +    struct ufid_to_rte_flow_data *data;
>> +
>> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
>> +        if (ovs_u128_equals(*ufid, data->ufid)) {
>> +            return data->rte_flow;
>> +        }
>> +    }
>> +
>> +    return NULL;
>> +}
>> +
>> +static inline void
>> +ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow
>> +*rte_flow) {
>> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
>> +
>> +    /*
>> +     * We should not simply overwrite an existing rte flow.
>> +     * We should have deleted it first before re-adding it.
>> +     * Thus, if following assert triggers, something is wrong:
>> +     * the rte_flow is not destroyed.
>> +     */
>> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
>> +
>> +    data->ufid = *ufid;
>> +    data->rte_flow = rte_flow;
>> +
>> +    cmap_insert(&ufid_to_rte_flow,
>> +                CONST_CAST(struct cmap_node *, &data->node), hash); }
>> +
>> +static inline void
>> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
>> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> +    struct ufid_to_rte_flow_data *data;
>> +
>> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
>> +        if (ovs_u128_equals(*ufid, data->ufid)) {
>> +            cmap_remove(&ufid_to_rte_flow,
>> +                        CONST_CAST(struct cmap_node *, &data->node),
>> hash);
>> +            free(data);
>> +            return;
>> +        }
>> +    }
>> +
>> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
>> +              UUID_ARGS((struct uuid *)ufid)); }
>> +
>> +struct flow_patterns {
>> +    struct rte_flow_item *items;
>> +    int cnt;
>> +    int max;
>> +};
>> +
>> +struct flow_actions {
>> +    struct rte_flow_action *actions;
>> +    int cnt;
>> +    int max;
>> +};
>> +
>> +static void
>> +add_flow_pattern(struct flow_patterns *patterns, enum
>> +rte_flow_item_type
>> type,
>> +                 const void *spec, const void *mask) {
>> +    int cnt = patterns->cnt;
>> +
>> +    if (cnt == 0) {
>> +        patterns->max = 8;
>> +        patterns->items = xcalloc(patterns->max, sizeof(struct
>> rte_flow_item));
>> +    } else if (cnt == patterns->max) {
>> +        patterns->max *= 2;
>> +        patterns->items = xrealloc(patterns->items, patterns->max *
>> +                                   sizeof(struct rte_flow_item));
>> +    }
>
>Just a general query about max value above, so if cnt == 0 you set the max to
>8. However if cnt is == max you then double the value of max.
>
>What is the purpose of max and what is it's limit? Maybe it becomes clearer
>later in the patch but at the moment it seems 8 is the default max however it
>can be higher, I just find the behavior a little misleading.

The rte flow pattern item part is allocated in one allocation and then grows with
xrealloc if needed. It initializes with 8 elements and then grows by doubling 
the number of elements. This is done to avoid a maximum on the number 
of items and also not allocate for each single item element individually, 
basically to simplify the code.
There is no limit other than when xalloc fails and then ovs_abort is called.
If the "max" wording is a bit misleading, we can change it. Maybe
"current_max" is better?

>
>> +
>> +    patterns->items[cnt].type = type;
>> +    patterns->items[cnt].spec = spec;
>> +    patterns->items[cnt].mask = mask;
>> +    patterns->items[cnt].last = NULL;
>> +    patterns->cnt++;
>> +}
>> +
>> +static void
>> +add_flow_action(struct flow_actions *actions, enum
>> +rte_flow_action_type
>> type,
>> +                const void *conf)
>> +{
>> +    int cnt = actions->cnt;
>> +
>> +    if (cnt == 0) {
>> +        actions->max = 8;
>> +        actions->actions = xcalloc(actions->max,
>> +                                   sizeof(struct rte_flow_action));
>> +    } else if (cnt == actions->max) {
>> +        actions->max *= 2;
>> +        actions->actions = xrealloc(actions->actions, actions->max *
>> +                                    sizeof(struct rte_flow_action));
>> +    }
>> +
>> +    actions->actions[cnt].type = type;
>> +    actions->actions[cnt].conf = conf;
>> +    actions->cnt++;
>> +}
>> +
>> +static struct rte_flow_action_rss *
>> +add_flow_rss_action(struct flow_actions *actions, struct netdev
>> +*netdev) {
>> +    int i;
>> +    struct rte_flow_action_rss *rss;
>> +
>> +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
>> +    /*
>> +     * Setting it to NULL will let the driver use the default RSS
>> +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
>> +     */
>> +    rss->rss_conf = NULL;
>> +    rss->num = netdev->n_rxq;
>> +
>> +    for (i = 0; i < rss->num; i++) {
>> +        rss->queue[i] = i;
>> +    }
>> +
>> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
>> +
>> +    return rss;
>> +}
>> +
>> +static int
>> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
>> +                                 const struct match *match,
>> +                                 struct nlattr *nl_actions OVS_UNUSED,
>> +                                 size_t actions_len OVS_UNUSED,
>> +                                 const ovs_u128 *ufid,
>> +                                 struct offload_info *info) {
>> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> +    const struct rte_flow_attr flow_attr = {
>> +        .group = 0,
>> +        .priority = 0,
>> +        .ingress = 1,
>> +        .egress = 0
>> +    };
>> +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
>> +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
>> +    struct rte_flow *flow;
>> +    struct rte_flow_error error;
>> +    uint8_t *ipv4_next_proto_mask = NULL;
>> +    int ret = 0;
>> +
>> +    /* Eth */
>> +    struct rte_flow_item_eth eth_spec;
>> +    struct rte_flow_item_eth eth_mask;
>> +    memset(&eth_spec, 0, sizeof(eth_spec));
>> +    memset(&eth_mask, 0, sizeof(eth_mask));
>> +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
>
>Above line will cause compilation error error: dereferencing pointer to
>incomplete type 'const struct match'

OK. Modified after rebasing.

>
>> +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
>> +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
>> sizeof(eth_spec.dst));
>> +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
>> sizeof(eth_spec.src));
>> +        eth_spec.type = match->flow.dl_type;
>> +
>> +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
>> +                   sizeof(eth_mask.dst));
>> +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
>> +                   sizeof(eth_mask.src));
>> +        eth_mask.type = match->wc.masks.dl_type;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
>> +                         &eth_spec, &eth_mask);
>> +    } else {
>> +        /*
>> +         * If user specifies a flow (like UDP flow) without L2 patterns,
>> +         * OVS will at least set the dl_type. Normally, it's enough to
>> +         * create an eth pattern just with it. Unluckily, some Intel's
>> +         * NIC (such as XL710) doesn't support that. Below is a
>> workaround,
>> +         * which simply matches any L2 pkts.
>> +         */
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL,
>NULL);
>> +    }
>> +
>> +    /* VLAN */
>> +    struct rte_flow_item_vlan vlan_spec;
>> +    struct rte_flow_item_vlan vlan_mask;
>> +    memset(&vlan_spec, 0, sizeof(vlan_spec));
>> +    memset(&vlan_mask, 0, sizeof(vlan_mask));
>> +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
>> +        vlan_spec.tci  = match->flow.vlans[0].tci;
>> +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
>> +
>> +        /* match any protocols */
>> +        vlan_mask.tpid = 0;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
>> +                         &vlan_spec, &vlan_mask);
>> +    }
>> +
>> +    /* IP v4 */
>> +    uint8_t proto = 0;
>> +    struct rte_flow_item_ipv4 ipv4_spec;
>> +    struct rte_flow_item_ipv4 ipv4_mask;
>> +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
>> +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
>> +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
>> +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
>> +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
>> +         match->wc.masks.nw_proto)) {
>> +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
>> +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
>> +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
>> +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
>> +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
>> +
>> +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
>> +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
>> +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
>> +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
>> +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
>> +                         &ipv4_spec, &ipv4_mask);
>> +
>> +        /* Save proto for L4 protocol setup */
>> +        proto = ipv4_spec.hdr.next_proto_id &
>> + ipv4_mask.hdr.next_proto_id;
>> +
>> +        /* Remember proto mask address for later modification */
>> +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
>> +    }
>> +
>> +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
>> +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
>> +        (match->wc.masks.tp_src ||
>> +         match->wc.masks.tp_dst ||
>> +         match->wc.masks.tcp_flags)) {
>> +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +
>> +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
>> +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +
>> +    struct rte_flow_item_udp udp_spec;
>> +    struct rte_flow_item_udp udp_mask;
>> +    memset(&udp_spec, 0, sizeof(udp_spec));
>> +    memset(&udp_mask, 0, sizeof(udp_mask));
>> +    if (proto == IPPROTO_UDP &&
>> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> +        udp_spec.hdr.src_port = match->flow.tp_src;
>> +        udp_spec.hdr.dst_port = match->flow.tp_dst;
>> +
>> +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
>> +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
>> +                         &udp_spec, &udp_mask);
>> +
>> +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +    }
>> +
>> +    struct rte_flow_item_sctp sctp_spec;
>> +    struct rte_flow_item_sctp sctp_mask;
>> +    memset(&sctp_spec, 0, sizeof(sctp_spec));
>> +    memset(&sctp_mask, 0, sizeof(sctp_mask));
>> +    if (proto == IPPROTO_SCTP &&
>> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> +        sctp_spec.hdr.src_port = match->flow.tp_src;
>> +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
>> +
>> +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
>> +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
>> +                         &sctp_spec, &sctp_mask);
>> +
>> +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +    }
>> +
>> +    struct rte_flow_item_icmp icmp_spec;
>> +    struct rte_flow_item_icmp icmp_mask;
>> +    memset(&icmp_spec, 0, sizeof(icmp_spec));
>> +    memset(&icmp_mask, 0, sizeof(icmp_mask));
>> +    if (proto == IPPROTO_ICMP &&
>> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
>> +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
>> +
>> +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
>> +        icmp_mask.hdr.icmp_code =
>> + (uint8_t)ntohs(match->wc.masks.tp_dst);
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
>> +                         &icmp_spec, &icmp_mask);
>> +
>> +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +    }
>> +
>> +    struct rte_flow_item_tcp tcp_spec;
>> +    struct rte_flow_item_tcp tcp_mask;
>> +    memset(&tcp_spec, 0, sizeof(tcp_spec));
>> +    memset(&tcp_mask, 0, sizeof(tcp_mask));
>> +    if (proto == IPPROTO_TCP &&
>> +        (match->wc.masks.tp_src ||
>> +         match->wc.masks.tp_dst ||
>> +         match->wc.masks.tcp_flags)) {
>> +        tcp_spec.hdr.src_port  = match->flow.tp_src;
>> +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
>> +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
>> +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
>> +
>> +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
>> +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
>> +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
>> +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) &
>> + 0xff;
>> +
>> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
>> +                         &tcp_spec, &tcp_mask);
>> +
>> +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto
>> + match
>> */
>> +        if (ipv4_next_proto_mask) {
>> +            *ipv4_next_proto_mask = 0;
>> +        }
>> +    }
>> +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL,
>NULL);
>> +
>> +    struct rte_flow_action_mark mark;
>> +    mark.id = info->flow_mark;
>> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
>> +
>> +    struct rte_flow_action_rss *rss;
>> +    rss = add_flow_rss_action(&actions, netdev);
>> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
>> +
>> +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
>> +                           actions.actions, &error);
>> +    free(rss);
>> +    if (!flow) {
>> +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
>> +                 error.type, error.message);
>> +        ret = -1;
>> +        goto out;
>> +    }
>> +    ufid_to_rte_flow_associate(ufid, flow);
>> +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
>> +             flow, UUID_ARGS((struct uuid *)ufid));
>
>A general comment, right now for a matching flow this function checks all
>supported options. E.g. if the protocol is identified as UDP, this will still check
>for TCP etc.
>
>Could it be refactored similar to how miniflow extract parses info? i.e. check
>for most common protocols first, once a protocol is identified and relevant
>info extracted then jump to the end of the function instead of checking
>against remaining protocols?

Yes, that makes sense. 

>
>> +
>> +out:
>> +    free(patterns.items);
>> +    free(actions.actions);
>> +    return ret;
>> +}
>> +
>> +static bool
>> +is_all_zero(const void *addr, size_t n) {
>> +    size_t i = 0;
>> +    const uint8_t *p = (uint8_t *)addr;
>> +
>> +    for (i = 0; i < n; i++) {
>> +        if (p[i] != 0) {
>> +            return false;
>> +        }
>> +    }
>> +
>> +    return true;
>> +}
>> +
>> +/*
>> + * Check if any unsupported flow patterns are specified.
>> + */
>> +static int
>> +netdev_dpdk_validate_flow(const struct match *match) {
>> +    struct match match_zero_wc;
>
>Causes compilation error error: storage size of 'match_zero_wc' isn't known.
>Also gcc complains with unused variable 'match_zero_wc'.
>
>> +
>> +    /* Create a wc-zeroed version of flow */
>> +    match_init(&match_zero_wc, &match->flow, &match->wc);
>
>Compilation fails with implicit declaration of function 'match_init'.

Same as above compilation error.
>
>> +
>> +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
>> +                     sizeof(match_zero_wc.flow.tunnel))) {
>> +        goto err;
>> +    }
>> +
>> +    if (match->wc.masks.metadata ||
>> +        match->wc.masks.skb_priority ||
>> +        match->wc.masks.pkt_mark ||
>> +        match->wc.masks.dp_hash) {
>> +        goto err;
>> +    }
>> +
>> +    /* recirc id must be zero */
>> +    if (match_zero_wc.flow.recirc_id) {
>> +        goto err;
>> +    }
>> +
>> +    if (match->wc.masks.ct_state ||
>> +        match->wc.masks.ct_nw_proto ||
>> +        match->wc.masks.ct_zone ||
>> +        match->wc.masks.ct_mark ||
>> +        match->wc.masks.ct_label.u64.hi ||
>> +        match->wc.masks.ct_label.u64.lo) {
>> +        goto err;
>> +    }
>> +
>> +    if (match->wc.masks.conj_id ||
>> +        match->wc.masks.actset_output) {
>> +        goto err;
>> +    }
>> +
>> +    /* unsupported L2 */
>> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
>> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
>> +        goto err;
>> +    }
>> +
>> +    /* unsupported L3 */
>> +    if (match->wc.masks.ipv6_label ||
>> +        match->wc.masks.ct_nw_src ||
>> +        match->wc.masks.ct_nw_dst ||
>> +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct
>> + in6_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct
>> + in6_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
>> in6_addr)) ||
>> +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
>> in6_addr)) ||
>> +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct
>> + in6_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
>> +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct
>> + eth_addr))
>> ||
>> +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct
>> + eth_addr)))
>> {
>> +        goto err;
>> +    }
>> +
>> +    /* If fragmented, then don't HW accelerate - for now */
>> +    if (match_zero_wc.flow.nw_frag) {
>> +        goto err;
>> +    }
>> +
>> +    /* unsupported L4 */
>> +    if (match->wc.masks.igmp_group_ip4 ||
>> +        match->wc.masks.ct_tp_src ||
>> +        match->wc.masks.ct_tp_dst) {
>> +        goto err;
>> +    }
>> +
>> +    return 0;
>> +
>> +err:
>> +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
>> protocols");
>> +    return -1;
>> +}
>> +
>> +static int
>> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
>> +                             const ovs_u128 *ufid,
>> +                             struct rte_flow *rte_flow) {
>> +    struct rte_flow_error error;
>> +    int ret;
>> +
>> +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
>> +    if (ret == 0) {
>> +        ufid_to_rte_flow_disassociate(ufid);
>> +        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT
>> "\n",
>> +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
>> +    } else {
>> +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
>> +                 error.type, error.message);
>> +    }
>> +
>> +    return ret;
>> +}
>> +
>> +static int
>> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
>> +                     struct nlattr *actions, size_t actions_len,
>> +                     const ovs_u128 *ufid, struct offload_info *info,
>> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
>> +    struct rte_flow *rte_flow;
>> +    int ret;
>> +
>> +    /*
>> +     * If an old rte_flow exists, it means it's a flow modification.
>> +     * Here destroy the old rte flow first before adding a new one.
>> +     */
>> +    rte_flow = ufid_to_rte_flow_find(ufid);
>> +    if (rte_flow) {
>> +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>> +                                           ufid, rte_flow);
>> +        if (ret < 0) {
>> +            return ret;
>> +        }
>> +    }
>> +
>> +    ret = netdev_dpdk_validate_flow(match);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
>> +                                            actions_len, ufid, info);
>> + }
>> +
>> +static int
>> +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
>> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
>> +
>> +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
>> +
>> +    if (!rte_flow) {
>> +        return -1;
>> +    }
>> +
>> +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>> +                                        ufid, rte_flow); }
>> +
>> +#define DPDK_FLOW_OFFLOAD_API                                 \
>> +    NULL,                   /* flow_flush */                  \
>> +    NULL,                   /* flow_dump_create */            \
>> +    NULL,                   /* flow_dump_destroy */           \
>> +    NULL,                   /* flow_dump_next */              \
>> +    netdev_dpdk_flow_put,                                     \
>> +    NULL,                   /* flow_get */                    \
>> +    netdev_dpdk_flow_del,                                     \
>> +    NULL                    /* init_flow_api */
>> +
>> +
>>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>>                            GET_CARRIER, GET_STATS,			  \
>> @@ -3707,7 +4264,7 @@ unlock:
>>      RXQ_RECV,                                                 \
>>      NULL,                       /* rx_wait */                 \
>>      NULL,                       /* rxq_drain */               \
>> -    NO_OFFLOAD_API                                            \
>> +    DPDK_FLOW_OFFLOAD_API                                     \
>>  }
>>
>>  static const struct netdev_class dpdk_class =
>> --
>> 2.7.4
Ian Stokes March 21, 2018, 1:37 p.m. | #4
> -----Original Message-----
> From: Finn Christensen [mailto:fc@napatech.com]
> Sent: Friday, March 16, 2018 12:44 PM
> To: Stokes, Ian <ian.stokes@intel.com>; 'Yuanhan Liu'
> <yliu@fridaylinux.org>; dev@openvswitch.org; Shahaf Shuler
> <shahafs@mellanox.com>
> Cc: Darrell Ball <dball@vmware.com>; Chandran, Sugesh
> <sugesh.chandran@intel.com>; Simon Horman <simon.horman@netronome.com>
> Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
> flow
> 
> Hi Ian,
> 
> Thanks for the review.
> I'll fix the compilation issues and the comments you have below.
> Please see my answers below.
> 
> Thanks
> Finn
> 
> >-----Original Message-----
> >From: Stokes, Ian <ian.stokes@intel.com>
> >Sent: 12. marts 2018 16:00
> >To: 'Yuanhan Liu' <yliu@fridaylinux.org>; dev@openvswitch.org
> >Cc: Finn Christensen <fc@napatech.com>; Darrell Ball
> ><dball@vmware.com>; Chandran, Sugesh <sugesh.chandran@intel.com>; Simon
> >Horman <simon.horman@netronome.com>
> >Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
> >rte flow
> >
> >> -----Original Message-----
> >> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> >> Sent: Monday, January 29, 2018 7:00 AM
> >> To: dev@openvswitch.org
> >> Cc: Stokes, Ian <ian.stokes@intel.com>; Finn Christensen
> >> <fc@napatech.com>; Darrell Ball <dball@vmware.com>; Chandran, Sugesh
> >> <sugesh.chandran@intel.com>; Simon Horman
> >> <simon.horman@netronome.com>; Yuanhan Liu <yliu@fridaylinux.org>
> >> Subject: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
> >> flow
> >>
> >> From: Finn Christensen <fc@napatech.com>
> >>
> >> The basic yet the major part of this patch is to translate the "match"
> >> to rte flow patterns. And then, we create a rte flow with MARK + RSS
> >> actions. Afterwards, all packets match the flow will have the mark id
> >> in the mbuf.
> >>
> >> The reason RSS is needed is, for most NICs, a MARK only action is not
> >> allowed. It has to be used together with some other actions, such as
> >> QUEUE, RSS, etc. However, QUEUE action can specify one queue only,
> >> which may break the rss. Likely, RSS action is currently the best we
> >> could
> >now.
> >> Thus, RSS action is choosen.
> >>
> >> For any unsupported flows, such as MPLS, -1 is returned, meaning the
> >> flow offload is failed and then skipped.
> >>
> >> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
> >> Signed-off-by: Finn Christensen <fc@napatech.com>
> >> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
> >
> >Hi Finn,
> >
> >Thanks for working on this. There are compilation errors introduced by
> >this commit, details below along with other comments.
> >
> >For completeness here is the link to the OVS Travis build with
> >associated failures
> >
> >https://travis-ci.org/istokes/ovs/builds/352283122
> >
> >
> >Thanks
> >Ian
> >
> >> ---
> >>
> >> v7: - set the rss_conf for rss action to NULL, to workaround a mlx5
> change
> >>       in DPDK v17.11. Note that it will obey the rss settings OVS-DPDK
> has
> >>       set in the beginning. Thus, nothing should be effected.
> >>
> >> ---
> >>  lib/netdev-dpdk.c | 559
> >> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >>  1 file changed, 558 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> >> ac2e38e..4bd0503
> >> 100644
> >> --- a/lib/netdev-dpdk.c
> >> +++ b/lib/netdev-dpdk.c
> >> @@ -38,7 +38,9 @@
> >>  #include <rte_pci.h>
> >>  #include <rte_vhost.h>
> >>  #include <rte_version.h>
> >> +#include <rte_flow.h>
> >>
> >> +#include "cmap.h"
> >>  #include "dirs.h"
> >>  #include "dp-packet.h"
> >>  #include "dpdk.h"
> >> @@ -60,6 +62,7 @@
> >>  #include "sset.h"
> >>  #include "unaligned.h"
> >>  #include "timeval.h"
> >> +#include "uuid.h"
> >>  #include "unixctl.h"
> >>
> >>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -3633,6 +3636,560
> >@@
> >> unlock:
> >>      return err;
> >>  }
> >>
> >> +/*
> >> + * A mapping from ufid to dpdk rte_flow.
> >> + */
> >> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
> >> +
> >> +struct ufid_to_rte_flow_data {
> >> +    struct cmap_node node;
> >> +    ovs_u128 ufid;
> >> +    struct rte_flow *rte_flow;
> >> +};
> >
> >Might be cleaner to declare above along with the other structs/maps
> >specific to netdev-dpdk.c towards the beginning of this file.
> 
> Ok done.
> 
> >
> >> +
> >> +/* Find rte_flow with @ufid */
> >> +static struct rte_flow *
> >> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> >> +    struct ufid_to_rte_flow_data *data;
> >> +
> >> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> >> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> >> +            return data->rte_flow;
> >> +        }
> >> +    }
> >> +
> >> +    return NULL;
> >> +}
> >> +
> >> +static inline void
> >> +ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow
> >> +*rte_flow) {
> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> >> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
> >> +
> >> +    /*
> >> +     * We should not simply overwrite an existing rte flow.
> >> +     * We should have deleted it first before re-adding it.
> >> +     * Thus, if following assert triggers, something is wrong:
> >> +     * the rte_flow is not destroyed.
> >> +     */
> >> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
> >> +
> >> +    data->ufid = *ufid;
> >> +    data->rte_flow = rte_flow;
> >> +
> >> +    cmap_insert(&ufid_to_rte_flow,
> >> +                CONST_CAST(struct cmap_node *, &data->node), hash);
> >> + }
> >> +
> >> +static inline void
> >> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> >> +    struct ufid_to_rte_flow_data *data;
> >> +
> >> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
> >> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> >> +            cmap_remove(&ufid_to_rte_flow,
> >> +                        CONST_CAST(struct cmap_node *, &data->node),
> >> hash);
> >> +            free(data);
> >> +            return;
> >> +        }
> >> +    }
> >> +
> >> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
> >> +              UUID_ARGS((struct uuid *)ufid)); }
> >> +
> >> +struct flow_patterns {
> >> +    struct rte_flow_item *items;
> >> +    int cnt;
> >> +    int max;
> >> +};
> >> +
> >> +struct flow_actions {
> >> +    struct rte_flow_action *actions;
> >> +    int cnt;
> >> +    int max;
> >> +};
> >> +
> >> +static void
> >> +add_flow_pattern(struct flow_patterns *patterns, enum
> >> +rte_flow_item_type
> >> type,
> >> +                 const void *spec, const void *mask) {
> >> +    int cnt = patterns->cnt;
> >> +
> >> +    if (cnt == 0) {
> >> +        patterns->max = 8;
> >> +        patterns->items = xcalloc(patterns->max, sizeof(struct
> >> rte_flow_item));
> >> +    } else if (cnt == patterns->max) {
> >> +        patterns->max *= 2;
> >> +        patterns->items = xrealloc(patterns->items, patterns->max *
> >> +                                   sizeof(struct rte_flow_item));
> >> +    }
> >
> >Just a general query about max value above, so if cnt == 0 you set the
> >max to 8. However if cnt is == max you then double the value of max.
> >
> >What is the purpose of max and what is it's limit? Maybe it becomes
> >clearer later in the patch but at the moment it seems 8 is the default
> >max however it can be higher, I just find the behavior a little
> misleading.
> 
> The rte flow pattern item part is allocated in one allocation and then
> grows with xrealloc if needed. It initializes with 8 elements and then
> grows by doubling the number of elements. This is done to avoid a maximum
> on the number of items and also not allocate for each single item element
> individually, basically to simplify the code.
> There is no limit other than when xalloc fails and then ovs_abort is
> called.
> If the "max" wording is a bit misleading, we can change it. Maybe
> "current_max" is better?

Ah ok, that makes a bit more sense. 'current_max' would be fine with a small comment explaining that we want to avoid individual allocations.

In the case where the xrealoc fails I can see why allocating on an individual babsis would be poor here.

In the case where no more items can be allocated can we fall back to the previous number allocated and flag an error?

Ian
> 
> >
> >> +
> >> +    patterns->items[cnt].type = type;
> >> +    patterns->items[cnt].spec = spec;
> >> +    patterns->items[cnt].mask = mask;
> >> +    patterns->items[cnt].last = NULL;
> >> +    patterns->cnt++;
> >> +}
> >> +
> >> +static void
> >> +add_flow_action(struct flow_actions *actions, enum
> >> +rte_flow_action_type
> >> type,
> >> +                const void *conf)
> >> +{
> >> +    int cnt = actions->cnt;
> >> +
> >> +    if (cnt == 0) {
> >> +        actions->max = 8;
> >> +        actions->actions = xcalloc(actions->max,
> >> +                                   sizeof(struct rte_flow_action));
> >> +    } else if (cnt == actions->max) {
> >> +        actions->max *= 2;
> >> +        actions->actions = xrealloc(actions->actions, actions->max *
> >> +                                    sizeof(struct rte_flow_action));
> >> +    }
> >> +
> >> +    actions->actions[cnt].type = type;
> >> +    actions->actions[cnt].conf = conf;
> >> +    actions->cnt++;
> >> +}
> >> +
> >> +static struct rte_flow_action_rss *
> >> +add_flow_rss_action(struct flow_actions *actions, struct netdev
> >> +*netdev) {
> >> +    int i;
> >> +    struct rte_flow_action_rss *rss;
> >> +
> >> +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
> >> +    /*
> >> +     * Setting it to NULL will let the driver use the default RSS
> >> +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
> >> +     */
> >> +    rss->rss_conf = NULL;
> >> +    rss->num = netdev->n_rxq;
> >> +
> >> +    for (i = 0; i < rss->num; i++) {
> >> +        rss->queue[i] = i;
> >> +    }
> >> +
> >> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
> >> +
> >> +    return rss;
> >> +}
> >> +
> >> +static int
> >> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
> >> +                                 const struct match *match,
> >> +                                 struct nlattr *nl_actions OVS_UNUSED,
> >> +                                 size_t actions_len OVS_UNUSED,
> >> +                                 const ovs_u128 *ufid,
> >> +                                 struct offload_info *info) {
> >> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
> >> +    const struct rte_flow_attr flow_attr = {
> >> +        .group = 0,
> >> +        .priority = 0,
> >> +        .ingress = 1,
> >> +        .egress = 0
> >> +    };
> >> +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
> >> +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
> >> +    struct rte_flow *flow;
> >> +    struct rte_flow_error error;
> >> +    uint8_t *ipv4_next_proto_mask = NULL;
> >> +    int ret = 0;
> >> +
> >> +    /* Eth */
> >> +    struct rte_flow_item_eth eth_spec;
> >> +    struct rte_flow_item_eth eth_mask;
> >> +    memset(&eth_spec, 0, sizeof(eth_spec));
> >> +    memset(&eth_mask, 0, sizeof(eth_mask));
> >> +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
> >
> >Above line will cause compilation error error: dereferencing pointer to
> >incomplete type 'const struct match'
> 
> OK. Modified after rebasing.
> 
> >
> >> +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
> >> +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
> >> sizeof(eth_spec.dst));
> >> +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
> >> sizeof(eth_spec.src));
> >> +        eth_spec.type = match->flow.dl_type;
> >> +
> >> +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
> >> +                   sizeof(eth_mask.dst));
> >> +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
> >> +                   sizeof(eth_mask.src));
> >> +        eth_mask.type = match->wc.masks.dl_type;
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
> >> +                         &eth_spec, &eth_mask);
> >> +    } else {
> >> +        /*
> >> +         * If user specifies a flow (like UDP flow) without L2
> patterns,
> >> +         * OVS will at least set the dl_type. Normally, it's enough to
> >> +         * create an eth pattern just with it. Unluckily, some Intel's
> >> +         * NIC (such as XL710) doesn't support that. Below is a
> >> workaround,
> >> +         * which simply matches any L2 pkts.
> >> +         */
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL,
> >NULL);
> >> +    }
> >> +
> >> +    /* VLAN */
> >> +    struct rte_flow_item_vlan vlan_spec;
> >> +    struct rte_flow_item_vlan vlan_mask;
> >> +    memset(&vlan_spec, 0, sizeof(vlan_spec));
> >> +    memset(&vlan_mask, 0, sizeof(vlan_mask));
> >> +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
> >> +        vlan_spec.tci  = match->flow.vlans[0].tci;
> >> +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
> >> +
> >> +        /* match any protocols */
> >> +        vlan_mask.tpid = 0;
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
> >> +                         &vlan_spec, &vlan_mask);
> >> +    }
> >> +
> >> +    /* IP v4 */
> >> +    uint8_t proto = 0;
> >> +    struct rte_flow_item_ipv4 ipv4_spec;
> >> +    struct rte_flow_item_ipv4 ipv4_mask;
> >> +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
> >> +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
> >> +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
> >> +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
> >> +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
> >> +         match->wc.masks.nw_proto)) {
> >> +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
> >> +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
> >> +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
> >> +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
> >> +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
> >> +
> >> +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
> >> +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
> >> +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
> >> +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
> >> +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
> >> +                         &ipv4_spec, &ipv4_mask);
> >> +
> >> +        /* Save proto for L4 protocol setup */
> >> +        proto = ipv4_spec.hdr.next_proto_id &
> >> + ipv4_mask.hdr.next_proto_id;
> >> +
> >> +        /* Remember proto mask address for later modification */
> >> +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
> >> +    }
> >> +
> >> +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
> >> +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
> >> +        (match->wc.masks.tp_src ||
> >> +         match->wc.masks.tp_dst ||
> >> +         match->wc.masks.tcp_flags)) {
> >> +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
> >> +        ret = -1;
> >> +        goto out;
> >> +    }
> >> +
> >> +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff)
> ||
> >> +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff))
> {
> >> +        ret = -1;
> >> +        goto out;
> >> +    }
> >> +
> >> +    struct rte_flow_item_udp udp_spec;
> >> +    struct rte_flow_item_udp udp_mask;
> >> +    memset(&udp_spec, 0, sizeof(udp_spec));
> >> +    memset(&udp_mask, 0, sizeof(udp_mask));
> >> +    if (proto == IPPROTO_UDP &&
> >> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> >> +        udp_spec.hdr.src_port = match->flow.tp_src;
> >> +        udp_spec.hdr.dst_port = match->flow.tp_dst;
> >> +
> >> +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
> >> +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
> >> +                         &udp_spec, &udp_mask);
> >> +
> >> +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto
> >> + match
> >> */
> >> +        if (ipv4_next_proto_mask) {
> >> +            *ipv4_next_proto_mask = 0;
> >> +        }
> >> +    }
> >> +
> >> +    struct rte_flow_item_sctp sctp_spec;
> >> +    struct rte_flow_item_sctp sctp_mask;
> >> +    memset(&sctp_spec, 0, sizeof(sctp_spec));
> >> +    memset(&sctp_mask, 0, sizeof(sctp_mask));
> >> +    if (proto == IPPROTO_SCTP &&
> >> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> >> +        sctp_spec.hdr.src_port = match->flow.tp_src;
> >> +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
> >> +
> >> +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
> >> +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
> >> +                         &sctp_spec, &sctp_mask);
> >> +
> >> +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto
> >> + match
> >> */
> >> +        if (ipv4_next_proto_mask) {
> >> +            *ipv4_next_proto_mask = 0;
> >> +        }
> >> +    }
> >> +
> >> +    struct rte_flow_item_icmp icmp_spec;
> >> +    struct rte_flow_item_icmp icmp_mask;
> >> +    memset(&icmp_spec, 0, sizeof(icmp_spec));
> >> +    memset(&icmp_mask, 0, sizeof(icmp_mask));
> >> +    if (proto == IPPROTO_ICMP &&
> >> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
> >> +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
> >> +        icmp_spec.hdr.icmp_code =
> >> + (uint8_t)ntohs(match->flow.tp_dst);
> >> +
> >> +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match-
> >wc.masks.tp_src);
> >> +        icmp_mask.hdr.icmp_code =
> >> + (uint8_t)ntohs(match->wc.masks.tp_dst);
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
> >> +                         &icmp_spec, &icmp_mask);
> >> +
> >> +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto
> >> + match
> >> */
> >> +        if (ipv4_next_proto_mask) {
> >> +            *ipv4_next_proto_mask = 0;
> >> +        }
> >> +    }
> >> +
> >> +    struct rte_flow_item_tcp tcp_spec;
> >> +    struct rte_flow_item_tcp tcp_mask;
> >> +    memset(&tcp_spec, 0, sizeof(tcp_spec));
> >> +    memset(&tcp_mask, 0, sizeof(tcp_mask));
> >> +    if (proto == IPPROTO_TCP &&
> >> +        (match->wc.masks.tp_src ||
> >> +         match->wc.masks.tp_dst ||
> >> +         match->wc.masks.tcp_flags)) {
> >> +        tcp_spec.hdr.src_port  = match->flow.tp_src;
> >> +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
> >> +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
> >> +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) &
> >> + 0xff;
> >> +
> >> +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
> >> +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
> >> +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >>
> 8;
> >> +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) &
> >> + 0xff;
> >> +
> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
> >> +                         &tcp_spec, &tcp_mask);
> >> +
> >> +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto
> >> + match
> >> */
> >> +        if (ipv4_next_proto_mask) {
> >> +            *ipv4_next_proto_mask = 0;
> >> +        }
> >> +    }
> >> +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL,
> >NULL);
> >> +
> >> +    struct rte_flow_action_mark mark;
> >> +    mark.id = info->flow_mark;
> >> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
> >> +
> >> +    struct rte_flow_action_rss *rss;
> >> +    rss = add_flow_rss_action(&actions, netdev);
> >> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
> >> +
> >> +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
> >> +                           actions.actions, &error);
> >> +    free(rss);
> >> +    if (!flow) {
> >> +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
> >> +                 error.type, error.message);
> >> +        ret = -1;
> >> +        goto out;
> >> +    }
> >> +    ufid_to_rte_flow_associate(ufid, flow);
> >> +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
> >> +             flow, UUID_ARGS((struct uuid *)ufid));
> >
> >A general comment, right now for a matching flow this function checks
> >all supported options. E.g. if the protocol is identified as UDP, this
> >will still check for TCP etc.
> >
> >Could it be refactored similar to how miniflow extract parses info?
> >i.e. check for most common protocols first, once a protocol is
> >identified and relevant info extracted then jump to the end of the
> >function instead of checking against remaining protocols?
> 
> Yes, that makes sense.
> 
> >
> >> +
> >> +out:
> >> +    free(patterns.items);
> >> +    free(actions.actions);
> >> +    return ret;
> >> +}
> >> +
> >> +static bool
> >> +is_all_zero(const void *addr, size_t n) {
> >> +    size_t i = 0;
> >> +    const uint8_t *p = (uint8_t *)addr;
> >> +
> >> +    for (i = 0; i < n; i++) {
> >> +        if (p[i] != 0) {
> >> +            return false;
> >> +        }
> >> +    }
> >> +
> >> +    return true;
> >> +}
> >> +
> >> +/*
> >> + * Check if any unsupported flow patterns are specified.
> >> + */
> >> +static int
> >> +netdev_dpdk_validate_flow(const struct match *match) {
> >> +    struct match match_zero_wc;
> >
> >Causes compilation error error: storage size of 'match_zero_wc' isn't
> known.
> >Also gcc complains with unused variable 'match_zero_wc'.
> >
> >> +
> >> +    /* Create a wc-zeroed version of flow */
> >> +    match_init(&match_zero_wc, &match->flow, &match->wc);
> >
> >Compilation fails with implicit declaration of function 'match_init'.
> 
> Same as above compilation error.
> >
> >> +
> >> +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
> >> +                     sizeof(match_zero_wc.flow.tunnel))) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    if (match->wc.masks.metadata ||
> >> +        match->wc.masks.skb_priority ||
> >> +        match->wc.masks.pkt_mark ||
> >> +        match->wc.masks.dp_hash) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* recirc id must be zero */
> >> +    if (match_zero_wc.flow.recirc_id) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    if (match->wc.masks.ct_state ||
> >> +        match->wc.masks.ct_nw_proto ||
> >> +        match->wc.masks.ct_zone ||
> >> +        match->wc.masks.ct_mark ||
> >> +        match->wc.masks.ct_label.u64.hi ||
> >> +        match->wc.masks.ct_label.u64.lo) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    if (match->wc.masks.conj_id ||
> >> +        match->wc.masks.actset_output) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* unsupported L2 */
> >> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
> >> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* unsupported L3 */
> >> +    if (match->wc.masks.ipv6_label ||
> >> +        match->wc.masks.ct_nw_src ||
> >> +        match->wc.masks.ct_nw_dst ||
> >> +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct
> >> + in6_addr))
> >> ||
> >> +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct
> >> + in6_addr))
> >> ||
> >> +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
> >> in6_addr)) ||
> >> +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
> >> in6_addr)) ||
> >> +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct
> >> + in6_addr))
> >> ||
> >> +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh))
> ||
> >> +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct
> >> + eth_addr))
> >> ||
> >> +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct
> >> + eth_addr)))
> >> {
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* If fragmented, then don't HW accelerate - for now */
> >> +    if (match_zero_wc.flow.nw_frag) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    /* unsupported L4 */
> >> +    if (match->wc.masks.igmp_group_ip4 ||
> >> +        match->wc.masks.ct_tp_src ||
> >> +        match->wc.masks.ct_tp_dst) {
> >> +        goto err;
> >> +    }
> >> +
> >> +    return 0;
> >> +
> >> +err:
> >> +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
> >> protocols");
> >> +    return -1;
> >> +}
> >> +
> >> +static int
> >> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
> >> +                             const ovs_u128 *ufid,
> >> +                             struct rte_flow *rte_flow) {
> >> +    struct rte_flow_error error;
> >> +    int ret;
> >> +
> >> +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
> >> +    if (ret == 0) {
> >> +        ufid_to_rte_flow_disassociate(ufid);
> >> +        VLOG_DBG("removed rte flow %p associated with ufid "
> >> + UUID_FMT
> >> "\n",
> >> +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
> >> +    } else {
> >> +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
> >> +                 error.type, error.message);
> >> +    }
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static int
> >> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
> >> +                     struct nlattr *actions, size_t actions_len,
> >> +                     const ovs_u128 *ufid, struct offload_info *info,
> >> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> >> +    struct rte_flow *rte_flow;
> >> +    int ret;
> >> +
> >> +    /*
> >> +     * If an old rte_flow exists, it means it's a flow modification.
> >> +     * Here destroy the old rte flow first before adding a new one.
> >> +     */
> >> +    rte_flow = ufid_to_rte_flow_find(ufid);
> >> +    if (rte_flow) {
> >> +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> >> +                                           ufid, rte_flow);
> >> +        if (ret < 0) {
> >> +            return ret;
> >> +        }
> >> +    }
> >> +
> >> +    ret = netdev_dpdk_validate_flow(match);
> >> +    if (ret < 0) {
> >> +        return ret;
> >> +    }
> >> +
> >> +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
> >> +                                            actions_len, ufid,
> >> + info); }
> >> +
> >> +static int
> >> +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
> >> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
> >> +
> >> +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
> >> +
> >> +    if (!rte_flow) {
> >> +        return -1;
> >> +    }
> >> +
> >> +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
> >> +                                        ufid, rte_flow); }
> >> +
> >> +#define DPDK_FLOW_OFFLOAD_API                                 \
> >> +    NULL,                   /* flow_flush */                  \
> >> +    NULL,                   /* flow_dump_create */            \
> >> +    NULL,                   /* flow_dump_destroy */           \
> >> +    NULL,                   /* flow_dump_next */              \
> >> +    netdev_dpdk_flow_put,                                     \
> >> +    NULL,                   /* flow_get */                    \
> >> +    netdev_dpdk_flow_del,                                     \
> >> +    NULL                    /* init_flow_api */
> >> +
> >> +
> >>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
> >>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
> >>                            GET_CARRIER, GET_STATS,  \ @@ -3707,7
> >> +4264,7 @@ unlock:
> >>      RXQ_RECV,                                                 \
> >>      NULL,                       /* rx_wait */                 \
> >>      NULL,                       /* rxq_drain */               \
> >> -    NO_OFFLOAD_API                                            \
> >> +    DPDK_FLOW_OFFLOAD_API                                     \
> >>  }
> >>
> >>  static const struct netdev_class dpdk_class =
> >> --
> >> 2.7.4
> 
> Disclaimer: This email and any files transmitted with it may contain
> confidential information intended for the addressee(s) only. The
> information is not to be surrendered or copied to unauthorized persons. If
> you have received this communication in error, please notify the sender
> immediately and delete this e-mail from your system.
Finn Christensen March 21, 2018, 2:39 p.m. | #5
>-----Original Message-----
>From: Stokes, Ian <ian.stokes@intel.com>
>Sent: 21. marts 2018 14:38
>To: Finn Christensen <fc@napatech.com>; 'Yuanhan Liu'
><yliu@fridaylinux.org>; dev@openvswitch.org; Shahaf Shuler
><shahafs@mellanox.com>
>Cc: Darrell Ball <dball@vmware.com>; Chandran, Sugesh
><sugesh.chandran@intel.com>; Simon Horman
><simon.horman@netronome.com>
>Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with rte
>flow
>
>> -----Original Message-----
>> From: Finn Christensen [mailto:fc@napatech.com]
>> Sent: Friday, March 16, 2018 12:44 PM
>> To: Stokes, Ian <ian.stokes@intel.com>; 'Yuanhan Liu'
>> <yliu@fridaylinux.org>; dev@openvswitch.org; Shahaf Shuler
>> <shahafs@mellanox.com>
>> Cc: Darrell Ball <dball@vmware.com>; Chandran, Sugesh
>> <sugesh.chandran@intel.com>; Simon Horman
><simon.horman@netronome.com>
>> Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
>> rte flow
>>
>> Hi Ian,
>>
>> Thanks for the review.
>> I'll fix the compilation issues and the comments you have below.
>> Please see my answers below.
>>
>> Thanks
>> Finn
>>
>> >-----Original Message-----
>> >From: Stokes, Ian <ian.stokes@intel.com>
>> >Sent: 12. marts 2018 16:00
>> >To: 'Yuanhan Liu' <yliu@fridaylinux.org>; dev@openvswitch.org
>> >Cc: Finn Christensen <fc@napatech.com>; Darrell Ball
>> ><dball@vmware.com>; Chandran, Sugesh <sugesh.chandran@intel.com>;
>> >Simon Horman <simon.horman@netronome.com>
>> >Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
>> >rte flow
>> >
>> >> -----Original Message-----
>> >> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
>> >> Sent: Monday, January 29, 2018 7:00 AM
>> >> To: dev@openvswitch.org
>> >> Cc: Stokes, Ian <ian.stokes@intel.com>; Finn Christensen
>> >> <fc@napatech.com>; Darrell Ball <dball@vmware.com>; Chandran,
>> >> Sugesh <sugesh.chandran@intel.com>; Simon Horman
>> >> <simon.horman@netronome.com>; Yuanhan Liu <yliu@fridaylinux.org>
>> >> Subject: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
>> >> rte flow
>> >>
>> >> From: Finn Christensen <fc@napatech.com>
>> >>
>> >> The basic yet the major part of this patch is to translate the "match"
>> >> to rte flow patterns. And then, we create a rte flow with MARK +
>> >> RSS actions. Afterwards, all packets match the flow will have the
>> >> mark id in the mbuf.
>> >>
>> >> The reason RSS is needed is, for most NICs, a MARK only action is
>> >> not allowed. It has to be used together with some other actions,
>> >> such as QUEUE, RSS, etc. However, QUEUE action can specify one
>> >> queue only, which may break the rss. Likely, RSS action is
>> >> currently the best we could
>> >now.
>> >> Thus, RSS action is choosen.
>> >>
>> >> For any unsupported flows, such as MPLS, -1 is returned, meaning
>> >> the flow offload is failed and then skipped.
>> >>
>> >> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
>> >> Signed-off-by: Finn Christensen <fc@napatech.com>
>> >> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
>> >
>> >Hi Finn,
>> >
>> >Thanks for working on this. There are compilation errors introduced
>> >by this commit, details below along with other comments.
>> >
>> >For completeness here is the link to the OVS Travis build with
>> >associated failures
>> >
>> >https://travis-ci.org/istokes/ovs/builds/352283122
>> >
>> >
>> >Thanks
>> >Ian
>> >
>> >> ---
>> >>
>> >> v7: - set the rss_conf for rss action to NULL, to workaround a mlx5
>> change
>> >>       in DPDK v17.11. Note that it will obey the rss settings
>> >> OVS-DPDK
>> has
>> >>       set in the beginning. Thus, nothing should be effected.
>> >>
>> >> ---
>> >>  lib/netdev-dpdk.c | 559
>> >> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>> >>  1 file changed, 558 insertions(+), 1 deletion(-)
>> >>
>> >> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
>> >> ac2e38e..4bd0503
>> >> 100644
>> >> --- a/lib/netdev-dpdk.c
>> >> +++ b/lib/netdev-dpdk.c
>> >> @@ -38,7 +38,9 @@
>> >>  #include <rte_pci.h>
>> >>  #include <rte_vhost.h>
>> >>  #include <rte_version.h>
>> >> +#include <rte_flow.h>
>> >>
>> >> +#include "cmap.h"
>> >>  #include "dirs.h"
>> >>  #include "dp-packet.h"
>> >>  #include "dpdk.h"
>> >> @@ -60,6 +62,7 @@
>> >>  #include "sset.h"
>> >>  #include "unaligned.h"
>> >>  #include "timeval.h"
>> >> +#include "uuid.h"
>> >>  #include "unixctl.h"
>> >>
>> >>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -3633,6
>+3636,560
>> >@@
>> >> unlock:
>> >>      return err;
>> >>  }
>> >>
>> >> +/*
>> >> + * A mapping from ufid to dpdk rte_flow.
>> >> + */
>> >> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
>> >> +
>> >> +struct ufid_to_rte_flow_data {
>> >> +    struct cmap_node node;
>> >> +    ovs_u128 ufid;
>> >> +    struct rte_flow *rte_flow;
>> >> +};
>> >
>> >Might be cleaner to declare above along with the other structs/maps
>> >specific to netdev-dpdk.c towards the beginning of this file.
>>
>> Ok done.
>>
>> >
>> >> +
>> >> +/* Find rte_flow with @ufid */
>> >> +static struct rte_flow *
>> >> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
>> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> >> +    struct ufid_to_rte_flow_data *data;
>> >> +
>> >> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
>&ufid_to_rte_flow) {
>> >> +        if (ovs_u128_equals(*ufid, data->ufid)) {
>> >> +            return data->rte_flow;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    return NULL;
>> >> +}
>> >> +
>> >> +static inline void
>> >> +ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow
>> >> +*rte_flow) {
>> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> >> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
>> >> +
>> >> +    /*
>> >> +     * We should not simply overwrite an existing rte flow.
>> >> +     * We should have deleted it first before re-adding it.
>> >> +     * Thus, if following assert triggers, something is wrong:
>> >> +     * the rte_flow is not destroyed.
>> >> +     */
>> >> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
>> >> +
>> >> +    data->ufid = *ufid;
>> >> +    data->rte_flow = rte_flow;
>> >> +
>> >> +    cmap_insert(&ufid_to_rte_flow,
>> >> +                CONST_CAST(struct cmap_node *, &data->node),
>> >> + hash); }
>> >> +
>> >> +static inline void
>> >> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
>> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
>> >> +    struct ufid_to_rte_flow_data *data;
>> >> +
>> >> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
>&ufid_to_rte_flow) {
>> >> +        if (ovs_u128_equals(*ufid, data->ufid)) {
>> >> +            cmap_remove(&ufid_to_rte_flow,
>> >> +                        CONST_CAST(struct cmap_node *,
>> >> + &data->node),
>> >> hash);
>> >> +            free(data);
>> >> +            return;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte
>flow\n",
>> >> +              UUID_ARGS((struct uuid *)ufid)); }
>> >> +
>> >> +struct flow_patterns {
>> >> +    struct rte_flow_item *items;
>> >> +    int cnt;
>> >> +    int max;
>> >> +};
>> >> +
>> >> +struct flow_actions {
>> >> +    struct rte_flow_action *actions;
>> >> +    int cnt;
>> >> +    int max;
>> >> +};
>> >> +
>> >> +static void
>> >> +add_flow_pattern(struct flow_patterns *patterns, enum
>> >> +rte_flow_item_type
>> >> type,
>> >> +                 const void *spec, const void *mask) {
>> >> +    int cnt = patterns->cnt;
>> >> +
>> >> +    if (cnt == 0) {
>> >> +        patterns->max = 8;
>> >> +        patterns->items = xcalloc(patterns->max, sizeof(struct
>> >> rte_flow_item));
>> >> +    } else if (cnt == patterns->max) {
>> >> +        patterns->max *= 2;
>> >> +        patterns->items = xrealloc(patterns->items, patterns->max *
>> >> +                                   sizeof(struct rte_flow_item));
>> >> +    }
>> >
>> >Just a general query about max value above, so if cnt == 0 you set
>> >the max to 8. However if cnt is == max you then double the value of max.
>> >
>> >What is the purpose of max and what is it's limit? Maybe it becomes
>> >clearer later in the patch but at the moment it seems 8 is the
>> >default max however it can be higher, I just find the behavior a
>> >little
>> misleading.
>>
>> The rte flow pattern item part is allocated in one allocation and then
>> grows with xrealloc if needed. It initializes with 8 elements and then
>> grows by doubling the number of elements. This is done to avoid a
>> maximum on the number of items and also not allocate for each single
>> item element individually, basically to simplify the code.
>> There is no limit other than when xalloc fails and then ovs_abort is
>> called.
>> If the "max" wording is a bit misleading, we can change it. Maybe
>> "current_max" is better?
>
>Ah ok, that makes a bit more sense. 'current_max' would be fine with a small
>comment explaining that we want to avoid individual allocations.

OK, I'll do that.

>
>In the case where the xrealoc fails I can see why allocating on an individual
>babsis would be poor here.
>
>In the case where no more items can be allocated can we fall back to the
>previous number allocated and flag an error?

It uses xrealloc. And when it fails, it calls out_of_memory and finally abort.
These allocations are really small, and we will probably have a limited 
number of items, even usually below 8. Furthermore, if it fails that small an 
allocation I think it is OK to have xrealloc flag an out_of_memory. It makes 
the code simpler.
If you still think it is worth implementing code for failing a flow offload caused 
by an xrealloc failure, let me know.

>
>Ian
>>
>> >
>> >> +
>> >> +    patterns->items[cnt].type = type;
>> >> +    patterns->items[cnt].spec = spec;
>> >> +    patterns->items[cnt].mask = mask;
>> >> +    patterns->items[cnt].last = NULL;
>> >> +    patterns->cnt++;
>> >> +}
>> >> +
>> >> +static void
>> >> +add_flow_action(struct flow_actions *actions, enum
>> >> +rte_flow_action_type
>> >> type,
>> >> +                const void *conf)
>> >> +{
>> >> +    int cnt = actions->cnt;
>> >> +
>> >> +    if (cnt == 0) {
>> >> +        actions->max = 8;
>> >> +        actions->actions = xcalloc(actions->max,
>> >> +                                   sizeof(struct rte_flow_action));
>> >> +    } else if (cnt == actions->max) {
>> >> +        actions->max *= 2;
>> >> +        actions->actions = xrealloc(actions->actions, actions->max *
>> >> +                                    sizeof(struct rte_flow_action));
>> >> +    }
>> >> +
>> >> +    actions->actions[cnt].type = type;
>> >> +    actions->actions[cnt].conf = conf;
>> >> +    actions->cnt++;
>> >> +}
>> >> +
>> >> +static struct rte_flow_action_rss * add_flow_rss_action(struct
>> >> +flow_actions *actions, struct netdev
>> >> +*netdev) {
>> >> +    int i;
>> >> +    struct rte_flow_action_rss *rss;
>> >> +
>> >> +    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
>> >> +    /*
>> >> +     * Setting it to NULL will let the driver use the default RSS
>> >> +     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
>> >> +     */
>> >> +    rss->rss_conf = NULL;
>> >> +    rss->num = netdev->n_rxq;
>> >> +
>> >> +    for (i = 0; i < rss->num; i++) {
>> >> +        rss->queue[i] = i;
>> >> +    }
>> >> +
>> >> +    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
>> >> +
>> >> +    return rss;
>> >> +}
>> >> +
>> >> +static int
>> >> +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
>> >> +                                 const struct match *match,
>> >> +                                 struct nlattr *nl_actions OVS_UNUSED,
>> >> +                                 size_t actions_len OVS_UNUSED,
>> >> +                                 const ovs_u128 *ufid,
>> >> +                                 struct offload_info *info) {
>> >> +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>> >> +    const struct rte_flow_attr flow_attr = {
>> >> +        .group = 0,
>> >> +        .priority = 0,
>> >> +        .ingress = 1,
>> >> +        .egress = 0
>> >> +    };
>> >> +    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
>> >> +    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
>> >> +    struct rte_flow *flow;
>> >> +    struct rte_flow_error error;
>> >> +    uint8_t *ipv4_next_proto_mask = NULL;
>> >> +    int ret = 0;
>> >> +
>> >> +    /* Eth */
>> >> +    struct rte_flow_item_eth eth_spec;
>> >> +    struct rte_flow_item_eth eth_mask;
>> >> +    memset(&eth_spec, 0, sizeof(eth_spec));
>> >> +    memset(&eth_mask, 0, sizeof(eth_mask));
>> >> +    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
>> >
>> >Above line will cause compilation error error: dereferencing pointer
>> >to incomplete type 'const struct match'
>>
>> OK. Modified after rebasing.
>>
>> >
>> >> +        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
>> >> +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst,
>> >> sizeof(eth_spec.dst));
>> >> +        rte_memcpy(&eth_spec.src, &match->flow.dl_src,
>> >> sizeof(eth_spec.src));
>> >> +        eth_spec.type = match->flow.dl_type;
>> >> +
>> >> +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
>> >> +                   sizeof(eth_mask.dst));
>> >> +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
>> >> +                   sizeof(eth_mask.src));
>> >> +        eth_mask.type = match->wc.masks.dl_type;
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
>> >> +                         &eth_spec, &eth_mask);
>> >> +    } else {
>> >> +        /*
>> >> +         * If user specifies a flow (like UDP flow) without L2
>> patterns,
>> >> +         * OVS will at least set the dl_type. Normally, it's enough to
>> >> +         * create an eth pattern just with it. Unluckily, some Intel's
>> >> +         * NIC (such as XL710) doesn't support that. Below is a
>> >> workaround,
>> >> +         * which simply matches any L2 pkts.
>> >> +         */
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL,
>> >NULL);
>> >> +    }
>> >> +
>> >> +    /* VLAN */
>> >> +    struct rte_flow_item_vlan vlan_spec;
>> >> +    struct rte_flow_item_vlan vlan_mask;
>> >> +    memset(&vlan_spec, 0, sizeof(vlan_spec));
>> >> +    memset(&vlan_mask, 0, sizeof(vlan_mask));
>> >> +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
>> >> +        vlan_spec.tci  = match->flow.vlans[0].tci;
>> >> +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
>> >> +
>> >> +        /* match any protocols */
>> >> +        vlan_mask.tpid = 0;
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
>> >> +                         &vlan_spec, &vlan_mask);
>> >> +    }
>> >> +
>> >> +    /* IP v4 */
>> >> +    uint8_t proto = 0;
>> >> +    struct rte_flow_item_ipv4 ipv4_spec;
>> >> +    struct rte_flow_item_ipv4 ipv4_mask;
>> >> +    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
>> >> +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
>> >> +    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
>> >> +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
>> >> +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
>> >> +         match->wc.masks.nw_proto)) {
>> >> +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
>> >> +        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
>> >> +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
>> >> +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
>> >> +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
>> >> +
>> >> +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
>> >> +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
>> >> +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
>> >> +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
>> >> +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
>> >> +                         &ipv4_spec, &ipv4_mask);
>> >> +
>> >> +        /* Save proto for L4 protocol setup */
>> >> +        proto = ipv4_spec.hdr.next_proto_id &
>> >> + ipv4_mask.hdr.next_proto_id;
>> >> +
>> >> +        /* Remember proto mask address for later modification */
>> >> +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
>> >> +    }
>> >> +
>> >> +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
>> >> +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
>> >> +        (match->wc.masks.tp_src ||
>> >> +         match->wc.masks.tp_dst ||
>> >> +         match->wc.masks.tcp_flags)) {
>> >> +        VLOG_DBG("L4 Protocol (%u) not supported", proto);
>> >> +        ret = -1;
>> >> +        goto out;
>> >> +    }
>> >> +
>> >> +    if ((match->wc.masks.tp_src && match->wc.masks.tp_src !=
>> >> + 0xffff)
>> ||
>> >> +        (match->wc.masks.tp_dst && match->wc.masks.tp_dst !=
>> >> + 0xffff))
>> {
>> >> +        ret = -1;
>> >> +        goto out;
>> >> +    }
>> >> +
>> >> +    struct rte_flow_item_udp udp_spec;
>> >> +    struct rte_flow_item_udp udp_mask;
>> >> +    memset(&udp_spec, 0, sizeof(udp_spec));
>> >> +    memset(&udp_mask, 0, sizeof(udp_mask));
>> >> +    if (proto == IPPROTO_UDP &&
>> >> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> >> +        udp_spec.hdr.src_port = match->flow.tp_src;
>> >> +        udp_spec.hdr.dst_port = match->flow.tp_dst;
>> >> +
>> >> +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
>> >> +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
>> >> +                         &udp_spec, &udp_mask);
>> >> +
>> >> +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto
>> >> + match
>> >> */
>> >> +        if (ipv4_next_proto_mask) {
>> >> +            *ipv4_next_proto_mask = 0;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    struct rte_flow_item_sctp sctp_spec;
>> >> +    struct rte_flow_item_sctp sctp_mask;
>> >> +    memset(&sctp_spec, 0, sizeof(sctp_spec));
>> >> +    memset(&sctp_mask, 0, sizeof(sctp_mask));
>> >> +    if (proto == IPPROTO_SCTP &&
>> >> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> >> +        sctp_spec.hdr.src_port = match->flow.tp_src;
>> >> +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
>> >> +
>> >> +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
>> >> +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
>> >> +                         &sctp_spec, &sctp_mask);
>> >> +
>> >> +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for
>> >> + proto match
>> >> */
>> >> +        if (ipv4_next_proto_mask) {
>> >> +            *ipv4_next_proto_mask = 0;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    struct rte_flow_item_icmp icmp_spec;
>> >> +    struct rte_flow_item_icmp icmp_mask;
>> >> +    memset(&icmp_spec, 0, sizeof(icmp_spec));
>> >> +    memset(&icmp_mask, 0, sizeof(icmp_mask));
>> >> +    if (proto == IPPROTO_ICMP &&
>> >> +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>> >> +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
>> >> +        icmp_spec.hdr.icmp_code =
>> >> + (uint8_t)ntohs(match->flow.tp_dst);
>> >> +
>> >> +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match-
>> >wc.masks.tp_src);
>> >> +        icmp_mask.hdr.icmp_code =
>> >> + (uint8_t)ntohs(match->wc.masks.tp_dst);
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
>> >> +                         &icmp_spec, &icmp_mask);
>> >> +
>> >> +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for
>> >> + proto match
>> >> */
>> >> +        if (ipv4_next_proto_mask) {
>> >> +            *ipv4_next_proto_mask = 0;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    struct rte_flow_item_tcp tcp_spec;
>> >> +    struct rte_flow_item_tcp tcp_mask;
>> >> +    memset(&tcp_spec, 0, sizeof(tcp_spec));
>> >> +    memset(&tcp_mask, 0, sizeof(tcp_mask));
>> >> +    if (proto == IPPROTO_TCP &&
>> >> +        (match->wc.masks.tp_src ||
>> >> +         match->wc.masks.tp_dst ||
>> >> +         match->wc.masks.tcp_flags)) {
>> >> +        tcp_spec.hdr.src_port  = match->flow.tp_src;
>> >> +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
>> >> +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
>> >> +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) &
>> >> + 0xff;
>> >> +
>> >> +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
>> >> +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
>> >> +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags)
>> >> + >>
>> 8;
>> >> +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags)
>> >> + & 0xff;
>> >> +
>> >> +        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
>> >> +                         &tcp_spec, &tcp_mask);
>> >> +
>> >> +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto
>> >> + match
>> >> */
>> >> +        if (ipv4_next_proto_mask) {
>> >> +            *ipv4_next_proto_mask = 0;
>> >> +        }
>> >> +    }
>> >> +    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL,
>> >NULL);
>> >> +
>> >> +    struct rte_flow_action_mark mark;
>> >> +    mark.id = info->flow_mark;
>> >> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK,
>&mark);
>> >> +
>> >> +    struct rte_flow_action_rss *rss;
>> >> +    rss = add_flow_rss_action(&actions, netdev);
>> >> +    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
>> >> +
>> >> +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
>> >> +                           actions.actions, &error);
>> >> +    free(rss);
>> >> +    if (!flow) {
>> >> +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
>> >> +                 error.type, error.message);
>> >> +        ret = -1;
>> >> +        goto out;
>> >> +    }
>> >> +    ufid_to_rte_flow_associate(ufid, flow);
>> >> +    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
>> >> +             flow, UUID_ARGS((struct uuid *)ufid));
>> >
>> >A general comment, right now for a matching flow this function checks
>> >all supported options. E.g. if the protocol is identified as UDP,
>> >this will still check for TCP etc.
>> >
>> >Could it be refactored similar to how miniflow extract parses info?
>> >i.e. check for most common protocols first, once a protocol is
>> >identified and relevant info extracted then jump to the end of the
>> >function instead of checking against remaining protocols?
>>
>> Yes, that makes sense.
>>
>> >
>> >> +
>> >> +out:
>> >> +    free(patterns.items);
>> >> +    free(actions.actions);
>> >> +    return ret;
>> >> +}
>> >> +
>> >> +static bool
>> >> +is_all_zero(const void *addr, size_t n) {
>> >> +    size_t i = 0;
>> >> +    const uint8_t *p = (uint8_t *)addr;
>> >> +
>> >> +    for (i = 0; i < n; i++) {
>> >> +        if (p[i] != 0) {
>> >> +            return false;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    return true;
>> >> +}
>> >> +
>> >> +/*
>> >> + * Check if any unsupported flow patterns are specified.
>> >> + */
>> >> +static int
>> >> +netdev_dpdk_validate_flow(const struct match *match) {
>> >> +    struct match match_zero_wc;
>> >
>> >Causes compilation error error: storage size of 'match_zero_wc' isn't
>> known.
>> >Also gcc complains with unused variable 'match_zero_wc'.
>> >
>> >> +
>> >> +    /* Create a wc-zeroed version of flow */
>> >> +    match_init(&match_zero_wc, &match->flow, &match->wc);
>> >
>> >Compilation fails with implicit declaration of function 'match_init'.
>>
>> Same as above compilation error.
>> >
>> >> +
>> >> +    if (!is_all_zero(&match_zero_wc.flow.tunnel,
>> >> +                     sizeof(match_zero_wc.flow.tunnel))) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    if (match->wc.masks.metadata ||
>> >> +        match->wc.masks.skb_priority ||
>> >> +        match->wc.masks.pkt_mark ||
>> >> +        match->wc.masks.dp_hash) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    /* recirc id must be zero */
>> >> +    if (match_zero_wc.flow.recirc_id) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    if (match->wc.masks.ct_state ||
>> >> +        match->wc.masks.ct_nw_proto ||
>> >> +        match->wc.masks.ct_zone ||
>> >> +        match->wc.masks.ct_mark ||
>> >> +        match->wc.masks.ct_label.u64.hi ||
>> >> +        match->wc.masks.ct_label.u64.lo) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    if (match->wc.masks.conj_id ||
>> >> +        match->wc.masks.actset_output) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    /* unsupported L2 */
>> >> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
>> >> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    /* unsupported L3 */
>> >> +    if (match->wc.masks.ipv6_label ||
>> >> +        match->wc.masks.ct_nw_src ||
>> >> +        match->wc.masks.ct_nw_dst ||
>> >> +        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct
>> >> + in6_addr))
>> >> ||
>> >> +        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct
>> >> + in6_addr))
>> >> ||
>> >> +        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct
>> >> in6_addr)) ||
>> >> +        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct
>> >> in6_addr)) ||
>> >> +        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct
>> >> + in6_addr))
>> >> ||
>> >> +        !is_all_zero(&match->wc.masks.nsh, sizeof(struct
>> >> + ovs_key_nsh))
>> ||
>> >> +        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct
>> >> + eth_addr))
>> >> ||
>> >> +        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct
>> >> + eth_addr)))
>> >> {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    /* If fragmented, then don't HW accelerate - for now */
>> >> +    if (match_zero_wc.flow.nw_frag) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    /* unsupported L4 */
>> >> +    if (match->wc.masks.igmp_group_ip4 ||
>> >> +        match->wc.masks.ct_tp_src ||
>> >> +        match->wc.masks.ct_tp_dst) {
>> >> +        goto err;
>> >> +    }
>> >> +
>> >> +    return 0;
>> >> +
>> >> +err:
>> >> +    VLOG_ERR("cannot HW accelerate this flow due to unsupported
>> >> protocols");
>> >> +    return -1;
>> >> +}
>> >> +
>> >> +static int
>> >> +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
>> >> +                             const ovs_u128 *ufid,
>> >> +                             struct rte_flow *rte_flow) {
>> >> +    struct rte_flow_error error;
>> >> +    int ret;
>> >> +
>> >> +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
>> >> +    if (ret == 0) {
>> >> +        ufid_to_rte_flow_disassociate(ufid);
>> >> +        VLOG_DBG("removed rte flow %p associated with ufid "
>> >> + UUID_FMT
>> >> "\n",
>> >> +                 rte_flow, UUID_ARGS((struct uuid *)ufid));
>> >> +    } else {
>> >> +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
>> >> +                 error.type, error.message);
>> >> +    }
>> >> +
>> >> +    return ret;
>> >> +}
>> >> +
>> >> +static int
>> >> +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
>> >> +                     struct nlattr *actions, size_t actions_len,
>> >> +                     const ovs_u128 *ufid, struct offload_info *info,
>> >> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
>> >> +    struct rte_flow *rte_flow;
>> >> +    int ret;
>> >> +
>> >> +    /*
>> >> +     * If an old rte_flow exists, it means it's a flow modification.
>> >> +     * Here destroy the old rte flow first before adding a new one.
>> >> +     */
>> >> +    rte_flow = ufid_to_rte_flow_find(ufid);
>> >> +    if (rte_flow) {
>> >> +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>> >> +                                           ufid, rte_flow);
>> >> +        if (ret < 0) {
>> >> +            return ret;
>> >> +        }
>> >> +    }
>> >> +
>> >> +    ret = netdev_dpdk_validate_flow(match);
>> >> +    if (ret < 0) {
>> >> +        return ret;
>> >> +    }
>> >> +
>> >> +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
>> >> +                                            actions_len, ufid,
>> >> + info); }
>> >> +
>> >> +static int
>> >> +netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
>> >> +                     struct dpif_flow_stats *stats OVS_UNUSED) {
>> >> +
>> >> +    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
>> >> +
>> >> +    if (!rte_flow) {
>> >> +        return -1;
>> >> +    }
>> >> +
>> >> +    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>> >> +                                        ufid, rte_flow); }
>> >> +
>> >> +#define DPDK_FLOW_OFFLOAD_API                                 \
>> >> +    NULL,                   /* flow_flush */                  \
>> >> +    NULL,                   /* flow_dump_create */            \
>> >> +    NULL,                   /* flow_dump_destroy */           \
>> >> +    NULL,                   /* flow_dump_next */              \
>> >> +    netdev_dpdk_flow_put,                                     \
>> >> +    NULL,                   /* flow_get */                    \
>> >> +    netdev_dpdk_flow_del,                                     \
>> >> +    NULL                    /* init_flow_api */
>> >> +
>> >> +
>> >>  #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>> >>                            SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>> >>                            GET_CARRIER, GET_STATS,  \ @@ -3707,7
>> >> +4264,7 @@ unlock:
>> >>      RXQ_RECV,                                                 \
>> >>      NULL,                       /* rx_wait */                 \
>> >>      NULL,                       /* rxq_drain */               \
>> >> -    NO_OFFLOAD_API                                            \
>> >> +    DPDK_FLOW_OFFLOAD_API                                     \
>> >>  }
>> >>
>> >>  static const struct netdev_class dpdk_class =
>> >> --
>> >> 2.7.4
>>
>> Disclaimer: This email and any files transmitted with it may contain
>> confidential information intended for the addressee(s) only. The
>> information is not to be surrendered or copied to unauthorized
>> persons. If you have received this communication in error, please
>> notify the sender immediately and delete this e-mail from your system.
Ian Stokes March 22, 2018, 9:47 a.m. | #6
> >-----Original Message-----
> >From: Stokes, Ian <ian.stokes@intel.com>
> >Sent: 21. marts 2018 14:38
> >To: Finn Christensen <fc@napatech.com>; 'Yuanhan Liu'
> ><yliu@fridaylinux.org>; dev@openvswitch.org; Shahaf Shuler
> ><shahafs@mellanox.com>
> >Cc: Darrell Ball <dball@vmware.com>; Chandran, Sugesh
> ><sugesh.chandran@intel.com>; Simon Horman <simon.horman@netronome.com>
> >Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
> >rte flow
> >
> >> -----Original Message-----
> >> From: Finn Christensen [mailto:fc@napatech.com]
> >> Sent: Friday, March 16, 2018 12:44 PM
> >> To: Stokes, Ian <ian.stokes@intel.com>; 'Yuanhan Liu'
> >> <yliu@fridaylinux.org>; dev@openvswitch.org; Shahaf Shuler
> >> <shahafs@mellanox.com>
> >> Cc: Darrell Ball <dball@vmware.com>; Chandran, Sugesh
> >> <sugesh.chandran@intel.com>; Simon Horman
> ><simon.horman@netronome.com>
> >> Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
> >> rte flow
> >>
> >> Hi Ian,
> >>
> >> Thanks for the review.
> >> I'll fix the compilation issues and the comments you have below.
> >> Please see my answers below.
> >>
> >> Thanks
> >> Finn
> >>
> >> >-----Original Message-----
> >> >From: Stokes, Ian <ian.stokes@intel.com>
> >> >Sent: 12. marts 2018 16:00
> >> >To: 'Yuanhan Liu' <yliu@fridaylinux.org>; dev@openvswitch.org
> >> >Cc: Finn Christensen <fc@napatech.com>; Darrell Ball
> >> ><dball@vmware.com>; Chandran, Sugesh <sugesh.chandran@intel.com>;
> >> >Simon Horman <simon.horman@netronome.com>
> >> >Subject: RE: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
> >> >rte flow
> >> >
> >> >> -----Original Message-----
> >> >> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> >> >> Sent: Monday, January 29, 2018 7:00 AM
> >> >> To: dev@openvswitch.org
> >> >> Cc: Stokes, Ian <ian.stokes@intel.com>; Finn Christensen
> >> >> <fc@napatech.com>; Darrell Ball <dball@vmware.com>; Chandran,
> >> >> Sugesh <sugesh.chandran@intel.com>; Simon Horman
> >> >> <simon.horman@netronome.com>; Yuanhan Liu <yliu@fridaylinux.org>
> >> >> Subject: [PATCH v7 3/6] netdev-dpdk: implement flow offload with
> >> >> rte flow
> >> >>
> >> >> From: Finn Christensen <fc@napatech.com>
> >> >>
> >> >> The basic yet the major part of this patch is to translate the
> "match"
> >> >> to rte flow patterns. And then, we create a rte flow with MARK +
> >> >> RSS actions. Afterwards, all packets match the flow will have the
> >> >> mark id in the mbuf.
> >> >>
> >> >> The reason RSS is needed is, for most NICs, a MARK only action is
> >> >> not allowed. It has to be used together with some other actions,
> >> >> such as QUEUE, RSS, etc. However, QUEUE action can specify one
> >> >> queue only, which may break the rss. Likely, RSS action is
> >> >> currently the best we could
> >> >now.
> >> >> Thus, RSS action is choosen.
> >> >>
> >> >> For any unsupported flows, such as MPLS, -1 is returned, meaning
> >> >> the flow offload is failed and then skipped.
> >> >>
> >> >> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
> >> >> Signed-off-by: Finn Christensen <fc@napatech.com>
> >> >> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
> >> >
> >> >Hi Finn,
> >> >
> >> >Thanks for working on this. There are compilation errors introduced
> >> >by this commit, details below along with other comments.
> >> >
> >> >For completeness here is the link to the OVS Travis build with
> >> >associated failures
> >> >
> >> >https://travis-ci.org/istokes/ovs/builds/352283122
> >> >
> >> >
> >> >Thanks
> >> >Ian
> >> >
> >> >> ---
> >> >>
> >> >> v7: - set the rss_conf for rss action to NULL, to workaround a
> >> >> mlx5
> >> change
> >> >>       in DPDK v17.11. Note that it will obey the rss settings
> >> >> OVS-DPDK
> >> has
> >> >>       set in the beginning. Thus, nothing should be effected.
> >> >>
> >> >> ---
> >> >>  lib/netdev-dpdk.c | 559
> >> >> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >> >>  1 file changed, 558 insertions(+), 1 deletion(-)
> >> >>
> >> >> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index
> >> >> ac2e38e..4bd0503
> >> >> 100644
> >> >> --- a/lib/netdev-dpdk.c
> >> >> +++ b/lib/netdev-dpdk.c
> >> >> @@ -38,7 +38,9 @@
> >> >>  #include <rte_pci.h>
> >> >>  #include <rte_vhost.h>
> >> >>  #include <rte_version.h>
> >> >> +#include <rte_flow.h>
> >> >>
> >> >> +#include "cmap.h"
> >> >>  #include "dirs.h"
> >> >>  #include "dp-packet.h"
> >> >>  #include "dpdk.h"
> >> >> @@ -60,6 +62,7 @@
> >> >>  #include "sset.h"
> >> >>  #include "unaligned.h"
> >> >>  #include "timeval.h"
> >> >> +#include "uuid.h"
> >> >>  #include "unixctl.h"
> >> >>
> >> >>  enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM}; @@ -3633,6
> >+3636,560
> >> >@@
> >> >> unlock:
> >> >>      return err;
> >> >>  }
> >> >>
> >> >> +/*
> >> >> + * A mapping from ufid to dpdk rte_flow.
> >> >> + */
> >> >> +static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
> >> >> +
> >> >> +struct ufid_to_rte_flow_data {
> >> >> +    struct cmap_node node;
> >> >> +    ovs_u128 ufid;
> >> >> +    struct rte_flow *rte_flow;
> >> >> +};
> >> >
> >> >Might be cleaner to declare above along with the other structs/maps
> >> >specific to netdev-dpdk.c towards the beginning of this file.
> >>
> >> Ok done.
> >>
> >> >
> >> >> +
> >> >> +/* Find rte_flow with @ufid */
> >> >> +static struct rte_flow *
> >> >> +ufid_to_rte_flow_find(const ovs_u128 *ufid) {
> >> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> >> >> +    struct ufid_to_rte_flow_data *data;
> >> >> +
> >> >> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
> >&ufid_to_rte_flow) {
> >> >> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> >> >> +            return data->rte_flow;
> >> >> +        }
> >> >> +    }
> >> >> +
> >> >> +    return NULL;
> >> >> +}
> >> >> +
> >> >> +static inline void
> >> >> +ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow
> >> >> +*rte_flow) {
> >> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> >> >> +    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
> >> >> +
> >> >> +    /*
> >> >> +     * We should not simply overwrite an existing rte flow.
> >> >> +     * We should have deleted it first before re-adding it.
> >> >> +     * Thus, if following assert triggers, something is wrong:
> >> >> +     * the rte_flow is not destroyed.
> >> >> +     */
> >> >> +    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
> >> >> +
> >> >> +    data->ufid = *ufid;
> >> >> +    data->rte_flow = rte_flow;
> >> >> +
> >> >> +    cmap_insert(&ufid_to_rte_flow,
> >> >> +                CONST_CAST(struct cmap_node *, &data->node),
> >> >> + hash); }
> >> >> +
> >> >> +static inline void
> >> >> +ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
> >> >> +    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
> >> >> +    struct ufid_to_rte_flow_data *data;
> >> >> +
> >> >> +    CMAP_FOR_EACH_WITH_HASH (data, node, hash,
> >&ufid_to_rte_flow) {
> >> >> +        if (ovs_u128_equals(*ufid, data->ufid)) {
> >> >> +            cmap_remove(&ufid_to_rte_flow,
> >> >> +                        CONST_CAST(struct cmap_node *,
> >> >> + &data->node),
> >> >> hash);
> >> >> +            free(data);
> >> >> +            return;
> >> >> +        }
> >> >> +    }
> >> >> +
> >> >> +    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte
> >flow\n",
> >> >> +              UUID_ARGS((struct uuid *)ufid)); }
> >> >> +
> >> >> +struct flow_patterns {
> >> >> +    struct rte_flow_item *items;
> >> >> +    int cnt;
> >> >> +    int max;
> >> >> +};
> >> >> +
> >> >> +struct flow_actions {
> >> >> +    struct rte_flow_action *actions;
> >> >> +    int cnt;
> >> >> +    int max;
> >> >> +};
> >> >> +
> >> >> +static void
> >> >> +add_flow_pattern(struct flow_patterns *patterns, enum
> >> >> +rte_flow_item_type
> >> >> type,
> >> >> +                 const void *spec, const void *mask) {
> >> >> +    int cnt = patterns->cnt;
> >> >> +
> >> >> +    if (cnt == 0) {
> >> >> +        patterns->max = 8;
> >> >> +        patterns->items = xcalloc(patterns->max, sizeof(struct
> >> >> rte_flow_item));
> >> >> +    } else if (cnt == patterns->max) {
> >> >> +        patterns->max *= 2;
> >> >> +        patterns->items = xrealloc(patterns->items, patterns->max *
> >> >> +                                   sizeof(struct rte_flow_item));
> >> >> +    }
> >> >
> >> >Just a general query about max value above, so if cnt == 0 you set
> >> >the max to 8. However if cnt is == max you then double the value of
> max.
> >> >
> >> >What is the purpose of max and what is it's limit? Maybe it becomes
> >> >clearer later in the patch but at the moment it seems 8 is the
> >> >default max however it can be higher, I just find the behavior a
> >> >little
> >> misleading.
> >>
> >> The rte flow pattern item part is allocated in one allocation and
> >> then grows with xrealloc if needed. It initializes with 8 elements
> >> and then grows by doubling the number of elements. This is done to
> >> avoid a maximum on the number of items and also not allocate for each
> >> single item element individually, basically to simplify the code.
> >> There is no limit other than when xalloc fails and then ovs_abort is
> >> called.
> >> If the "max" wording is a bit misleading, we can change it. Maybe
> >> "current_max" is better?
> >
> >Ah ok, that makes a bit more sense. 'current_max' would be fine with a
> >small comment explaining that we want to avoid individual allocations.
> 
> OK, I'll do that.
> 
> >
> >In the case where the xrealoc fails I can see why allocating on an
> >individual babsis would be poor here.
> >
> >In the case where no more items can be allocated can we fall back to
> >the previous number allocated and flag an error?
> 
> It uses xrealloc. And when it fails, it calls out_of_memory and finally
> abort.
> These allocations are really small, and we will probably have a limited
> number of items, even usually below 8. Furthermore, if it fails that small
> an allocation I think it is OK to have xrealloc flag an out_of_memory. It
> makes the code simpler.
> If you still think it is worth implementing code for failing a flow
> offload caused by an xrealloc failure, let me know.
> 

And once we've exhausted memory there's not much can be done at that point form a user's perspective. OK this approach as is is fine. We can always revisit down the road if it does become an issue.

Ian

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index ac2e38e..4bd0503 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -38,7 +38,9 @@ 
 #include <rte_pci.h>
 #include <rte_vhost.h>
 #include <rte_version.h>
+#include <rte_flow.h>
 
+#include "cmap.h"
 #include "dirs.h"
 #include "dp-packet.h"
 #include "dpdk.h"
@@ -60,6 +62,7 @@ 
 #include "sset.h"
 #include "unaligned.h"
 #include "timeval.h"
+#include "uuid.h"
 #include "unixctl.h"
 
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
@@ -3633,6 +3636,560 @@  unlock:
     return err;
 }
 
+/*
+ * A mapping from ufid to dpdk rte_flow.
+ */
+static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
+
+struct ufid_to_rte_flow_data {
+    struct cmap_node node;
+    ovs_u128 ufid;
+    struct rte_flow *rte_flow;
+};
+
+/* Find rte_flow with @ufid */
+static struct rte_flow *
+ufid_to_rte_flow_find(const ovs_u128 *ufid)
+{
+    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+    struct ufid_to_rte_flow_data *data;
+
+    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
+        if (ovs_u128_equals(*ufid, data->ufid)) {
+            return data->rte_flow;
+        }
+    }
+
+    return NULL;
+}
+
+static inline void
+ufid_to_rte_flow_associate(const ovs_u128 *ufid, struct rte_flow *rte_flow)
+{
+    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+    struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
+
+    /*
+     * We should not simply overwrite an existing rte flow.
+     * We should have deleted it first before re-adding it.
+     * Thus, if following assert triggers, something is wrong:
+     * the rte_flow is not destroyed.
+     */
+    ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
+
+    data->ufid = *ufid;
+    data->rte_flow = rte_flow;
+
+    cmap_insert(&ufid_to_rte_flow,
+                CONST_CAST(struct cmap_node *, &data->node), hash);
+}
+
+static inline void
+ufid_to_rte_flow_disassociate(const ovs_u128 *ufid)
+{
+    size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+    struct ufid_to_rte_flow_data *data;
+
+    CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
+        if (ovs_u128_equals(*ufid, data->ufid)) {
+            cmap_remove(&ufid_to_rte_flow,
+                        CONST_CAST(struct cmap_node *, &data->node), hash);
+            free(data);
+            return;
+        }
+    }
+
+    VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
+              UUID_ARGS((struct uuid *)ufid));
+}
+
+struct flow_patterns {
+    struct rte_flow_item *items;
+    int cnt;
+    int max;
+};
+
+struct flow_actions {
+    struct rte_flow_action *actions;
+    int cnt;
+    int max;
+};
+
+static void
+add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type type,
+                 const void *spec, const void *mask)
+{
+    int cnt = patterns->cnt;
+
+    if (cnt == 0) {
+        patterns->max = 8;
+        patterns->items = xcalloc(patterns->max, sizeof(struct rte_flow_item));
+    } else if (cnt == patterns->max) {
+        patterns->max *= 2;
+        patterns->items = xrealloc(patterns->items, patterns->max *
+                                   sizeof(struct rte_flow_item));
+    }
+
+    patterns->items[cnt].type = type;
+    patterns->items[cnt].spec = spec;
+    patterns->items[cnt].mask = mask;
+    patterns->items[cnt].last = NULL;
+    patterns->cnt++;
+}
+
+static void
+add_flow_action(struct flow_actions *actions, enum rte_flow_action_type type,
+                const void *conf)
+{
+    int cnt = actions->cnt;
+
+    if (cnt == 0) {
+        actions->max = 8;
+        actions->actions = xcalloc(actions->max,
+                                   sizeof(struct rte_flow_action));
+    } else if (cnt == actions->max) {
+        actions->max *= 2;
+        actions->actions = xrealloc(actions->actions, actions->max *
+                                    sizeof(struct rte_flow_action));
+    }
+
+    actions->actions[cnt].type = type;
+    actions->actions[cnt].conf = conf;
+    actions->cnt++;
+}
+
+static struct rte_flow_action_rss *
+add_flow_rss_action(struct flow_actions *actions, struct netdev *netdev)
+{
+    int i;
+    struct rte_flow_action_rss *rss;
+
+    rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
+    /*
+     * Setting it to NULL will let the driver use the default RSS
+     * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
+     */
+    rss->rss_conf = NULL;
+    rss->num = netdev->n_rxq;
+
+    for (i = 0; i < rss->num; i++) {
+        rss->queue[i] = i;
+    }
+
+    add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
+
+    return rss;
+}
+
+static int
+netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
+                                 const struct match *match,
+                                 struct nlattr *nl_actions OVS_UNUSED,
+                                 size_t actions_len OVS_UNUSED,
+                                 const ovs_u128 *ufid,
+                                 struct offload_info *info)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    const struct rte_flow_attr flow_attr = {
+        .group = 0,
+        .priority = 0,
+        .ingress = 1,
+        .egress = 0
+    };
+    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
+    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
+    struct rte_flow *flow;
+    struct rte_flow_error error;
+    uint8_t *ipv4_next_proto_mask = NULL;
+    int ret = 0;
+
+    /* Eth */
+    struct rte_flow_item_eth eth_spec;
+    struct rte_flow_item_eth eth_mask;
+    memset(&eth_spec, 0, sizeof(eth_spec));
+    memset(&eth_mask, 0, sizeof(eth_mask));
+    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
+        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
+        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
+        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
+        eth_spec.type = match->flow.dl_type;
+
+        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
+                   sizeof(eth_mask.dst));
+        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
+                   sizeof(eth_mask.src));
+        eth_mask.type = match->wc.masks.dl_type;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
+                         &eth_spec, &eth_mask);
+    } else {
+        /*
+         * If user specifies a flow (like UDP flow) without L2 patterns,
+         * OVS will at least set the dl_type. Normally, it's enough to
+         * create an eth pattern just with it. Unluckily, some Intel's
+         * NIC (such as XL710) doesn't support that. Below is a workaround,
+         * which simply matches any L2 pkts.
+         */
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
+    }
+
+    /* VLAN */
+    struct rte_flow_item_vlan vlan_spec;
+    struct rte_flow_item_vlan vlan_mask;
+    memset(&vlan_spec, 0, sizeof(vlan_spec));
+    memset(&vlan_mask, 0, sizeof(vlan_mask));
+    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
+        vlan_spec.tci  = match->flow.vlans[0].tci;
+        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
+
+        /* match any protocols */
+        vlan_mask.tpid = 0;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
+                         &vlan_spec, &vlan_mask);
+    }
+
+    /* IP v4 */
+    uint8_t proto = 0;
+    struct rte_flow_item_ipv4 ipv4_spec;
+    struct rte_flow_item_ipv4 ipv4_mask;
+    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
+    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
+        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
+         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
+         match->wc.masks.nw_proto)) {
+        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
+        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
+        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
+        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
+        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
+
+        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
+        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
+        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
+        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
+        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
+                         &ipv4_spec, &ipv4_mask);
+
+        /* Save proto for L4 protocol setup */
+        proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id;
+
+        /* Remember proto mask address for later modification */
+        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
+    }
+
+    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
+        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        VLOG_DBG("L4 Protocol (%u) not supported", proto);
+        ret = -1;
+        goto out;
+    }
+
+    if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
+        (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
+        ret = -1;
+        goto out;
+    }
+
+    struct rte_flow_item_udp udp_spec;
+    struct rte_flow_item_udp udp_mask;
+    memset(&udp_spec, 0, sizeof(udp_spec));
+    memset(&udp_mask, 0, sizeof(udp_mask));
+    if (proto == IPPROTO_UDP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        udp_spec.hdr.src_port = match->flow.tp_src;
+        udp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        udp_mask.hdr.src_port = match->wc.masks.tp_src;
+        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
+                         &udp_spec, &udp_mask);
+
+        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_sctp sctp_spec;
+    struct rte_flow_item_sctp sctp_mask;
+    memset(&sctp_spec, 0, sizeof(sctp_spec));
+    memset(&sctp_mask, 0, sizeof(sctp_mask));
+    if (proto == IPPROTO_SCTP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        sctp_spec.hdr.src_port = match->flow.tp_src;
+        sctp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
+        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
+                         &sctp_spec, &sctp_mask);
+
+        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_icmp icmp_spec;
+    struct rte_flow_item_icmp icmp_mask;
+    memset(&icmp_spec, 0, sizeof(icmp_spec));
+    memset(&icmp_mask, 0, sizeof(icmp_mask));
+    if (proto == IPPROTO_ICMP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
+        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
+
+        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
+        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
+                         &icmp_spec, &icmp_mask);
+
+        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_tcp tcp_spec;
+    struct rte_flow_item_tcp tcp_mask;
+    memset(&tcp_spec, 0, sizeof(tcp_spec));
+    memset(&tcp_mask, 0, sizeof(tcp_mask));
+    if (proto == IPPROTO_TCP &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        tcp_spec.hdr.src_port  = match->flow.tp_src;
+        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
+        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
+        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
+
+        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
+        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
+        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
+        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
+                         &tcp_spec, &tcp_mask);
+
+        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
+
+    struct rte_flow_action_mark mark;
+    mark.id = info->flow_mark;
+    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
+
+    struct rte_flow_action_rss *rss;
+    rss = add_flow_rss_action(&actions, netdev);
+    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
+
+    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
+                           actions.actions, &error);
+    free(rss);
+    if (!flow) {
+        VLOG_ERR("rte flow creat error: %u : message : %s\n",
+                 error.type, error.message);
+        ret = -1;
+        goto out;
+    }
+    ufid_to_rte_flow_associate(ufid, flow);
+    VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
+             flow, UUID_ARGS((struct uuid *)ufid));
+
+out:
+    free(patterns.items);
+    free(actions.actions);
+    return ret;
+}
+
+static bool
+is_all_zero(const void *addr, size_t n)
+{
+    size_t i = 0;
+    const uint8_t *p = (uint8_t *)addr;
+
+    for (i = 0; i < n; i++) {
+        if (p[i] != 0) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/*
+ * Check if any unsupported flow patterns are specified.
+ */
+static int
+netdev_dpdk_validate_flow(const struct match *match)
+{
+    struct match match_zero_wc;
+
+    /* Create a wc-zeroed version of flow */
+    match_init(&match_zero_wc, &match->flow, &match->wc);
+
+    if (!is_all_zero(&match_zero_wc.flow.tunnel,
+                     sizeof(match_zero_wc.flow.tunnel))) {
+        goto err;
+    }
+
+    if (match->wc.masks.metadata ||
+        match->wc.masks.skb_priority ||
+        match->wc.masks.pkt_mark ||
+        match->wc.masks.dp_hash) {
+        goto err;
+    }
+
+    /* recirc id must be zero */
+    if (match_zero_wc.flow.recirc_id) {
+        goto err;
+    }
+
+    if (match->wc.masks.ct_state ||
+        match->wc.masks.ct_nw_proto ||
+        match->wc.masks.ct_zone ||
+        match->wc.masks.ct_mark ||
+        match->wc.masks.ct_label.u64.hi ||
+        match->wc.masks.ct_label.u64.lo) {
+        goto err;
+    }
+
+    if (match->wc.masks.conj_id ||
+        match->wc.masks.actset_output) {
+        goto err;
+    }
+
+    /* unsupported L2 */
+    if (!is_all_zero(&match->wc.masks.mpls_lse,
+                     sizeof(match_zero_wc.flow.mpls_lse))) {
+        goto err;
+    }
+
+    /* unsupported L3 */
+    if (match->wc.masks.ipv6_label ||
+        match->wc.masks.ct_nw_src ||
+        match->wc.masks.ct_nw_dst ||
+        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
+        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr)) ||
+        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr))) {
+        goto err;
+    }
+
+    /* If fragmented, then don't HW accelerate - for now */
+    if (match_zero_wc.flow.nw_frag) {
+        goto err;
+    }
+
+    /* unsupported L4 */
+    if (match->wc.masks.igmp_group_ip4 ||
+        match->wc.masks.ct_tp_src ||
+        match->wc.masks.ct_tp_dst) {
+        goto err;
+    }
+
+    return 0;
+
+err:
+    VLOG_ERR("cannot HW accelerate this flow due to unsupported protocols");
+    return -1;
+}
+
+static int
+netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
+                             const ovs_u128 *ufid,
+                             struct rte_flow *rte_flow)
+{
+    struct rte_flow_error error;
+    int ret;
+
+    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
+    if (ret == 0) {
+        ufid_to_rte_flow_disassociate(ufid);
+        VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT "\n",
+                 rte_flow, UUID_ARGS((struct uuid *)ufid));
+    } else {
+        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
+                 error.type, error.message);
+    }
+
+    return ret;
+}
+
+static int
+netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
+                     struct nlattr *actions, size_t actions_len,
+                     const ovs_u128 *ufid, struct offload_info *info,
+                     struct dpif_flow_stats *stats OVS_UNUSED)
+{
+    struct rte_flow *rte_flow;
+    int ret;
+
+    /*
+     * If an old rte_flow exists, it means it's a flow modification.
+     * Here destroy the old rte flow first before adding a new one.
+     */
+    rte_flow = ufid_to_rte_flow_find(ufid);
+    if (rte_flow) {
+        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+                                           ufid, rte_flow);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = netdev_dpdk_validate_flow(match);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
+                                            actions_len, ufid, info);
+}
+
+static int
+netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
+                     struct dpif_flow_stats *stats OVS_UNUSED)
+{
+
+    struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
+
+    if (!rte_flow) {
+        return -1;
+    }
+
+    return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+                                        ufid, rte_flow);
+}
+
+#define DPDK_FLOW_OFFLOAD_API                                 \
+    NULL,                   /* flow_flush */                  \
+    NULL,                   /* flow_dump_create */            \
+    NULL,                   /* flow_dump_destroy */           \
+    NULL,                   /* flow_dump_next */              \
+    netdev_dpdk_flow_put,                                     \
+    NULL,                   /* flow_get */                    \
+    netdev_dpdk_flow_del,                                     \
+    NULL                    /* init_flow_api */
+
+
 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,			  \
@@ -3707,7 +4264,7 @@  unlock:
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
-    NO_OFFLOAD_API                                            \
+    DPDK_FLOW_OFFLOAD_API                                     \
 }
 
 static const struct netdev_class dpdk_class =