diff mbox

[ovs-dev,4/7] netdev-dpdk: implement flow put with rte flow

Message ID 1503469462-22391-5-git-send-email-yliu@fridaylinux.org
State Superseded
Headers show

Commit Message

Yuanhan Liu Aug. 23, 2017, 6:24 a.m. UTC
From: Finn Christensen <fc@napatech.com>

The basic yet the major part of this patch is to translate the "match"
to rte flow patterns. And then, we create a rte flow with a MARK action.
Afterwards, all pkts matches the flow will have the mark id in the mbuf.

For any unsupported flows, such as MPLS, -1 is returned, meaning the
flow offload is failed and then skipped.

Signed-off-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
---
 lib/netdev-dpdk.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 384 insertions(+), 1 deletion(-)

Comments

Darrell Ball Aug. 29, 2017, 8:11 a.m. UTC | #1
On 8/22/17, 11:24 PM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:
    
        From: Finn Christensen <fc@napatech.com>
        
        The basic yet the major part of this patch is to translate the "match"
        to rte flow patterns. And then, we create a rte flow with a MARK action.
        Afterwards, all pkts matches the flow will have the mark id in the mbuf.
        
        For any unsupported flows, such as MPLS, -1 is returned, meaning the
        flow offload is failed and then skipped.
        
        Signed-off-by: Finn Christensen <fc@napatech.com>
        Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
        ---
         lib/netdev-dpdk.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
         1 file changed, 384 insertions(+), 1 deletion(-)
        
        diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
        index 15fef1e..8089da8 100644
        --- a/lib/netdev-dpdk.c
        +++ b/lib/netdev-dpdk.c
        @@ -58,6 +58,7 @@
         #include "smap.h"
         #include "sset.h"
         #include "unaligned.h"
        +#include "uuid.h"
         #include "timeval.h"
         #include "unixctl.h"
         
        @@ -3398,6 +3399,388 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)
         }
         
         
        +#define MAX_RTE_FLOW_ITEMS      100
        +#define MAX_RTE_FLOW_ACTIONS    100

I guess these are temporary
Do we need to do a rte query during initialization ?

        +
        +static int
        +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
        +                                 const struct match *match,
        +                                 struct nlattr *nl_actions OVS_UNUSED,
        +                                 size_t actions_len,
        +                                 const ovs_u128 *ufid,
        +                                 struct offload_info *info)
        +{
        +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
        +    const struct rte_flow_attr flow_attr = {
        +        .group = 0,
        +        .priority = 0,
        +        .ingress = 1,
        +        .egress = 0
        +    };
        +    struct flow_patterns {
        +        struct rte_flow_item items[MAX_RTE_FLOW_ITEMS];
        +        uint32_t cnt;
        +    } patterns;
        +    struct flow_actions {
        +        struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS];
        +        uint32_t cnt;
        +    } actions;
        +    struct rte_flow *flow;
        +    struct rte_flow_error error;
        +
        +    uint8_t *ipv4_next_proto_mask = NULL;
        +
        +#define ADD_FLOW_PATTERN(type_, spec_, mask_)  do { \
        +    patterns.items[patterns.cnt].type = type_;      \
        +    patterns.items[patterns.cnt].spec = spec_;      \
        +    patterns.items[patterns.cnt].mask = mask_;      \
        +    patterns.items[patterns.cnt].last = NULL;       \
        +    if (patterns.cnt < MAX_RTE_FLOW_ITEMS) {        \
        +        patterns.cnt++;                             \
        +    } else {                                        \
        +        VLOG_ERR("too many rte flow patterns (> %d)", MAX_RTE_FLOW_ITEMS);  \
        +        goto err;                                   \
        +    }                                               \
        +} while (0)

static (inline) function maybe ?



        +
        +#define ADD_FLOW_ACTION(type_, conf_)         do {  \
        +    actions.actions[actions.cnt].type = type_;      \
        +    actions.actions[actions.cnt].conf = conf_;      \
        +    actions.cnt++;                                  \
        +    ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \
        +} while (0)


static (inline) function maybe ?

        +
        +    patterns.cnt = 0;
        +    actions.cnt  = 0;
        +
        +    /* Eth */
        +    struct rte_flow_item_eth eth_spec;
        +    struct rte_flow_item_eth eth_mask;
        +    memset(&eth_mask, 0, sizeof(eth_mask));
        +    if (match->wc.masks.dl_src.be16[0] ||
        +        match->wc.masks.dl_src.be16[1] ||
        +        match->wc.masks.dl_src.be16[2] ||
        +        match->wc.masks.dl_dst.be16[0] ||
        +        match->wc.masks.dl_dst.be16[1] ||
        +        match->wc.masks.dl_dst.be16[2]) {
        +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
        +        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
        +        eth_spec.type = match->flow.dl_type;
        +
        +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst, sizeof(eth_mask.dst));
        +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src, sizeof(eth_mask.src));
        +        eth_mask.type = match->wc.masks.dl_type;
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);
        +    } else {
        +        /*
        +         * If user specifies a flow (like UDP flow) without L2 patterns,
        +         * OVS will at least set the dl_type. Normally, it's enough to
        +         * create an eth pattern just with it. Unluckily, some Intel's
        +         * NIC (such as XL710) doesn't support that. Below is a workaround,
        +         * which simply matches any L2 pkts.
        +         */
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);
        +    }
        +
        +    /* VLAN */
        +    struct rte_flow_item_vlan vlan_spec;
        +    struct rte_flow_item_vlan vlan_mask;
        +    memset(&vlan_mask, 0, sizeof(vlan_mask));
        +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
        +        vlan_spec.tci  = match->flow.vlans[0].tci;
        +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
        +
        +        /* match any protocols */
        +        vlan_mask.tpid = 0;
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec, &vlan_mask);
        +    }
        +
        +    /* IP v4 */
        +    uint8_t proto = 0;
        +    struct rte_flow_item_ipv4 ipv4_spec;
        +    struct rte_flow_item_ipv4 ipv4_mask;
        +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
        +    if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) &&
        +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
        +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
        +         match->wc.masks.nw_proto)) {
        +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
        +        ipv4_spec.hdr.time_to_live    = match->flow.nw_tos;
        +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
        +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
        +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
        +
        +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
        +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_tos;
        +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
        +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
        +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec, &ipv4_mask);
        +
        +        /* Save proto for L4 protocol setup */
        +        proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id;
        +
        +        /* Remember proto mask address for later modification */
        +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
        +    }
        +
        +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
        +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
        +        (match->wc.masks.tp_src ||
        +         match->wc.masks.tp_dst ||
        +         match->wc.masks.tcp_flags)) {
        +        VLOG_INFO("L4 Protocol (%u) not supported", proto);
        +        goto err;
        +    }
        +
        +    struct rte_flow_item_udp udp_spec;
        +    struct rte_flow_item_udp udp_mask;
        +    memset(&udp_mask, 0, sizeof(udp_mask));
        +    if ((proto == IPPROTO_UDP) &&
        +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
        +        udp_spec.hdr.src_port = match->flow.tp_src;
        +        udp_spec.hdr.dst_port = match->flow.tp_dst;
        +
        +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
        +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec, &udp_mask);
        +
        +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
        +        if (ipv4_next_proto_mask) {
        +            *ipv4_next_proto_mask = 0;
        +        }
        +    }
        +
        +    struct rte_flow_item_sctp sctp_spec;
        +    struct rte_flow_item_sctp sctp_mask;
        +    memset(&sctp_mask, 0, sizeof(sctp_mask));
        +    if ((proto == IPPROTO_SCTP) &&
        +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
        +        sctp_spec.hdr.src_port = match->flow.tp_src;
        +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
        +
        +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
        +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec, &sctp_mask);
        +
        +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
        +        if (ipv4_next_proto_mask) {
        +            *ipv4_next_proto_mask = 0;
        +        }
        +    }
        +
        +    struct rte_flow_item_icmp icmp_spec;
        +    struct rte_flow_item_icmp icmp_mask;
        +    memset(&icmp_mask, 0, sizeof(icmp_mask));
        +    if ((proto == IPPROTO_ICMP) &&
        +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
        +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
        +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
        +
        +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
        +        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec, &icmp_mask);
        +
        +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
        +        if (ipv4_next_proto_mask) {
        +            *ipv4_next_proto_mask = 0;
        +        }
        +    }
        +
        +    struct rte_flow_item_tcp tcp_spec;
        +    struct rte_flow_item_tcp tcp_mask;
        +    memset(&tcp_mask, 0, sizeof(tcp_mask));
        +    if ((proto == IPPROTO_TCP) &&
        +        (match->wc.masks.tp_src ||
        +         match->wc.masks.tp_dst ||
        +         match->wc.masks.tcp_flags)) {
        +        tcp_spec.hdr.src_port  = match->flow.tp_src;
        +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
        +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
        +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
        +
        +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
        +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
        +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
        +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
        +
        +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec, &tcp_mask);
        +
        +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
        +        if (ipv4_next_proto_mask) {
        +            *ipv4_next_proto_mask = 0;
        +        }
        +    }
        +    ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
        +
        +    struct rte_flow_action_mark mark;
        +    if (actions_len) {
        +        mark.id = info->flow_mark;
        +        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark);
        +    } else {
        +        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL);
        +        VLOG_INFO("no action given; drop pkts in hardware\n");
        +    }
        +    ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL);
        +
        +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
        +                           actions.actions, &error);
        +    if (!flow) {
        +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
        +                 error.type, error.message);
        +        goto err;
        +    }
        +    add_ufid_dpdk_flow_mapping(ufid, flow);
        +    VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n",
        +              flow, UUID_ARGS((struct uuid *)ufid));
        +
        +    return 0;
        +
        +err:
        +    return -1;
        +}
        +
        +/*
        + * Validate for later rte flow offload creation. If any unsupported
        + * flow are specified, return -1.
        + */
        +static int
        +netdev_dpdk_validate_flow(const struct match *match)
        +{
        +    struct match match_zero_wc;
        +
        +    /* Create a wc-zeroed version of flow */
        +    match_init(&match_zero_wc, &match->flow, &match->wc);
        +
        +#define CHECK_NONZERO(addr, size)   do {    \
        +    uint8_t *padr = (uint8_t *)(addr);      \
        +    int i;                                  \
        +    for (i = 0; i < (size); i++) {          \
        +        if (padr[i] != 0) {                 \
        +            goto err;                       \
        +        }                                   \
        +    }                                       \
        +} while (0)
        +
        +#define CHECK_NONZEROSIMPLE(var)    do {    \

CHECK_NONZERO ?

        +    if ((var) != 0) {                       \
        +        goto err;                           \
        +    }                                       \
        +} while (0)
        +
        +    CHECK_NONZERO(&match_zero_wc.flow.tunnel, sizeof(match_zero_wc.flow.tunnel));
        +    CHECK_NONZEROSIMPLE(match->wc.masks.metadata);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash);
        +
        +    /* recirc id must be zero */
        +    CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id);
        +
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_state);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.conj_id);
        +    CHECK_NONZEROSIMPLE(match->wc.masks.actset_output);
        +
        +    /* unsupported L2 */
        +    CHECK_NONZERO(&match->wc.masks.mpls_lse,
        +                  sizeof(match_zero_wc.flow.mpls_lse) / sizeof(ovs_be32));
        +
        +    /* unsupported L3 */
        +    CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct in6_addr));
        +    CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr));
        +    CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label);
        +    CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct in6_addr));
        +    CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct eth_addr));
        +    CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct eth_addr));

CHECK_NONZERO_BYTES ?

        +
        +    /* If fragmented, then don't HW accelerate - for now */
        +    CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag);
        +
        +    /* unsupported L4 */
        +    CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4);
        +
        +    return 0;
        +
        +err:
        +    VLOG_INFO("Cannot HW accelerate this flow");
        +    return -1;
        +}
        +
        +static int
        +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
        +                             const ovs_u128 *ufid,
        +                             struct rte_flow *rte_flow)
        +{
        +    struct rte_flow_error error;
        +    int ret;
        +
        +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
        +    if (ret == 0) {
        +        del_ufid_dpdk_flow_mapping(ufid);
        +        VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT "\n",
        +                  rte_flow, UUID_ARGS((struct uuid *)ufid));
        +    } else {
        +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
        +                 error.type, error.message);
        +    }
        +
        +    return ret;
        +}
        +
        +static int
        +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
        +                     struct nlattr *actions, size_t actions_len,
        +                     const ovs_u128 *ufid, struct offload_info *info,
        +                     struct dpif_flow_stats *stats OVS_UNUSED)
        +{
        +    struct rte_flow *rte_flow;
        +    int ret;
        +
        +    /*
        +     * If an old rte_flow exists, it means it's a flow modification.
        +     * Here destroy the old rte flow first before adding a new one.
        +     */
        +    rte_flow = get_rte_flow_by_ufid(ufid);
        +    if (rte_flow) {
        +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
        +                                           ufid, rte_flow);
        +        if (ret < 0)
        +            return ret;
        +    }
        +
        +    ret = netdev_dpdk_validate_flow(match);
        +    if (ret < 0) {
        +        return ret;
        +    }
        +
        +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
        +                                            actions_len, ufid, info);
        +}
        +

Would you mind adding some comments for below NULL elements ?


        +#define DPDK_FLOW_OFFLOAD_API                                 \
        +    NULL,                                                     \
        +    NULL,                                                     \
        +    NULL,                                                     \
        +    NULL,                                                     \
        +    netdev_dpdk_flow_put,                                     \
        +    NULL,                                                     \
        +    NULL,                                                     \
        +    NULL
        +
        +
         #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                                   SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                                   GET_CARRIER, GET_STATS,             \
        @@ -3470,7 +3853,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)
             RXQ_RECV,                                                 \
             NULL,                       /* rx_wait */                 \
             NULL,                       /* rxq_drain */               \
        -    NO_OFFLOAD_API                                            \
        +    DPDK_FLOW_OFFLOAD_API                                     \
         }
         
         static const struct netdev_class dpdk_class =
        -- 
        2.7.4
Yuanhan Liu Aug. 29, 2017, 12:15 p.m. UTC | #2
On Tue, Aug 29, 2017 at 08:11:23AM +0000, Darrell Ball wrote:
> 
>     On 8/22/17, 11:24 PM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:
>     
>         From: Finn Christensen <fc@napatech.com>
>         
>         The basic yet the major part of this patch is to translate the "match"
>         to rte flow patterns. And then, we create a rte flow with a MARK action.
>         Afterwards, all pkts matches the flow will have the mark id in the mbuf.
>         
>         For any unsupported flows, such as MPLS, -1 is returned, meaning the
>         flow offload is failed and then skipped.
>         
>         Signed-off-by: Finn Christensen <fc@napatech.com>
>         Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
>         ---
>          lib/netdev-dpdk.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>          1 file changed, 384 insertions(+), 1 deletion(-)
>         
>         diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
>         index 15fef1e..8089da8 100644
>         --- a/lib/netdev-dpdk.c
>         +++ b/lib/netdev-dpdk.c
>         @@ -58,6 +58,7 @@
>          #include "smap.h"
>          #include "sset.h"
>          #include "unaligned.h"
>         +#include "uuid.h"
>          #include "timeval.h"
>          #include "unixctl.h"
>          
>         @@ -3398,6 +3399,388 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)
>          }
>          
>          
>         +#define MAX_RTE_FLOW_ITEMS      100
>         +#define MAX_RTE_FLOW_ACTIONS    100
> 
> I guess these are temporary

Yes, the hardcoded number is really hacky.

> Do we need to do a rte query during initialization ?

query on what?

>         +
>         +static int
>         +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
>         +                                 const struct match *match,
>         +                                 struct nlattr *nl_actions OVS_UNUSED,
>         +                                 size_t actions_len,
>         +                                 const ovs_u128 *ufid,
>         +                                 struct offload_info *info)
>         +{
>         +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
>         +    const struct rte_flow_attr flow_attr = {
>         +        .group = 0,
>         +        .priority = 0,
>         +        .ingress = 1,
>         +        .egress = 0
>         +    };
>         +    struct flow_patterns {
>         +        struct rte_flow_item items[MAX_RTE_FLOW_ITEMS];
>         +        uint32_t cnt;
>         +    } patterns;
>         +    struct flow_actions {
>         +        struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS];
>         +        uint32_t cnt;
>         +    } actions;
>         +    struct rte_flow *flow;
>         +    struct rte_flow_error error;
>         +
>         +    uint8_t *ipv4_next_proto_mask = NULL;
>         +
>         +#define ADD_FLOW_PATTERN(type_, spec_, mask_)  do { \
>         +    patterns.items[patterns.cnt].type = type_;      \
>         +    patterns.items[patterns.cnt].spec = spec_;      \
>         +    patterns.items[patterns.cnt].mask = mask_;      \
>         +    patterns.items[patterns.cnt].last = NULL;       \
>         +    if (patterns.cnt < MAX_RTE_FLOW_ITEMS) {        \
>         +        patterns.cnt++;                             \
>         +    } else {                                        \
>         +        VLOG_ERR("too many rte flow patterns (> %d)", MAX_RTE_FLOW_ITEMS);  \
>         +        goto err;                                   \
>         +    }                                               \
>         +} while (0)
> 
> static (inline) function maybe ?

Indeed. I'm not a big fan of macro like this. Let me turn it to function
next time. I see no reason to make it inline (explicitly) though: it's
not in data path. Moreover, it's likely it will be inlined implicitly by
compiler.

> 
> 
>         +
>         +#define ADD_FLOW_ACTION(type_, conf_)         do {  \
>         +    actions.actions[actions.cnt].type = type_;      \
>         +    actions.actions[actions.cnt].conf = conf_;      \
>         +    actions.cnt++;                                  \
>         +    ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \
>         +} while (0)
> 
> 
> static (inline) function maybe ?

Ditto.

> 
>         +
>         +    patterns.cnt = 0;
>         +    actions.cnt  = 0;
>         +
>         +    /* Eth */
>         +    struct rte_flow_item_eth eth_spec;
>         +    struct rte_flow_item_eth eth_mask;
>         +    memset(&eth_mask, 0, sizeof(eth_mask));
>         +    if (match->wc.masks.dl_src.be16[0] ||
>         +        match->wc.masks.dl_src.be16[1] ||
>         +        match->wc.masks.dl_src.be16[2] ||
>         +        match->wc.masks.dl_dst.be16[0] ||
>         +        match->wc.masks.dl_dst.be16[1] ||
>         +        match->wc.masks.dl_dst.be16[2]) {
>         +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
>         +        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
>         +        eth_spec.type = match->flow.dl_type;
>         +
>         +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst, sizeof(eth_mask.dst));
>         +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src, sizeof(eth_mask.src));
>         +        eth_mask.type = match->wc.masks.dl_type;
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);
>         +    } else {
>         +        /*
>         +         * If user specifies a flow (like UDP flow) without L2 patterns,
>         +         * OVS will at least set the dl_type. Normally, it's enough to
>         +         * create an eth pattern just with it. Unluckily, some Intel's
>         +         * NIC (such as XL710) doesn't support that. Below is a workaround,
>         +         * which simply matches any L2 pkts.
>         +         */
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);
>         +    }
>         +
>         +    /* VLAN */
>         +    struct rte_flow_item_vlan vlan_spec;
>         +    struct rte_flow_item_vlan vlan_mask;
>         +    memset(&vlan_mask, 0, sizeof(vlan_mask));
>         +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
>         +        vlan_spec.tci  = match->flow.vlans[0].tci;
>         +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
>         +
>         +        /* match any protocols */
>         +        vlan_mask.tpid = 0;
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec, &vlan_mask);
>         +    }
>         +
>         +    /* IP v4 */
>         +    uint8_t proto = 0;
>         +    struct rte_flow_item_ipv4 ipv4_spec;
>         +    struct rte_flow_item_ipv4 ipv4_mask;
>         +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
>         +    if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) &&
>         +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
>         +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
>         +         match->wc.masks.nw_proto)) {
>         +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
>         +        ipv4_spec.hdr.time_to_live    = match->flow.nw_tos;
>         +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
>         +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
>         +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
>         +
>         +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
>         +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_tos;
>         +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
>         +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
>         +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec, &ipv4_mask);
>         +
>         +        /* Save proto for L4 protocol setup */
>         +        proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id;
>         +
>         +        /* Remember proto mask address for later modification */
>         +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
>         +    }
>         +
>         +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
>         +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
>         +        (match->wc.masks.tp_src ||
>         +         match->wc.masks.tp_dst ||
>         +         match->wc.masks.tcp_flags)) {
>         +        VLOG_INFO("L4 Protocol (%u) not supported", proto);
>         +        goto err;
>         +    }
>         +
>         +    struct rte_flow_item_udp udp_spec;
>         +    struct rte_flow_item_udp udp_mask;
>         +    memset(&udp_mask, 0, sizeof(udp_mask));
>         +    if ((proto == IPPROTO_UDP) &&
>         +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>         +        udp_spec.hdr.src_port = match->flow.tp_src;
>         +        udp_spec.hdr.dst_port = match->flow.tp_dst;
>         +
>         +        udp_mask.hdr.src_port = match->wc.masks.tp_src;
>         +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec, &udp_mask);
>         +
>         +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
>         +        if (ipv4_next_proto_mask) {
>         +            *ipv4_next_proto_mask = 0;
>         +        }
>         +    }
>         +
>         +    struct rte_flow_item_sctp sctp_spec;
>         +    struct rte_flow_item_sctp sctp_mask;
>         +    memset(&sctp_mask, 0, sizeof(sctp_mask));
>         +    if ((proto == IPPROTO_SCTP) &&
>         +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>         +        sctp_spec.hdr.src_port = match->flow.tp_src;
>         +        sctp_spec.hdr.dst_port = match->flow.tp_dst;
>         +
>         +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
>         +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec, &sctp_mask);
>         +
>         +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
>         +        if (ipv4_next_proto_mask) {
>         +            *ipv4_next_proto_mask = 0;
>         +        }
>         +    }
>         +
>         +    struct rte_flow_item_icmp icmp_spec;
>         +    struct rte_flow_item_icmp icmp_mask;
>         +    memset(&icmp_mask, 0, sizeof(icmp_mask));
>         +    if ((proto == IPPROTO_ICMP) &&
>         +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
>         +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
>         +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
>         +
>         +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
>         +        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec, &icmp_mask);
>         +
>         +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
>         +        if (ipv4_next_proto_mask) {
>         +            *ipv4_next_proto_mask = 0;
>         +        }
>         +    }
>         +
>         +    struct rte_flow_item_tcp tcp_spec;
>         +    struct rte_flow_item_tcp tcp_mask;
>         +    memset(&tcp_mask, 0, sizeof(tcp_mask));
>         +    if ((proto == IPPROTO_TCP) &&
>         +        (match->wc.masks.tp_src ||
>         +         match->wc.masks.tp_dst ||
>         +         match->wc.masks.tcp_flags)) {
>         +        tcp_spec.hdr.src_port  = match->flow.tp_src;
>         +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
>         +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
>         +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
>         +
>         +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
>         +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
>         +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
>         +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
>         +
>         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec, &tcp_mask);
>         +
>         +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
>         +        if (ipv4_next_proto_mask) {
>         +            *ipv4_next_proto_mask = 0;
>         +        }
>         +    }
>         +    ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
>         +
>         +    struct rte_flow_action_mark mark;
>         +    if (actions_len) {
>         +        mark.id = info->flow_mark;
>         +        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark);
>         +    } else {
>         +        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL);
>         +        VLOG_INFO("no action given; drop pkts in hardware\n");
>         +    }
>         +    ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL);
>         +
>         +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
>         +                           actions.actions, &error);
>         +    if (!flow) {
>         +        VLOG_ERR("rte flow creat error: %u : message : %s\n",
>         +                 error.type, error.message);
>         +        goto err;
>         +    }
>         +    add_ufid_dpdk_flow_mapping(ufid, flow);
>         +    VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n",
>         +              flow, UUID_ARGS((struct uuid *)ufid));
>         +
>         +    return 0;
>         +
>         +err:
>         +    return -1;
>         +}
>         +
>         +/*
>         + * Validate for later rte flow offload creation. If any unsupported
>         + * flow are specified, return -1.
>         + */
>         +static int
>         +netdev_dpdk_validate_flow(const struct match *match)
>         +{
>         +    struct match match_zero_wc;
>         +
>         +    /* Create a wc-zeroed version of flow */
>         +    match_init(&match_zero_wc, &match->flow, &match->wc);
>         +
>         +#define CHECK_NONZERO(addr, size)   do {    \
>         +    uint8_t *padr = (uint8_t *)(addr);      \
>         +    int i;                                  \
>         +    for (i = 0; i < (size); i++) {          \
>         +        if (padr[i] != 0) {                 \
>         +            goto err;                       \
>         +        }                                   \
>         +    }                                       \
>         +} while (0)
>         +
>         +#define CHECK_NONZEROSIMPLE(var)    do {    \
> 
> CHECK_NONZERO ?

Yep, it's better (as it's shorter; also, SIMPLE is too vague).

> 
>         +    if ((var) != 0) {                       \
>         +        goto err;                           \
>         +    }                                       \
>         +} while (0)
>         +
>         +    CHECK_NONZERO(&match_zero_wc.flow.tunnel, sizeof(match_zero_wc.flow.tunnel));
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.metadata);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash);
>         +
>         +    /* recirc id must be zero */
>         +    CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id);
>         +
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_state);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.conj_id);
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.actset_output);
>         +
>         +    /* unsupported L2 */
>         +    CHECK_NONZERO(&match->wc.masks.mpls_lse,
>         +                  sizeof(match_zero_wc.flow.mpls_lse) / sizeof(ovs_be32));
>         +
>         +    /* unsupported L3 */
>         +    CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct in6_addr));
>         +    CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr));
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label);
>         +    CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct in6_addr));
>         +    CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct eth_addr));
>         +    CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct eth_addr));
> 
> CHECK_NONZERO_BYTES ?

Again, better. Thanks!

>         +
>         +    /* If fragmented, then don't HW accelerate - for now */
>         +    CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag);
>         +
>         +    /* unsupported L4 */
>         +    CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4);
>         +
>         +    return 0;
>         +
>         +err:
>         +    VLOG_INFO("Cannot HW accelerate this flow");
>         +    return -1;
>         +}
>         +
>         +static int
>         +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
>         +                             const ovs_u128 *ufid,
>         +                             struct rte_flow *rte_flow)
>         +{
>         +    struct rte_flow_error error;
>         +    int ret;
>         +
>         +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
>         +    if (ret == 0) {
>         +        del_ufid_dpdk_flow_mapping(ufid);
>         +        VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT "\n",
>         +                  rte_flow, UUID_ARGS((struct uuid *)ufid));
>         +    } else {
>         +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
>         +                 error.type, error.message);
>         +    }
>         +
>         +    return ret;
>         +}
>         +
>         +static int
>         +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
>         +                     struct nlattr *actions, size_t actions_len,
>         +                     const ovs_u128 *ufid, struct offload_info *info,
>         +                     struct dpif_flow_stats *stats OVS_UNUSED)
>         +{
>         +    struct rte_flow *rte_flow;
>         +    int ret;
>         +
>         +    /*
>         +     * If an old rte_flow exists, it means it's a flow modification.
>         +     * Here destroy the old rte flow first before adding a new one.
>         +     */
>         +    rte_flow = get_rte_flow_by_ufid(ufid);
>         +    if (rte_flow) {
>         +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
>         +                                           ufid, rte_flow);
>         +        if (ret < 0)
>         +            return ret;
>         +    }
>         +
>         +    ret = netdev_dpdk_validate_flow(match);
>         +    if (ret < 0) {
>         +        return ret;
>         +    }
>         +
>         +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
>         +                                            actions_len, ufid, info);
>         +}
>         +
> 
> Would you mind adding some comments for below NULL elements ?

Like what the method it's pointing to?

	--yliu
> 
> 
>         +#define DPDK_FLOW_OFFLOAD_API                                 \
>         +    NULL,                                                     \
>         +    NULL,                                                     \
>         +    NULL,                                                     \
>         +    NULL,                                                     \
>         +    netdev_dpdk_flow_put,                                     \
>         +    NULL,                                                     \
>         +    NULL,                                                     \
>         +    NULL
>         +
>         +
>          #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
>                                    SET_CONFIG, SET_TX_MULTIQ, SEND,    \
>                                    GET_CARRIER, GET_STATS,             \
>         @@ -3470,7 +3853,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)
>              RXQ_RECV,                                                 \
>              NULL,                       /* rx_wait */                 \
>              NULL,                       /* rxq_drain */               \
>         -    NO_OFFLOAD_API                                            \
>         +    DPDK_FLOW_OFFLOAD_API                                     \
>          }
>          
>          static const struct netdev_class dpdk_class =
>         -- 
>         2.7.4
>         
>         
>     
>     
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Darrell Ball Aug. 30, 2017, 2:02 a.m. UTC | #3
On 8/29/17, 5:15 AM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:

    On Tue, Aug 29, 2017 at 08:11:23AM +0000, Darrell Ball wrote:
    > 

    >     On 8/22/17, 11:24 PM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:

    >     

    >         From: Finn Christensen <fc@napatech.com>

    >         

    >         The basic yet the major part of this patch is to translate the "match"

    >         to rte flow patterns. And then, we create a rte flow with a MARK action.

    >         Afterwards, all pkts matches the flow will have the mark id in the mbuf.

    >         

    >         For any unsupported flows, such as MPLS, -1 is returned, meaning the

    >         flow offload is failed and then skipped.

    >         

    >         Signed-off-by: Finn Christensen <fc@napatech.com>

    >         Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>

    >         ---

    >          lib/netdev-dpdk.c | 385 +++++++++++++++++++++++++++++++++++++++++++++++++++++-

    >          1 file changed, 384 insertions(+), 1 deletion(-)

    >         

    >         diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c

    >         index 15fef1e..8089da8 100644

    >         --- a/lib/netdev-dpdk.c

    >         +++ b/lib/netdev-dpdk.c

    >         @@ -58,6 +58,7 @@

    >          #include "smap.h"

    >          #include "sset.h"

    >          #include "unaligned.h"

    >         +#include "uuid.h"

    >          #include "timeval.h"

    >          #include "unixctl.h"

    >          

    >         @@ -3398,6 +3399,388 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)

    >          }

    >          

    >          

    >         +#define MAX_RTE_FLOW_ITEMS      100

    >         +#define MAX_RTE_FLOW_ACTIONS    100

    > 

    > I guess these are temporary

    
    Yes, the hardcoded number is really hacky.
    
    > Do we need to do a rte query during initialization ?

    
    query on what?

[Darrell]
I mean somehow the max hardware resources available at
dev initialization time ? I realize this is non-trivial overall.


    
    >         +

    >         +static int

    >         +netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,

    >         +                                 const struct match *match,

    >         +                                 struct nlattr *nl_actions OVS_UNUSED,

    >         +                                 size_t actions_len,

    >         +                                 const ovs_u128 *ufid,

    >         +                                 struct offload_info *info)

    >         +{

    >         +    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);

    >         +    const struct rte_flow_attr flow_attr = {

    >         +        .group = 0,

    >         +        .priority = 0,

    >         +        .ingress = 1,

    >         +        .egress = 0

    >         +    };

    >         +    struct flow_patterns {

    >         +        struct rte_flow_item items[MAX_RTE_FLOW_ITEMS];

    >         +        uint32_t cnt;

    >         +    } patterns;

    >         +    struct flow_actions {

    >         +        struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS];

    >         +        uint32_t cnt;

    >         +    } actions;

    >         +    struct rte_flow *flow;

    >         +    struct rte_flow_error error;

    >         +

    >         +    uint8_t *ipv4_next_proto_mask = NULL;

    >         +

    >         +#define ADD_FLOW_PATTERN(type_, spec_, mask_)  do { \

    >         +    patterns.items[patterns.cnt].type = type_;      \

    >         +    patterns.items[patterns.cnt].spec = spec_;      \

    >         +    patterns.items[patterns.cnt].mask = mask_;      \

    >         +    patterns.items[patterns.cnt].last = NULL;       \

    >         +    if (patterns.cnt < MAX_RTE_FLOW_ITEMS) {        \

    >         +        patterns.cnt++;                             \

    >         +    } else {                                        \

    >         +        VLOG_ERR("too many rte flow patterns (> %d)", MAX_RTE_FLOW_ITEMS);  \

    >         +        goto err;                                   \

    >         +    }                                               \

    >         +} while (0)

    > 

    > static (inline) function maybe ?

    
    Indeed. I'm not a big fan of macro like this. Let me turn it to function
    next time. I see no reason to make it inline (explicitly) though: it's
    not in data path. Moreover, it's likely it will be inlined implicitly by
    compiler.

[Darrell]
I put ‘(inline)’ in parentheses because we would never specify it explicitly,
since gcc would likely inline anyways. 


    
    > 

    > 

    >         +

    >         +#define ADD_FLOW_ACTION(type_, conf_)         do {  \

    >         +    actions.actions[actions.cnt].type = type_;      \

    >         +    actions.actions[actions.cnt].conf = conf_;      \

    >         +    actions.cnt++;                                  \

    >         +    ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \

    >         +} while (0)

    > 

    > 

    > static (inline) function maybe ?

    
    Ditto.
    
    > 

    >         +

    >         +    patterns.cnt = 0;

    >         +    actions.cnt  = 0;

    >         +

    >         +    /* Eth */

    >         +    struct rte_flow_item_eth eth_spec;

    >         +    struct rte_flow_item_eth eth_mask;

    >         +    memset(&eth_mask, 0, sizeof(eth_mask));

    >         +    if (match->wc.masks.dl_src.be16[0] ||

    >         +        match->wc.masks.dl_src.be16[1] ||

    >         +        match->wc.masks.dl_src.be16[2] ||

    >         +        match->wc.masks.dl_dst.be16[0] ||

    >         +        match->wc.masks.dl_dst.be16[1] ||

    >         +        match->wc.masks.dl_dst.be16[2]) {

    >         +        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));

    >         +        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));

    >         +        eth_spec.type = match->flow.dl_type;

    >         +

    >         +        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst, sizeof(eth_mask.dst));

    >         +        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src, sizeof(eth_mask.src));

    >         +        eth_mask.type = match->wc.masks.dl_type;

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);

    >         +    } else {

    >         +        /*

    >         +         * If user specifies a flow (like UDP flow) without L2 patterns,

    >         +         * OVS will at least set the dl_type. Normally, it's enough to

    >         +         * create an eth pattern just with it. Unluckily, some Intel's

    >         +         * NIC (such as XL710) doesn't support that. Below is a workaround,

    >         +         * which simply matches any L2 pkts.

    >         +         */

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);

    >         +    }

    >         +

    >         +    /* VLAN */

    >         +    struct rte_flow_item_vlan vlan_spec;

    >         +    struct rte_flow_item_vlan vlan_mask;

    >         +    memset(&vlan_mask, 0, sizeof(vlan_mask));

    >         +    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {

    >         +        vlan_spec.tci  = match->flow.vlans[0].tci;

    >         +        vlan_mask.tci  = match->wc.masks.vlans[0].tci;

    >         +

    >         +        /* match any protocols */

    >         +        vlan_mask.tpid = 0;

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec, &vlan_mask);

    >         +    }

    >         +

    >         +    /* IP v4 */

    >         +    uint8_t proto = 0;

    >         +    struct rte_flow_item_ipv4 ipv4_spec;

    >         +    struct rte_flow_item_ipv4 ipv4_mask;

    >         +    memset(&ipv4_mask, 0, sizeof(ipv4_mask));

    >         +    if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) &&

    >         +        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||

    >         +         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||

    >         +         match->wc.masks.nw_proto)) {

    >         +        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;

    >         +        ipv4_spec.hdr.time_to_live    = match->flow.nw_tos;

    >         +        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;

    >         +        ipv4_spec.hdr.src_addr        = match->flow.nw_src;

    >         +        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;

    >         +

    >         +        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;

    >         +        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_tos;

    >         +        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;

    >         +        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;

    >         +        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec, &ipv4_mask);

    >         +

    >         +        /* Save proto for L4 protocol setup */

    >         +        proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id;

    >         +

    >         +        /* Remember proto mask address for later modification */

    >         +        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;

    >         +    }

    >         +

    >         +    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&

    >         +        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&

    >         +        (match->wc.masks.tp_src ||

    >         +         match->wc.masks.tp_dst ||

    >         +         match->wc.masks.tcp_flags)) {

    >         +        VLOG_INFO("L4 Protocol (%u) not supported", proto);

    >         +        goto err;

    >         +    }

    >         +

    >         +    struct rte_flow_item_udp udp_spec;

    >         +    struct rte_flow_item_udp udp_mask;

    >         +    memset(&udp_mask, 0, sizeof(udp_mask));

    >         +    if ((proto == IPPROTO_UDP) &&

    >         +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {

    >         +        udp_spec.hdr.src_port = match->flow.tp_src;

    >         +        udp_spec.hdr.dst_port = match->flow.tp_dst;

    >         +

    >         +        udp_mask.hdr.src_port = match->wc.masks.tp_src;

    >         +        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec, &udp_mask);

    >         +

    >         +        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */

    >         +        if (ipv4_next_proto_mask) {

    >         +            *ipv4_next_proto_mask = 0;

    >         +        }

    >         +    }

    >         +

    >         +    struct rte_flow_item_sctp sctp_spec;

    >         +    struct rte_flow_item_sctp sctp_mask;

    >         +    memset(&sctp_mask, 0, sizeof(sctp_mask));

    >         +    if ((proto == IPPROTO_SCTP) &&

    >         +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {

    >         +        sctp_spec.hdr.src_port = match->flow.tp_src;

    >         +        sctp_spec.hdr.dst_port = match->flow.tp_dst;

    >         +

    >         +        sctp_mask.hdr.src_port = match->wc.masks.tp_src;

    >         +        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec, &sctp_mask);

    >         +

    >         +        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */

    >         +        if (ipv4_next_proto_mask) {

    >         +            *ipv4_next_proto_mask = 0;

    >         +        }

    >         +    }

    >         +

    >         +    struct rte_flow_item_icmp icmp_spec;

    >         +    struct rte_flow_item_icmp icmp_mask;

    >         +    memset(&icmp_mask, 0, sizeof(icmp_mask));

    >         +    if ((proto == IPPROTO_ICMP) &&

    >         +        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {

    >         +        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);

    >         +        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);

    >         +

    >         +        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);

    >         +        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec, &icmp_mask);

    >         +

    >         +        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */

    >         +        if (ipv4_next_proto_mask) {

    >         +            *ipv4_next_proto_mask = 0;

    >         +        }

    >         +    }

    >         +

    >         +    struct rte_flow_item_tcp tcp_spec;

    >         +    struct rte_flow_item_tcp tcp_mask;

    >         +    memset(&tcp_mask, 0, sizeof(tcp_mask));

    >         +    if ((proto == IPPROTO_TCP) &&

    >         +        (match->wc.masks.tp_src ||

    >         +         match->wc.masks.tp_dst ||

    >         +         match->wc.masks.tcp_flags)) {

    >         +        tcp_spec.hdr.src_port  = match->flow.tp_src;

    >         +        tcp_spec.hdr.dst_port  = match->flow.tp_dst;

    >         +        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;

    >         +        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;

    >         +

    >         +        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;

    >         +        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;

    >         +        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;

    >         +        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;

    >         +

    >         +        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec, &tcp_mask);

    >         +

    >         +        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */

    >         +        if (ipv4_next_proto_mask) {

    >         +            *ipv4_next_proto_mask = 0;

    >         +        }

    >         +    }

    >         +    ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL);

    >         +

    >         +    struct rte_flow_action_mark mark;

    >         +    if (actions_len) {

    >         +        mark.id = info->flow_mark;

    >         +        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark);

    >         +    } else {

    >         +        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL);

    >         +        VLOG_INFO("no action given; drop pkts in hardware\n");

    >         +    }

    >         +    ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL);

    >         +

    >         +    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,

    >         +                           actions.actions, &error);

    >         +    if (!flow) {

    >         +        VLOG_ERR("rte flow creat error: %u : message : %s\n",

    >         +                 error.type, error.message);

    >         +        goto err;

    >         +    }

    >         +    add_ufid_dpdk_flow_mapping(ufid, flow);

    >         +    VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n",

    >         +              flow, UUID_ARGS((struct uuid *)ufid));

    >         +

    >         +    return 0;

    >         +

    >         +err:

    >         +    return -1;

    >         +}

    >         +

    >         +/*

    >         + * Validate for later rte flow offload creation. If any unsupported

    >         + * flow are specified, return -1.

    >         + */

    >         +static int

    >         +netdev_dpdk_validate_flow(const struct match *match)

    >         +{

    >         +    struct match match_zero_wc;

    >         +

    >         +    /* Create a wc-zeroed version of flow */

    >         +    match_init(&match_zero_wc, &match->flow, &match->wc);

    >         +

    >         +#define CHECK_NONZERO(addr, size)   do {    \

    >         +    uint8_t *padr = (uint8_t *)(addr);      \

    >         +    int i;                                  \

    >         +    for (i = 0; i < (size); i++) {          \

    >         +        if (padr[i] != 0) {                 \

    >         +            goto err;                       \

    >         +        }                                   \

    >         +    }                                       \

    >         +} while (0)

    >         +

    >         +#define CHECK_NONZEROSIMPLE(var)    do {    \

    > 

    > CHECK_NONZERO ?

    
    Yep, it's better (as it's shorter; also, SIMPLE is too vague).
    
    > 

    >         +    if ((var) != 0) {                       \

    >         +        goto err;                           \

    >         +    }                                       \

    >         +} while (0)

    >         +

    >         +    CHECK_NONZERO(&match_zero_wc.flow.tunnel, sizeof(match_zero_wc.flow.tunnel));

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.metadata);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash);

    >         +

    >         +    /* recirc id must be zero */

    >         +    CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id);

    >         +

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_state);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.conj_id);

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.actset_output);

    >         +

    >         +    /* unsupported L2 */

    >         +    CHECK_NONZERO(&match->wc.masks.mpls_lse,

    >         +                  sizeof(match_zero_wc.flow.mpls_lse) / sizeof(ovs_be32));

    >         +

    >         +    /* unsupported L3 */

    >         +    CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct in6_addr));

    >         +    CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr));

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label);

    >         +    CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct in6_addr));

    >         +    CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct eth_addr));

    >         +    CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct eth_addr));

    > 

    > CHECK_NONZERO_BYTES ?

    
    Again, better. Thanks!
    
    >         +

    >         +    /* If fragmented, then don't HW accelerate - for now */

    >         +    CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag);

    >         +

    >         +    /* unsupported L4 */

    >         +    CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4);

    >         +

    >         +    return 0;

    >         +

    >         +err:

    >         +    VLOG_INFO("Cannot HW accelerate this flow");

    >         +    return -1;

    >         +}

    >         +

    >         +static int

    >         +netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,

    >         +                             const ovs_u128 *ufid,

    >         +                             struct rte_flow *rte_flow)

    >         +{

    >         +    struct rte_flow_error error;

    >         +    int ret;

    >         +

    >         +    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);

    >         +    if (ret == 0) {

    >         +        del_ufid_dpdk_flow_mapping(ufid);

    >         +        VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT "\n",

    >         +                  rte_flow, UUID_ARGS((struct uuid *)ufid));

    >         +    } else {

    >         +        VLOG_ERR("rte flow destroy error: %u : message : %s\n",

    >         +                 error.type, error.message);

    >         +    }

    >         +

    >         +    return ret;

    >         +}

    >         +

    >         +static int

    >         +netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,

    >         +                     struct nlattr *actions, size_t actions_len,

    >         +                     const ovs_u128 *ufid, struct offload_info *info,

    >         +                     struct dpif_flow_stats *stats OVS_UNUSED)

    >         +{

    >         +    struct rte_flow *rte_flow;

    >         +    int ret;

    >         +

    >         +    /*

    >         +     * If an old rte_flow exists, it means it's a flow modification.

    >         +     * Here destroy the old rte flow first before adding a new one.

    >         +     */

    >         +    rte_flow = get_rte_flow_by_ufid(ufid);

    >         +    if (rte_flow) {

    >         +        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),

    >         +                                           ufid, rte_flow);

    >         +        if (ret < 0)

    >         +            return ret;

    >         +    }

    >         +

    >         +    ret = netdev_dpdk_validate_flow(match);

    >         +    if (ret < 0) {

    >         +        return ret;

    >         +    }

    >         +

    >         +    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,

    >         +                                            actions_len, ufid, info);

    >         +}

    >         +

    > 

    > Would you mind adding some comments for below NULL elements ?

    
    Like what the method it's pointing to?

[Darrell]
exactly
    
    	--yliu
    > 

    > 

    >         +#define DPDK_FLOW_OFFLOAD_API                                 \

    >         +    NULL,                                                     \

    >         +    NULL,                                                     \

    >         +    NULL,                                                     \

    >         +    NULL,                                                     \

    >         +    netdev_dpdk_flow_put,                                     \

    >         +    NULL,                                                     \

    >         +    NULL,                                                     \

    >         +    NULL

    >         +

    >         +

    >          #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \

    >                                    SET_CONFIG, SET_TX_MULTIQ, SEND,    \

    >                                    GET_CARRIER, GET_STATS,             \

    >         @@ -3470,7 +3853,7 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)

    >              RXQ_RECV,                                                 \

    >              NULL,                       /* rx_wait */                 \

    >              NULL,                       /* rxq_drain */               \

    >         -    NO_OFFLOAD_API                                            \

    >         +    DPDK_FLOW_OFFLOAD_API                                     \

    >          }

    >          

    >          static const struct netdev_class dpdk_class =

    >         -- 

    >         2.7.4

    >         

    >         

    >     

    >     

    > 

    > _______________________________________________

    > dev mailing list

    > dev@openvswitch.org

    > https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwIBAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=dfWCL5vBH3eyoegyoQj-Ue82sTh5Z8aC0rJoUlNTPag&s=F90aGyXQASuSFbqM4N5JSxXju2QooA3333vvKNgYLmw&e=
Yuanhan Liu Aug. 30, 2017, 2:33 a.m. UTC | #4
On Wed, Aug 30, 2017 at 02:02:23AM +0000, Darrell Ball wrote:
> 
>     >         +#define MAX_RTE_FLOW_ITEMS      100
>     >         +#define MAX_RTE_FLOW_ACTIONS    100
>     > 
>     > I guess these are temporary
>     
>     Yes, the hardcoded number is really hacky.
>     
>     > Do we need to do a rte query during initialization ?
>     
>     query on what?
> 
> [Darrell]
> I mean somehow the max hardware resources available at
> dev initialization time ? I realize this is non-trivial overall.

I see you point then. I don't think it's needed then. I'm also not
aware of there are such limitations existed (say, how many patterns
are supported).  I think we just add patterns as many as we can and
let the driver to figure out the rest. If the flow creation is failed,
we skip the hw offload, with an error message provided.
 
>     > static (inline) function maybe ?
>     
>     Indeed. I'm not a big fan of macro like this. Let me turn it to function
>     next time. I see no reason to make it inline (explicitly) though: it's
>     not in data path. Moreover, it's likely it will be inlined implicitly by
>     compiler.
> 
> [Darrell]
> I put ‘(inline)’ in parentheses because we would never specify it explicitly,
> since gcc would likely inline anyways. 

I see. Sorry for misunderstanding.

	--yliu
Darrell Ball Sept. 1, 2017, 10:38 p.m. UTC | #5
On 8/29/17, 7:33 PM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:

    On Wed, Aug 30, 2017 at 02:02:23AM +0000, Darrell Ball wrote:
    > 

    >     >         +#define MAX_RTE_FLOW_ITEMS      100

    >     >         +#define MAX_RTE_FLOW_ACTIONS    100

    >     > 

    >     > I guess these are temporary

    >     

    >     Yes, the hardcoded number is really hacky.

    >     

    >     > Do we need to do a rte query during initialization ?

    >     

    >     query on what?

    > 

    > [Darrell]

    > I mean somehow the max hardware resources available at

    > dev initialization time ? I realize this is non-trivial overall.

    
    I see you point then. I don't think it's needed then. I'm also not
    aware of there are such limitations existed (say, how many patterns
    are supported).  I think we just add patterns as many as we can and
    let the driver to figure out the rest. If the flow creation is failed,
    we skip the hw offload, with an error message provided.

[Darrell]
I understand the present intention.
But for future enhancements, maybe it would be good to display the max capability/capacity and
remaining capacity to the user in some way.
This brings back another discussion point: having user specification of HWOL flows
is starting to look more useful, as it helps the queue action issue and HWOL
capacity planning/predictability for high value flows.
     
    >     > static (inline) function maybe ?

    >     

    >     Indeed. I'm not a big fan of macro like this. Let me turn it to function

    >     next time. I see no reason to make it inline (explicitly) though: it's

    >     not in data path. Moreover, it's likely it will be inlined implicitly by

    >     compiler.

    > 

    > [Darrell]

    > I put ‘(inline)’ in parentheses because we would never specify it explicitly,

    > since gcc would likely inline anyways. 

    
    I see. Sorry for misunderstanding.
    
    	--yliu
Yuanhan Liu Sept. 5, 2017, 8:30 a.m. UTC | #6
On Fri, Sep 01, 2017 at 10:38:37PM +0000, Darrell Ball wrote:
> 
> 
> On 8/29/17, 7:33 PM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:
> 
>     On Wed, Aug 30, 2017 at 02:02:23AM +0000, Darrell Ball wrote:
>     > 
>     >     >         +#define MAX_RTE_FLOW_ITEMS      100
>     >     >         +#define MAX_RTE_FLOW_ACTIONS    100
>     >     > 
>     >     > I guess these are temporary
>     >     
>     >     Yes, the hardcoded number is really hacky.
>     >     
>     >     > Do we need to do a rte query during initialization ?
>     >     
>     >     query on what?
>     > 
>     > [Darrell]
>     > I mean somehow the max hardware resources available at
>     > dev initialization time ? I realize this is non-trivial overall.
>     
>     I see you point then. I don't think it's needed then. I'm also not
>     aware of there are such limitations existed (say, how many patterns
>     are supported).  I think we just add patterns as many as we can and
>     let the driver to figure out the rest. If the flow creation is failed,
>     we skip the hw offload, with an error message provided.
> 
> [Darrell]
> I understand the present intention.
> But for future enhancements, maybe it would be good to display the max capability/capacity and
> remaining capacity to the user in some way.

Agreed, and that's also what in my mind. It's some work in DPDK though.

> This brings back another discussion point: having user specification of HWOL flows
> is starting to look more useful,

Are you introducing some new (CLI) interfaces? Could you give a bit more
detailes here?

> as it helps the queue action issue and HWOL
> capacity planning/predictability for high value flows.

Yes, I would think so.

	--yliu
>      
>     >     > static (inline) function maybe ?
>     >     
>     >     Indeed. I'm not a big fan of macro like this. Let me turn it to function
>     >     next time. I see no reason to make it inline (explicitly) though: it's
>     >     not in data path. Moreover, it's likely it will be inlined implicitly by
>     >     compiler.
>     > 
>     > [Darrell]
>     > I put ‘(inline)’ in parentheses because we would never specify it explicitly,
>     > since gcc would likely inline anyways. 
>     
>     I see. Sorry for misunderstanding.
>     
>     	--yliu
>     
> 
> 
>
Darrell Ball Sept. 6, 2017, 2:19 a.m. UTC | #7
On 9/5/17, 1:30 AM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:

    On Fri, Sep 01, 2017 at 10:38:37PM +0000, Darrell Ball wrote:
    > 

    > 

    > On 8/29/17, 7:33 PM, "Yuanhan Liu" <yliu@fridaylinux.org> wrote:

    > 

    >     On Wed, Aug 30, 2017 at 02:02:23AM +0000, Darrell Ball wrote:

    >     > 

    >     >     >         +#define MAX_RTE_FLOW_ITEMS      100

    >     >     >         +#define MAX_RTE_FLOW_ACTIONS    100

    >     >     > 

    >     >     > I guess these are temporary

    >     >     

    >     >     Yes, the hardcoded number is really hacky.

    >     >     

    >     >     > Do we need to do a rte query during initialization ?

    >     >     

    >     >     query on what?

    >     > 

    >     > [Darrell]

    >     > I mean somehow the max hardware resources available at

    >     > dev initialization time ? I realize this is non-trivial overall.

    >     

    >     I see you point then. I don't think it's needed then. I'm also not

    >     aware of there are such limitations existed (say, how many patterns

    >     are supported).  I think we just add patterns as many as we can and

    >     let the driver to figure out the rest. If the flow creation is failed,

    >     we skip the hw offload, with an error message provided.

    > 

    > [Darrell]

    > I understand the present intention.

    > But for future enhancements, maybe it would be good to display the max capability/capacity and

    > remaining capacity to the user in some way.

    
    Agreed, and that's also what in my mind. It's some work in DPDK though.
    
    > This brings back another discussion point: having user specification of HWOL flows

    > is starting to look more useful,

    
    Are you introducing some new (CLI) interfaces? Could you give a bit more
    detailes here?

[Darrell] 
Maybe something wild like this with a special dp only action ‘in_queue’ or ‘hwol_in_queue’ ?
ovs-appctl dpctl/mod-flow "in_port(1),eth(),eth_type(0x800),ipv4(src=1.1.1.1,dst=2.2.2.2)" in_queue(3)

It feels weird to do this as a full Openflow extension since it is special HW centric, but maybe that is also
a yet more wild possibility. It would be better to fix the NICs/drivers than to go here.

    
    > as it helps the queue action issue and HWOL

    > capacity planning/predictability for high value flows.

    
    Yes, I would think so.
    
    	--yliu
    >      

    >     >     > static (inline) function maybe ?

    >     >     

    >     >     Indeed. I'm not a big fan of macro like this. Let me turn it to function

    >     >     next time. I see no reason to make it inline (explicitly) though: it's

    >     >     not in data path. Moreover, it's likely it will be inlined implicitly by

    >     >     compiler.

    >     > 

    >     > [Darrell]

    >     > I put ‘(inline)’ in parentheses because we would never specify it explicitly,

    >     > since gcc would likely inline anyways. 

    >     

    >     I see. Sorry for misunderstanding.

    >     

    >     	--yliu

    >     

    > 

    > 

    >
diff mbox

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 15fef1e..8089da8 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -58,6 +58,7 @@ 
 #include "smap.h"
 #include "sset.h"
 #include "unaligned.h"
+#include "uuid.h"
 #include "timeval.h"
 #include "unixctl.h"
 
@@ -3398,6 +3399,388 @@  get_rte_flow_by_ufid(const ovs_u128 *ufid)
 }
 
 
+#define MAX_RTE_FLOW_ITEMS      100
+#define MAX_RTE_FLOW_ACTIONS    100
+
+static int
+netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
+                                 const struct match *match,
+                                 struct nlattr *nl_actions OVS_UNUSED,
+                                 size_t actions_len,
+                                 const ovs_u128 *ufid,
+                                 struct offload_info *info)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    const struct rte_flow_attr flow_attr = {
+        .group = 0,
+        .priority = 0,
+        .ingress = 1,
+        .egress = 0
+    };
+    struct flow_patterns {
+        struct rte_flow_item items[MAX_RTE_FLOW_ITEMS];
+        uint32_t cnt;
+    } patterns;
+    struct flow_actions {
+        struct rte_flow_action actions[MAX_RTE_FLOW_ACTIONS];
+        uint32_t cnt;
+    } actions;
+    struct rte_flow *flow;
+    struct rte_flow_error error;
+
+    uint8_t *ipv4_next_proto_mask = NULL;
+
+#define ADD_FLOW_PATTERN(type_, spec_, mask_)  do { \
+    patterns.items[patterns.cnt].type = type_;      \
+    patterns.items[patterns.cnt].spec = spec_;      \
+    patterns.items[patterns.cnt].mask = mask_;      \
+    patterns.items[patterns.cnt].last = NULL;       \
+    if (patterns.cnt < MAX_RTE_FLOW_ITEMS) {        \
+        patterns.cnt++;                             \
+    } else {                                        \
+        VLOG_ERR("too many rte flow patterns (> %d)", MAX_RTE_FLOW_ITEMS);  \
+        goto err;                                   \
+    }                                               \
+} while (0)
+
+#define ADD_FLOW_ACTION(type_, conf_)         do {  \
+    actions.actions[actions.cnt].type = type_;      \
+    actions.actions[actions.cnt].conf = conf_;      \
+    actions.cnt++;                                  \
+    ovs_assert(actions.cnt < MAX_RTE_FLOW_ACTIONS); \
+} while (0)
+
+    patterns.cnt = 0;
+    actions.cnt  = 0;
+
+    /* Eth */
+    struct rte_flow_item_eth eth_spec;
+    struct rte_flow_item_eth eth_mask;
+    memset(&eth_mask, 0, sizeof(eth_mask));
+    if (match->wc.masks.dl_src.be16[0] ||
+        match->wc.masks.dl_src.be16[1] ||
+        match->wc.masks.dl_src.be16[2] ||
+        match->wc.masks.dl_dst.be16[0] ||
+        match->wc.masks.dl_dst.be16[1] ||
+        match->wc.masks.dl_dst.be16[2]) {
+        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
+        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
+        eth_spec.type = match->flow.dl_type;
+
+        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst, sizeof(eth_mask.dst));
+        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src, sizeof(eth_mask.src));
+        eth_mask.type = match->wc.masks.dl_type;
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);
+    } else {
+        /*
+         * If user specifies a flow (like UDP flow) without L2 patterns,
+         * OVS will at least set the dl_type. Normally, it's enough to
+         * create an eth pattern just with it. Unluckily, some Intel's
+         * NIC (such as XL710) doesn't support that. Below is a workaround,
+         * which simply matches any L2 pkts.
+         */
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ETH, &eth_spec, &eth_mask);
+    }
+
+    /* VLAN */
+    struct rte_flow_item_vlan vlan_spec;
+    struct rte_flow_item_vlan vlan_mask;
+    memset(&vlan_mask, 0, sizeof(vlan_mask));
+    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
+        vlan_spec.tci  = match->flow.vlans[0].tci;
+        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
+
+        /* match any protocols */
+        vlan_mask.tpid = 0;
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_VLAN, &vlan_spec, &vlan_mask);
+    }
+
+    /* IP v4 */
+    uint8_t proto = 0;
+    struct rte_flow_item_ipv4 ipv4_spec;
+    struct rte_flow_item_ipv4 ipv4_mask;
+    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+    if ((match->flow.dl_type == ntohs(ETH_TYPE_IP)) &&
+        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
+         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
+         match->wc.masks.nw_proto)) {
+        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
+        ipv4_spec.hdr.time_to_live    = match->flow.nw_tos;
+        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
+        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
+        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
+
+        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
+        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_tos;
+        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
+        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
+        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_IPV4, &ipv4_spec, &ipv4_mask);
+
+        /* Save proto for L4 protocol setup */
+        proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id;
+
+        /* Remember proto mask address for later modification */
+        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
+    }
+
+    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
+        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        VLOG_INFO("L4 Protocol (%u) not supported", proto);
+        goto err;
+    }
+
+    struct rte_flow_item_udp udp_spec;
+    struct rte_flow_item_udp udp_mask;
+    memset(&udp_mask, 0, sizeof(udp_mask));
+    if ((proto == IPPROTO_UDP) &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        udp_spec.hdr.src_port = match->flow.tp_src;
+        udp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        udp_mask.hdr.src_port = match->wc.masks.tp_src;
+        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_UDP, &udp_spec, &udp_mask);
+
+        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_sctp sctp_spec;
+    struct rte_flow_item_sctp sctp_mask;
+    memset(&sctp_mask, 0, sizeof(sctp_mask));
+    if ((proto == IPPROTO_SCTP) &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        sctp_spec.hdr.src_port = match->flow.tp_src;
+        sctp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
+        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_SCTP, &sctp_spec, &sctp_mask);
+
+        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_icmp icmp_spec;
+    struct rte_flow_item_icmp icmp_mask;
+    memset(&icmp_mask, 0, sizeof(icmp_mask));
+    if ((proto == IPPROTO_ICMP) &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
+        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
+
+        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
+        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_ICMP, &icmp_spec, &icmp_mask);
+
+        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_tcp tcp_spec;
+    struct rte_flow_item_tcp tcp_mask;
+    memset(&tcp_mask, 0, sizeof(tcp_mask));
+    if ((proto == IPPROTO_TCP) &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        tcp_spec.hdr.src_port  = match->flow.tp_src;
+        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
+        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
+        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
+
+        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
+        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
+        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
+        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
+
+        ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_TCP, &tcp_spec, &tcp_mask);
+
+        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+    ADD_FLOW_PATTERN(RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
+
+    struct rte_flow_action_mark mark;
+    if (actions_len) {
+        mark.id = info->flow_mark;
+        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_MARK, &mark);
+    } else {
+        ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_DROP, NULL);
+        VLOG_INFO("no action given; drop pkts in hardware\n");
+    }
+    ADD_FLOW_ACTION(RTE_FLOW_ACTION_TYPE_END, NULL);
+
+    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
+                           actions.actions, &error);
+    if (!flow) {
+        VLOG_ERR("rte flow creat error: %u : message : %s\n",
+                 error.type, error.message);
+        goto err;
+    }
+    add_ufid_dpdk_flow_mapping(ufid, flow);
+    VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n",
+              flow, UUID_ARGS((struct uuid *)ufid));
+
+    return 0;
+
+err:
+    return -1;
+}
+
+/*
+ * Validate for later rte flow offload creation. If any unsupported
+ * flow are specified, return -1.
+ */
+static int
+netdev_dpdk_validate_flow(const struct match *match)
+{
+    struct match match_zero_wc;
+
+    /* Create a wc-zeroed version of flow */
+    match_init(&match_zero_wc, &match->flow, &match->wc);
+
+#define CHECK_NONZERO(addr, size)   do {    \
+    uint8_t *padr = (uint8_t *)(addr);      \
+    int i;                                  \
+    for (i = 0; i < (size); i++) {          \
+        if (padr[i] != 0) {                 \
+            goto err;                       \
+        }                                   \
+    }                                       \
+} while (0)
+
+#define CHECK_NONZEROSIMPLE(var)    do {    \
+    if ((var) != 0) {                       \
+        goto err;                           \
+    }                                       \
+} while (0)
+
+    CHECK_NONZERO(&match_zero_wc.flow.tunnel, sizeof(match_zero_wc.flow.tunnel));
+    CHECK_NONZEROSIMPLE(match->wc.masks.metadata);
+    CHECK_NONZEROSIMPLE(match->wc.masks.skb_priority);
+    CHECK_NONZEROSIMPLE(match->wc.masks.pkt_mark);
+    CHECK_NONZEROSIMPLE(match->wc.masks.dp_hash);
+
+    /* recirc id must be zero */
+    CHECK_NONZEROSIMPLE(match_zero_wc.flow.recirc_id);
+
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_state);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_zone);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_mark);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.hi);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_label.u64.lo);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_nw_proto);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_src);
+    CHECK_NONZEROSIMPLE(match->wc.masks.ct_tp_dst);
+    CHECK_NONZEROSIMPLE(match->wc.masks.conj_id);
+    CHECK_NONZEROSIMPLE(match->wc.masks.actset_output);
+
+    /* unsupported L2 */
+    CHECK_NONZERO(&match->wc.masks.mpls_lse,
+                  sizeof(match_zero_wc.flow.mpls_lse) / sizeof(ovs_be32));
+
+    /* unsupported L3 */
+    CHECK_NONZERO(&match->wc.masks.ipv6_src, sizeof(struct in6_addr));
+    CHECK_NONZERO(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr));
+    CHECK_NONZEROSIMPLE(match->wc.masks.ipv6_label);
+    CHECK_NONZERO(&match->wc.masks.nd_target, sizeof(struct in6_addr));
+    CHECK_NONZERO(&match->wc.masks.arp_sha, sizeof(struct eth_addr));
+    CHECK_NONZERO(&match->wc.masks.arp_tha, sizeof(struct eth_addr));
+
+    /* If fragmented, then don't HW accelerate - for now */
+    CHECK_NONZEROSIMPLE(match_zero_wc.flow.nw_frag);
+
+    /* unsupported L4 */
+    CHECK_NONZEROSIMPLE(match->wc.masks.igmp_group_ip4);
+
+    return 0;
+
+err:
+    VLOG_INFO("Cannot HW accelerate this flow");
+    return -1;
+}
+
+static int
+netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
+                             const ovs_u128 *ufid,
+                             struct rte_flow *rte_flow)
+{
+    struct rte_flow_error error;
+    int ret;
+
+    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
+    if (ret == 0) {
+        del_ufid_dpdk_flow_mapping(ufid);
+        VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT "\n",
+                  rte_flow, UUID_ARGS((struct uuid *)ufid));
+    } else {
+        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
+                 error.type, error.message);
+    }
+
+    return ret;
+}
+
+static int
+netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
+                     struct nlattr *actions, size_t actions_len,
+                     const ovs_u128 *ufid, struct offload_info *info,
+                     struct dpif_flow_stats *stats OVS_UNUSED)
+{
+    struct rte_flow *rte_flow;
+    int ret;
+
+    /*
+     * If an old rte_flow exists, it means it's a flow modification.
+     * Here destroy the old rte flow first before adding a new one.
+     */
+    rte_flow = get_rte_flow_by_ufid(ufid);
+    if (rte_flow) {
+        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+                                           ufid, rte_flow);
+        if (ret < 0)
+            return ret;
+    }
+
+    ret = netdev_dpdk_validate_flow(match);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
+                                            actions_len, ufid, info);
+}
+
+#define DPDK_FLOW_OFFLOAD_API                                 \
+    NULL,                                                     \
+    NULL,                                                     \
+    NULL,                                                     \
+    NULL,                                                     \
+    netdev_dpdk_flow_put,                                     \
+    NULL,                                                     \
+    NULL,                                                     \
+    NULL
+
+
 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,             \
@@ -3470,7 +3853,7 @@  get_rte_flow_by_ufid(const ovs_u128 *ufid)
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
-    NO_OFFLOAD_API                                            \
+    DPDK_FLOW_OFFLOAD_API                                     \
 }
 
 static const struct netdev_class dpdk_class =