[ovs-dev,v3,4/9] netdev-dpdk: implement flow put with rte flow

Message ID 1506404199-23579-5-git-send-email-yliu@fridaylinux.org
State New
Headers show
Series
  • OVS-DPDK flow offload with rte_flow
Related show

Commit Message

Yuanhan Liu Sept. 26, 2017, 5:36 a.m.
From: Finn Christensen <fc@napatech.com>

The basic yet the major part of this patch is to translate the "match"
to rte flow patterns. And then, we create a rte flow with a MARK action.
Afterwards, all pkts matches the flow will have the mark id in the mbuf.

For any unsupported flows, such as MPLS, -1 is returned, meaning the
flow offload is failed and then skipped.

Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
---

v3: - fix duplicate (and wrong) TOS assignment
    - zero the pattern spec as well
    - remove macros
    - add missing unsupported fields

v2: - convert some macros to functions
    - do not hardcode the max number of flow/action
    - fix L2 patterns for Intel nic
    - add comments for not implemented offload methods
---
 lib/netdev-dpdk.c | 441 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 440 insertions(+), 1 deletion(-)

Comments

Chandran, Sugesh Oct. 2, 2017, 4:28 p.m. | #1
Regards
_Sugesh


> -----Original Message-----
> From: Yuanhan Liu [mailto:yliu@fridaylinux.org]
> Sent: Tuesday, September 26, 2017 6:37 AM
> To: dev@openvswitch.org
> Cc: Finn Christensen <fc@napatech.com>; Darrell Ball <dball@vmware.com>;
> Chandran, Sugesh <sugesh.chandran@intel.com>; Simon Horman
> <simon.horman@netronome.com>; Yuanhan Liu <yliu@fridaylinux.org>
> Subject: [PATCH v3 4/9] netdev-dpdk: implement flow put with rte flow
> 
> From: Finn Christensen <fc@napatech.com>
> 
> The basic yet the major part of this patch is to translate the "match"
> to rte flow patterns. And then, we create a rte flow with a MARK action.
> Afterwards, all pkts matches the flow will have the mark id in the mbuf.
> 
> For any unsupported flows, such as MPLS, -1 is returned, meaning the flow
> offload is failed and then skipped.
> 
> Co-authored-by: Yuanhan Liu <yliu@fridaylinux.org>
> Signed-off-by: Finn Christensen <fc@napatech.com>
> Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
> ---
> 
> v3: - fix duplicate (and wrong) TOS assignment
>     - zero the pattern spec as well
>     - remove macros
>     - add missing unsupported fields
> 
> v2: - convert some macros to functions
>     - do not hardcode the max number of flow/action
>     - fix L2 patterns for Intel nic
>     - add comments for not implemented offload methods
> ---
>  lib/netdev-dpdk.c | 441
> +++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 440 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index 1be9131..525536a
> 100644
> --- a/lib/netdev-dpdk.c
> +++ b/lib/netdev-dpdk.c
> @@ -3404,6 +3404,445 @@ get_rte_flow_by_ufid(const ovs_u128 *ufid)  }
> 

[snip] 
> +
> +    /* unsupported L2 */
> +    if (!is_all_zero(&match->wc.masks.mpls_lse,
> +                     sizeof(match_zero_wc.flow.mpls_lse))) {
> +        goto err;
[Sugesh] As we discussed earlier, If we know the capability , we can avoid the whole translation itself.
Also it helps to avoid the flow install when the device is full and cannot take any more flow rules.

> +    }
> +
> +    /* unsupported L3 */
> +    if (match->wc.masks.ipv6_label ||
> --
> 2.7.4

Patch

diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 1be9131..525536a 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -3404,6 +3404,445 @@  get_rte_flow_by_ufid(const ovs_u128 *ufid)
 }
 
 
+struct flow_patterns {
+    struct rte_flow_item *items;
+    int cnt;
+    int max;
+};
+
+struct flow_actions {
+    struct rte_flow_action *actions;
+    int cnt;
+    int max;
+};
+
+static void
+add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type type,
+                 const void *spec, const void *mask)
+{
+    int cnt = patterns->cnt;
+
+    if (cnt == 0) {
+        patterns->max = 8;
+        patterns->items = xcalloc(patterns->max, sizeof(struct rte_flow_item));
+    } else if (cnt == patterns->max) {
+        patterns->max *= 2;
+        patterns->items = xrealloc(patterns->items, patterns->max *
+                                   sizeof(struct rte_flow_item));
+    }
+
+    patterns->items[cnt].type = type;
+    patterns->items[cnt].spec = spec;
+    patterns->items[cnt].mask = mask;
+    patterns->items[cnt].last = NULL;
+    patterns->cnt++;
+}
+
+static void
+add_flow_action(struct flow_actions *actions, enum rte_flow_action_type type,
+                const void *conf)
+{
+    int cnt = actions->cnt;
+
+    if (cnt == 0) {
+        actions->max = 8;
+        actions->actions = xcalloc(actions->max,
+                                   sizeof(struct rte_flow_action));
+    } else if (cnt == actions->max) {
+        actions->max *= 2;
+        actions->actions = xrealloc(actions->actions, actions->max *
+                                    sizeof(struct rte_flow_action));
+    }
+
+    actions->actions[cnt].type = type;
+    actions->actions[cnt].conf = conf;
+    actions->cnt++;
+}
+
+static int
+netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
+                                 const struct match *match,
+                                 struct nlattr *nl_actions OVS_UNUSED,
+                                 size_t actions_len OVS_UNUSED,
+                                 const ovs_u128 *ufid,
+                                 struct offload_info *info)
+{
+    struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+    const struct rte_flow_attr flow_attr = {
+        .group = 0,
+        .priority = 0,
+        .ingress = 1,
+        .egress = 0
+    };
+    struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
+    struct flow_actions actions = { .actions = NULL, .cnt = 0 };
+    struct rte_flow *flow;
+    struct rte_flow_error error;
+    uint8_t *ipv4_next_proto_mask = NULL;
+    int ret = 0;
+
+    /* Eth */
+    struct rte_flow_item_eth eth_spec;
+    struct rte_flow_item_eth eth_mask;
+    memset(&eth_spec, 0, sizeof(eth_spec));
+    memset(&eth_mask, 0, sizeof(eth_mask));
+    if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
+        !eth_addr_is_zero(match->wc.masks.dl_dst)) {
+        rte_memcpy(&eth_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
+        rte_memcpy(&eth_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
+        eth_spec.type = match->flow.dl_type;
+
+        rte_memcpy(&eth_mask.dst, &match->wc.masks.dl_dst,
+                   sizeof(eth_mask.dst));
+        rte_memcpy(&eth_mask.src, &match->wc.masks.dl_src,
+                   sizeof(eth_mask.src));
+        eth_mask.type = match->wc.masks.dl_type;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
+                         &eth_spec, &eth_mask);
+    } else {
+        /*
+         * If user specifies a flow (like UDP flow) without L2 patterns,
+         * OVS will at least set the dl_type. Normally, it's enough to
+         * create an eth pattern just with it. Unluckily, some Intel's
+         * NIC (such as XL710) doesn't support that. Below is a workaround,
+         * which simply matches any L2 pkts.
+         */
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
+    }
+
+    /* VLAN */
+    struct rte_flow_item_vlan vlan_spec;
+    struct rte_flow_item_vlan vlan_mask;
+    memset(&vlan_spec, 0, sizeof(vlan_spec));
+    memset(&vlan_mask, 0, sizeof(vlan_mask));
+    if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
+        vlan_spec.tci  = match->flow.vlans[0].tci;
+        vlan_mask.tci  = match->wc.masks.vlans[0].tci;
+
+        /* match any protocols */
+        vlan_mask.tpid = 0;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
+                         &vlan_spec, &vlan_mask);
+    }
+
+    /* IP v4 */
+    uint8_t proto = 0;
+    struct rte_flow_item_ipv4 ipv4_spec;
+    struct rte_flow_item_ipv4 ipv4_mask;
+    memset(&ipv4_spec, 0, sizeof(ipv4_spec));
+    memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+    if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
+        (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
+         match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
+         match->wc.masks.nw_proto)) {
+        ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
+        ipv4_spec.hdr.time_to_live    = match->flow.nw_ttl;
+        ipv4_spec.hdr.next_proto_id   = match->flow.nw_proto;
+        ipv4_spec.hdr.src_addr        = match->flow.nw_src;
+        ipv4_spec.hdr.dst_addr        = match->flow.nw_dst;
+
+        ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
+        ipv4_mask.hdr.time_to_live    = match->wc.masks.nw_ttl;
+        ipv4_mask.hdr.next_proto_id   = match->wc.masks.nw_proto;
+        ipv4_mask.hdr.src_addr        = match->wc.masks.nw_src;
+        ipv4_mask.hdr.dst_addr        = match->wc.masks.nw_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
+                         &ipv4_spec, &ipv4_mask);
+
+        /* Save proto for L4 protocol setup */
+        proto = ipv4_spec.hdr.next_proto_id & ipv4_mask.hdr.next_proto_id;
+
+        /* Remember proto mask address for later modification */
+        ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
+    }
+
+    if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP  &&
+        proto != IPPROTO_SCTP && proto != IPPROTO_TCP  &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        VLOG_INFO("L4 Protocol (%u) not supported", proto);
+        ret = -1;
+        goto out;
+    }
+
+    struct rte_flow_item_udp udp_spec;
+    struct rte_flow_item_udp udp_mask;
+    memset(&udp_spec, 0, sizeof(udp_spec));
+    memset(&udp_mask, 0, sizeof(udp_mask));
+    if (proto == IPPROTO_UDP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        udp_spec.hdr.src_port = match->flow.tp_src;
+        udp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        udp_mask.hdr.src_port = match->wc.masks.tp_src;
+        udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
+                         &udp_spec, &udp_mask);
+
+        /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_sctp sctp_spec;
+    struct rte_flow_item_sctp sctp_mask;
+    memset(&sctp_spec, 0, sizeof(sctp_spec));
+    memset(&sctp_mask, 0, sizeof(sctp_mask));
+    if (proto == IPPROTO_SCTP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        sctp_spec.hdr.src_port = match->flow.tp_src;
+        sctp_spec.hdr.dst_port = match->flow.tp_dst;
+
+        sctp_mask.hdr.src_port = match->wc.masks.tp_src;
+        sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
+                         &sctp_spec, &sctp_mask);
+
+        /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_icmp icmp_spec;
+    struct rte_flow_item_icmp icmp_mask;
+    memset(&icmp_spec, 0, sizeof(icmp_spec));
+    memset(&icmp_mask, 0, sizeof(icmp_mask));
+    if (proto == IPPROTO_ICMP &&
+        (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+        icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
+        icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
+
+        icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
+        icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
+                         &icmp_spec, &icmp_mask);
+
+        /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+
+    struct rte_flow_item_tcp tcp_spec;
+    struct rte_flow_item_tcp tcp_mask;
+    memset(&tcp_spec, 0, sizeof(tcp_spec));
+    memset(&tcp_mask, 0, sizeof(tcp_mask));
+    if (proto == IPPROTO_TCP &&
+        (match->wc.masks.tp_src ||
+         match->wc.masks.tp_dst ||
+         match->wc.masks.tcp_flags)) {
+        tcp_spec.hdr.src_port  = match->flow.tp_src;
+        tcp_spec.hdr.dst_port  = match->flow.tp_dst;
+        tcp_spec.hdr.data_off  = ntohs(match->flow.tcp_flags) >> 8;
+        tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
+
+        tcp_mask.hdr.src_port  = match->wc.masks.tp_src;
+        tcp_mask.hdr.dst_port  = match->wc.masks.tp_dst;
+        tcp_mask.hdr.data_off  = ntohs(match->wc.masks.tcp_flags) >> 8;
+        tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
+
+        add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
+                         &tcp_spec, &tcp_mask);
+
+        /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
+        if (ipv4_next_proto_mask) {
+            *ipv4_next_proto_mask = 0;
+        }
+    }
+    add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
+
+    struct rte_flow_action_mark mark;
+    mark.id = info->flow_mark;
+    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
+    add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
+
+    flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
+                           actions.actions, &error);
+    if (!flow) {
+        VLOG_ERR("rte flow creat error: %u : message : %s\n",
+                 error.type, error.message);
+        ret = -1;
+        goto out;
+    }
+    add_ufid_dpdk_flow_mapping(ufid, flow);
+    VLOG_INFO("installed flow %p by ufid "UUID_FMT"\n",
+              flow, UUID_ARGS((struct uuid *)ufid));
+
+out:
+    free(patterns.items);
+    free(actions.actions);
+    return ret;
+}
+
+static bool
+is_all_zero(const void *addr, size_t n)
+{
+    size_t i = 0;
+    const uint8_t *p = (uint8_t *)addr;
+
+    for (i = 0; i < n; i++) {
+        if (p[i] != 0) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+/*
+ * Check if any unsupported flow patterns are specified.
+ */
+static int
+netdev_dpdk_validate_flow(const struct match *match)
+{
+    struct match match_zero_wc;
+
+    /* Create a wc-zeroed version of flow */
+    match_init(&match_zero_wc, &match->flow, &match->wc);
+
+    if (!is_all_zero(&match_zero_wc.flow.tunnel,
+                     sizeof(match_zero_wc.flow.tunnel))) {
+        goto err;
+    }
+
+    if (match->wc.masks.metadata ||
+        match->wc.masks.skb_priority ||
+        match->wc.masks.pkt_mark ||
+        match->wc.masks.dp_hash) {
+        goto err;
+    }
+
+    /* recirc id must be zero */
+    if (match_zero_wc.flow.recirc_id) {
+        goto err;
+    }
+
+    if (match->wc.masks.ct_state ||
+        match->wc.masks.ct_nw_proto ||
+        match->wc.masks.ct_zone ||
+        match->wc.masks.ct_mark ||
+        match->wc.masks.ct_label.u64.hi ||
+        match->wc.masks.ct_label.u64.lo) {
+        goto err;
+    }
+
+    if (match->wc.masks.conj_id ||
+        match->wc.masks.actset_output) {
+        goto err;
+    }
+
+    /* unsupported L2 */
+    if (!is_all_zero(&match->wc.masks.mpls_lse,
+                     sizeof(match_zero_wc.flow.mpls_lse))) {
+        goto err;
+    }
+
+    /* unsupported L3 */
+    if (match->wc.masks.ipv6_label ||
+        match->wc.masks.ct_nw_src ||
+        match->wc.masks.ct_nw_dst ||
+        !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr)) ||
+        !is_all_zero(&match->wc.masks.nsh, sizeof(struct flow_nsh)) ||
+        !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr)) ||
+        !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr))) {
+        goto err;
+    }
+
+    /* If fragmented, then don't HW accelerate - for now */
+    if (match_zero_wc.flow.nw_frag) {
+        goto err;
+    }
+
+    /* unsupported L4 */
+    if (match->wc.masks.igmp_group_ip4 ||
+        match->wc.masks.ct_tp_src ||
+        match->wc.masks.ct_tp_dst) {
+        goto err;
+    }
+
+    return 0;
+
+err:
+    VLOG_ERR("cannot HW accelerate this flow due to unsupported protocols");
+    return -1;
+}
+
+static int
+netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
+                             const ovs_u128 *ufid,
+                             struct rte_flow *rte_flow)
+{
+    struct rte_flow_error error;
+    int ret;
+
+    ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
+    if (ret == 0) {
+        del_ufid_dpdk_flow_mapping(ufid);
+        VLOG_INFO("removed rte flow %p associated with ufid " UUID_FMT "\n",
+                  rte_flow, UUID_ARGS((struct uuid *)ufid));
+    } else {
+        VLOG_ERR("rte flow destroy error: %u : message : %s\n",
+                 error.type, error.message);
+    }
+
+    return ret;
+}
+
+static int
+netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
+                     struct nlattr *actions, size_t actions_len,
+                     const ovs_u128 *ufid, struct offload_info *info,
+                     struct dpif_flow_stats *stats OVS_UNUSED)
+{
+    struct rte_flow *rte_flow;
+    int ret;
+
+    /*
+     * If an old rte_flow exists, it means it's a flow modification.
+     * Here destroy the old rte flow first before adding a new one.
+     */
+    rte_flow = get_rte_flow_by_ufid(ufid);
+    if (rte_flow) {
+        ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+                                           ufid, rte_flow);
+        if (ret < 0) {
+            return ret;
+        }
+    }
+
+    ret = netdev_dpdk_validate_flow(match);
+    if (ret < 0) {
+        return ret;
+    }
+
+    return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
+                                            actions_len, ufid, info);
+}
+
+#define DPDK_FLOW_OFFLOAD_API                                 \
+    NULL,                   /* flow_flush */                  \
+    NULL,                   /* flow_dump_create */            \
+    NULL,                   /* flow_dump_destroy */           \
+    NULL,                   /* flow_dump_next */              \
+    netdev_dpdk_flow_put,                                     \
+    NULL,                   /* flow_get */                    \
+    NULL,                   /* flow_del */                    \
+    NULL                    /* init_flow_api */
+
+
 #define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT,    \
                           SET_CONFIG, SET_TX_MULTIQ, SEND,    \
                           GET_CARRIER, GET_STATS,             \
@@ -3476,7 +3915,7 @@  get_rte_flow_by_ufid(const ovs_u128 *ufid)
     RXQ_RECV,                                                 \
     NULL,                       /* rx_wait */                 \
     NULL,                       /* rxq_drain */               \
-    NO_OFFLOAD_API                                            \
+    DPDK_FLOW_OFFLOAD_API                                     \
 }
 
 static const struct netdev_class dpdk_class =