@@ -38,7 +38,9 @@
#include <rte_pci.h>
#include <rte_vhost.h>
#include <rte_version.h>
+#include <rte_flow.h>
+#include "cmap.h"
#include "dirs.h"
#include "dp-packet.h"
#include "dpdk.h"
@@ -51,6 +53,7 @@
#include "openvswitch/list.h"
#include "openvswitch/ofp-print.h"
#include "openvswitch/vlog.h"
+#include "openvswitch/match.h"
#include "ovs-numa.h"
#include "ovs-thread.h"
#include "ovs-rcu.h"
@@ -60,6 +63,7 @@
#include "sset.h"
#include "unaligned.h"
#include "timeval.h"
+#include "uuid.h"
#include "unixctl.h"
enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
@@ -171,6 +175,17 @@ static const struct rte_eth_conf port_conf = {
};
/*
+ * A mapping from ufid to dpdk rte_flow.
+ */
+static struct cmap ufid_to_rte_flow = CMAP_INITIALIZER;
+
+struct ufid_to_rte_flow_data {
+ struct cmap_node node;
+ ovs_u128 ufid;
+ struct rte_flow *rte_flow;
+};
+
+/*
* These callbacks allow virtio-net devices to be added to vhost ports when
* configuration has been fully completed.
*/
@@ -3853,6 +3868,552 @@ unlock:
return err;
}
+
+/* Find rte_flow with @ufid */
+static struct rte_flow *
+ufid_to_rte_flow_find(const ovs_u128 *ufid) {
+ size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+ struct ufid_to_rte_flow_data *data;
+
+ CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
+ if (ovs_u128_equals(*ufid, data->ufid)) {
+ return data->rte_flow;
+ }
+ }
+
+ return NULL;
+}
+
+static inline void
+ufid_to_rte_flow_associate(const ovs_u128 *ufid,
+ struct rte_flow *rte_flow) {
+ size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+ struct ufid_to_rte_flow_data *data = xzalloc(sizeof(*data));
+
+ /*
+ * We should not simply overwrite an existing rte flow.
+ * We should have deleted it first before re-adding it.
+ * Thus, if following assert triggers, something is wrong:
+ * the rte_flow is not destroyed.
+ */
+ ovs_assert(ufid_to_rte_flow_find(ufid) == NULL);
+
+ data->ufid = *ufid;
+ data->rte_flow = rte_flow;
+
+ cmap_insert(&ufid_to_rte_flow,
+ CONST_CAST(struct cmap_node *, &data->node), hash);
+}
+
+static inline void
+ufid_to_rte_flow_disassociate(const ovs_u128 *ufid) {
+ size_t hash = hash_bytes(ufid, sizeof(*ufid), 0);
+ struct ufid_to_rte_flow_data *data;
+
+ CMAP_FOR_EACH_WITH_HASH (data, node, hash, &ufid_to_rte_flow) {
+ if (ovs_u128_equals(*ufid, data->ufid)) {
+ cmap_remove(&ufid_to_rte_flow,
+ CONST_CAST(struct cmap_node *, &data->node), hash);
+ free(data);
+ return;
+ }
+ }
+
+ VLOG_WARN("ufid "UUID_FMT" is not associated with an rte flow\n",
+ UUID_ARGS((struct uuid *)ufid));
+}
+
+/*
+ * To avoid individual xrealloc calls for each new element, a 'curent_max'
+ * is used to keep track of current allocated number of elements. Starts
+ * by 8 and doubles on each xrealloc call
+ */
+struct flow_patterns {
+ struct rte_flow_item *items;
+ int cnt;
+ int current_max;
+};
+
+struct flow_actions {
+ struct rte_flow_action *actions;
+ int cnt;
+ int current_max;
+};
+
+static void
+add_flow_pattern(struct flow_patterns *patterns, enum rte_flow_item_type type,
+ const void *spec, const void *mask) {
+ int cnt = patterns->cnt;
+
+ if (cnt == 0) {
+ patterns->current_max = 8;
+ patterns->items = xcalloc(patterns->current_max, sizeof(struct rte_flow_item));
+ } else if (cnt == patterns->current_max) {
+ patterns->current_max *= 2;
+ patterns->items = xrealloc(patterns->items, patterns->current_max *
+ sizeof(struct rte_flow_item));
+ }
+
+ patterns->items[cnt].type = type;
+ patterns->items[cnt].spec = spec;
+ patterns->items[cnt].mask = mask;
+ patterns->items[cnt].last = NULL;
+ patterns->cnt++;
+}
+
+static void
+add_flow_action(struct flow_actions *actions, enum rte_flow_action_type type,
+ const void *conf)
+{
+ int cnt = actions->cnt;
+
+ if (cnt == 0) {
+ actions->current_max = 8;
+ actions->actions = xcalloc(actions->current_max,
+ sizeof(struct rte_flow_action));
+ } else if (cnt == actions->current_max) {
+ actions->current_max *= 2;
+ actions->actions = xrealloc(actions->actions, actions->current_max *
+ sizeof(struct rte_flow_action));
+ }
+
+ actions->actions[cnt].type = type;
+ actions->actions[cnt].conf = conf;
+ actions->cnt++;
+}
+
+static struct rte_flow_action_rss *
+add_flow_rss_action(struct flow_actions *actions,
+ struct netdev *netdev) {
+ int i;
+ struct rte_flow_action_rss *rss;
+
+ rss = xmalloc(sizeof(*rss) + sizeof(uint16_t) * netdev->n_rxq);
+ /*
+ * Setting it to NULL will let the driver use the default RSS
+ * configuration we have set: &port_conf.rx_adv_conf.rss_conf.
+ */
+ rss->rss_conf = NULL;
+ rss->num = netdev->n_rxq;
+
+ for (i = 0; i < rss->num; i++) {
+ rss->queue[i] = i;
+ }
+
+ add_flow_action(actions, RTE_FLOW_ACTION_TYPE_RSS, rss);
+
+ return rss;
+}
+
+static int
+netdev_dpdk_add_rte_flow_offload(struct netdev *netdev,
+ const struct match *match,
+ struct nlattr *nl_actions OVS_UNUSED,
+ size_t actions_len OVS_UNUSED,
+ const ovs_u128 *ufid,
+ struct offload_info *info) {
+ struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+ const struct rte_flow_attr flow_attr = {
+ .group = 0,
+ .priority = 0,
+ .ingress = 1,
+ .egress = 0
+ };
+ struct flow_patterns patterns = { .items = NULL, .cnt = 0 };
+ struct flow_actions actions = { .actions = NULL, .cnt = 0 };
+ struct rte_flow *flow;
+ struct rte_flow_error error;
+ uint8_t *ipv4_next_proto_mask = NULL;
+ int ret = 0;
+
+ /* Eth */
+ struct rte_flow_item_eth eth_spec;
+ struct rte_flow_item_eth eth_mask;
+ memset(ð_spec, 0, sizeof(eth_spec));
+ memset(ð_mask, 0, sizeof(eth_mask));
+ if (!eth_addr_is_zero(match->wc.masks.dl_src) ||
+ !eth_addr_is_zero(match->wc.masks.dl_dst)) {
+ rte_memcpy(ð_spec.dst, &match->flow.dl_dst, sizeof(eth_spec.dst));
+ rte_memcpy(ð_spec.src, &match->flow.dl_src, sizeof(eth_spec.src));
+ eth_spec.type = match->flow.dl_type;
+
+ rte_memcpy(ð_mask.dst, &match->wc.masks.dl_dst,
+ sizeof(eth_mask.dst));
+ rte_memcpy(ð_mask.src, &match->wc.masks.dl_src,
+ sizeof(eth_mask.src));
+ eth_mask.type = match->wc.masks.dl_type;
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH,
+ ð_spec, ð_mask);
+ } else {
+ /*
+ * If user specifies a flow (like UDP flow) without L2 patterns,
+ * OVS will at least set the dl_type. Normally, it's enough to
+ * create an eth pattern just with it. Unluckily, some Intel's
+ * NIC (such as XL710) doesn't support that. Below is a workaround,
+ * which simply matches any L2 pkts.
+ */
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ETH, NULL, NULL);
+ }
+
+ /* VLAN */
+ struct rte_flow_item_vlan vlan_spec;
+ struct rte_flow_item_vlan vlan_mask;
+ memset(&vlan_spec, 0, sizeof(vlan_spec));
+ memset(&vlan_mask, 0, sizeof(vlan_mask));
+ if (match->wc.masks.vlans[0].tci && match->flow.vlans[0].tci) {
+ vlan_spec.tci = match->flow.vlans[0].tci;
+ vlan_mask.tci = match->wc.masks.vlans[0].tci;
+
+ /* match any protocols */
+ vlan_mask.tpid = 0;
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_VLAN,
+ &vlan_spec, &vlan_mask);
+ }
+
+ /* IP v4 */
+ uint8_t proto = 0;
+ struct rte_flow_item_ipv4 ipv4_spec;
+ struct rte_flow_item_ipv4 ipv4_mask;
+ memset(&ipv4_spec, 0, sizeof(ipv4_spec));
+ memset(&ipv4_mask, 0, sizeof(ipv4_mask));
+ if (match->flow.dl_type == ntohs(ETH_TYPE_IP) &&
+ (match->wc.masks.nw_src || match->wc.masks.nw_dst ||
+ match->wc.masks.nw_tos || match->wc.masks.nw_ttl ||
+ match->wc.masks.nw_proto)) {
+ ipv4_spec.hdr.type_of_service = match->flow.nw_tos;
+ ipv4_spec.hdr.time_to_live = match->flow.nw_ttl;
+ ipv4_spec.hdr.next_proto_id = match->flow.nw_proto;
+ ipv4_spec.hdr.src_addr = match->flow.nw_src;
+ ipv4_spec.hdr.dst_addr = match->flow.nw_dst;
+
+ ipv4_mask.hdr.type_of_service = match->wc.masks.nw_tos;
+ ipv4_mask.hdr.time_to_live = match->wc.masks.nw_ttl;
+ ipv4_mask.hdr.next_proto_id = match->wc.masks.nw_proto;
+ ipv4_mask.hdr.src_addr = match->wc.masks.nw_src;
+ ipv4_mask.hdr.dst_addr = match->wc.masks.nw_dst;
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_IPV4,
+ &ipv4_spec, &ipv4_mask);
+
+ /* Save proto for L4 protocol setup */
+ proto = ipv4_spec.hdr.next_proto_id &
+ ipv4_mask.hdr.next_proto_id;
+
+ /* Remember proto mask address for later modification */
+ ipv4_next_proto_mask = &ipv4_mask.hdr.next_proto_id;
+ }
+
+ if (proto != IPPROTO_ICMP && proto != IPPROTO_UDP &&
+ proto != IPPROTO_SCTP && proto != IPPROTO_TCP &&
+ (match->wc.masks.tp_src ||
+ match->wc.masks.tp_dst ||
+ match->wc.masks.tcp_flags)) {
+ VLOG_DBG("L4 Protocol (%u) not supported", proto);
+ ret = -1;
+ goto out;
+ }
+
+ if ((match->wc.masks.tp_src && match->wc.masks.tp_src != 0xffff) ||
+ (match->wc.masks.tp_dst && match->wc.masks.tp_dst != 0xffff)) {
+ ret = -1;
+ goto out;
+ }
+
+ struct rte_flow_item_tcp tcp_spec;
+ struct rte_flow_item_tcp tcp_mask;
+ memset(&tcp_spec, 0, sizeof(tcp_spec));
+ memset(&tcp_mask, 0, sizeof(tcp_mask));
+ if (proto == IPPROTO_TCP &&
+ (match->wc.masks.tp_src ||
+ match->wc.masks.tp_dst ||
+ match->wc.masks.tcp_flags)) {
+ tcp_spec.hdr.src_port = match->flow.tp_src;
+ tcp_spec.hdr.dst_port = match->flow.tp_dst;
+ tcp_spec.hdr.data_off = ntohs(match->flow.tcp_flags) >> 8;
+ tcp_spec.hdr.tcp_flags = ntohs(match->flow.tcp_flags) & 0xff;
+
+ tcp_mask.hdr.src_port = match->wc.masks.tp_src;
+ tcp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+ tcp_mask.hdr.data_off = ntohs(match->wc.masks.tcp_flags) >> 8;
+ tcp_mask.hdr.tcp_flags = ntohs(match->wc.masks.tcp_flags) & 0xff;
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_TCP,
+ &tcp_spec, &tcp_mask);
+
+ /* proto == TCP and ITEM_TYPE_TCP, thus no need for proto match */
+ if (ipv4_next_proto_mask) {
+ *ipv4_next_proto_mask = 0;
+ }
+ goto end_proto_check;
+ }
+
+ struct rte_flow_item_udp udp_spec;
+ struct rte_flow_item_udp udp_mask;
+ memset(&udp_spec, 0, sizeof(udp_spec));
+ memset(&udp_mask, 0, sizeof(udp_mask));
+ if (proto == IPPROTO_UDP &&
+ (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+ udp_spec.hdr.src_port = match->flow.tp_src;
+ udp_spec.hdr.dst_port = match->flow.tp_dst;
+
+ udp_mask.hdr.src_port = match->wc.masks.tp_src;
+ udp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_UDP,
+ &udp_spec, &udp_mask);
+
+ /* proto == UDP and ITEM_TYPE_UDP, thus no need for proto match */
+ if (ipv4_next_proto_mask) {
+ *ipv4_next_proto_mask = 0;
+ }
+ goto end_proto_check;
+ }
+
+ struct rte_flow_item_sctp sctp_spec;
+ struct rte_flow_item_sctp sctp_mask;
+ memset(&sctp_spec, 0, sizeof(sctp_spec));
+ memset(&sctp_mask, 0, sizeof(sctp_mask));
+ if (proto == IPPROTO_SCTP &&
+ (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+ sctp_spec.hdr.src_port = match->flow.tp_src;
+ sctp_spec.hdr.dst_port = match->flow.tp_dst;
+
+ sctp_mask.hdr.src_port = match->wc.masks.tp_src;
+ sctp_mask.hdr.dst_port = match->wc.masks.tp_dst;
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_SCTP,
+ &sctp_spec, &sctp_mask);
+
+ /* proto == SCTP and ITEM_TYPE_SCTP, thus no need for proto match */
+ if (ipv4_next_proto_mask) {
+ *ipv4_next_proto_mask = 0;
+ }
+ goto end_proto_check;
+ }
+
+ struct rte_flow_item_icmp icmp_spec;
+ struct rte_flow_item_icmp icmp_mask;
+ memset(&icmp_spec, 0, sizeof(icmp_spec));
+ memset(&icmp_mask, 0, sizeof(icmp_mask));
+ if (proto == IPPROTO_ICMP &&
+ (match->wc.masks.tp_src || match->wc.masks.tp_dst)) {
+ icmp_spec.hdr.icmp_type = (uint8_t)ntohs(match->flow.tp_src);
+ icmp_spec.hdr.icmp_code = (uint8_t)ntohs(match->flow.tp_dst);
+
+ icmp_mask.hdr.icmp_type = (uint8_t)ntohs(match->wc.masks.tp_src);
+ icmp_mask.hdr.icmp_code = (uint8_t)ntohs(match->wc.masks.tp_dst);
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_ICMP,
+ &icmp_spec, &icmp_mask);
+
+ /* proto == ICMP and ITEM_TYPE_ICMP, thus no need for proto match */
+ if (ipv4_next_proto_mask) {
+ *ipv4_next_proto_mask = 0;
+ }
+ goto end_proto_check;
+ }
+
+end_proto_check:
+
+ add_flow_pattern(&patterns, RTE_FLOW_ITEM_TYPE_END, NULL, NULL);
+
+ struct rte_flow_action_mark mark;
+ mark.id = info->flow_mark;
+ add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_MARK, &mark);
+
+ struct rte_flow_action_rss *rss;
+ rss = add_flow_rss_action(&actions, netdev);
+ add_flow_action(&actions, RTE_FLOW_ACTION_TYPE_END, NULL);
+
+ flow = rte_flow_create(dev->port_id, &flow_attr, patterns.items,
+ actions.actions, &error);
+ free(rss);
+ if (!flow) {
+ VLOG_ERR("rte flow creat error: %u : message : %s\n",
+ error.type, error.message);
+ ret = -1;
+ goto out;
+ }
+ ufid_to_rte_flow_associate(ufid, flow);
+ VLOG_DBG("installed flow %p by ufid "UUID_FMT"\n",
+ flow, UUID_ARGS((struct uuid *)ufid));
+
+out:
+ free(patterns.items);
+ free(actions.actions);
+ return ret;
+}
+
+static bool
+is_all_zero(const void *addr, size_t n) {
+ size_t i = 0;
+ const uint8_t *p = (uint8_t *)addr;
+
+ for (i = 0; i < n; i++) {
+ if (p[i] != 0) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+/*
+ * Check if any unsupported flow patterns are specified.
+ */
+static int
+netdev_dpdk_validate_flow(const struct match *match) {
+ struct match match_zero_wc;
+
+ /* Create a wc-zeroed version of flow */
+ match_init(&match_zero_wc, &match->flow, &match->wc);
+
+ if (!is_all_zero(&match_zero_wc.flow.tunnel,
+ sizeof(match_zero_wc.flow.tunnel))) {
+ goto err;
+ }
+
+ if (match->wc.masks.metadata ||
+ match->wc.masks.skb_priority ||
+ match->wc.masks.pkt_mark ||
+ match->wc.masks.dp_hash) {
+ goto err;
+ }
+
+ /* recirc id must be zero */
+ if (match_zero_wc.flow.recirc_id) {
+ goto err;
+ }
+
+ if (match->wc.masks.ct_state ||
+ match->wc.masks.ct_nw_proto ||
+ match->wc.masks.ct_zone ||
+ match->wc.masks.ct_mark ||
+ match->wc.masks.ct_label.u64.hi ||
+ match->wc.masks.ct_label.u64.lo) {
+ goto err;
+ }
+
+ if (match->wc.masks.conj_id ||
+ match->wc.masks.actset_output) {
+ goto err;
+ }
+
+ /* unsupported L2 */
+ if (!is_all_zero(&match->wc.masks.mpls_lse,
+ sizeof(match_zero_wc.flow.mpls_lse))) {
+ goto err;
+ }
+
+ /* unsupported L3 */
+ if (match->wc.masks.ipv6_label ||
+ match->wc.masks.ct_nw_src ||
+ match->wc.masks.ct_nw_dst ||
+ !is_all_zero(&match->wc.masks.ipv6_src, sizeof(struct in6_addr)) ||
+ !is_all_zero(&match->wc.masks.ipv6_dst, sizeof(struct in6_addr)) ||
+ !is_all_zero(&match->wc.masks.ct_ipv6_src, sizeof(struct in6_addr)) ||
+ !is_all_zero(&match->wc.masks.ct_ipv6_dst, sizeof(struct in6_addr)) ||
+ !is_all_zero(&match->wc.masks.nd_target, sizeof(struct in6_addr)) ||
+ !is_all_zero(&match->wc.masks.nsh, sizeof(struct ovs_key_nsh)) ||
+ !is_all_zero(&match->wc.masks.arp_sha, sizeof(struct eth_addr)) ||
+ !is_all_zero(&match->wc.masks.arp_tha, sizeof(struct eth_addr))) {
+ goto err;
+ }
+
+ /* If fragmented, then don't HW accelerate - for now */
+ if (match_zero_wc.flow.nw_frag) {
+ goto err;
+ }
+
+ /* unsupported L4 */
+ if (match->wc.masks.igmp_group_ip4 ||
+ match->wc.masks.ct_tp_src ||
+ match->wc.masks.ct_tp_dst) {
+ goto err;
+ }
+
+ return 0;
+
+err:
+ VLOG_ERR("cannot HW accelerate this flow due to unsupported protocols");
+ return -1;
+}
+
+static int
+netdev_dpdk_destroy_rte_flow(struct netdev_dpdk *dev,
+ const ovs_u128 *ufid,
+ struct rte_flow *rte_flow) {
+ struct rte_flow_error error;
+ int ret;
+
+ ret = rte_flow_destroy(dev->port_id, rte_flow, &error);
+ if (ret == 0) {
+ ufid_to_rte_flow_disassociate(ufid);
+ VLOG_DBG("removed rte flow %p associated with ufid " UUID_FMT "\n",
+ rte_flow, UUID_ARGS((struct uuid *)ufid));
+ } else {
+ VLOG_ERR("rte flow destroy error: %u : message : %s\n",
+ error.type, error.message);
+ }
+
+ return ret;
+}
+
+static int
+netdev_dpdk_flow_put(struct netdev *netdev, struct match *match,
+ struct nlattr *actions, size_t actions_len,
+ const ovs_u128 *ufid, struct offload_info *info,
+ struct dpif_flow_stats *stats OVS_UNUSED) {
+ struct rte_flow *rte_flow;
+ int ret;
+
+ /*
+ * If an old rte_flow exists, it means it's a flow modification.
+ * Here destroy the old rte flow first before adding a new one.
+ */
+ rte_flow = ufid_to_rte_flow_find(ufid);
+ if (rte_flow) {
+ ret = netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+ ufid, rte_flow);
+ if (ret < 0) {
+ return ret;
+ }
+ }
+
+ ret = netdev_dpdk_validate_flow(match);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return netdev_dpdk_add_rte_flow_offload(netdev, match, actions,
+ actions_len, ufid, info); }
+
+static int
+netdev_dpdk_flow_del(struct netdev *netdev, const ovs_u128 *ufid,
+ struct dpif_flow_stats *stats OVS_UNUSED) {
+
+ struct rte_flow *rte_flow = ufid_to_rte_flow_find(ufid);
+
+ if (!rte_flow) {
+ return -1;
+ }
+
+ return netdev_dpdk_destroy_rte_flow(netdev_dpdk_cast(netdev),
+ ufid, rte_flow); }
+
+#define DPDK_FLOW_OFFLOAD_API \
+ NULL, /* flow_flush */ \
+ NULL, /* flow_dump_create */ \
+ NULL, /* flow_dump_destroy */ \
+ NULL, /* flow_dump_next */ \
+ netdev_dpdk_flow_put, \
+ NULL, /* flow_get */ \
+ netdev_dpdk_flow_del, \
+ NULL /* init_flow_api */
+
+
#define NETDEV_DPDK_CLASS(NAME, INIT, CONSTRUCT, DESTRUCT, \
SET_CONFIG, SET_TX_MULTIQ, SEND, \
GET_CARRIER, GET_STATS, \
@@ -3927,7 +4488,7 @@ unlock:
RXQ_RECV, \
NULL, /* rx_wait */ \
NULL, /* rxq_drain */ \
- NO_OFFLOAD_API \
+ DPDK_FLOW_OFFLOAD_API \
}
static const struct netdev_class dpdk_class =