[ovs-dev,v4,2/5] dpif-netdev: retrieve flow directly from the flow mark

Message ID 1511768584-19167-3-git-send-email-yliu@fridaylinux.org
State New
Delegated to: Ian Stokes
Headers show
Series
  • OVS-DPDK flow offload with rte_flow
Related show

Commit Message

Yuanhan Liu Nov. 27, 2017, 7:43 a.m.
So that we could skip some very costly CPU operations, including but
not limiting to miniflow_extract, emc lookup, dpcls lookup, etc. Thus,
performance could be greatly improved.

A PHY-PHY forwarding with 1000 mega flows (udp,tp_src=1000-1999) and
1 million streams (tp_src=1000-1999, tp_dst=2000-2999) show more that
260% performance boost.

Note that though the heavy miniflow_extract is skipped, we still have
to do per packet checking, due to we have to check the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
---
 lib/dp-packet.h   |  13 +++++
 lib/dpif-netdev.c |  42 ++++++++++++---
 lib/flow.c        | 155 +++++++++++++++++++++++++++++++++++++++++++-----------
 lib/flow.h        |   1 +
 4 files changed, 173 insertions(+), 38 deletions(-)

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index b4b721c..ea1194c 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -691,6 +691,19 @@  reset_dp_packet_checksum_ol_flags(struct dp_packet *p)
 #define reset_dp_packet_checksum_ol_flags(arg)
 #endif
 
+static inline bool
+dp_packet_has_flow_mark(struct dp_packet *p OVS_UNUSED,
+                        uint32_t *mark OVS_UNUSED)
+{
+#ifdef DPDK_NETDEV
+    if (p->mbuf.ol_flags & PKT_RX_FDIR_ID) {
+        *mark = p->mbuf.hash.fdir.hi;
+        return true;
+    }
+#endif
+    return false;
+}
+
 enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */
 
 struct dp_packet_batch {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8579474..e129fa3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -2027,6 +2027,21 @@  flow_mark_flush(struct dp_netdev_pmd_thread *pmd)
     }
 }
 
+static struct dp_netdev_flow *
+mark_to_flow_find(const struct dp_netdev_pmd_thread *pmd,
+                  const uint32_t mark)
+{
+    struct dp_netdev_flow *flow;
+
+    CMAP_FOR_EACH_WITH_HASH (flow, mark_node, mark, &flow_mark.mark_to_flow) {
+        if (flow->mark == mark && flow->pmd_id == pmd->core_id) {
+            return flow;
+        }
+    }
+
+    return NULL;
+}
+
 static void
 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
                           struct dp_netdev_flow *flow)
@@ -5116,10 +5131,10 @@  struct packet_batch_per_flow {
 static inline void
 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
                              struct dp_packet *packet,
-                             const struct miniflow *mf)
+                             uint16_t tcp_flags)
 {
     batch->byte_count += dp_packet_size(packet);
-    batch->tcp_flags |= miniflow_get_tcp_flags(mf);
+    batch->tcp_flags |= tcp_flags;
     batch->array.packets[batch->array.count++] = packet;
 }
 
@@ -5154,7 +5169,7 @@  packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
 
 static inline void
 dp_netdev_queue_batches(struct dp_packet *pkt,
-                        struct dp_netdev_flow *flow, const struct miniflow *mf,
+                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
                         struct packet_batch_per_flow *batches,
                         size_t *n_batches)
 {
@@ -5165,7 +5180,7 @@  dp_netdev_queue_batches(struct dp_packet *pkt,
         packet_batch_per_flow_init(batch, flow);
     }
 
-    packet_batch_per_flow_update(batch, pkt, mf);
+    packet_batch_per_flow_update(batch, pkt, tcp_flags);
 }
 
 /* Try to process all ('cnt') the 'packets' using only the exact match cache
@@ -5196,11 +5211,13 @@  emc_processing(struct dp_netdev_pmd_thread *pmd,
     const size_t cnt = dp_packet_batch_size(packets_);
     uint32_t cur_min;
     int i;
+    uint16_t tcp_flags;
 
     atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
         struct dp_netdev_flow *flow;
+        uint32_t flow_mark;
 
         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
             dp_packet_delete(packet);
@@ -5208,6 +5225,16 @@  emc_processing(struct dp_netdev_pmd_thread *pmd,
             continue;
         }
 
+        if (dp_packet_has_flow_mark(packet, &flow_mark)) {
+            flow = mark_to_flow_find(pmd, flow_mark);
+            if (flow) {
+                tcp_flags = parse_tcp_flags(packet);
+                dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
+                                        n_batches);
+                continue;
+            }
+        }
+
         if (i != cnt - 1) {
             struct dp_packet **packets = packets_->packets;
             /* Prefetch next packet data and metadata. */
@@ -5233,7 +5260,8 @@  emc_processing(struct dp_netdev_pmd_thread *pmd,
             flow = NULL;
         }
         if (OVS_LIKELY(flow)) {
-            dp_netdev_queue_batches(packet, flow, &key->mf, batches,
+            tcp_flags = miniflow_get_tcp_flags(&key->mf);
+            dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
                                     n_batches);
         } else {
             /* Exact match cache missed. Group missed packets together at
@@ -5409,7 +5437,9 @@  fast_path_processing(struct dp_netdev_pmd_thread *pmd,
         flow = dp_netdev_flow_cast(rules[i]);
 
         emc_probabilistic_insert(pmd, &keys[i], flow);
-        dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
+        dp_netdev_queue_batches(packet, flow,
+                                miniflow_get_tcp_flags(&keys[i].mf),
+                                batches, n_batches);
     }
 
     dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
diff --git a/lib/flow.c b/lib/flow.c
index 1adc499..8d281a1 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -625,6 +625,70 @@  flow_extract(struct dp_packet *packet, struct flow *flow)
     miniflow_expand(&m.mf, flow);
 }
 
+static inline bool
+ipv4_sanity_check(const struct ip_header *nh, size_t size,
+                  int *ip_lenp, uint16_t *tot_lenp)
+{
+    int ip_len;
+    uint16_t tot_len;
+
+    if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
+        return false;
+    }
+    ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
+
+    if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN || size < ip_len)) {
+        return false;
+    }
+
+    tot_len = ntohs(nh->ip_tot_len);
+    if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len ||
+                size - tot_len > UINT8_MAX)) {
+        return false;
+    }
+
+    *ip_lenp = ip_len;
+    *tot_lenp = tot_len;
+
+    return true;
+}
+
+static inline uint8_t
+ipv4_get_nw_frag(const struct ip_header *nh)
+{
+    uint8_t nw_frag = 0;
+
+    if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
+        nw_frag = FLOW_NW_FRAG_ANY;
+        if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
+            nw_frag |= FLOW_NW_FRAG_LATER;
+        }
+    }
+
+    return nw_frag;
+}
+
+static inline bool
+ipv6_sanity_check(const struct ovs_16aligned_ip6_hdr *nh, size_t size)
+{
+    uint16_t plen;
+
+    if (OVS_UNLIKELY(size < sizeof *nh)) {
+        return false;
+    }
+
+    plen = ntohs(nh->ip6_plen);
+    if (OVS_UNLIKELY(plen > size)) {
+        return false;
+    }
+    /* Jumbo Payload option not supported yet. */
+    if (OVS_UNLIKELY(size - plen > UINT8_MAX)) {
+        return false;
+    }
+
+    return true;
+}
+
 /* Caller is responsible for initializing 'dst' with enough storage for
  * FLOW_U64S * 8 bytes. */
 void
@@ -749,22 +813,7 @@  miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
         int ip_len;
         uint16_t tot_len;
 
-        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
-            goto out;
-        }
-        ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
-
-        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
-            goto out;
-        }
-        if (OVS_UNLIKELY(size < ip_len)) {
-            goto out;
-        }
-        tot_len = ntohs(nh->ip_tot_len);
-        if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len)) {
-            goto out;
-        }
-        if (OVS_UNLIKELY(size - tot_len > UINT8_MAX)) {
+        if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len, &tot_len))) {
             goto out;
         }
         dp_packet_set_l2_pad_size(packet, size - tot_len);
@@ -787,31 +836,19 @@  miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
         nw_tos = nh->ip_tos;
         nw_ttl = nh->ip_ttl;
         nw_proto = nh->ip_proto;
-        if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
-            nw_frag = FLOW_NW_FRAG_ANY;
-            if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
-                nw_frag |= FLOW_NW_FRAG_LATER;
-            }
-        }
+        nw_frag = ipv4_get_nw_frag(nh);
         data_pull(&data, &size, ip_len);
     } else if (dl_type == htons(ETH_TYPE_IPV6)) {
-        const struct ovs_16aligned_ip6_hdr *nh;
+        const struct ovs_16aligned_ip6_hdr *nh = data;
         ovs_be32 tc_flow;
         uint16_t plen;
 
-        if (OVS_UNLIKELY(size < sizeof *nh)) {
+        if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) {
             goto out;
         }
-        nh = data_pull(&data, &size, sizeof *nh);
+        data_pull(&data, &size, sizeof *nh);
 
         plen = ntohs(nh->ip6_plen);
-        if (OVS_UNLIKELY(plen > size)) {
-            goto out;
-        }
-        /* Jumbo Payload option not supported yet. */
-        if (OVS_UNLIKELY(size - plen > UINT8_MAX)) {
-            goto out;
-        }
         dp_packet_set_l2_pad_size(packet, size - plen);
         size = plen;   /* Never pull padding. */
 
@@ -990,6 +1027,60 @@  parse_dl_type(const struct eth_header *data_, size_t size)
     return parse_ethertype(&data, &size);
 }
 
+uint16_t
+parse_tcp_flags(struct dp_packet *packet)
+{
+    const void *data = dp_packet_data(packet);
+    size_t size = dp_packet_size(packet);
+    ovs_be16 dl_type;
+    uint8_t nw_frag = 0, nw_proto = 0;
+
+    if (packet->packet_type != htonl(PT_ETH)) {
+        return 0;
+    }
+
+    data_pull(&data, &size, ETH_ADDR_LEN * 2);
+    dl_type = parse_ethertype(&data, &size);
+    if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))) {
+        const struct ip_header *nh = data;
+        int ip_len;
+        uint16_t tot_len;
+
+        if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len, &tot_len))) {
+            return 0;
+        }
+        nw_proto = nh->ip_proto;
+        nw_frag = ipv4_get_nw_frag(nh);
+
+        size = tot_len;   /* Never pull padding. */
+        data_pull(&data, &size, ip_len);
+    } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+        const struct ovs_16aligned_ip6_hdr *nh = data;
+
+        if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) {
+            return 0;
+        }
+        data_pull(&data, &size, sizeof *nh);
+
+        size = ntohs(nh->ip6_plen); /* Never pull padding. */
+        if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag)) {
+            return 0;
+        }
+        nw_proto = nh->ip6_nxt;
+    } else {
+        return 0;
+    }
+
+    if (!(nw_frag & FLOW_NW_FRAG_LATER) && nw_proto == IPPROTO_TCP &&
+        size >= TCP_HEADER_LEN) {
+        const struct tcp_header *tcp = data;
+
+        return TCP_FLAGS_BE32(tcp->tcp_ctl);
+    }
+
+    return 0;
+}
+
 /* For every bit of a field that is wildcarded in 'wildcards', sets the
  * corresponding bit in 'flow' to zero. */
 void
diff --git a/lib/flow.h b/lib/flow.h
index b3128da..6537094 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -130,6 +130,7 @@  bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto,
                          uint8_t *nw_frag);
 ovs_be16 parse_dl_type(const struct eth_header *data_, size_t size);
 bool parse_nsh(const void **datap, size_t *sizep, struct flow_nsh *key);
+uint16_t parse_tcp_flags(struct dp_packet *packet);
 
 static inline uint64_t
 flow_get_xreg(const struct flow *flow, int idx)