diff mbox series

[ovs-dev,v3,2/9] dpif-netdev: retrieve flow directly from the flow mark

Message ID 1506404199-23579-3-git-send-email-yliu@fridaylinux.org
State Changes Requested
Headers show
Series OVS-DPDK flow offload with rte_flow | expand

Commit Message

Yuanhan Liu Sept. 26, 2017, 5:36 a.m. UTC
So that we could skip the heavy emc processing, notably,
the miniflow_extract function. A simple PHY-PHY forwarding
testing (with one core, one queue and one flow) shows about
70% performance improvement.

The retrievement is done at datapath, which should be light.
Unfortunately, the pthread lock is too heavy for that. Instead,
RCU is chosen.

The major race issue is that the flows array might have been
resized, aka, the address might have been changed. For example,
assume the size of the flows array is 4. And then it's resized
to 8. Then a thread might reference the 6th item of the old
array, which is wrong.

RCU firstly makes sure the flows address update is atomic. Also
it provides an barrier. Setting the new array size after the
ovsrcu_set() makes sure above issue will not happen.

Note that though the heavy miniflow_extract is skipped, we
still have to do per packet checking, due to we have to check
the tcp_flags.

Co-authored-by: Finn Christensen <fc@napatech.com>
Signed-off-by: Yuanhan Liu <yliu@fridaylinux.org>
Signed-off-by: Finn Christensen <fc@napatech.com>
---

v3: - replace CMAP with array, which futhure improves the
      performance from 50% to 70%

    - introduce some common help functions for parse_tcp_flags

v2: update tcp_flags, which also fixes the build warnings
---
 lib/dp-packet.h   |  13 +++++
 lib/dpif-netdev.c |  91 +++++++++++++++++++++++++-------
 lib/flow.c        | 155 +++++++++++++++++++++++++++++++++++++++++++-----------
 lib/flow.h        |   1 +
 4 files changed, 209 insertions(+), 51 deletions(-)
diff mbox series

Patch

diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index 046f3ab..a7a062f 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -691,6 +691,19 @@  reset_dp_packet_checksum_ol_flags(struct dp_packet *p)
 #define reset_dp_packet_checksum_ol_flags(arg)
 #endif
 
+static inline bool
+dp_packet_has_flow_mark(struct dp_packet *p OVS_UNUSED,
+                        uint32_t *mark OVS_UNUSED)
+{
+#ifdef DPDK_NETDEV
+    if (p->mbuf.ol_flags & PKT_RX_FDIR_ID) {
+        *mark = p->mbuf.hash.fdir.hi;
+        return true;
+    }
+#endif
+    return false;
+}
+
 enum { NETDEV_MAX_BURST = 32 }; /* Maximum number packets in a batch. */
 
 struct dp_packet_batch {
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index f417b25..5c33c74 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -1845,7 +1845,7 @@  struct flow_mark_map {
     size_t max;
     struct ovs_mutex mutex;
     struct id_pool *mark_pool;
-    struct dp_netdev_flow **flows;
+    OVSRCU_TYPE(struct dp_netdev_flow **) flows;
 };
 
 static struct flow_mark_map flow_mark_map = {
@@ -1857,16 +1857,20 @@  static bool
 dp_netdev_alloc_flow_mark(uint32_t *mark)
 {
     bool succeed = true;
+    struct dp_netdev_flow **flows;
 
     ovs_mutex_lock(&flow_mark_map.mutex);
 
     /* Haven't initiated yet, do it here */
-    if (!flow_mark_map.flows) {
-        flow_mark_map.flows = xzalloc(sizeof(struct dp_netdev_flow *) *
-                                      flow_mark_map.max);
+    if (!flow_mark_map.mark_pool) {
         flow_mark_map.mark_pool = id_pool_create(0, UINT32_MAX / 2);
+
+        flows = xzalloc(sizeof(struct dp_netdev_flow *) * flow_mark_map.max);
+        ovsrcu_set(&flow_mark_map.flows, flows);
     }
 
+    flows = ovsrcu_get_protected(struct dp_netdev_flow **,
+                                 &flow_mark_map.flows);
     do {
         if (!id_pool_alloc_id(flow_mark_map.mark_pool, mark)) {
             succeed = false;
@@ -1874,16 +1878,28 @@  dp_netdev_alloc_flow_mark(uint32_t *mark)
         }
 
         if (*mark >= flow_mark_map.max) {
-            flow_mark_map.flows = xrealloc(flow_mark_map.flows,
-                                           flow_mark_map.max * 2 *
-                                           sizeof(struct dp_netdev_flow *));
-            memset(&flow_mark_map.flows[flow_mark_map.max], 0,
-                   flow_mark_map.max * sizeof(struct dp_netdev_flow *));
+            struct dp_netdev_flow **new_flows;
+
+            new_flows = xmalloc(2 * flow_mark_map.max *
+                                sizeof(struct dp_netdev_flow *));
+            memcpy(new_flows, flows, sizeof(struct dp_netdev_flow *) *
+                                     flow_mark_map.max);
+            memset(&new_flows[flow_mark_map.max], 0,
+                   sizeof(struct dp_netdev_flow *) * flow_mark_map.max);
+            ovsrcu_set(&flow_mark_map.flows, new_flows);
+            /*
+             * Set the new size after above ovsrcu_set, which will make
+             * sure the thread still referencing the old array will not
+             * go beyond the old max.
+             */
             flow_mark_map.max *= 2;
 
+            ovsrcu_synchronize();
+            free(flows);
+            flows = new_flows;
             break;
         }
-    } while (flow_mark_map.flows[*mark]);
+    } while (flows[*mark]);
 
     ovs_mutex_unlock(&flow_mark_map.mutex);
 
@@ -1894,9 +1910,13 @@  static void
 dp_netdev_install_flow_mark_map(const uint32_t mark,
                                 struct dp_netdev_flow *netdev_flow)
 {
+    struct dp_netdev_flow **flows;
+
     ovs_mutex_lock(&flow_mark_map.mutex);
-    if (mark < flow_mark_map.max) {
-        flow_mark_map.flows[mark] = netdev_flow;
+    if (OVS_LIKELY(mark < flow_mark_map.max)) {
+        flows = ovsrcu_get_protected(struct dp_netdev_flow **,
+                                     &flow_mark_map.flows);
+        flows[mark] = netdev_flow;
     }
     ovs_mutex_unlock(&flow_mark_map.mutex);
 }
@@ -1904,14 +1924,32 @@  dp_netdev_install_flow_mark_map(const uint32_t mark,
 static void
 dp_netdev_remove_flow_mark_map(const uint32_t mark)
 {
+    struct dp_netdev_flow **flows;
+
     ovs_mutex_lock(&flow_mark_map.mutex);
     id_pool_free_id(flow_mark_map.mark_pool, mark);
     if (mark < flow_mark_map.max) {
-        flow_mark_map.flows[mark] = NULL;
+        flows = ovsrcu_get_protected(struct dp_netdev_flow **,
+                                     &flow_mark_map.flows);
+        flows[mark] = NULL;
     }
     ovs_mutex_unlock(&flow_mark_map.mutex);
 }
 
+static struct dp_netdev_flow *
+dp_netdev_pmd_find_flow_by_mark(const uint32_t mark)
+{
+    struct dp_netdev_flow **flows;
+
+    if (OVS_LIKELY(mark < flow_mark_map.max)) {
+        flows = ovsrcu_get(struct dp_netdev_flow **, &flow_mark_map.flows);
+        return flows[mark];
+    }
+
+    return NULL;
+}
+
+
 static void
 dp_netdev_pmd_remove_flow(struct dp_netdev_pmd_thread *pmd,
                           struct dp_netdev_flow *flow)
@@ -4938,10 +4976,10 @@  struct packet_batch_per_flow {
 static inline void
 packet_batch_per_flow_update(struct packet_batch_per_flow *batch,
                              struct dp_packet *packet,
-                             const struct miniflow *mf)
+                             uint16_t tcp_flags)
 {
     batch->byte_count += dp_packet_size(packet);
-    batch->tcp_flags |= miniflow_get_tcp_flags(mf);
+    batch->tcp_flags |= tcp_flags;
     batch->array.packets[batch->array.count++] = packet;
 }
 
@@ -4976,7 +5014,7 @@  packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
 
 static inline void
 dp_netdev_queue_batches(struct dp_packet *pkt,
-                        struct dp_netdev_flow *flow, const struct miniflow *mf,
+                        struct dp_netdev_flow *flow, uint16_t tcp_flags,
                         struct packet_batch_per_flow *batches,
                         size_t *n_batches)
 {
@@ -4987,7 +5025,7 @@  dp_netdev_queue_batches(struct dp_packet *pkt,
         packet_batch_per_flow_init(batch, flow);
     }
 
-    packet_batch_per_flow_update(batch, pkt, mf);
+    packet_batch_per_flow_update(batch, pkt, tcp_flags);
 }
 
 /* Try to process all ('cnt') the 'packets' using only the exact match cache
@@ -5015,11 +5053,13 @@  emc_processing(struct dp_netdev_pmd_thread *pmd,
     const size_t size = dp_packet_batch_size(packets_);
     uint32_t cur_min;
     int i;
+    uint16_t tcp_flags;
 
     atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
 
     DP_PACKET_BATCH_REFILL_FOR_EACH (i, size, packet, packets_) {
         struct dp_netdev_flow *flow;
+        uint32_t flow_mark;
 
         if (OVS_UNLIKELY(dp_packet_size(packet) < ETH_HEADER_LEN)) {
             dp_packet_delete(packet);
@@ -5027,6 +5067,16 @@  emc_processing(struct dp_netdev_pmd_thread *pmd,
             continue;
         }
 
+        if (dp_packet_has_flow_mark(packet, &flow_mark)) {
+            flow = dp_netdev_pmd_find_flow_by_mark(flow_mark);
+            if (flow) {
+                tcp_flags = parse_tcp_flags(packet);
+                dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
+                                        n_batches);
+                continue;
+            }
+        }
+
         if (i != size - 1) {
             struct dp_packet **packets = packets_->packets;
             /* Prefetch next packet data and metadata. */
@@ -5044,7 +5094,8 @@  emc_processing(struct dp_netdev_pmd_thread *pmd,
         /* If EMC is disabled skip emc_lookup */
         flow = (cur_min == 0) ? NULL: emc_lookup(flow_cache, key);
         if (OVS_LIKELY(flow)) {
-            dp_netdev_queue_batches(packet, flow, &key->mf, batches,
+            tcp_flags = miniflow_get_tcp_flags(&key->mf);
+            dp_netdev_queue_batches(packet, flow, tcp_flags, batches,
                                     n_batches);
         } else {
             /* Exact match cache missed. Group missed packets together at
@@ -5221,7 +5272,9 @@  fast_path_processing(struct dp_netdev_pmd_thread *pmd,
         flow = dp_netdev_flow_cast(rules[i]);
 
         emc_probabilistic_insert(pmd, &keys[i], flow);
-        dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
+        dp_netdev_queue_batches(packet, flow,
+                                miniflow_get_tcp_flags(&keys[i].mf),
+                                batches, n_batches);
     }
 
     dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
diff --git a/lib/flow.c b/lib/flow.c
index b2b10aa..df2b879 100644
--- a/lib/flow.c
+++ b/lib/flow.c
@@ -626,6 +626,70 @@  flow_extract(struct dp_packet *packet, struct flow *flow)
     miniflow_expand(&m.mf, flow);
 }
 
+static inline bool
+ipv4_sanity_check(const struct ip_header *nh, size_t size,
+                  int *ip_lenp, uint16_t *tot_lenp)
+{
+    int ip_len;
+    uint16_t tot_len;
+
+    if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
+        return false;
+    }
+    ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
+
+    if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN || size < ip_len)) {
+        return false;
+    }
+
+    tot_len = ntohs(nh->ip_tot_len);
+    if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len ||
+                size - tot_len > UINT8_MAX)) {
+        return false;
+    }
+
+    *ip_lenp = ip_len;
+    *tot_lenp = tot_len;
+
+    return true;
+}
+
+static inline uint8_t
+ipv4_get_nw_frag(const struct ip_header *nh)
+{
+    uint8_t nw_frag = 0;
+
+    if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
+        nw_frag = FLOW_NW_FRAG_ANY;
+        if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
+            nw_frag |= FLOW_NW_FRAG_LATER;
+        }
+    }
+
+    return nw_frag;
+}
+
+static inline bool
+ipv6_sanity_check(const struct ovs_16aligned_ip6_hdr *nh, size_t size)
+{
+    uint16_t plen;
+
+    if (OVS_UNLIKELY(size < sizeof *nh)) {
+        return false;
+    }
+
+    plen = ntohs(nh->ip6_plen);
+    if (OVS_UNLIKELY(plen > size)) {
+        return false;
+    }
+    /* Jumbo Payload option not supported yet. */
+    if (OVS_UNLIKELY(size - plen > UINT8_MAX)) {
+        return false;
+    }
+
+    return true;
+}
+
 /* Caller is responsible for initializing 'dst' with enough storage for
  * FLOW_U64S * 8 bytes. */
 void
@@ -750,22 +814,7 @@  miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
         int ip_len;
         uint16_t tot_len;
 
-        if (OVS_UNLIKELY(size < IP_HEADER_LEN)) {
-            goto out;
-        }
-        ip_len = IP_IHL(nh->ip_ihl_ver) * 4;
-
-        if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) {
-            goto out;
-        }
-        if (OVS_UNLIKELY(size < ip_len)) {
-            goto out;
-        }
-        tot_len = ntohs(nh->ip_tot_len);
-        if (OVS_UNLIKELY(tot_len > size || ip_len > tot_len)) {
-            goto out;
-        }
-        if (OVS_UNLIKELY(size - tot_len > UINT8_MAX)) {
+        if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len, &tot_len))) {
             goto out;
         }
         dp_packet_set_l2_pad_size(packet, size - tot_len);
@@ -788,31 +837,19 @@  miniflow_extract(struct dp_packet *packet, struct miniflow *dst)
         nw_tos = nh->ip_tos;
         nw_ttl = nh->ip_ttl;
         nw_proto = nh->ip_proto;
-        if (OVS_UNLIKELY(IP_IS_FRAGMENT(nh->ip_frag_off))) {
-            nw_frag = FLOW_NW_FRAG_ANY;
-            if (nh->ip_frag_off & htons(IP_FRAG_OFF_MASK)) {
-                nw_frag |= FLOW_NW_FRAG_LATER;
-            }
-        }
+        nw_frag = ipv4_get_nw_frag(nh);
         data_pull(&data, &size, ip_len);
     } else if (dl_type == htons(ETH_TYPE_IPV6)) {
-        const struct ovs_16aligned_ip6_hdr *nh;
+        const struct ovs_16aligned_ip6_hdr *nh = data;
         ovs_be32 tc_flow;
         uint16_t plen;
 
-        if (OVS_UNLIKELY(size < sizeof *nh)) {
+        if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) {
             goto out;
         }
-        nh = data_pull(&data, &size, sizeof *nh);
+        data_pull(&data, &size, sizeof *nh);
 
         plen = ntohs(nh->ip6_plen);
-        if (OVS_UNLIKELY(plen > size)) {
-            goto out;
-        }
-        /* Jumbo Payload option not supported yet. */
-        if (OVS_UNLIKELY(size - plen > UINT8_MAX)) {
-            goto out;
-        }
         dp_packet_set_l2_pad_size(packet, size - plen);
         size = plen;   /* Never pull padding. */
 
@@ -991,6 +1028,60 @@  parse_dl_type(const struct eth_header *data_, size_t size)
     return parse_ethertype(&data, &size);
 }
 
+uint16_t
+parse_tcp_flags(struct dp_packet *packet)
+{
+    const void *data = dp_packet_data(packet);
+    size_t size = dp_packet_size(packet);
+    ovs_be16 dl_type;
+    uint8_t nw_frag = 0, nw_proto = 0;
+
+    if (packet->packet_type != htonl(PT_ETH)) {
+        return 0;
+    }
+
+    data_pull(&data, &size, ETH_ADDR_LEN * 2);
+    dl_type = parse_ethertype(&data, &size);
+    if (OVS_LIKELY(dl_type == htons(ETH_TYPE_IP))) {
+        const struct ip_header *nh = data;
+        int ip_len;
+        uint16_t tot_len;
+
+        if (OVS_UNLIKELY(!ipv4_sanity_check(nh, size, &ip_len, &tot_len))) {
+            return 0;
+        }
+        nw_proto = nh->ip_proto;
+        nw_frag = ipv4_get_nw_frag(nh);
+
+        size = tot_len;   /* Never pull padding. */
+        data_pull(&data, &size, ip_len);
+    } else if (dl_type == htons(ETH_TYPE_IPV6)) {
+        const struct ovs_16aligned_ip6_hdr *nh = data;
+
+        if (OVS_UNLIKELY(!ipv6_sanity_check(nh, size))) {
+            return 0;
+        }
+        data_pull(&data, &size, sizeof *nh);
+
+        size = ntohs(nh->ip6_plen); /* Never pull padding. */
+        if (!parse_ipv6_ext_hdrs__(&data, &size, &nw_proto, &nw_frag)) {
+            return 0;
+        }
+        nw_proto = nh->ip6_nxt;
+    } else {
+        return 0;
+    }
+
+    if (!(nw_frag & FLOW_NW_FRAG_LATER) && nw_proto == IPPROTO_TCP &&
+        size >= TCP_HEADER_LEN) {
+        const struct tcp_header *tcp = data;
+
+        return TCP_FLAGS_BE32(tcp->tcp_ctl);
+    }
+
+    return 0;
+}
+
 /* For every bit of a field that is wildcarded in 'wildcards', sets the
  * corresponding bit in 'flow' to zero. */
 void
diff --git a/lib/flow.h b/lib/flow.h
index 6ae5a67..f113ec4 100644
--- a/lib/flow.h
+++ b/lib/flow.h
@@ -130,6 +130,7 @@  bool parse_ipv6_ext_hdrs(const void **datap, size_t *sizep, uint8_t *nw_proto,
                          uint8_t *nw_frag);
 ovs_be16 parse_dl_type(const struct eth_header *data_, size_t size);
 bool parse_nsh(const void **datap, size_t *sizep, struct flow_nsh *key);
+uint16_t parse_tcp_flags(struct dp_packet *packet);
 
 static inline uint64_t
 flow_get_xreg(const struct flow *flow, int idx)