diff mbox series

[ovs-dev,v3,4/4] dpif-netdev: optimized dpcls_rule_matches_key()

Message ID 20181121014000.39339-5-harry.van.haaren@intel.com
State Changes Requested
Delegated to: Ian Stokes
Headers show
Series dpcls subtable miniflow optimizations | expand

Commit Message

Van Haaren, Harry Nov. 21, 2018, 1:40 a.m. UTC
This commit optimizes the dpcls_rule_matches_key() function
by refactoring the code to be shorter and less branchy. The
main code-change (and optimization) is to use popcount and
mask to calculate the packet block index.

This code is split out to the header file, which is marked as
static inline. The code itself is refactored into two functions,
one to iterate the "units" of the miniflow, the other to compare
the actual miniflow "blocks" and compare them with the mask/value.

The verify_unit() function is called twice, and should/will be inlined
by the compiler in optimized builds. As a result the while() loop
inside is hence present in two locations in the caller function.
This gives each while() loop its own entry in the branch-predictor,
helping it to correctly predict each units loop iterations.

Finally, we always compute all blocks, and then return match or
not-matching in a branch-free manner. This allows the pipeline to
execute better, by removing non-predictable branches.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

This patch optimizes rule_verify() signficantly, please benchmark
before and after this patch and see what gains your workload sees.

v3:
- Fix "last minute v2 variable rename fix" which broke the build :/

---
 lib/dpif-netdev.c | 18 ---------------
 lib/dpif-netdev.h | 56 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 54 insertions(+), 20 deletions(-)
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 8d02cdf62..12bda05d7 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -7207,24 +7207,6 @@  dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
     }
 }
 
-/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
- * in 'mask' the values in 'key' and 'target' are the same. */
-bool
-dpcls_rule_matches_key(const struct dpcls_rule *rule,
-                       const struct netdev_flow_key *target)
-{
-    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
-    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
-    uint64_t value;
-
-    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
-        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
-            return false;
-        }
-    }
-    return true;
-}
-
 /* For each miniflow in 'keys' performs a classifier lookup writing the result
  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
  * NULL it is skipped.
diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
index 7195b9791..8b11485d4 100644
--- a/lib/dpif-netdev.h
+++ b/lib/dpif-netdev.h
@@ -100,8 +100,60 @@  struct dpcls_subtable {
 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
     MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
 
-bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
-                            const struct netdev_flow_key *target);
+
+/* Iterate a unit of a miniflow, verifing if those block values match */
+static inline int32_t
+dpcls_verify_unit(const uint64_t unit, uint64_t offset, const uint64_t *rle,
+                  const uint64_t *msk, const uint64_t *pkt)
+{
+    int match_fail = 0;
+    int linear_idx = 0;
+
+    uint64_t iter = unit;
+    while (iter) {
+        uint64_t low_bit = iter & (-iter);
+        iter &= ~(low_bit);
+
+        uint64_t low_mask = low_bit - 1;
+        uint64_t bits = (low_mask & unit);
+        uint64_t blk_idx = __builtin_popcountll(bits) + offset;
+
+        /* Take packet, mask bits away, compare against rule.
+         * Results in 1 for matching, so ! to invert to fail */
+        match_fail |= !((pkt[blk_idx] & msk[linear_idx]) == rle[linear_idx]);
+        linear_idx++;
+    }
+
+    return match_fail;
+}
+
+/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
+ * in 'mask' the values in 'key' and 'target' are the same. */
+static inline int
+dpcls_rule_matches_key(const struct dpcls_rule *rule,
+                       const struct netdev_flow_key *target)
+{
+    const uint64_t *rle_blocks = miniflow_get_values(&rule->flow.mf);
+    const uint64_t *msk_blocks = miniflow_get_values(&rule->mask->mf);
+    const uint64_t *pkt_blocks = miniflow_get_values(&target->mf);
+    const uint64_t u0 = rule->mask->mf.map.bits[0];
+    const uint64_t u1 = rule->mask->mf.map.bits[1];
+
+    /* offset for u1 */
+    int u0_pop = __builtin_popcountll(u0);
+
+    int fail = 0;
+    /* call verify_unit for both units. This has multiple advantages:
+     * 1) Each while() loop gets its own branch predictor entry - improves hits
+     * 2) Compiler can re-shuffle instructions as it likes, also between iters
+     * 3) Simpler popcount() approach means less branches in general
+     */
+    fail |= dpcls_verify_unit(u0, 0, rle_blocks, msk_blocks, pkt_blocks);
+    fail |= dpcls_verify_unit(u1, u0_pop, rle_blocks, msk_blocks, pkt_blocks);
+
+    /* return 1 if matches, 0 on fail */
+    return ~(fail);
+}
 
 
 #ifdef  __cplusplus