diff mbox series

[ovs-dev,v4,4/4] dpif-netdev: optimized dpcls_rule_matches_key()

Message ID 20190121132241.76471-5-harry.van.haaren@intel.com
State Changes Requested
Headers show
Series dpcls subtable miniflow optimizations | expand

Commit Message

Van Haaren, Harry Jan. 21, 2019, 1:22 p.m. UTC
This commit optimizes the dpcls_rule_matches_key() function
by refactoring the code to be shorter and less branchy. The
main code-change (and optimization) is to use popcount and
mask to calculate the packet block index.

This code is split out to the header file, which is marked as
static inline. The code itself is refactored into two functions,
one to iterate the "units" of the miniflow, the other to compare
the actual miniflow "blocks" and compare them with the mask/value.

The verify_unit() function is called twice, and should/will be inlined
by the compiler in optimized builds. As a result the while() loop
inside is hence present in two locations in the caller function.
This gives each while() loop its own entry in the branch-predictor,
helping it to correctly predict each units loop iterations.

Finally, we always compute all blocks, and then return match or
not-matching in a branch-free manner. This allows the pipeline to
execute better, by removing non-predictable branches.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

This patch optimizes rule_verify() signficantly, please benchmark
before and after this patch and see what gains your workload sees.

v4:
- Fixed bug in popcount offset calculation, thanks Ilya for reporting
- Comments cleaned up and reworded for clarity

v3:
- Fix "last minute variable rename fix" which broke the build :/
---
 lib/dpif-netdev.c | 18 -------------
 lib/dpif-netdev.h | 68 +++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 66 insertions(+), 20 deletions(-)

Comments

0-day Robot Jan. 21, 2019, 2:09 p.m. UTC | #1
Bleep bloop.  Greetings Harry van Haaren, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line is 94 characters long (recommended limit is 79)
#131 FILE: lib/dpif-netdev.h:163:
    fail |= dpcls_verify_unit(rle_u0, pkt_u0, &rle_blocks[0], &msk_blocks[0], &pkt_blocks[0]);

WARNING: Line is 95 characters long (recommended limit is 79)
#132 FILE: lib/dpif-netdev.h:164:
    fail |= dpcls_verify_unit(rle_u1, pkt_u1, &rle_blocks[rle_u0_pop], &msk_blocks[rle_u0_pop],

Lines checked: 144, Warnings: 2, Errors: 0


Please check this out.  If you feel there has been an error, please email aconole@bytheb.org

Thanks,
0-day Robot
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index bcf77db22..bee85a8f3 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -7669,24 +7669,6 @@  dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
     }
 }
 
-/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
- * in 'mask' the values in 'key' and 'target' are the same. */
-bool
-dpcls_rule_matches_key(const struct dpcls_rule *rule,
-                       const struct netdev_flow_key *target)
-{
-    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
-    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
-    uint64_t value;
-
-    NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
-        if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
-            return false;
-        }
-    }
-    return true;
-}
-
 /* For each miniflow in 'keys' performs a classifier lookup writing the result
  * into the corresponding slot in 'rules'.  If a particular entry in 'keys' is
  * NULL it is skipped.
diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
index 7195b9791..13017ce14 100644
--- a/lib/dpif-netdev.h
+++ b/lib/dpif-netdev.h
@@ -100,9 +100,73 @@  struct dpcls_subtable {
 #define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP)   \
     MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
 
-bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
-                            const struct netdev_flow_key *target);
+/* Iterate all bits set in the *rle_unit*, lookup the block of metadata based
+ * on the packet miniflow, and compare it for "matching" the rule, using the
+ * subtable mask in the process. Note that the pointers passed in to this
+ * function are already adjusted for the unit offset. */
+static inline int32_t
+dpcls_verify_unit(const uint64_t rle_unit, const uint64_t pkt_unit,
+                  const uint64_t *rle, const uint64_t *msk,
+                  const uint64_t *pkt)
+{
+    int match_fail = 0;
+    int linear_idx = 0;
+
+    uint64_t iter = rle_unit;
+    while (iter) {
+        uint64_t low_bit = iter & (-iter);
+        iter &= ~(low_bit);
+
+        uint64_t low_mask = low_bit - 1;
+        uint64_t bits = (low_mask & pkt_unit);
+        uint64_t blk_idx = __builtin_popcountll(bits);
+
+        /* Take packet, mask bits away, compare against rule.
+         * Results in 1 for matching, so ! to invert to fail */
+        match_fail |= !((pkt[blk_idx] & msk[linear_idx]) == rle[linear_idx]);
+        linear_idx++;
+    }
+
+    return match_fail;
+}
 
+/* match rule and target (aka packet), to understand if the rule applies to
+ * this packet. The actual miniflow-unit iteration is performed in
+ * the *dpcls_verify_unit* function, this just wraps the two unit calls */
+static inline int
+dpcls_rule_matches_key(const struct dpcls_rule *rule,
+                       const struct netdev_flow_key *target)
+{
+    /* retrieve the "block" pointers for the packet, rule and subtable mask */
+    const uint64_t *rle_blocks = miniflow_get_values(&rule->flow.mf);
+    const uint64_t *msk_blocks = miniflow_get_values(&rule->mask->mf);
+    const uint64_t *pkt_blocks = miniflow_get_values(&target->mf);
+
+    /* fetch the rule bits to iterate */
+    const uint64_t rle_u0 = rule->flow.mf.map.bits[0];
+    const uint64_t rle_u1 = rule->flow.mf.map.bits[1];
+
+    /* fetch the packet bits to navigate the packet's miniflow block indexes */
+    const uint64_t pkt_u0 = target->mf.map.bits[0];
+    const uint64_t pkt_u1 = target->mf.map.bits[1];
+
+    /* calculate where u1 starts by finding total size of u0 */
+    int rle_u0_pop = __builtin_popcountll(rle_u0);
+    int pkt_u0_pop = __builtin_popcountll(pkt_u0);
+
+    int fail = 0;
+    /* call verify_unit for both units. This has multiple advantages:
+     * 1) Each while() loop gets its own branch predictor entry - improves hits
+     * 2) Compiler can re-shuffle instructions as it likes, also between iters
+     * 3) Simpler popcount() approach means less branches in general
+     */
+    fail |= dpcls_verify_unit(rle_u0, pkt_u0, &rle_blocks[0], &msk_blocks[0], &pkt_blocks[0]);
+    fail |= dpcls_verify_unit(rle_u1, pkt_u1, &rle_blocks[rle_u0_pop], &msk_blocks[rle_u0_pop],
+                              &pkt_blocks[pkt_u0_pop]);
+
+    /* return 1 if matches, 0 on fail */
+    return fail == 0;
+}
 
 #ifdef  __cplusplus
 }