@@ -7207,24 +7207,6 @@ dpcls_remove(struct dpcls *cls, struct dpcls_rule *rule)
}
}
-/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
- * in 'mask' the values in 'key' and 'target' are the same. */
-bool
-dpcls_rule_matches_key(const struct dpcls_rule *rule,
- const struct netdev_flow_key *target)
-{
- const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
- const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
- uint64_t value;
-
- NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(value, target, rule->flow.mf.map) {
- if (OVS_UNLIKELY((value & *maskp++) != *keyp++)) {
- return false;
- }
- }
- return true;
-}
-
/* For each miniflow in 'keys' performs a classifier lookup writing the result
* into the corresponding slot in 'rules'. If a particular entry in 'keys' is
* NULL it is skipped.
@@ -100,8 +100,60 @@ struct dpcls_subtable {
#define NETDEV_FLOW_KEY_FOR_EACH_IN_FLOWMAP(VALUE, KEY, FLOWMAP) \
MINIFLOW_FOR_EACH_IN_FLOWMAP(VALUE, &(KEY)->mf, FLOWMAP)
-bool dpcls_rule_matches_key(const struct dpcls_rule *rule,
- const struct netdev_flow_key *target);
+
+/* Iterate a unit of a miniflow, verifing if those block values match */
+static inline int32_t
+dpcls_verify_unit(const uint64_t unit, uint64_t offset, const uint64_t *rle,
+ const uint64_t *msk, const uint64_t *pkt)
+{
+ int match_fail = 0;
+ int linear_idx = 0;
+
+ uint64_t iter = unit;
+ while (iter) {
+ uint64_t low_bit = iter & (-iter);
+ iter &= ~(low_bit);
+
+ uint64_t low_mask = low_bit - 1;
+ uint64_t bits = (low_mask & unit);
+ uint64_t blk_idx = __builtin_popcountll(bits) + offset;
+
+ /* Take packet, mask bits away, compare against rule.
+ * Results in 1 for matching, so ! to invert to fail */
+ match_fail |= !((pkt[blk_idx] & msk[linear_idx]) == rle[linear_idx]);
+ linear_idx++;
+ }
+
+ return match_fail;
+}
+
+/* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
+ * in 'mask' the values in 'key' and 'target' are the same. */
+static inline int
+dpcls_rule_matches_key(const struct dpcls_rule *rule,
+ const struct netdev_flow_key *target)
+{
+ const uint64_t *rle_blocks = miniflow_get_values(&rule->flow.mf);
+ const uint64_t *msk_blocks = miniflow_get_values(&rule->mask->mf);
+ const uint64_t *pkt_blocks = miniflow_get_values(&target->mf);
+ const uint64_t u0 = rule->mask->mf.map.bits[0];
+ const uint64_t u1 = rule->mask->mf.map.bits[1];
+
+ /* offset for u1 */
+ int u0_pop = __builtin_popcountll(u0);
+
+ int fail = 0;
+ /* call verify_unit for both units. This has multiple advantages:
+ * 1) Each while() loop gets its own branch predictor entry - improves hits
+ * 2) Compiler can re-shuffle instructions as it likes, also between iters
+ * 3) Simpler popcount() approach means less branches in general
+ */
+ fail |= dpcls_verify_unit(u0, 0, rle_blocks, msk_blocks, pkt_blocks);
+ fail |= dpcls_verify_unit(u1, u0_pop, rle_blocks, msk_blocks, pkt_blocks);
+
+ /* return 1 if matches, 0 on fail */
+ return ~(fail);
+}
#ifdef __cplusplus
This commit optimizes the dpcls_rule_matches_key() function by refactoring the code to be shorter and less branchy. The main code-change (and optimization) is to use popcount and mask to calculate the packet block index. This code is split out to the header file, which is marked as static inline. The code itself is refactored into two functions, one to iterate the "units" of the miniflow, the other to compare the actual miniflow "blocks" and compare them with the mask/value. The verify_unit() function is called twice, and should/will be inlined by the compiler in optimized builds. As a result the while() loop inside is hence present in two locations in the caller function. This gives each while() loop its own entry in the branch-predictor, helping it to correctly predict each units loop iterations. Finally, we always compute all blocks, and then return match or not-matching in a branch-free manner. This allows the pipeline to execute better, by removing non-predictable branches. Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com> --- This patch optimizes rule_verify() signficantly, please benchmark before and after this patch and see what gains your workload sees. v3: - Fix "last minute v2 variable rename fix" which broke the build :/ --- lib/dpif-netdev.c | 18 --------------- lib/dpif-netdev.h | 56 +++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 54 insertions(+), 20 deletions(-)