diff mbox series

[ovs-dev,v13,04/12] dpif-avx512: Add ISA implementation of dpif.

Message ID 20210617161825.94741-5-cian.ferriter@intel.com
State Changes Requested
Headers show
Series DPIF Framework + Optimizations | expand

Commit Message

Ferriter, Cian June 17, 2021, 4:18 p.m. UTC
From: Harry van Haaren <harry.van.haaren@intel.com>

This commit adds the AVX512 implementation of DPIF functionality,
specifically the dp_netdev_input_outer_avx512 function. This function
only handles outer (no re-circulations), and is optimized to use the
AVX512 ISA for packet batching and other DPIF work.

Sparse is not able to handle the AVX512 intrinsics, causing compile
time failures, so it is disabled for this file.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
Co-authored-by: Kumar Amber <kumar.amber@intel.com>
Signed-off-by: Kumar Amber <kumar.amber@intel.com>

---

v13:
- Squash "Add HWOL support" commit into this commit.
- Add NEWS item about this feature here rather than in a later commit.
- Add #define NUM_U64_IN_ZMM_REG 8.
- Add comment describing operation of while loop handling HWOL->EMC->SMC
  lookups in dp_netdev_input_outer_avx512().
- Add EMC and SMC batch insert functions for better handling of EMC and
  SMC in AVX512 DPIF.
- Minor code refactor to address review comments.
---
 NEWS                             |   2 +
 lib/automake.mk                  |   5 +-
 lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
 lib/dpif-netdev-private-dfc.h    |  25 +++
 lib/dpif-netdev-private-dpif.h   |  32 +++
 lib/dpif-netdev-private-thread.h |  11 +-
 lib/dpif-netdev-private.h        |  25 +++
 lib/dpif-netdev.c                | 103 ++++++++--
 8 files changed, 514 insertions(+), 16 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512.c
 create mode 100644 lib/dpif-netdev-private-dpif.h

Comments

Flavio Leitner June 20, 2021, 8:08 p.m. UTC | #1
Hi,

I am still reviewing the patch, but I thought worth to discuss
few items below.

On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> From: Harry van Haaren <harry.van.haaren@intel.com>
> 
> This commit adds the AVX512 implementation of DPIF functionality,
> specifically the dp_netdev_input_outer_avx512 function. This function
> only handles outer (no re-circulations), and is optimized to use the
> AVX512 ISA for packet batching and other DPIF work.
> 
> Sparse is not able to handle the AVX512 intrinsics, causing compile
> time failures, so it is disabled for this file.
> 
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> 
> ---
> 
> v13:
> - Squash "Add HWOL support" commit into this commit.
> - Add NEWS item about this feature here rather than in a later commit.
> - Add #define NUM_U64_IN_ZMM_REG 8.
> - Add comment describing operation of while loop handling HWOL->EMC->SMC
>   lookups in dp_netdev_input_outer_avx512().
> - Add EMC and SMC batch insert functions for better handling of EMC and
>   SMC in AVX512 DPIF.
> - Minor code refactor to address review comments.
> ---
>  NEWS                             |   2 +
>  lib/automake.mk                  |   5 +-
>  lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
>  lib/dpif-netdev-private-dfc.h    |  25 +++
>  lib/dpif-netdev-private-dpif.h   |  32 +++
>  lib/dpif-netdev-private-thread.h |  11 +-
>  lib/dpif-netdev-private.h        |  25 +++
>  lib/dpif-netdev.c                | 103 ++++++++--
>  8 files changed, 514 insertions(+), 16 deletions(-)
>  create mode 100644 lib/dpif-netdev-avx512.c
>  create mode 100644 lib/dpif-netdev-private-dpif.h
> 
> diff --git a/NEWS b/NEWS
> index 96b3a61c8..6a4a7b76d 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -10,6 +10,8 @@ Post-v2.15.0
>       * Auto load balancing of PMDs now partially supports cross-NUMA polling
>         cases, e.g if all PMD threads are running on the same NUMA node.
>       * Refactor lib/dpif-netdev.c to multiple header files.
> +     * Add avx512 implementation of dpif which can process non recirculated
> +       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
>     - ovs-ctl:
>       * New option '--no-record-hostname' to disable hostname configuration
>         in ovsdb on startup.
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 3a33cdd5c..660cd07f0 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
>  	-mavx512f \
>  	-mavx512bw \
>  	-mavx512dq \
> +	-mbmi \
>  	-mbmi2 \
>  	-fPIC \
>  	$(AM_CFLAGS)
>  lib_libopenvswitchavx512_la_SOURCES = \
> -	lib/dpif-netdev-lookup-avx512-gather.c
> +	lib/dpif-netdev-lookup-avx512-gather.c \
> +	lib/dpif-netdev-avx512.c
>  lib_libopenvswitchavx512_la_LDFLAGS = \
>  	-static
>  endif
> @@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dpif-netdev-private-dfc.c \
>  	lib/dpif-netdev-private-dfc.h \
>  	lib/dpif-netdev-private-dpcls.h \
> +	lib/dpif-netdev-private-dpif.h \
>  	lib/dpif-netdev-private-flow.h \
>  	lib/dpif-netdev-private-hwol.h \
>  	lib/dpif-netdev-private-thread.h \
> diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> new file mode 100644
> index 000000000..0e55b0be2
> --- /dev/null
> +++ b/lib/dpif-netdev-avx512.c
> @@ -0,0 +1,327 @@
> +/*
> + * Copyright (c) 2021 Intel Corporation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-perf.h"
> +
> +#include "dpif-netdev-private.h"
> +#include "dpif-netdev-private-dpcls.h"
> +#include "dpif-netdev-private-flow.h"
> +#include "dpif-netdev-private-thread.h"
> +#include "dpif-netdev-private-hwol.h"

The -private.h already includes a few of the above, but
not all, so the interface doesn't seem to be well defined.
For example, in -private.h we have dpcls_lookup() while
other dpcls functions are in -private-dpcls.h. In this
case, the following would be enough:

#include "dpif-netdev-private.h"
#include "dpif-netdev-private-hwol.h"

But then I don't know why other headers are included in the
interface but not the -private-hwol.h.


> +
> +#include "dp-packet.h"
> +#include "netdev.h"
> +
> +#include "immintrin.h"
> +
> +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> + * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
> + * number of miniflow blocks that can be processed in a single pass of the
> + * AVX512 code at a time.
> + */
> +#define NUM_U64_IN_ZMM_REG (8)
> +
> +/* Structure to contain per-packet metadata that must be attributed to the
> + * dp netdev flow. This is unfortunate to have to track per packet, however
> + * it's a bit awkward to maintain them in a performant way. This structure
> + * helps to keep two variables on a single cache line per packet.
> + */
> +struct pkt_flow_meta {
> +    uint16_t bytes;
> +    uint16_t tcp_flags;
> +};
> +
> +/* Structure of heap allocated memory for DPIF internals. */
> +struct dpif_userdata {
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> +        struct netdev_flow_key keys[NETDEV_MAX_BURST];
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> +        struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> +        struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
> +};
> +
> +int32_t
> +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> +                             struct dp_packet_batch *packets,
> +                             odp_port_t in_port)
> +{
> +    /* Allocate DPIF userdata. */
> +    if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
> +        pmd->netdev_input_func_userdata =
> +                xmalloc_pagealign(sizeof(struct dpif_userdata));
> +    }
> +
> +    struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
> +    struct netdev_flow_key *keys = ud->keys;
> +    struct netdev_flow_key **key_ptrs = ud->key_ptrs;
> +    struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
> +
> +    /* The AVX512 DPIF implementation handles rules in a way that is optimized
> +     * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
> +     * achieved by separating the rule arrays. Bitmasks are kept for each
> +     * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
> +     * array. Later the two arrays are merged by AVX-512 expand instructions.
> +     */
> +
> +    /* Stores the computed output: a rule pointer for each packet. */
> +    /* Used initially for HWOL/EMC/SMC. */
> +    struct dpcls_rule *rules[NETDEV_MAX_BURST];
> +    /* Used for DPCLS. */
> +    struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
> +
> +    uint32_t dpcls_key_idx = 0;
> +
> +    for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> +        _mm512_storeu_si512(&rules[i], _mm512_setzero_si512());
> +        _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512());
> +    }
> +
> +    /* Prefetch each packet's metadata. */
> +    const size_t batch_size = dp_packet_batch_size(packets);
> +    for (int i = 0; i < batch_size; i++) {
> +        struct dp_packet *packet = packets->packets[i];
> +        OVS_PREFETCH(dp_packet_data(packet));
> +        pkt_metadata_prefetch_init(&packet->md);
> +    }
> +
> +    /* Check if EMC or SMC are enabled. */
> +    struct dfc_cache *cache = &pmd->flow_cache;
> +    const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0;
> +    const uint32_t smc_enabled = pmd->ctx.smc_enable_db;
> +
> +    uint32_t emc_hits = 0;
> +    uint32_t smc_hits = 0;
> +
> +    /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
> +    uint32_t hwol_emc_smc_hitmask = 0;
> +    uint32_t smc_hitmask = 0;
> +
> +    /* The below while loop is based on the 'iter' variable which has a number
> +     * of bits set representing packets that we want to process
> +     * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0)
> +     * the bit representing that packet using '_blsr_u64()'. The
> +     * '__builtin_ctz()' will give us the correct index into the 'packets',
> +     * 'pkt_meta', 'keys' and 'rules' arrays.
> +     *
> +     * For one iteration of the while loop, here's some psuedocode as an
> +     * example where 'iter' is represented in binary:
> +     *
> +     * while (iter) { // iter = 1100
> +     *     uint32_t i = __builtin_ctz(iter); // i = 2
> +     *     iter = _blsr_u64(iter); // iter = 1000
> +     *     // do all processing (HWOL->MFEX->EMC->SMC)
> +     * }
> +     */
> +    uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
> +    uint32_t iter = lookup_pkts_bitmask;
> +    while (iter) {
> +        uint32_t i = __builtin_ctz(iter);
> +        iter = _blsr_u64(iter);
> +
> +        /* Get packet pointer from bitmask and packet md. */
> +        struct dp_packet *packet = packets->packets[i];
> +        pkt_metadata_init(&packet->md, in_port);
> +
> +        struct dp_netdev_flow *f = NULL;
> +
> +        /* Check for partial hardware offload mark. */
> +        uint32_t mark;
> +        if (dp_packet_has_flow_mark(packet, &mark)) {
> +            f = mark_to_flow_find(pmd, mark);
> +            if (f) {
> +                rules[i] = &f->cr;
> +                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> +                pkt_meta[i].bytes = dp_packet_size(packet);
> +                hwol_emc_smc_hitmask |= (1 << i);
> +                continue;
> +            }
> +        }
> +
> +        /* Do miniflow extract into keys. */
> +        struct netdev_flow_key *key = &keys[i];
> +        miniflow_extract(packet, &key->mf);
> +
> +        /* Cache TCP and byte values for all packets. */
> +        pkt_meta[i].bytes = dp_packet_size(packet);
> +        pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
> +
> +        key->len = netdev_flow_key_size(miniflow_n_values(&key->mf));
> +        key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
> +
> +        if (emc_enabled) {
> +            f = emc_lookup(&cache->emc_cache, key);
> +
> +            if (f) {
> +                rules[i] = &f->cr;
> +                emc_hits++;
> +                hwol_emc_smc_hitmask |= (1 << i);
> +                continue;
> +            }
> +        }
> +
> +        if (smc_enabled && !f) {

Do we need !f here? It seems that is the only possible case.


> +            f = smc_lookup_single(pmd, packet, key);
> +            if (f) {
> +                rules[i] = &f->cr;
> +                smc_hits++;
> +                smc_hitmask |= (1 << i);
> +                continue;
> +            }
> +        }
> +
> +        /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the
> +         * dpcls input keys array for batch lookup later.
> +         */
> +        key_ptrs[dpcls_key_idx] = &keys[i];
> +        dpcls_key_idx++;
> +    }
> +
> +    hwol_emc_smc_hitmask |= smc_hitmask;
> +
> +    /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the
> +     * key_ptrs[] for input miniflows to match, storing results in the
> +     * dpcls_rules[] array.
> +     */
> +    if (dpcls_key_idx > 0) {
> +        struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
> +        if (OVS_UNLIKELY(!cls)) {
> +            return -1;
> +        }
> +        bool any_miss =
> +            !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs,
> +                          dpcls_rules, dpcls_key_idx, NULL);
> +        if (OVS_UNLIKELY(any_miss)) {
> +            return -1;
> +        }
> +
> +        /* Merge DPCLS rules and HWOL/EMC/SMC rules. */
> +        uint32_t dpcls_idx = 0;
> +        for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> +            /* Indexing here is somewhat complicated due to DPCLS output rule
> +             * load index depending on the hitmask of HWOL/EMC/SMC. More
> +             * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are
> +             * used.
> +             */
> +            __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]);
> +            __m512i v_merged_rules =
> +                        _mm512_mask_expandloadu_epi64(v_cache_rules,
> +                                                      ~hwol_emc_smc_hitmask,
> +                                                      &dpcls_rules[dpcls_idx]);
> +            _mm512_storeu_si512(&rules[i], v_merged_rules);
> +
> +            /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits.
> +             * There are NUM_U64_IN_ZMM_REG output pointers per register,
> +             * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules
> +             * consumed.
> +             */
> +            uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF);
> +            dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF);
> +            hwol_emc_smc_hitmask =
> +                (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG);
> +        }
> +    }
> +
> +    /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC
> +     * if required.
> +     */
> +    /* Insert SMC and DPCLS hits into EMC. */
> +    /* Insert DPCLS hits into SMC. */
> +    if (emc_enabled) {
> +        uint32_t emc_insert_mask = smc_hitmask | ~hwol_emc_smc_hitmask;

The hwol_emc_smc_hitmask could contain only the last most significant
8 bits from the original mask if dpcls is used. What am I missing?


> +        emc_insert_mask &= lookup_pkts_bitmask;
> +        emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask);
> +    }
> +    if (smc_enabled) {
> +        uint32_t smc_insert_mask = ~hwol_emc_smc_hitmask;
> +        smc_insert_mask &= lookup_pkts_bitmask;
> +        smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask);
> +    }
> +
> +    /* At this point we don't return error anymore, so commit stats here. */
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
> +                            dpcls_key_idx);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
> +                            dpcls_key_idx);
> +
> +    /* Initialize the "Action Batch" for each flow handled below. */
> +    struct dp_packet_batch action_batch;
> +    action_batch.trunc = 0;
> +
> +    while (lookup_pkts_bitmask) {
> +        uint32_t rule_pkt_idx = __builtin_ctz(lookup_pkts_bitmask);
> +        uint64_t needle = (uintptr_t) rules[rule_pkt_idx];
> +
> +        /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a
> +         * bitmask.
> +         */
> +        uint32_t batch_bitmask = 0;
> +        for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) {
> +            /* Pre-calculate store addr. */
> +            uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask);
> +            void *store_addr = &action_batch.packets[num_pkts_in_batch];
> +
> +            /* Search for identical flow* in burst, update bitmask. */
> +            __m512i v_needle = _mm512_set1_epi64(needle);
> +            __m512i v_hay = _mm512_loadu_si512(&rules[j]);
> +            __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay);
> +            uint32_t cmp_bits = k_cmp_bits;
> +            batch_bitmask |= cmp_bits << j;
> +
> +            /* Compress and store the batched packets. */
> +            struct dp_packet **packets_ptrs = &packets->packets[j];
> +            __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs);
> +            _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs);
> +        }
> +
> +        /* Strip all packets in this batch from the lookup_pkts_bitmask. */
> +        lookup_pkts_bitmask &= (~batch_bitmask);
> +        action_batch.count = __builtin_popcountll(batch_bitmask);
> +
> +        /* Loop over all packets in this batch, to gather the byte and tcp_flag
> +         * values, and pass them to the execute function. It would be nice to
> +         * optimize this away, however it is not easy to refactor in dpif.
> +         */
> +        uint32_t bytes = 0;
> +        uint16_t tcp_flags = 0;
> +        uint32_t bitmask_iter = batch_bitmask;
> +        for (int i = 0; i < action_batch.count; i++) {
> +            uint32_t idx = __builtin_ctzll(bitmask_iter);
> +            bitmask_iter = _blsr_u64(bitmask_iter);
> +
> +            bytes += pkt_meta[idx].bytes;
> +            tcp_flags |= pkt_meta[idx].tcp_flags;
> +        }
> +
> +        dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx],
> +                                bytes, tcp_flags);
> +    }
> +
> +    return 0;
> +}
> +
> +#endif
> +#endif
> diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
> index 6a672d1b3..d5d4da7ea 100644
> --- a/lib/dpif-netdev-private-dfc.h
> +++ b/lib/dpif-netdev-private-dfc.h
> @@ -81,6 +81,14 @@ extern "C" {
>  #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
>                                      DEFAULT_EM_FLOW_INSERT_INV_PROB)
>  
> +/* Forward declaration for SMC function prototype that requires access to
> + * 'struct dp_netdev_pmd_thread'. */
> +struct dp_netdev_pmd_thread;
> +
> +/* Forward declaration for EMC and SMC batch insert function prototypes that
> + * require access to 'struct dpcls_rule'. */
> +struct dpcls_rule;
> +
>  struct emc_entry {
>      struct dp_netdev_flow *flow;
>      struct netdev_flow_key key;   /* key.hash used for emc hash value. */
> @@ -168,6 +176,23 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
>      return NULL;
>  }
>  
> +/* Insert a batch of keys/flows into the EMC and SMC caches. */
> +void
> +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                               const struct netdev_flow_key *keys,
> +                               struct dpcls_rule **rules,
> +                               uint32_t emc_insert_mask);
> +
> +void
> +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                               const struct netdev_flow_key *keys,
> +                               struct dpcls_rule **rules,
> +                               uint32_t smc_insert_mask);
> +
> +struct dp_netdev_flow *
> +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> +                  struct dp_packet *packet,
> +                  struct netdev_flow_key *key);
>  
>  #ifdef  __cplusplus
>  }
> diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
> new file mode 100644
> index 000000000..2fd7cc400
> --- /dev/null
> +++ b/lib/dpif-netdev-private-dpif.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright (c) 2021 Intel Corporation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef DPIF_NETDEV_PRIVATE_DPIF_H
> +#define DPIF_NETDEV_PRIVATE_DPIF_H 1
> +
> +#include "openvswitch/types.h"
> +
> +/* Forward declarations to avoid including files. */
> +struct dp_netdev_pmd_thread;
> +struct dp_packet_batch;
> +
> +/* Available implementations for dpif work. */
> +int32_t
> +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> +                             struct dp_packet_batch *packets,
> +                             odp_port_t in_port);
> +
> +#endif /* netdev-private.h */
> diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
> index 0d674ab83..17356d5e2 100644
> --- a/lib/dpif-netdev-private-thread.h
> +++ b/lib/dpif-netdev-private-thread.h
> @@ -45,14 +45,19 @@ struct dp_netdev_pmd_thread_ctx {
>      struct dp_netdev_rxq *last_rxq;
>      /* EMC insertion probability context for the current processing cycle. */
>      uint32_t emc_insert_min;
> +    /* Enable the SMC cache from ovsdb config. */
> +    bool smc_enable_db;
>  };
>  
>  /* Forward declaration for typedef. */
>  struct dp_netdev_pmd_thread;
>  
> -typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> -                                     struct dp_packet_batch *packets,
> -                                     odp_port_t port_no);
> +/* Typedef for DPIF functions.
> + * Returns a bitmask of packets to handle, possibly including upcall/misses.
> + */
> +typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> +                                        struct dp_packet_batch *packets,
> +                                        odp_port_t port_no);
>  
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
>   * the performance overhead of interrupt processing.  Therefore netdev can
> diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
> index d7b6fd7ec..0315b5bf6 100644
> --- a/lib/dpif-netdev-private.h
> +++ b/lib/dpif-netdev-private.h
> @@ -31,4 +31,29 @@
>  #include "dpif-netdev-private-dfc.h"
>  #include "dpif-netdev-private-thread.h"
>  
> +/* Allow other implementations to lookup the DPCLS instances. */
> +struct dpcls *
> +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> +                           odp_port_t in_port);
> +
> +/* Allow other implementations to call dpcls_lookup() for subtable search. */
> +bool
> +dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> +             struct dpcls_rule **rules, const size_t cnt,
> +             int *num_lookups_p);
> +
> +/* Allow other implementations to execute actions on a batch. */
> +void
> +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> +                        struct dp_packet_batch *packets,
> +                        struct dpcls_rule *rule,
> +                        uint32_t bytes,
> +                        uint16_t tcp_flags);
> +
> +/* Available implementations for dpif work. */
> +int32_t
> +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> +                             struct dp_packet_batch *packets,
> +                             odp_port_t in_port);
> +
>  #endif /* netdev-private.h */
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e6486417e..1f15af882 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -183,10 +183,6 @@ static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
>  static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
>                           const struct netdev_flow_key *mask);
>  static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
> -static bool dpcls_lookup(struct dpcls *cls,
> -                         const struct netdev_flow_key *keys[],
> -                         struct dpcls_rule **rules, size_t cnt,
> -                         int *num_lookups_p);
>  
>  /* Set of supported meter flags */
>  #define DP_SUPPORTED_METER_FLAGS_MASK \
> @@ -483,7 +479,7 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
>                                        const struct flow *flow,
>                                        const struct nlattr *actions,
>                                        size_t actions_len);
> -static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> +static int32_t dp_netdev_input(struct dp_netdev_pmd_thread *,
>                              struct dp_packet_batch *, odp_port_t port_no);
>  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
>                                    struct dp_packet_batch *);
> @@ -555,7 +551,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
>                                 bool purge);
>  static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
>                                        struct tx_port *tx);
> -static inline struct dpcls *
> +inline struct dpcls *
>  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
>                             odp_port_t in_port);
>  
> @@ -1920,7 +1916,7 @@ void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
>      }
>  }
>  
> -static inline struct dpcls *
> +inline struct dpcls *
>  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
>                             odp_port_t in_port)
>  {
> @@ -2714,13 +2710,46 @@ smc_insert(struct dp_netdev_pmd_thread *pmd,
>      bucket->flow_idx[i] = index;
>  }
>  
> +inline void
> +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                               const struct netdev_flow_key *keys,
> +                               struct dpcls_rule **rules,
> +                               uint32_t emc_insert_mask)
> +{
> +    while (emc_insert_mask) {
> +        uint32_t i = __builtin_ctz(emc_insert_mask);
> +        emc_insert_mask &= emc_insert_mask - 1;
> +        /* Get the require parameters for EMC/SMC from the rule */
> +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> +        /* Insert the key into EMC/SMC. */
> +        emc_probabilistic_insert(pmd, &keys[i], flow);
> +    }
> +}
> +
> +inline void
> +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                 const struct netdev_flow_key *keys,
> +                 struct dpcls_rule **rules,
> +                 uint32_t smc_insert_mask)
> +{
> +    while (smc_insert_mask) {
> +        uint32_t i = __builtin_ctz(smc_insert_mask);
> +        smc_insert_mask &= smc_insert_mask - 1;
> +        /* Get the require parameters for EMC/SMC from the rule */
> +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> +        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
> +        /* Insert the key into EMC/SMC. */
> +        smc_insert(pmd, &keys[i], hash);
> +    }
> +}
> +
>  static struct dp_netdev_flow *
>  dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
>                            const struct netdev_flow_key *key,
>                            int *lookup_num_p)
>  {
>      struct dpcls *cls;
> -    struct dpcls_rule *rule;
> +    struct dpcls_rule *rule = NULL;
>      odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
>                                                       in_port.odp_port));
>      struct dp_netdev_flow *netdev_flow = NULL;
> @@ -4233,7 +4262,10 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>          }
>  
>          /* Process packet batch. */
> -        pmd->netdev_input_func(pmd, &batch, port_no);
> +        int32_t ret = pmd->netdev_input_func(pmd, &batch, port_no);

nit: int is enough.

> +        if (ret) {
> +            dp_netdev_input(pmd, &batch, port_no);
> +        }
>  
>          /* Assign processing cycles to rx queue. */
>          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> @@ -5251,6 +5283,8 @@ dpif_netdev_run(struct dpif *dpif)
>                      non_pmd->ctx.emc_insert_min = 0;
>                  }
>  
> +                non_pmd->ctx.smc_enable_db = dp->smc_enable_db;
> +
>                  for (i = 0; i < port->n_rxq; i++) {
>  
>                      if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
> @@ -5522,6 +5556,8 @@ reload:
>                  pmd->ctx.emc_insert_min = 0;
>              }
>  
> +            pmd->ctx.smc_enable_db = pmd->dp->smc_enable_db;
> +
>              process_packets =
>                  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
>                                             poll_list[i].port_no);
> @@ -6415,6 +6451,24 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
>                                actions->actions, actions->size);
>  }
>  
> +void
> +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> +                        struct dp_packet_batch *packets,
> +                        struct dpcls_rule *rule,
> +                        uint32_t bytes,
> +                        uint16_t tcp_flags)
> +{
> +    /* Gets action* from the rule. */
> +    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
> +    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
> +
> +    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
> +                        tcp_flags, pmd->ctx.now / 1000);
> +    const uint32_t steal = 1;
> +    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
> +                              actions->actions, actions->size);
> +}
> +
>  static inline void
>  dp_netdev_queue_batches(struct dp_packet *pkt,
>                          struct dp_netdev_flow *flow, uint16_t tcp_flags,
> @@ -6519,6 +6573,30 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
>  }
>  
> +struct dp_netdev_flow *
> +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> +                  struct dp_packet *packet,
> +                  struct netdev_flow_key *key)
> +{
> +    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
> +
> +    if (OVS_LIKELY(flow_node != NULL)) {
> +        struct dp_netdev_flow *flow = NULL;
> +
> +        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
> +            /* Since we dont have per-port megaflow to check the port
> +             * number, we need to verify that the input ports match. */
> +            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
> +                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
> +
> +                return (void *) flow;
> +            }
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
>  /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
>   * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
>   * miniflow is copied into 'keys' and the packet pointer is moved at the
> @@ -6924,12 +7002,13 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
>      }
>  }
>  
> -static void
> +static int32_t
>  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
>                  struct dp_packet_batch *packets,
>                  odp_port_t port_no)
>  {
>      dp_netdev_input__(pmd, packets, false, port_no);
> +    return 0;
>  }
>  
>  static void
> @@ -8369,7 +8448,7 @@ dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
>  
>  /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
>   * in 'mask' the values in 'key' and 'target' are the same. */
> -bool
> +inline bool ALWAYS_INLINE
>  dpcls_rule_matches_key(const struct dpcls_rule *rule,
>                         const struct netdev_flow_key *target)

Why always_inline? Shouldn't it be in the header then?

Thanks,
fbl


>  {
> @@ -8395,7 +8474,7 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
>   * priorities, instead returning any rule which matches the flow.
>   *
>   * Returns true if all miniflows found a corresponding rule. */
> -static bool
> +bool
>  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
>               struct dpcls_rule **rules, const size_t cnt,
>               int *num_lookups_p)
> -- 
> 2.32.0
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Ferriter, Cian June 21, 2021, 4:13 p.m. UTC | #2
Hi Flavio,

Thanks for the review. My responses are inline.

Cian

> -----Original Message-----
> From: Flavio Leitner <fbl@sysclose.org>
> Sent: Sunday 20 June 2021 21:09
> To: Ferriter, Cian <cian.ferriter@intel.com>
> Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> 
> 
> Hi,
> 
> I am still reviewing the patch, but I thought worth to discuss
> few items below.
> 
> On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> > From: Harry van Haaren <harry.van.haaren@intel.com>
> >
> > This commit adds the AVX512 implementation of DPIF functionality,
> > specifically the dp_netdev_input_outer_avx512 function. This function
> > only handles outer (no re-circulations), and is optimized to use the
> > AVX512 ISA for packet batching and other DPIF work.
> >
> > Sparse is not able to handle the AVX512 intrinsics, causing compile
> > time failures, so it is disabled for this file.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> > Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> > Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> >
> > ---
> >
> > v13:
> > - Squash "Add HWOL support" commit into this commit.
> > - Add NEWS item about this feature here rather than in a later commit.
> > - Add #define NUM_U64_IN_ZMM_REG 8.
> > - Add comment describing operation of while loop handling HWOL->EMC->SMC
> >   lookups in dp_netdev_input_outer_avx512().
> > - Add EMC and SMC batch insert functions for better handling of EMC and
> >   SMC in AVX512 DPIF.
> > - Minor code refactor to address review comments.
> > ---
> >  NEWS                             |   2 +
> >  lib/automake.mk                  |   5 +-
> >  lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
> >  lib/dpif-netdev-private-dfc.h    |  25 +++
> >  lib/dpif-netdev-private-dpif.h   |  32 +++
> >  lib/dpif-netdev-private-thread.h |  11 +-
> >  lib/dpif-netdev-private.h        |  25 +++
> >  lib/dpif-netdev.c                | 103 ++++++++--
> >  8 files changed, 514 insertions(+), 16 deletions(-)
> >  create mode 100644 lib/dpif-netdev-avx512.c
> >  create mode 100644 lib/dpif-netdev-private-dpif.h
> >
> > diff --git a/NEWS b/NEWS
> > index 96b3a61c8..6a4a7b76d 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -10,6 +10,8 @@ Post-v2.15.0
> >       * Auto load balancing of PMDs now partially supports cross-NUMA polling
> >         cases, e.g if all PMD threads are running on the same NUMA node.
> >       * Refactor lib/dpif-netdev.c to multiple header files.
> > +     * Add avx512 implementation of dpif which can process non recirculated
> > +       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
> >     - ovs-ctl:
> >       * New option '--no-record-hostname' to disable hostname configuration
> >         in ovsdb on startup.
> > diff --git a/lib/automake.mk b/lib/automake.mk
> > index 3a33cdd5c..660cd07f0 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
> >  	-mavx512f \
> >  	-mavx512bw \
> >  	-mavx512dq \
> > +	-mbmi \
> >  	-mbmi2 \
> >  	-fPIC \
> >  	$(AM_CFLAGS)
> >  lib_libopenvswitchavx512_la_SOURCES = \
> > -	lib/dpif-netdev-lookup-avx512-gather.c
> > +	lib/dpif-netdev-lookup-avx512-gather.c \
> > +	lib/dpif-netdev-avx512.c
> >  lib_libopenvswitchavx512_la_LDFLAGS = \
> >  	-static
> >  endif
> > @@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
> >  	lib/dpif-netdev-private-dfc.c \
> >  	lib/dpif-netdev-private-dfc.h \
> >  	lib/dpif-netdev-private-dpcls.h \
> > +	lib/dpif-netdev-private-dpif.h \
> >  	lib/dpif-netdev-private-flow.h \
> >  	lib/dpif-netdev-private-hwol.h \
> >  	lib/dpif-netdev-private-thread.h \
> > diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> > new file mode 100644
> > index 000000000..0e55b0be2
> > --- /dev/null
> > +++ b/lib/dpif-netdev-avx512.c
> > @@ -0,0 +1,327 @@
> > +/*
> > + * Copyright (c) 2021 Intel Corporation.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifdef __x86_64__
> > +/* Sparse cannot handle the AVX512 instructions. */
> > +#if !defined(__CHECKER__)
> > +
> > +#include <config.h>
> > +
> > +#include "dpif-netdev.h"
> > +#include "dpif-netdev-perf.h"
> > +
> > +#include "dpif-netdev-private.h"
> > +#include "dpif-netdev-private-dpcls.h"
> > +#include "dpif-netdev-private-flow.h"
> > +#include "dpif-netdev-private-thread.h"
> > +#include "dpif-netdev-private-hwol.h"
> 
> The -private.h already includes a few of the above, but
> not all, so the interface doesn't seem to be well defined.
> For example, in -private.h we have dpcls_lookup() while
> other dpcls functions are in -private-dpcls.h. In this
> case, the following would be enough:
> 
> #include "dpif-netdev-private.h"
> #include "dpif-netdev-private-hwol.h"
> 
> But then I don't know why other headers are included in the
> interface but not the -private-hwol.h.
> 
> 

Good point. This can be cleaned up. I've included lib/dpif-netdev-private-hwol.h in lib/dpif-netdev-private.h and removed the headers included by lib/dpif-netdev-private.h from lib/dpif-netdev-avx512.c.

I'll move the prototype for dpcls_lookup() too, it makes more sense if it's in lib/dpif-netdev-private-dpcls.h.

> > +
> > +#include "dp-packet.h"
> > +#include "netdev.h"
> > +
> > +#include "immintrin.h"
> > +
> > +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> > + * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
> > + * number of miniflow blocks that can be processed in a single pass of the
> > + * AVX512 code at a time.
> > + */
> > +#define NUM_U64_IN_ZMM_REG (8)
> > +
> > +/* Structure to contain per-packet metadata that must be attributed to the
> > + * dp netdev flow. This is unfortunate to have to track per packet, however
> > + * it's a bit awkward to maintain them in a performant way. This structure
> > + * helps to keep two variables on a single cache line per packet.
> > + */
> > +struct pkt_flow_meta {
> > +    uint16_t bytes;
> > +    uint16_t tcp_flags;
> > +};
> > +
> > +/* Structure of heap allocated memory for DPIF internals. */
> > +struct dpif_userdata {
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct netdev_flow_key keys[NETDEV_MAX_BURST];
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
> > +};
> > +
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port)
> > +{
> > +    /* Allocate DPIF userdata. */
> > +    if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
> > +        pmd->netdev_input_func_userdata =
> > +                xmalloc_pagealign(sizeof(struct dpif_userdata));
> > +    }
> > +
> > +    struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
> > +    struct netdev_flow_key *keys = ud->keys;
> > +    struct netdev_flow_key **key_ptrs = ud->key_ptrs;
> > +    struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
> > +
> > +    /* The AVX512 DPIF implementation handles rules in a way that is optimized
> > +     * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
> > +     * achieved by separating the rule arrays. Bitmasks are kept for each
> > +     * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
> > +     * array. Later the two arrays are merged by AVX-512 expand instructions.
> > +     */
> > +
> > +    /* Stores the computed output: a rule pointer for each packet. */
> > +    /* Used initially for HWOL/EMC/SMC. */
> > +    struct dpcls_rule *rules[NETDEV_MAX_BURST];
> > +    /* Used for DPCLS. */
> > +    struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
> > +
> > +    uint32_t dpcls_key_idx = 0;
> > +
> > +    for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> > +        _mm512_storeu_si512(&rules[i], _mm512_setzero_si512());
> > +        _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512());
> > +    }
> > +
> > +    /* Prefetch each packet's metadata. */
> > +    const size_t batch_size = dp_packet_batch_size(packets);
> > +    for (int i = 0; i < batch_size; i++) {
> > +        struct dp_packet *packet = packets->packets[i];
> > +        OVS_PREFETCH(dp_packet_data(packet));
> > +        pkt_metadata_prefetch_init(&packet->md);
> > +    }
> > +
> > +    /* Check if EMC or SMC are enabled. */
> > +    struct dfc_cache *cache = &pmd->flow_cache;
> > +    const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0;
> > +    const uint32_t smc_enabled = pmd->ctx.smc_enable_db;
> > +
> > +    uint32_t emc_hits = 0;
> > +    uint32_t smc_hits = 0;
> > +
> > +    /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
> > +    uint32_t hwol_emc_smc_hitmask = 0;
> > +    uint32_t smc_hitmask = 0;
> > +
> > +    /* The below while loop is based on the 'iter' variable which has a number
> > +     * of bits set representing packets that we want to process
> > +     * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0)
> > +     * the bit representing that packet using '_blsr_u64()'. The
> > +     * '__builtin_ctz()' will give us the correct index into the 'packets',
> > +     * 'pkt_meta', 'keys' and 'rules' arrays.
> > +     *
> > +     * For one iteration of the while loop, here's some psuedocode as an
> > +     * example where 'iter' is represented in binary:
> > +     *
> > +     * while (iter) { // iter = 1100
> > +     *     uint32_t i = __builtin_ctz(iter); // i = 2
> > +     *     iter = _blsr_u64(iter); // iter = 1000
> > +     *     // do all processing (HWOL->MFEX->EMC->SMC)
> > +     * }
> > +     */
> > +    uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
> > +    uint32_t iter = lookup_pkts_bitmask;
> > +    while (iter) {
> > +        uint32_t i = __builtin_ctz(iter);
> > +        iter = _blsr_u64(iter);
> > +
> > +        /* Get packet pointer from bitmask and packet md. */
> > +        struct dp_packet *packet = packets->packets[i];
> > +        pkt_metadata_init(&packet->md, in_port);
> > +
> > +        struct dp_netdev_flow *f = NULL;
> > +
> > +        /* Check for partial hardware offload mark. */
> > +        uint32_t mark;
> > +        if (dp_packet_has_flow_mark(packet, &mark)) {
> > +            f = mark_to_flow_find(pmd, mark);
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> > +                pkt_meta[i].bytes = dp_packet_size(packet);
> > +                hwol_emc_smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /* Do miniflow extract into keys. */
> > +        struct netdev_flow_key *key = &keys[i];
> > +        miniflow_extract(packet, &key->mf);
> > +
> > +        /* Cache TCP and byte values for all packets. */
> > +        pkt_meta[i].bytes = dp_packet_size(packet);
> > +        pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
> > +
> > +        key->len = netdev_flow_key_size(miniflow_n_values(&key->mf));
> > +        key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
> > +
> > +        if (emc_enabled) {
> > +            f = emc_lookup(&cache->emc_cache, key);
> > +
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                emc_hits++;
> > +                hwol_emc_smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        if (smc_enabled && !f) {
> 
> Do we need !f here? It seems that is the only possible case.
> 
> 

Good catch, we don't need that !f check. I'll remove it.

> > +            f = smc_lookup_single(pmd, packet, key);
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                smc_hits++;
> > +                smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the
> > +         * dpcls input keys array for batch lookup later.
> > +         */
> > +        key_ptrs[dpcls_key_idx] = &keys[i];
> > +        dpcls_key_idx++;
> > +    }
> > +
> > +    hwol_emc_smc_hitmask |= smc_hitmask;
> > +
> > +    /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the
> > +     * key_ptrs[] for input miniflows to match, storing results in the
> > +     * dpcls_rules[] array.
> > +     */
> > +    if (dpcls_key_idx > 0) {
> > +        struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
> > +        if (OVS_UNLIKELY(!cls)) {
> > +            return -1;
> > +        }
> > +        bool any_miss =
> > +            !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs,
> > +                          dpcls_rules, dpcls_key_idx, NULL);
> > +        if (OVS_UNLIKELY(any_miss)) {
> > +            return -1;
> > +        }
> > +
> > +        /* Merge DPCLS rules and HWOL/EMC/SMC rules. */
> > +        uint32_t dpcls_idx = 0;
> > +        for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> > +            /* Indexing here is somewhat complicated due to DPCLS output rule
> > +             * load index depending on the hitmask of HWOL/EMC/SMC. More
> > +             * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are
> > +             * used.
> > +             */
> > +            __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]);
> > +            __m512i v_merged_rules =
> > +                        _mm512_mask_expandloadu_epi64(v_cache_rules,
> > +                                                      ~hwol_emc_smc_hitmask,
> > +                                                      &dpcls_rules[dpcls_idx]);
> > +            _mm512_storeu_si512(&rules[i], v_merged_rules);
> > +
> > +            /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits.
> > +             * There are NUM_U64_IN_ZMM_REG output pointers per register,
> > +             * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules
> > +             * consumed.
> > +             */
> > +            uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF);
> > +            dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF);
> > +            hwol_emc_smc_hitmask =
> > +                (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG);
> > +        }
> > +    }
> > +
> > +    /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC
> > +     * if required.
> > +     */
> > +    /* Insert SMC and DPCLS hits into EMC. */
> > +    /* Insert DPCLS hits into SMC. */
> > +    if (emc_enabled) {
> > +        uint32_t emc_insert_mask = smc_hitmask | ~hwol_emc_smc_hitmask;
> 
> The hwol_emc_smc_hitmask could contain only the last most significant
> 8 bits from the original mask if dpcls is used. What am I missing?
> 
> 

Good catch, this is an error. hwol_emc_smc_hitmask has been shifted in the case where DPCLS is used. We want to use hwol_emc_smc_hitmask before it is modified by DPCLS lookup section. I'll fix this in the next version, thanks for pointing it out.

> > +        emc_insert_mask &= lookup_pkts_bitmask;
> > +        emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask);
> > +    }
> > +    if (smc_enabled) {
> > +        uint32_t smc_insert_mask = ~hwol_emc_smc_hitmask;
> > +        smc_insert_mask &= lookup_pkts_bitmask;
> > +        smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask);
> > +    }
> > +
> > +    /* At this point we don't return error anymore, so commit stats here. */
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
> > +                            dpcls_key_idx);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
> > +                            dpcls_key_idx);
> > +
> > +    /* Initialize the "Action Batch" for each flow handled below. */
> > +    struct dp_packet_batch action_batch;
> > +    action_batch.trunc = 0;
> > +
> > +    while (lookup_pkts_bitmask) {
> > +        uint32_t rule_pkt_idx = __builtin_ctz(lookup_pkts_bitmask);
> > +        uint64_t needle = (uintptr_t) rules[rule_pkt_idx];
> > +
> > +        /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a
> > +         * bitmask.
> > +         */
> > +        uint32_t batch_bitmask = 0;
> > +        for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) {
> > +            /* Pre-calculate store addr. */
> > +            uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask);
> > +            void *store_addr = &action_batch.packets[num_pkts_in_batch];
> > +
> > +            /* Search for identical flow* in burst, update bitmask. */
> > +            __m512i v_needle = _mm512_set1_epi64(needle);
> > +            __m512i v_hay = _mm512_loadu_si512(&rules[j]);
> > +            __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay);
> > +            uint32_t cmp_bits = k_cmp_bits;
> > +            batch_bitmask |= cmp_bits << j;
> > +
> > +            /* Compress and store the batched packets. */
> > +            struct dp_packet **packets_ptrs = &packets->packets[j];
> > +            __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs);
> > +            _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs);
> > +        }
> > +
> > +        /* Strip all packets in this batch from the lookup_pkts_bitmask. */
> > +        lookup_pkts_bitmask &= (~batch_bitmask);
> > +        action_batch.count = __builtin_popcountll(batch_bitmask);
> > +
> > +        /* Loop over all packets in this batch, to gather the byte and tcp_flag
> > +         * values, and pass them to the execute function. It would be nice to
> > +         * optimize this away, however it is not easy to refactor in dpif.
> > +         */
> > +        uint32_t bytes = 0;
> > +        uint16_t tcp_flags = 0;
> > +        uint32_t bitmask_iter = batch_bitmask;
> > +        for (int i = 0; i < action_batch.count; i++) {
> > +            uint32_t idx = __builtin_ctzll(bitmask_iter);
> > +            bitmask_iter = _blsr_u64(bitmask_iter);
> > +
> > +            bytes += pkt_meta[idx].bytes;
> > +            tcp_flags |= pkt_meta[idx].tcp_flags;
> > +        }
> > +
> > +        dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx],
> > +                                bytes, tcp_flags);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +#endif
> > +#endif
> > diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
> > index 6a672d1b3..d5d4da7ea 100644
> > --- a/lib/dpif-netdev-private-dfc.h
> > +++ b/lib/dpif-netdev-private-dfc.h
> > @@ -81,6 +81,14 @@ extern "C" {
> >  #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
> >                                      DEFAULT_EM_FLOW_INSERT_INV_PROB)
> >
> > +/* Forward declaration for SMC function prototype that requires access to
> > + * 'struct dp_netdev_pmd_thread'. */
> > +struct dp_netdev_pmd_thread;
> > +
> > +/* Forward declaration for EMC and SMC batch insert function prototypes that
> > + * require access to 'struct dpcls_rule'. */
> > +struct dpcls_rule;
> > +
> >  struct emc_entry {
> >      struct dp_netdev_flow *flow;
> >      struct netdev_flow_key key;   /* key.hash used for emc hash value. */
> > @@ -168,6 +176,23 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
> >      return NULL;
> >  }
> >
> > +/* Insert a batch of keys/flows into the EMC and SMC caches. */
> > +void
> > +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t emc_insert_mask);
> > +
> > +void
> > +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t smc_insert_mask);
> > +
> > +struct dp_netdev_flow *
> > +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> > +                  struct dp_packet *packet,
> > +                  struct netdev_flow_key *key);
> >
> >  #ifdef  __cplusplus
> >  }
> > diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
> > new file mode 100644
> > index 000000000..2fd7cc400
> > --- /dev/null
> > +++ b/lib/dpif-netdev-private-dpif.h
> > @@ -0,0 +1,32 @@
> > +/*
> > + * Copyright (c) 2021 Intel Corporation.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef DPIF_NETDEV_PRIVATE_DPIF_H
> > +#define DPIF_NETDEV_PRIVATE_DPIF_H 1
> > +
> > +#include "openvswitch/types.h"
> > +
> > +/* Forward declarations to avoid including files. */
> > +struct dp_netdev_pmd_thread;
> > +struct dp_packet_batch;
> > +
> > +/* Available implementations for dpif work. */
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port);
> > +
> > +#endif /* netdev-private.h */
> > diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
> > index 0d674ab83..17356d5e2 100644
> > --- a/lib/dpif-netdev-private-thread.h
> > +++ b/lib/dpif-netdev-private-thread.h
> > @@ -45,14 +45,19 @@ struct dp_netdev_pmd_thread_ctx {
> >      struct dp_netdev_rxq *last_rxq;
> >      /* EMC insertion probability context for the current processing cycle. */
> >      uint32_t emc_insert_min;
> > +    /* Enable the SMC cache from ovsdb config. */
> > +    bool smc_enable_db;
> >  };
> >
> >  /* Forward declaration for typedef. */
> >  struct dp_netdev_pmd_thread;
> >
> > -typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> > -                                     struct dp_packet_batch *packets,
> > -                                     odp_port_t port_no);
> > +/* Typedef for DPIF functions.
> > + * Returns a bitmask of packets to handle, possibly including upcall/misses.
> > + */
> > +typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> > +                                        struct dp_packet_batch *packets,
> > +                                        odp_port_t port_no);
> >
> >  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> >   * the performance overhead of interrupt processing.  Therefore netdev can
> > diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
> > index d7b6fd7ec..0315b5bf6 100644
> > --- a/lib/dpif-netdev-private.h
> > +++ b/lib/dpif-netdev-private.h
> > @@ -31,4 +31,29 @@
> >  #include "dpif-netdev-private-dfc.h"
> >  #include "dpif-netdev-private-thread.h"
> >
> > +/* Allow other implementations to lookup the DPCLS instances. */
> > +struct dpcls *
> > +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> > +                           odp_port_t in_port);
> > +
> > +/* Allow other implementations to call dpcls_lookup() for subtable search. */
> > +bool
> > +dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> > +             struct dpcls_rule **rules, const size_t cnt,
> > +             int *num_lookups_p);
> > +
> > +/* Allow other implementations to execute actions on a batch. */
> > +void
> > +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> > +                        struct dp_packet_batch *packets,
> > +                        struct dpcls_rule *rule,
> > +                        uint32_t bytes,
> > +                        uint16_t tcp_flags);
> > +
> > +/* Available implementations for dpif work. */
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port);
> > +
> >  #endif /* netdev-private.h */
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index e6486417e..1f15af882 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -183,10 +183,6 @@ static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
> >  static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
> >                           const struct netdev_flow_key *mask);
> >  static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
> > -static bool dpcls_lookup(struct dpcls *cls,
> > -                         const struct netdev_flow_key *keys[],
> > -                         struct dpcls_rule **rules, size_t cnt,
> > -                         int *num_lookups_p);
> >
> >  /* Set of supported meter flags */
> >  #define DP_SUPPORTED_METER_FLAGS_MASK \
> > @@ -483,7 +479,7 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> >                                        const struct flow *flow,
> >                                        const struct nlattr *actions,
> >                                        size_t actions_len);
> > -static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > +static int32_t dp_netdev_input(struct dp_netdev_pmd_thread *,
> >                              struct dp_packet_batch *, odp_port_t port_no);
> >  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
> >                                    struct dp_packet_batch *);
> > @@ -555,7 +551,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
> >                                 bool purge);
> >  static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
> >                                        struct tx_port *tx);
> > -static inline struct dpcls *
> > +inline struct dpcls *
> >  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> >                             odp_port_t in_port);
> >
> > @@ -1920,7 +1916,7 @@ void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
> >      }
> >  }
> >
> > -static inline struct dpcls *
> > +inline struct dpcls *
> >  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> >                             odp_port_t in_port)
> >  {
> > @@ -2714,13 +2710,46 @@ smc_insert(struct dp_netdev_pmd_thread *pmd,
> >      bucket->flow_idx[i] = index;
> >  }
> >
> > +inline void
> > +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t emc_insert_mask)
> > +{
> > +    while (emc_insert_mask) {
> > +        uint32_t i = __builtin_ctz(emc_insert_mask);
> > +        emc_insert_mask &= emc_insert_mask - 1;
> > +        /* Get the require parameters for EMC/SMC from the rule */
> > +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> > +        /* Insert the key into EMC/SMC. */
> > +        emc_probabilistic_insert(pmd, &keys[i], flow);
> > +    }
> > +}
> > +
> > +inline void
> > +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                 const struct netdev_flow_key *keys,
> > +                 struct dpcls_rule **rules,
> > +                 uint32_t smc_insert_mask)
> > +{
> > +    while (smc_insert_mask) {
> > +        uint32_t i = __builtin_ctz(smc_insert_mask);
> > +        smc_insert_mask &= smc_insert_mask - 1;
> > +        /* Get the require parameters for EMC/SMC from the rule */
> > +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> > +        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
> > +        /* Insert the key into EMC/SMC. */
> > +        smc_insert(pmd, &keys[i], hash);
> > +    }
> > +}
> > +
> >  static struct dp_netdev_flow *
> >  dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
> >                            const struct netdev_flow_key *key,
> >                            int *lookup_num_p)
> >  {
> >      struct dpcls *cls;
> > -    struct dpcls_rule *rule;
> > +    struct dpcls_rule *rule = NULL;
> >      odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
> >                                                       in_port.odp_port));
> >      struct dp_netdev_flow *netdev_flow = NULL;
> > @@ -4233,7 +4262,10 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> >          }
> >
> >          /* Process packet batch. */
> > -        pmd->netdev_input_func(pmd, &batch, port_no);
> > +        int32_t ret = pmd->netdev_input_func(pmd, &batch, port_no);
> 
> nit: int is enough.
> 

I'll change to int in the next version.

> > +        if (ret) {
> > +            dp_netdev_input(pmd, &batch, port_no);
> > +        }
> >
> >          /* Assign processing cycles to rx queue. */
> >          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> > @@ -5251,6 +5283,8 @@ dpif_netdev_run(struct dpif *dpif)
> >                      non_pmd->ctx.emc_insert_min = 0;
> >                  }
> >
> > +                non_pmd->ctx.smc_enable_db = dp->smc_enable_db;
> > +
> >                  for (i = 0; i < port->n_rxq; i++) {
> >
> >                      if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
> > @@ -5522,6 +5556,8 @@ reload:
> >                  pmd->ctx.emc_insert_min = 0;
> >              }
> >
> > +            pmd->ctx.smc_enable_db = pmd->dp->smc_enable_db;
> > +
> >              process_packets =
> >                  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
> >                                             poll_list[i].port_no);
> > @@ -6415,6 +6451,24 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
> >                                actions->actions, actions->size);
> >  }
> >
> > +void
> > +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> > +                        struct dp_packet_batch *packets,
> > +                        struct dpcls_rule *rule,
> > +                        uint32_t bytes,
> > +                        uint16_t tcp_flags)
> > +{
> > +    /* Gets action* from the rule. */
> > +    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
> > +    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
> > +
> > +    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
> > +                        tcp_flags, pmd->ctx.now / 1000);
> > +    const uint32_t steal = 1;
> > +    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
> > +                              actions->actions, actions->size);
> > +}
> > +
> >  static inline void
> >  dp_netdev_queue_batches(struct dp_packet *pkt,
> >                          struct dp_netdev_flow *flow, uint16_t tcp_flags,
> > @@ -6519,6 +6573,30 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
> >      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
> >  }
> >
> > +struct dp_netdev_flow *
> > +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> > +                  struct dp_packet *packet,
> > +                  struct netdev_flow_key *key)
> > +{
> > +    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
> > +
> > +    if (OVS_LIKELY(flow_node != NULL)) {
> > +        struct dp_netdev_flow *flow = NULL;
> > +
> > +        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
> > +            /* Since we dont have per-port megaflow to check the port
> > +             * number, we need to verify that the input ports match. */
> > +            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
> > +                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
> > +
> > +                return (void *) flow;
> > +            }
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> >  /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
> >   * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
> >   * miniflow is copied into 'keys' and the packet pointer is moved at the
> > @@ -6924,12 +7002,13 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> >      }
> >  }
> >
> > -static void
> > +static int32_t
> >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> >                  struct dp_packet_batch *packets,
> >                  odp_port_t port_no)
> >  {
> >      dp_netdev_input__(pmd, packets, false, port_no);
> > +    return 0;
> >  }
> >
> >  static void
> > @@ -8369,7 +8448,7 @@ dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
> >
> >  /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
> >   * in 'mask' the values in 'key' and 'target' are the same. */
> > -bool
> > +inline bool ALWAYS_INLINE
> >  dpcls_rule_matches_key(const struct dpcls_rule *rule,
> >                         const struct netdev_flow_key *target)
> 
> Why always_inline? Shouldn't it be in the header then?
> 

We were experimenting on inlining different functions and left this here as an oversight. I'll take out the "ALWAYS_INLINE".

> Thanks,
> fbl
> 
> 
> >  {
> > @@ -8395,7 +8474,7 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
> >   * priorities, instead returning any rule which matches the flow.
> >   *
> >   * Returns true if all miniflows found a corresponding rule. */
> > -static bool
> > +bool
> >  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> >               struct dpcls_rule **rules, const size_t cnt,
> >               int *num_lookups_p)
> > --
> > 2.32.0
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> --
> fbl
Flavio Leitner June 21, 2021, 4:39 p.m. UTC | #3
On Mon, Jun 21, 2021 at 04:13:12PM +0000, Ferriter, Cian wrote:
> Hi Flavio,
> 
> Thanks for the review. My responses are inline.
> 
> Cian
> 
> > -----Original Message-----
> > From: Flavio Leitner <fbl@sysclose.org>
> > Sent: Sunday 20 June 2021 21:09
> > To: Ferriter, Cian <cian.ferriter@intel.com>
> > Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> > 
> > 
> > Hi,
> > 
> > I am still reviewing the patch, but I thought worth to discuss
> > few items below.
> > 
> > On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> > > From: Harry van Haaren <harry.van.haaren@intel.com>
> > >
> > > This commit adds the AVX512 implementation of DPIF functionality,
> > > specifically the dp_netdev_input_outer_avx512 function. This function
> > > only handles outer (no re-circulations), and is optimized to use the
> > > AVX512 ISA for packet batching and other DPIF work.
> > >
> > > Sparse is not able to handle the AVX512 intrinsics, causing compile
> > > time failures, so it is disabled for this file.
> > >
> > > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > > Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> > > Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> > > Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> > > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> > >
> > > ---
> > >
> > > v13:
> > > - Squash "Add HWOL support" commit into this commit.
> > > - Add NEWS item about this feature here rather than in a later commit.
> > > - Add #define NUM_U64_IN_ZMM_REG 8.
> > > - Add comment describing operation of while loop handling HWOL->EMC->SMC
> > >   lookups in dp_netdev_input_outer_avx512().
> > > - Add EMC and SMC batch insert functions for better handling of EMC and
> > >   SMC in AVX512 DPIF.
> > > - Minor code refactor to address review comments.
> > > ---
> > >  NEWS                             |   2 +
> > >  lib/automake.mk                  |   5 +-
> > >  lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
> > >  lib/dpif-netdev-private-dfc.h    |  25 +++
> > >  lib/dpif-netdev-private-dpif.h   |  32 +++
> > >  lib/dpif-netdev-private-thread.h |  11 +-
> > >  lib/dpif-netdev-private.h        |  25 +++
> > >  lib/dpif-netdev.c                | 103 ++++++++--
> > >  8 files changed, 514 insertions(+), 16 deletions(-)
> > >  create mode 100644 lib/dpif-netdev-avx512.c
> > >  create mode 100644 lib/dpif-netdev-private-dpif.h
> > >
> > > diff --git a/NEWS b/NEWS
> > > index 96b3a61c8..6a4a7b76d 100644
> > > --- a/NEWS
> > > +++ b/NEWS
> > > @@ -10,6 +10,8 @@ Post-v2.15.0
> > >       * Auto load balancing of PMDs now partially supports cross-NUMA polling
> > >         cases, e.g if all PMD threads are running on the same NUMA node.
> > >       * Refactor lib/dpif-netdev.c to multiple header files.
> > > +     * Add avx512 implementation of dpif which can process non recirculated
> > > +       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
> > >     - ovs-ctl:
> > >       * New option '--no-record-hostname' to disable hostname configuration
> > >         in ovsdb on startup.
> > > diff --git a/lib/automake.mk b/lib/automake.mk
> > > index 3a33cdd5c..660cd07f0 100644
> > > --- a/lib/automake.mk
> > > +++ b/lib/automake.mk
> > > @@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
> > >  	-mavx512f \
> > >  	-mavx512bw \
> > >  	-mavx512dq \
> > > +	-mbmi \
> > >  	-mbmi2 \
> > >  	-fPIC \
> > >  	$(AM_CFLAGS)
> > >  lib_libopenvswitchavx512_la_SOURCES = \
> > > -	lib/dpif-netdev-lookup-avx512-gather.c
> > > +	lib/dpif-netdev-lookup-avx512-gather.c \
> > > +	lib/dpif-netdev-avx512.c
> > >  lib_libopenvswitchavx512_la_LDFLAGS = \
> > >  	-static
> > >  endif
> > > @@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
> > >  	lib/dpif-netdev-private-dfc.c \
> > >  	lib/dpif-netdev-private-dfc.h \
> > >  	lib/dpif-netdev-private-dpcls.h \
> > > +	lib/dpif-netdev-private-dpif.h \
> > >  	lib/dpif-netdev-private-flow.h \
> > >  	lib/dpif-netdev-private-hwol.h \
> > >  	lib/dpif-netdev-private-thread.h \
> > > diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> > > new file mode 100644
> > > index 000000000..0e55b0be2
> > > --- /dev/null
> > > +++ b/lib/dpif-netdev-avx512.c
> > > @@ -0,0 +1,327 @@
> > > +/*
> > > + * Copyright (c) 2021 Intel Corporation.
> > > + *
> > > + * Licensed under the Apache License, Version 2.0 (the "License");
> > > + * you may not use this file except in compliance with the License.
> > > + * You may obtain a copy of the License at:
> > > + *
> > > + *     http://www.apache.org/licenses/LICENSE-2.0
> > > + *
> > > + * Unless required by applicable law or agreed to in writing, software
> > > + * distributed under the License is distributed on an "AS IS" BASIS,
> > > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > > + * See the License for the specific language governing permissions and
> > > + * limitations under the License.
> > > + */
> > > +
> > > +#ifdef __x86_64__
> > > +/* Sparse cannot handle the AVX512 instructions. */
> > > +#if !defined(__CHECKER__)
> > > +
> > > +#include <config.h>
> > > +
> > > +#include "dpif-netdev.h"
> > > +#include "dpif-netdev-perf.h"
> > > +
> > > +#include "dpif-netdev-private.h"
> > > +#include "dpif-netdev-private-dpcls.h"
> > > +#include "dpif-netdev-private-flow.h"
> > > +#include "dpif-netdev-private-thread.h"
> > > +#include "dpif-netdev-private-hwol.h"
> > 
> > The -private.h already includes a few of the above, but
> > not all, so the interface doesn't seem to be well defined.
> > For example, in -private.h we have dpcls_lookup() while
> > other dpcls functions are in -private-dpcls.h. In this
> > case, the following would be enough:
> > 
> > #include "dpif-netdev-private.h"
> > #include "dpif-netdev-private-hwol.h"
> > 
> > But then I don't know why other headers are included in the
> > interface but not the -private-hwol.h.
> > 
> > 
> 
> Good point. This can be cleaned up. I've included lib/dpif-netdev-private-hwol.h in lib/dpif-netdev-private.h and removed the headers included by lib/dpif-netdev-private.h from lib/dpif-netdev-avx512.c.
> 
> I'll move the prototype for dpcls_lookup() too, it makes more sense if it's in lib/dpif-netdev-private-dpcls.h.

Before you spend time on it, please consider if the refactoring is
really required. I think refactoring the code usually is a nice
thing to do when the result is a clean interface, but it seems that
will conflict with some other patches being reviewed. Then, instead
of you and/or others have to fix patches approaching the deadline
maybe it would be better to leave optional refactoring to a follow
up patch.

Another point to consider is that this refactoring is affecting an
important part of OVS, so it will require careful review and perhaps
additional follow ups until everyone is happy. If you can reduce that
impact, it would reduce the risk which helps to get the work accepted.

What do you think?

Thanks,
Van Haaren, Harry June 22, 2021, 11:10 a.m. UTC | #4
> -----Original Message-----
> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Flavio Leitner
> Sent: Monday, June 21, 2021 5:39 PM
> To: Ferriter, Cian <cian.ferriter@intel.com>
> Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>;
> i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> 
> On Mon, Jun 21, 2021 at 04:13:12PM +0000, Ferriter, Cian wrote:
> > Hi Flavio,

Hi Flavio & All,

Responses inline below.

Regards, -Harry


> > Thanks for the review. My responses are inline.
> >
> > Cian
> >
> > > -----Original Message-----
> > > From: Flavio Leitner <fbl@sysclose.org>
> > > Sent: Sunday 20 June 2021 21:09
> > > To: Ferriter, Cian <cian.ferriter@intel.com>
> > > Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>;
> i.maximets@ovn.org
> > > Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> > >
> > >
> > > Hi,
> > >
> > > I am still reviewing the patch, but I thought worth to discuss
> > > few items below.
> > >
> > > On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> > > > From: Harry van Haaren <harry.van.haaren@intel.com>
> > > >
> > > > This commit adds the AVX512 implementation of DPIF functionality,
> > > > specifically the dp_netdev_input_outer_avx512 function. This function
> > > > only handles outer (no re-circulations), and is optimized to use the
> > > > AVX512 ISA for packet batching and other DPIF work.
> > > >
> > > > Sparse is not able to handle the AVX512 intrinsics, causing compile
> > > > time failures, so it is disabled for this file.
> > > >
> > > > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > > > Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> > > > Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> > > > Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> > > > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> > > >
> > > > ---

<snip patch contents>

> > Good point. This can be cleaned up. I've included lib/dpif-netdev-private-hwol.h in
> lib/dpif-netdev-private.h and removed the headers included by lib/dpif-netdev-
> private.h from lib/dpif-netdev-avx512.c.
> >
> > I'll move the prototype for dpcls_lookup() too, it makes more sense if it's in
> lib/dpif-netdev-private-dpcls.h.
> 
> Before you spend time on it, please consider if the refactoring is
> really required. I think refactoring the code usually is a nice
> thing to do when the result is a clean interface

Refactoring code can be done for multiple reasons, indeed cleaner interfaces
is a noble goal, as is avoiding code-duplication, and general tidying up.
This refactoring is not a "nice to have" it is required, let me explain:

In this patchset as a whole, an ISA optimized DPIF implementation is added.
Before this refactor all DPIF related components (EMC, SMC, PartialHWOL,
and DPIF structs like flow-stats, dp_netdev_flow, dp_netdev_pmd_thread etc)
are defined & used only in a single .c file. There is no modularity, and there is
no possibility to re-use any of those components outside the .c file where they
are declared.

This patchset refactors those components into separate header files, allowing
re-use outside the .c that they were previously limited to. This allows EMC and
SMC to be re-used, and the ISA optimized DPIF is now viable, due to code reuse.

The result of the patches is a much more modular codebase, and indeed it avoids
much code duplication. The interface is kept as consistent as possible with the
previous implementation. I agree the interface is not as clean as it could be, but
this is the pragmatic approach to improve modularity and avoid code duplication.


> but it seems that will conflict with some other patches being reviewed.

Yes, any code changes can cause rebase-conflicts. As you know, this is an unfortunate
but unavoidable step in general software development. Various parties that may have
conflicting patches have been CC-ed, so have been made aware of potential rebasing.


> Then, instead of you and/or others have to fix patches approaching the deadline
> maybe it would be better to leave optional refactoring to a follow up patch.

As stated above, the refactoring is required to avoid code-duplication. Without the
refactoring, EMC and SMC (as well as other components) are not available. The DPIF
cannot compile without the modularity introduced by these patches, hence this
refactoring is not optional, it is required.

Regarding deadlines, improving the modularity of the DPIF code (EMC/SMC) has been
present since the first version of the patchset in October of 2020:
https://patchwork.ozlabs.org/project/openvswitch/patch/20201006145437.35124-3-harry.van.haaren@intel.com/ 


> Another point to consider is that this refactoring is affecting an
> important part of OVS, so it will require careful review and perhaps
> additional follow ups until everyone is happy. If you can reduce that
> impact, it would reduce the risk which helps to get the work accepted.

Yes I agree that careful review is a good thing, but note that given these patches
have been available for review for >6 months now. Can you commit to reviewing
and providing specific feedback by the end of this week (by Friday 25th?)

As you are aware this work is due for inclusion in OVS 2.16, and the soft-freeze
deadline is estimated at next Thursday 1 July, based on the release timelines
documented here: https://docs.openvswitch.org/en/latest/internals/release-process/


> What do you think?
> 
> Thanks,
> --
> fbl
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Flavio Leitner June 22, 2021, 4:22 p.m. UTC | #5
Hi Harry,

All good points. I made a suggestion and left to the authors to
decide the best course of action. It was a suggestion to accommodate
everyone and to reduce the churn. That's all.

Anyways, my plan is to continue reviewing the patches and, as always,
I appreciate your support.

Thanks,
fbl


On Tue, Jun 22, 2021 at 11:10:32AM +0000, Van Haaren, Harry wrote:
> > -----Original Message-----
> > From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Flavio Leitner
> > Sent: Monday, June 21, 2021 5:39 PM
> > To: Ferriter, Cian <cian.ferriter@intel.com>
> > Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>;
> > i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> > 
> > On Mon, Jun 21, 2021 at 04:13:12PM +0000, Ferriter, Cian wrote:
> > > Hi Flavio,
> 
> Hi Flavio & All,
> 
> Responses inline below.
> 
> Regards, -Harry
> 
> 
> > > Thanks for the review. My responses are inline.
> > >
> > > Cian
> > >
> > > > -----Original Message-----
> > > > From: Flavio Leitner <fbl@sysclose.org>
> > > > Sent: Sunday 20 June 2021 21:09
> > > > To: Ferriter, Cian <cian.ferriter@intel.com>
> > > > Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>;
> > i.maximets@ovn.org
> > > > Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> > > >
> > > >
> > > > Hi,
> > > >
> > > > I am still reviewing the patch, but I thought worth to discuss
> > > > few items below.
> > > >
> > > > On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> > > > > From: Harry van Haaren <harry.van.haaren@intel.com>
> > > > >
> > > > > This commit adds the AVX512 implementation of DPIF functionality,
> > > > > specifically the dp_netdev_input_outer_avx512 function. This function
> > > > > only handles outer (no re-circulations), and is optimized to use the
> > > > > AVX512 ISA for packet batching and other DPIF work.
> > > > >
> > > > > Sparse is not able to handle the AVX512 intrinsics, causing compile
> > > > > time failures, so it is disabled for this file.
> > > > >
> > > > > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > > > > Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> > > > > Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> > > > > Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> > > > > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> > > > >
> > > > > ---
> 
> <snip patch contents>
> 
> > > Good point. This can be cleaned up. I've included lib/dpif-netdev-private-hwol.h in
> > lib/dpif-netdev-private.h and removed the headers included by lib/dpif-netdev-
> > private.h from lib/dpif-netdev-avx512.c.
> > >
> > > I'll move the prototype for dpcls_lookup() too, it makes more sense if it's in
> > lib/dpif-netdev-private-dpcls.h.
> > 
> > Before you spend time on it, please consider if the refactoring is
> > really required. I think refactoring the code usually is a nice
> > thing to do when the result is a clean interface
> 
> Refactoring code can be done for multiple reasons, indeed cleaner interfaces
> is a noble goal, as is avoiding code-duplication, and general tidying up.
> This refactoring is not a "nice to have" it is required, let me explain:
> 
> In this patchset as a whole, an ISA optimized DPIF implementation is added.
> Before this refactor all DPIF related components (EMC, SMC, PartialHWOL,
> and DPIF structs like flow-stats, dp_netdev_flow, dp_netdev_pmd_thread etc)
> are defined & used only in a single .c file. There is no modularity, and there is
> no possibility to re-use any of those components outside the .c file where they
> are declared.
> 
> This patchset refactors those components into separate header files, allowing
> re-use outside the .c that they were previously limited to. This allows EMC and
> SMC to be re-used, and the ISA optimized DPIF is now viable, due to code reuse.
> 
> The result of the patches is a much more modular codebase, and indeed it avoids
> much code duplication. The interface is kept as consistent as possible with the
> previous implementation. I agree the interface is not as clean as it could be, but
> this is the pragmatic approach to improve modularity and avoid code duplication.
> 
> 
> > but it seems that will conflict with some other patches being reviewed.
> 
> Yes, any code changes can cause rebase-conflicts. As you know, this is an unfortunate
> but unavoidable step in general software development. Various parties that may have
> conflicting patches have been CC-ed, so have been made aware of potential rebasing.
> 
> 
> > Then, instead of you and/or others have to fix patches approaching the deadline
> > maybe it would be better to leave optional refactoring to a follow up patch.
> 
> As stated above, the refactoring is required to avoid code-duplication. Without the
> refactoring, EMC and SMC (as well as other components) are not available. The DPIF
> cannot compile without the modularity introduced by these patches, hence this
> refactoring is not optional, it is required.
> 
> Regarding deadlines, improving the modularity of the DPIF code (EMC/SMC) has been
> present since the first version of the patchset in October of 2020:
> https://patchwork.ozlabs.org/project/openvswitch/patch/20201006145437.35124-3-harry.van.haaren@intel.com/ 
> 
> 
> > Another point to consider is that this refactoring is affecting an
> > important part of OVS, so it will require careful review and perhaps
> > additional follow ups until everyone is happy. If you can reduce that
> > impact, it would reduce the risk which helps to get the work accepted.
> 
> Yes I agree that careful review is a good thing, but note that given these patches
> have been available for review for >6 months now. Can you commit to reviewing
> and providing specific feedback by the end of this week (by Friday 25th?)
> 
> As you are aware this work is due for inclusion in OVS 2.16, and the soft-freeze
> deadline is estimated at next Thursday 1 July, based on the release timelines
> documented here: https://docs.openvswitch.org/en/latest/internals/release-process/
> 
> 
> > What do you think?
> > 
> > Thanks,
> > --
> > fbl
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Flavio Leitner June 24, 2021, 4:06 a.m. UTC | #6
On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> From: Harry van Haaren <harry.van.haaren@intel.com>
> 
> This commit adds the AVX512 implementation of DPIF functionality,
> specifically the dp_netdev_input_outer_avx512 function. This function
> only handles outer (no re-circulations), and is optimized to use the
> AVX512 ISA for packet batching and other DPIF work.
> 
> Sparse is not able to handle the AVX512 intrinsics, causing compile
> time failures, so it is disabled for this file.
> 
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> 
> ---
> 
> v13:
> - Squash "Add HWOL support" commit into this commit.
> - Add NEWS item about this feature here rather than in a later commit.
> - Add #define NUM_U64_IN_ZMM_REG 8.
> - Add comment describing operation of while loop handling HWOL->EMC->SMC
>   lookups in dp_netdev_input_outer_avx512().
> - Add EMC and SMC batch insert functions for better handling of EMC and
>   SMC in AVX512 DPIF.
> - Minor code refactor to address review comments.
> ---
>  NEWS                             |   2 +
>  lib/automake.mk                  |   5 +-
>  lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
>  lib/dpif-netdev-private-dfc.h    |  25 +++
>  lib/dpif-netdev-private-dpif.h   |  32 +++
>  lib/dpif-netdev-private-thread.h |  11 +-
>  lib/dpif-netdev-private.h        |  25 +++
>  lib/dpif-netdev.c                | 103 ++++++++--
>  8 files changed, 514 insertions(+), 16 deletions(-)
>  create mode 100644 lib/dpif-netdev-avx512.c
>  create mode 100644 lib/dpif-netdev-private-dpif.h
> 
> diff --git a/NEWS b/NEWS
> index 96b3a61c8..6a4a7b76d 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -10,6 +10,8 @@ Post-v2.15.0
>       * Auto load balancing of PMDs now partially supports cross-NUMA polling
>         cases, e.g if all PMD threads are running on the same NUMA node.
>       * Refactor lib/dpif-netdev.c to multiple header files.
> +     * Add avx512 implementation of dpif which can process non recirculated
> +       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
>     - ovs-ctl:
>       * New option '--no-record-hostname' to disable hostname configuration
>         in ovsdb on startup.
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 3a33cdd5c..660cd07f0 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
>  	-mavx512f \
>  	-mavx512bw \
>  	-mavx512dq \
> +	-mbmi \
>  	-mbmi2 \
>  	-fPIC \
>  	$(AM_CFLAGS)
>  lib_libopenvswitchavx512_la_SOURCES = \
> -	lib/dpif-netdev-lookup-avx512-gather.c
> +	lib/dpif-netdev-lookup-avx512-gather.c \
> +	lib/dpif-netdev-avx512.c
>  lib_libopenvswitchavx512_la_LDFLAGS = \
>  	-static
>  endif
> @@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dpif-netdev-private-dfc.c \
>  	lib/dpif-netdev-private-dfc.h \
>  	lib/dpif-netdev-private-dpcls.h \
> +	lib/dpif-netdev-private-dpif.h \
>  	lib/dpif-netdev-private-flow.h \
>  	lib/dpif-netdev-private-hwol.h \
>  	lib/dpif-netdev-private-thread.h \
> diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> new file mode 100644
> index 000000000..0e55b0be2
> --- /dev/null
> +++ b/lib/dpif-netdev-avx512.c
> @@ -0,0 +1,327 @@
> +/*
> + * Copyright (c) 2021 Intel Corporation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-perf.h"
> +
> +#include "dpif-netdev-private.h"
> +#include "dpif-netdev-private-dpcls.h"
> +#include "dpif-netdev-private-flow.h"
> +#include "dpif-netdev-private-thread.h"
> +#include "dpif-netdev-private-hwol.h"
> +
> +#include "dp-packet.h"
> +#include "netdev.h"
> +
> +#include "immintrin.h"
> +
> +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> + * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
> + * number of miniflow blocks that can be processed in a single pass of the
> + * AVX512 code at a time.
> + */
> +#define NUM_U64_IN_ZMM_REG (8)
> +
> +/* Structure to contain per-packet metadata that must be attributed to the
> + * dp netdev flow. This is unfortunate to have to track per packet, however
> + * it's a bit awkward to maintain them in a performant way. This structure
> + * helps to keep two variables on a single cache line per packet.
> + */
> +struct pkt_flow_meta {
> +    uint16_t bytes;
> +    uint16_t tcp_flags;
> +};
> +
> +/* Structure of heap allocated memory for DPIF internals. */
> +struct dpif_userdata {
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> +        struct netdev_flow_key keys[NETDEV_MAX_BURST];
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> +        struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
> +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> +        struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
> +};
> +
> +int32_t
> +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> +                             struct dp_packet_batch *packets,
> +                             odp_port_t in_port)
> +{
> +    /* Allocate DPIF userdata. */
> +    if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
> +        pmd->netdev_input_func_userdata =
> +                xmalloc_pagealign(sizeof(struct dpif_userdata));
> +    }
> +
> +    struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
> +    struct netdev_flow_key *keys = ud->keys;
> +    struct netdev_flow_key **key_ptrs = ud->key_ptrs;
> +    struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
> +
> +    /* The AVX512 DPIF implementation handles rules in a way that is optimized
> +     * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
> +     * achieved by separating the rule arrays. Bitmasks are kept for each
> +     * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
> +     * array. Later the two arrays are merged by AVX-512 expand instructions.
> +     */
> +
> +    /* Stores the computed output: a rule pointer for each packet. */
> +    /* Used initially for HWOL/EMC/SMC. */
> +    struct dpcls_rule *rules[NETDEV_MAX_BURST];
> +    /* Used for DPCLS. */
> +    struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
> +
> +    uint32_t dpcls_key_idx = 0;
> +
> +    for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> +        _mm512_storeu_si512(&rules[i], _mm512_setzero_si512());
> +        _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512());
> +    }
> +
> +    /* Prefetch each packet's metadata. */
> +    const size_t batch_size = dp_packet_batch_size(packets);
> +    for (int i = 0; i < batch_size; i++) {
> +        struct dp_packet *packet = packets->packets[i];
> +        OVS_PREFETCH(dp_packet_data(packet));
> +        pkt_metadata_prefetch_init(&packet->md);
> +    }
> +
> +    /* Check if EMC or SMC are enabled. */
> +    struct dfc_cache *cache = &pmd->flow_cache;
> +    const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0;
> +    const uint32_t smc_enabled = pmd->ctx.smc_enable_db;
> +
> +    uint32_t emc_hits = 0;
> +    uint32_t smc_hits = 0;
> +
> +    /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
> +    uint32_t hwol_emc_smc_hitmask = 0;
> +    uint32_t smc_hitmask = 0;
> +
> +    /* The below while loop is based on the 'iter' variable which has a number
> +     * of bits set representing packets that we want to process
> +     * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0)
> +     * the bit representing that packet using '_blsr_u64()'. The
> +     * '__builtin_ctz()' will give us the correct index into the 'packets',
> +     * 'pkt_meta', 'keys' and 'rules' arrays.
> +     *
> +     * For one iteration of the while loop, here's some psuedocode as an
> +     * example where 'iter' is represented in binary:
> +     *
> +     * while (iter) { // iter = 1100
> +     *     uint32_t i = __builtin_ctz(iter); // i = 2
> +     *     iter = _blsr_u64(iter); // iter = 1000
> +     *     // do all processing (HWOL->MFEX->EMC->SMC)
> +     * }
> +     */
> +    uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
> +    uint32_t iter = lookup_pkts_bitmask;
> +    while (iter) {
> +        uint32_t i = __builtin_ctz(iter);
> +        iter = _blsr_u64(iter);
> +
> +        /* Get packet pointer from bitmask and packet md. */
> +        struct dp_packet *packet = packets->packets[i];
> +        pkt_metadata_init(&packet->md, in_port);
> +
> +        struct dp_netdev_flow *f = NULL;
> +
> +        /* Check for partial hardware offload mark. */
> +        uint32_t mark;
> +        if (dp_packet_has_flow_mark(packet, &mark)) {
> +            f = mark_to_flow_find(pmd, mark);
> +            if (f) {
> +                rules[i] = &f->cr;
> +                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> +                pkt_meta[i].bytes = dp_packet_size(packet);
> +                hwol_emc_smc_hitmask |= (1 << i);
> +                continue;
> +            }
> +        }
> +
> +        /* Do miniflow extract into keys. */
> +        struct netdev_flow_key *key = &keys[i];
> +        miniflow_extract(packet, &key->mf);
> +
> +        /* Cache TCP and byte values for all packets. */
> +        pkt_meta[i].bytes = dp_packet_size(packet);
> +        pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
> +
> +        key->len = netdev_flow_key_size(miniflow_n_values(&key->mf));
> +        key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
> +
> +        if (emc_enabled) {
> +            f = emc_lookup(&cache->emc_cache, key);
> +
> +            if (f) {
> +                rules[i] = &f->cr;
> +                emc_hits++;
> +                hwol_emc_smc_hitmask |= (1 << i);
> +                continue;
> +            }
> +        }
> +
> +        if (smc_enabled && !f) {
> +            f = smc_lookup_single(pmd, packet, key);
> +            if (f) {
> +                rules[i] = &f->cr;
> +                smc_hits++;
> +                smc_hitmask |= (1 << i);
> +                continue;
> +            }
> +        }
> +
> +        /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the
> +         * dpcls input keys array for batch lookup later.
> +         */
> +        key_ptrs[dpcls_key_idx] = &keys[i];
> +        dpcls_key_idx++;
> +    }
> +
> +    hwol_emc_smc_hitmask |= smc_hitmask;
> +
> +    /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the
> +     * key_ptrs[] for input miniflows to match, storing results in the
> +     * dpcls_rules[] array.
> +     */
> +    if (dpcls_key_idx > 0) {
> +        struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
> +        if (OVS_UNLIKELY(!cls)) {
> +            return -1;
> +        }
> +        bool any_miss =
> +            !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs,
> +                          dpcls_rules, dpcls_key_idx, NULL);
> +        if (OVS_UNLIKELY(any_miss)) {
> +            return -1;
> +        }
> +
> +        /* Merge DPCLS rules and HWOL/EMC/SMC rules. */
> +        uint32_t dpcls_idx = 0;
> +        for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> +            /* Indexing here is somewhat complicated due to DPCLS output rule
> +             * load index depending on the hitmask of HWOL/EMC/SMC. More
> +             * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are
> +             * used.
> +             */
> +            __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]);
> +            __m512i v_merged_rules =
> +                        _mm512_mask_expandloadu_epi64(v_cache_rules,
> +                                                      ~hwol_emc_smc_hitmask,
> +                                                      &dpcls_rules[dpcls_idx]);
> +            _mm512_storeu_si512(&rules[i], v_merged_rules);
> +
> +            /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits.
> +             * There are NUM_U64_IN_ZMM_REG output pointers per register,
> +             * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules
> +             * consumed.
> +             */
> +            uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF);
> +            dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF);
> +            hwol_emc_smc_hitmask =
> +                (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG);
> +        }
> +    }
> +
> +    /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC
> +     * if required.
> +     */
> +    /* Insert SMC and DPCLS hits into EMC. */
> +    /* Insert DPCLS hits into SMC. */
> +    if (emc_enabled) {
> +        uint32_t emc_insert_mask = smc_hitmask | ~hwol_emc_smc_hitmask;
> +        emc_insert_mask &= lookup_pkts_bitmask;
> +        emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask);
> +    }
> +    if (smc_enabled) {
> +        uint32_t smc_insert_mask = ~hwol_emc_smc_hitmask;
> +        smc_insert_mask &= lookup_pkts_bitmask;
> +        smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask);
> +    }
> +
> +    /* At this point we don't return error anymore, so commit stats here. */
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
> +                            dpcls_key_idx);
> +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
> +                            dpcls_key_idx);
> +
> +    /* Initialize the "Action Batch" for each flow handled below. */
> +    struct dp_packet_batch action_batch;
> +    action_batch.trunc = 0;
> +
> +    while (lookup_pkts_bitmask) {
> +        uint32_t rule_pkt_idx = __builtin_ctz(lookup_pkts_bitmask);
> +        uint64_t needle = (uintptr_t) rules[rule_pkt_idx];
> +
> +        /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a
> +         * bitmask.
> +         */
> +        uint32_t batch_bitmask = 0;
> +        for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) {
> +            /* Pre-calculate store addr. */
> +            uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask);
> +            void *store_addr = &action_batch.packets[num_pkts_in_batch];
> +
> +            /* Search for identical flow* in burst, update bitmask. */
> +            __m512i v_needle = _mm512_set1_epi64(needle);
> +            __m512i v_hay = _mm512_loadu_si512(&rules[j]);
> +            __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay);
> +            uint32_t cmp_bits = k_cmp_bits;
> +            batch_bitmask |= cmp_bits << j;
> +
> +            /* Compress and store the batched packets. */
> +            struct dp_packet **packets_ptrs = &packets->packets[j];
> +            __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs);
> +            _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs);
> +        }
> +
> +        /* Strip all packets in this batch from the lookup_pkts_bitmask. */
> +        lookup_pkts_bitmask &= (~batch_bitmask);
> +        action_batch.count = __builtin_popcountll(batch_bitmask);
> +
> +        /* Loop over all packets in this batch, to gather the byte and tcp_flag
> +         * values, and pass them to the execute function. It would be nice to
> +         * optimize this away, however it is not easy to refactor in dpif.
> +         */
> +        uint32_t bytes = 0;
> +        uint16_t tcp_flags = 0;
> +        uint32_t bitmask_iter = batch_bitmask;
> +        for (int i = 0; i < action_batch.count; i++) {
> +            uint32_t idx = __builtin_ctzll(bitmask_iter);
> +            bitmask_iter = _blsr_u64(bitmask_iter);
> +
> +            bytes += pkt_meta[idx].bytes;
> +            tcp_flags |= pkt_meta[idx].tcp_flags;
> +        }
> +
> +        dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx],
> +                                bytes, tcp_flags);
> +    }
> +
> +    return 0;
> +}
> +
> +#endif
> +#endif
> diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
> index 6a672d1b3..d5d4da7ea 100644
> --- a/lib/dpif-netdev-private-dfc.h
> +++ b/lib/dpif-netdev-private-dfc.h
> @@ -81,6 +81,14 @@ extern "C" {
>  #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
>                                      DEFAULT_EM_FLOW_INSERT_INV_PROB)
>  
> +/* Forward declaration for SMC function prototype that requires access to
> + * 'struct dp_netdev_pmd_thread'. */
> +struct dp_netdev_pmd_thread;
> +
> +/* Forward declaration for EMC and SMC batch insert function prototypes that
> + * require access to 'struct dpcls_rule'. */
> +struct dpcls_rule;
> +
>  struct emc_entry {
>      struct dp_netdev_flow *flow;
>      struct netdev_flow_key key;   /* key.hash used for emc hash value. */
> @@ -168,6 +176,23 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
>      return NULL;
>  }
>  
> +/* Insert a batch of keys/flows into the EMC and SMC caches. */
> +void
> +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                               const struct netdev_flow_key *keys,
> +                               struct dpcls_rule **rules,
> +                               uint32_t emc_insert_mask);
> +
> +void
> +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                               const struct netdev_flow_key *keys,
> +                               struct dpcls_rule **rules,
> +                               uint32_t smc_insert_mask);
> +
> +struct dp_netdev_flow *
> +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> +                  struct dp_packet *packet,
> +                  struct netdev_flow_key *key);
>  
>  #ifdef  __cplusplus
>  }
> diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
> new file mode 100644
> index 000000000..2fd7cc400
> --- /dev/null
> +++ b/lib/dpif-netdev-private-dpif.h
> @@ -0,0 +1,32 @@
> +/*
> + * Copyright (c) 2021 Intel Corporation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef DPIF_NETDEV_PRIVATE_DPIF_H
> +#define DPIF_NETDEV_PRIVATE_DPIF_H 1
> +
> +#include "openvswitch/types.h"
> +
> +/* Forward declarations to avoid including files. */
> +struct dp_netdev_pmd_thread;
> +struct dp_packet_batch;
> +
> +/* Available implementations for dpif work. */
> +int32_t
> +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> +                             struct dp_packet_batch *packets,
> +                             odp_port_t in_port);
> +
> +#endif /* netdev-private.h */
> diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
> index 0d674ab83..17356d5e2 100644
> --- a/lib/dpif-netdev-private-thread.h
> +++ b/lib/dpif-netdev-private-thread.h
> @@ -45,14 +45,19 @@ struct dp_netdev_pmd_thread_ctx {
>      struct dp_netdev_rxq *last_rxq;
>      /* EMC insertion probability context for the current processing cycle. */
>      uint32_t emc_insert_min;
> +    /* Enable the SMC cache from ovsdb config. */
> +    bool smc_enable_db;
>  };
>  
>  /* Forward declaration for typedef. */
>  struct dp_netdev_pmd_thread;
>  
> -typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> -                                     struct dp_packet_batch *packets,
> -                                     odp_port_t port_no);
> +/* Typedef for DPIF functions.
> + * Returns a bitmask of packets to handle, possibly including upcall/misses.
> + */
> +typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> +                                        struct dp_packet_batch *packets,
> +                                        odp_port_t port_no);
>  
>  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
>   * the performance overhead of interrupt processing.  Therefore netdev can
> diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
> index d7b6fd7ec..0315b5bf6 100644
> --- a/lib/dpif-netdev-private.h
> +++ b/lib/dpif-netdev-private.h
> @@ -31,4 +31,29 @@
>  #include "dpif-netdev-private-dfc.h"
>  #include "dpif-netdev-private-thread.h"
>  
> +/* Allow other implementations to lookup the DPCLS instances. */
> +struct dpcls *
> +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> +                           odp_port_t in_port);
> +
> +/* Allow other implementations to call dpcls_lookup() for subtable search. */
> +bool
> +dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> +             struct dpcls_rule **rules, const size_t cnt,
> +             int *num_lookups_p);
> +
> +/* Allow other implementations to execute actions on a batch. */
> +void
> +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> +                        struct dp_packet_batch *packets,
> +                        struct dpcls_rule *rule,
> +                        uint32_t bytes,
> +                        uint16_t tcp_flags);
> +
> +/* Available implementations for dpif work. */
> +int32_t
> +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> +                             struct dp_packet_batch *packets,
> +                             odp_port_t in_port);
> +
>  #endif /* netdev-private.h */
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index e6486417e..1f15af882 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -183,10 +183,6 @@ static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
>  static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
>                           const struct netdev_flow_key *mask);
>  static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
> -static bool dpcls_lookup(struct dpcls *cls,
> -                         const struct netdev_flow_key *keys[],
> -                         struct dpcls_rule **rules, size_t cnt,
> -                         int *num_lookups_p);
>  
>  /* Set of supported meter flags */
>  #define DP_SUPPORTED_METER_FLAGS_MASK \
> @@ -483,7 +479,7 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
>                                        const struct flow *flow,
>                                        const struct nlattr *actions,
>                                        size_t actions_len);
> -static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> +static int32_t dp_netdev_input(struct dp_netdev_pmd_thread *,
>                              struct dp_packet_batch *, odp_port_t port_no);
>  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
>                                    struct dp_packet_batch *);
> @@ -555,7 +551,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
>                                 bool purge);
>  static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
>                                        struct tx_port *tx);
> -static inline struct dpcls *
> +inline struct dpcls *
>  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
>                             odp_port_t in_port);
>  
> @@ -1920,7 +1916,7 @@ void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
>      }
>  }
>  
> -static inline struct dpcls *
> +inline struct dpcls *
>  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
>                             odp_port_t in_port)
>  {
> @@ -2714,13 +2710,46 @@ smc_insert(struct dp_netdev_pmd_thread *pmd,
>      bucket->flow_idx[i] = index;
>  }
>  
> +inline void
> +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                               const struct netdev_flow_key *keys,
> +                               struct dpcls_rule **rules,
> +                               uint32_t emc_insert_mask)
> +{
> +    while (emc_insert_mask) {
> +        uint32_t i = __builtin_ctz(emc_insert_mask);

I got an error on Windows:

[...]
libtool: compile:  build-aux/cccl -DHAVE_CONFIG_H -I. -I ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-arithmetic -g -DHAVE_AVX512F -c lib/dpif-netdev.c
libtool: compile: mv -f "dpif-netdev-lookup-autovalidator.obj" "lib/dpif-netdev-lookup-autovalidator.obj"
c:\PTHREADS-BUILT\include\_ptw32.h(120): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
c:\openvswitch_compile\config.h(207): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
c:\openvswitch_compile\lib\ovs-rcu.h(215): warning C4311: 'type cast': pointer truncation from 'void *' to 'long'
libtool: compile: mv -f "dpif-netdev-lookup-generic.obj" "lib/dpif-netdev-lookup-generic.obj"
dpif-netdev.c
\
	source='lib/dpif-netdev-private-dfc.c' object='lib/dpif-netdev-private-dfc.lo' libtool=yes \
	DEPDIR=.deps depmode=none /bin/sh ./build-aux/depcomp \
	/bin/sh ./libtool  --tag=CC   --mode=compile build-aux/cccl -DHAVE_CONFIG_H -I.   -I ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include   -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-arithmetic    -g -DHAVE_AVX512F -c -o lib/dpif-netdev-private-dfc.lo lib/dpif-netdev-private-dfc.c
c:\PTHREADS-BUILT\include\_ptw32.h(120): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
c:\openvswitch_compile\config.h(207): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
c:\openvswitch_compile\lib\ovs-rcu.h(215): warning C4311: 'type cast': pointer truncation from 'void *' to 'long'
c:\openvswitch_compile\config.h(207): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
c:\PTHREADS-BUILT\include\_ptw32.h(120): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
lib/dpif-netdev.c(2826): error C4013: '__builtin_ctz' undefined; assuming extern returning int
lib/dpif-netdev.c(2919): warning C4311: 'type cast': pointer truncation from 'const char *const ' to 'long'
\
	source='lib/dpif-netdev-private-dpif.c' object='lib/dpif-netdev-private-dpif.lo' libtool=yes \
	DEPDIR=.deps depmode=none /bin/sh ./build-aux/depcomp \
	/bin/sh ./libtool  --tag=CC   --mode=compile build-aux/cccl -DHAVE_CONFIG_H -I.   -I ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include   -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-arithmetic    -g -DHAVE_AVX512F -c -o lib/dpif-netdev-private-dpif.lo lib/dpif-netdev-private-dpif.c
make[2]: *** [lib/dpif-netdev.lo] Error 1
make[2]: *** Waiting for unfinished jobs....

Thanks,
fbl


> +        emc_insert_mask &= emc_insert_mask - 1;
> +        /* Get the require parameters for EMC/SMC from the rule */
> +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> +        /* Insert the key into EMC/SMC. */
> +        emc_probabilistic_insert(pmd, &keys[i], flow);
> +    }
> +}
> +
> +inline void
> +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> +                 const struct netdev_flow_key *keys,
> +                 struct dpcls_rule **rules,
> +                 uint32_t smc_insert_mask)
> +{
> +    while (smc_insert_mask) {
> +        uint32_t i = __builtin_ctz(smc_insert_mask);
> +        smc_insert_mask &= smc_insert_mask - 1;
> +        /* Get the require parameters for EMC/SMC from the rule */
> +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> +        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
> +        /* Insert the key into EMC/SMC. */
> +        smc_insert(pmd, &keys[i], hash);
> +    }
> +}
> +
>  static struct dp_netdev_flow *
>  dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
>                            const struct netdev_flow_key *key,
>                            int *lookup_num_p)
>  {
>      struct dpcls *cls;
> -    struct dpcls_rule *rule;
> +    struct dpcls_rule *rule = NULL;
>      odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
>                                                       in_port.odp_port));
>      struct dp_netdev_flow *netdev_flow = NULL;
> @@ -4233,7 +4262,10 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
>          }
>  
>          /* Process packet batch. */
> -        pmd->netdev_input_func(pmd, &batch, port_no);
> +        int32_t ret = pmd->netdev_input_func(pmd, &batch, port_no);
> +        if (ret) {
> +            dp_netdev_input(pmd, &batch, port_no);
> +        }
>  
>          /* Assign processing cycles to rx queue. */
>          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> @@ -5251,6 +5283,8 @@ dpif_netdev_run(struct dpif *dpif)
>                      non_pmd->ctx.emc_insert_min = 0;
>                  }
>  
> +                non_pmd->ctx.smc_enable_db = dp->smc_enable_db;
> +
>                  for (i = 0; i < port->n_rxq; i++) {
>  
>                      if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
> @@ -5522,6 +5556,8 @@ reload:
>                  pmd->ctx.emc_insert_min = 0;
>              }
>  
> +            pmd->ctx.smc_enable_db = pmd->dp->smc_enable_db;
> +
>              process_packets =
>                  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
>                                             poll_list[i].port_no);
> @@ -6415,6 +6451,24 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
>                                actions->actions, actions->size);
>  }
>  
> +void
> +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> +                        struct dp_packet_batch *packets,
> +                        struct dpcls_rule *rule,
> +                        uint32_t bytes,
> +                        uint16_t tcp_flags)
> +{
> +    /* Gets action* from the rule. */
> +    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
> +    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
> +
> +    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
> +                        tcp_flags, pmd->ctx.now / 1000);
> +    const uint32_t steal = 1;
> +    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
> +                              actions->actions, actions->size);
> +}
> +
>  static inline void
>  dp_netdev_queue_batches(struct dp_packet *pkt,
>                          struct dp_netdev_flow *flow, uint16_t tcp_flags,
> @@ -6519,6 +6573,30 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
>      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
>  }
>  
> +struct dp_netdev_flow *
> +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> +                  struct dp_packet *packet,
> +                  struct netdev_flow_key *key)
> +{
> +    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
> +
> +    if (OVS_LIKELY(flow_node != NULL)) {
> +        struct dp_netdev_flow *flow = NULL;
> +
> +        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
> +            /* Since we dont have per-port megaflow to check the port
> +             * number, we need to verify that the input ports match. */
> +            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
> +                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
> +
> +                return (void *) flow;
> +            }
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
>  /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
>   * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
>   * miniflow is copied into 'keys' and the packet pointer is moved at the
> @@ -6924,12 +7002,13 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
>      }
>  }
>  
> -static void
> +static int32_t
>  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
>                  struct dp_packet_batch *packets,
>                  odp_port_t port_no)
>  {
>      dp_netdev_input__(pmd, packets, false, port_no);
> +    return 0;
>  }
>  
>  static void
> @@ -8369,7 +8448,7 @@ dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
>  
>  /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
>   * in 'mask' the values in 'key' and 'target' are the same. */
> -bool
> +inline bool ALWAYS_INLINE
>  dpcls_rule_matches_key(const struct dpcls_rule *rule,
>                         const struct netdev_flow_key *target)
>  {
> @@ -8395,7 +8474,7 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
>   * priorities, instead returning any rule which matches the flow.
>   *
>   * Returns true if all miniflows found a corresponding rule. */
> -static bool
> +bool
>  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
>               struct dpcls_rule **rules, const size_t cnt,
>               int *num_lookups_p)
> -- 
> 2.32.0
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Ferriter, Cian June 24, 2021, 11:44 a.m. UTC | #7
Hi Flavio,

Thanks for the testing here. My responses are inline.

Cian

> -----Original Message-----
> From: Flavio Leitner <fbl@sysclose.org>
> Sent: Thursday 24 June 2021 05:06
> To: Ferriter, Cian <cian.ferriter@intel.com>
> Cc: ovs-dev@openvswitch.org; Amber, Kumar <kumar.amber@intel.com>; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v13 04/12] dpif-avx512: Add ISA implementation of dpif.
> 
> On Thu, Jun 17, 2021 at 05:18:17PM +0100, Cian Ferriter wrote:
> > From: Harry van Haaren <harry.van.haaren@intel.com>
> >
> > This commit adds the AVX512 implementation of DPIF functionality,
> > specifically the dp_netdev_input_outer_avx512 function. This function
> > only handles outer (no re-circulations), and is optimized to use the
> > AVX512 ISA for packet batching and other DPIF work.
> >
> > Sparse is not able to handle the AVX512 intrinsics, causing compile
> > time failures, so it is disabled for this file.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > Co-authored-by: Cian Ferriter <cian.ferriter@intel.com>
> > Signed-off-by: Cian Ferriter <cian.ferriter@intel.com>
> > Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> >
> > ---
> >
> > v13:
> > - Squash "Add HWOL support" commit into this commit.
> > - Add NEWS item about this feature here rather than in a later commit.
> > - Add #define NUM_U64_IN_ZMM_REG 8.
> > - Add comment describing operation of while loop handling HWOL->EMC->SMC
> >   lookups in dp_netdev_input_outer_avx512().
> > - Add EMC and SMC batch insert functions for better handling of EMC and
> >   SMC in AVX512 DPIF.
> > - Minor code refactor to address review comments.
> > ---
> >  NEWS                             |   2 +
> >  lib/automake.mk                  |   5 +-
> >  lib/dpif-netdev-avx512.c         | 327 +++++++++++++++++++++++++++++++
> >  lib/dpif-netdev-private-dfc.h    |  25 +++
> >  lib/dpif-netdev-private-dpif.h   |  32 +++
> >  lib/dpif-netdev-private-thread.h |  11 +-
> >  lib/dpif-netdev-private.h        |  25 +++
> >  lib/dpif-netdev.c                | 103 ++++++++--
> >  8 files changed, 514 insertions(+), 16 deletions(-)
> >  create mode 100644 lib/dpif-netdev-avx512.c
> >  create mode 100644 lib/dpif-netdev-private-dpif.h
> >
> > diff --git a/NEWS b/NEWS
> > index 96b3a61c8..6a4a7b76d 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -10,6 +10,8 @@ Post-v2.15.0
> >       * Auto load balancing of PMDs now partially supports cross-NUMA polling
> >         cases, e.g if all PMD threads are running on the same NUMA node.
> >       * Refactor lib/dpif-netdev.c to multiple header files.
> > +     * Add avx512 implementation of dpif which can process non recirculated
> > +       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
> >     - ovs-ctl:
> >       * New option '--no-record-hostname' to disable hostname configuration
> >         in ovsdb on startup.
> > diff --git a/lib/automake.mk b/lib/automake.mk
> > index 3a33cdd5c..660cd07f0 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -33,11 +33,13 @@ lib_libopenvswitchavx512_la_CFLAGS = \
> >  	-mavx512f \
> >  	-mavx512bw \
> >  	-mavx512dq \
> > +	-mbmi \
> >  	-mbmi2 \
> >  	-fPIC \
> >  	$(AM_CFLAGS)
> >  lib_libopenvswitchavx512_la_SOURCES = \
> > -	lib/dpif-netdev-lookup-avx512-gather.c
> > +	lib/dpif-netdev-lookup-avx512-gather.c \
> > +	lib/dpif-netdev-avx512.c
> >  lib_libopenvswitchavx512_la_LDFLAGS = \
> >  	-static
> >  endif
> > @@ -114,6 +116,7 @@ lib_libopenvswitch_la_SOURCES = \
> >  	lib/dpif-netdev-private-dfc.c \
> >  	lib/dpif-netdev-private-dfc.h \
> >  	lib/dpif-netdev-private-dpcls.h \
> > +	lib/dpif-netdev-private-dpif.h \
> >  	lib/dpif-netdev-private-flow.h \
> >  	lib/dpif-netdev-private-hwol.h \
> >  	lib/dpif-netdev-private-thread.h \
> > diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
> > new file mode 100644
> > index 000000000..0e55b0be2
> > --- /dev/null
> > +++ b/lib/dpif-netdev-avx512.c
> > @@ -0,0 +1,327 @@
> > +/*
> > + * Copyright (c) 2021 Intel Corporation.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifdef __x86_64__
> > +/* Sparse cannot handle the AVX512 instructions. */
> > +#if !defined(__CHECKER__)
> > +
> > +#include <config.h>
> > +
> > +#include "dpif-netdev.h"
> > +#include "dpif-netdev-perf.h"
> > +
> > +#include "dpif-netdev-private.h"
> > +#include "dpif-netdev-private-dpcls.h"
> > +#include "dpif-netdev-private-flow.h"
> > +#include "dpif-netdev-private-thread.h"
> > +#include "dpif-netdev-private-hwol.h"
> > +
> > +#include "dp-packet.h"
> > +#include "netdev.h"
> > +
> > +#include "immintrin.h"
> > +
> > +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> > + * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
> > + * number of miniflow blocks that can be processed in a single pass of the
> > + * AVX512 code at a time.
> > + */
> > +#define NUM_U64_IN_ZMM_REG (8)
> > +
> > +/* Structure to contain per-packet metadata that must be attributed to the
> > + * dp netdev flow. This is unfortunate to have to track per packet, however
> > + * it's a bit awkward to maintain them in a performant way. This structure
> > + * helps to keep two variables on a single cache line per packet.
> > + */
> > +struct pkt_flow_meta {
> > +    uint16_t bytes;
> > +    uint16_t tcp_flags;
> > +};
> > +
> > +/* Structure of heap allocated memory for DPIF internals. */
> > +struct dpif_userdata {
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct netdev_flow_key keys[NETDEV_MAX_BURST];
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
> > +    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
> > +        struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
> > +};
> > +
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port)
> > +{
> > +    /* Allocate DPIF userdata. */
> > +    if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
> > +        pmd->netdev_input_func_userdata =
> > +                xmalloc_pagealign(sizeof(struct dpif_userdata));
> > +    }
> > +
> > +    struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
> > +    struct netdev_flow_key *keys = ud->keys;
> > +    struct netdev_flow_key **key_ptrs = ud->key_ptrs;
> > +    struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
> > +
> > +    /* The AVX512 DPIF implementation handles rules in a way that is optimized
> > +     * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
> > +     * achieved by separating the rule arrays. Bitmasks are kept for each
> > +     * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
> > +     * array. Later the two arrays are merged by AVX-512 expand instructions.
> > +     */
> > +
> > +    /* Stores the computed output: a rule pointer for each packet. */
> > +    /* Used initially for HWOL/EMC/SMC. */
> > +    struct dpcls_rule *rules[NETDEV_MAX_BURST];
> > +    /* Used for DPCLS. */
> > +    struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
> > +
> > +    uint32_t dpcls_key_idx = 0;
> > +
> > +    for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> > +        _mm512_storeu_si512(&rules[i], _mm512_setzero_si512());
> > +        _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512());
> > +    }
> > +
> > +    /* Prefetch each packet's metadata. */
> > +    const size_t batch_size = dp_packet_batch_size(packets);
> > +    for (int i = 0; i < batch_size; i++) {
> > +        struct dp_packet *packet = packets->packets[i];
> > +        OVS_PREFETCH(dp_packet_data(packet));
> > +        pkt_metadata_prefetch_init(&packet->md);
> > +    }
> > +
> > +    /* Check if EMC or SMC are enabled. */
> > +    struct dfc_cache *cache = &pmd->flow_cache;
> > +    const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0;
> > +    const uint32_t smc_enabled = pmd->ctx.smc_enable_db;
> > +
> > +    uint32_t emc_hits = 0;
> > +    uint32_t smc_hits = 0;
> > +
> > +    /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
> > +    uint32_t hwol_emc_smc_hitmask = 0;
> > +    uint32_t smc_hitmask = 0;
> > +
> > +    /* The below while loop is based on the 'iter' variable which has a number
> > +     * of bits set representing packets that we want to process
> > +     * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0)
> > +     * the bit representing that packet using '_blsr_u64()'. The
> > +     * '__builtin_ctz()' will give us the correct index into the 'packets',
> > +     * 'pkt_meta', 'keys' and 'rules' arrays.
> > +     *
> > +     * For one iteration of the while loop, here's some psuedocode as an
> > +     * example where 'iter' is represented in binary:
> > +     *
> > +     * while (iter) { // iter = 1100
> > +     *     uint32_t i = __builtin_ctz(iter); // i = 2
> > +     *     iter = _blsr_u64(iter); // iter = 1000
> > +     *     // do all processing (HWOL->MFEX->EMC->SMC)
> > +     * }
> > +     */
> > +    uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
> > +    uint32_t iter = lookup_pkts_bitmask;
> > +    while (iter) {
> > +        uint32_t i = __builtin_ctz(iter);
> > +        iter = _blsr_u64(iter);
> > +
> > +        /* Get packet pointer from bitmask and packet md. */
> > +        struct dp_packet *packet = packets->packets[i];
> > +        pkt_metadata_init(&packet->md, in_port);
> > +
> > +        struct dp_netdev_flow *f = NULL;
> > +
> > +        /* Check for partial hardware offload mark. */
> > +        uint32_t mark;
> > +        if (dp_packet_has_flow_mark(packet, &mark)) {
> > +            f = mark_to_flow_find(pmd, mark);
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
> > +                pkt_meta[i].bytes = dp_packet_size(packet);
> > +                hwol_emc_smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /* Do miniflow extract into keys. */
> > +        struct netdev_flow_key *key = &keys[i];
> > +        miniflow_extract(packet, &key->mf);
> > +
> > +        /* Cache TCP and byte values for all packets. */
> > +        pkt_meta[i].bytes = dp_packet_size(packet);
> > +        pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
> > +
> > +        key->len = netdev_flow_key_size(miniflow_n_values(&key->mf));
> > +        key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
> > +
> > +        if (emc_enabled) {
> > +            f = emc_lookup(&cache->emc_cache, key);
> > +
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                emc_hits++;
> > +                hwol_emc_smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        if (smc_enabled && !f) {
> > +            f = smc_lookup_single(pmd, packet, key);
> > +            if (f) {
> > +                rules[i] = &f->cr;
> > +                smc_hits++;
> > +                smc_hitmask |= (1 << i);
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the
> > +         * dpcls input keys array for batch lookup later.
> > +         */
> > +        key_ptrs[dpcls_key_idx] = &keys[i];
> > +        dpcls_key_idx++;
> > +    }
> > +
> > +    hwol_emc_smc_hitmask |= smc_hitmask;
> > +
> > +    /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the
> > +     * key_ptrs[] for input miniflows to match, storing results in the
> > +     * dpcls_rules[] array.
> > +     */
> > +    if (dpcls_key_idx > 0) {
> > +        struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
> > +        if (OVS_UNLIKELY(!cls)) {
> > +            return -1;
> > +        }
> > +        bool any_miss =
> > +            !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs,
> > +                          dpcls_rules, dpcls_key_idx, NULL);
> > +        if (OVS_UNLIKELY(any_miss)) {
> > +            return -1;
> > +        }
> > +
> > +        /* Merge DPCLS rules and HWOL/EMC/SMC rules. */
> > +        uint32_t dpcls_idx = 0;
> > +        for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
> > +            /* Indexing here is somewhat complicated due to DPCLS output rule
> > +             * load index depending on the hitmask of HWOL/EMC/SMC. More
> > +             * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are
> > +             * used.
> > +             */
> > +            __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]);
> > +            __m512i v_merged_rules =
> > +                        _mm512_mask_expandloadu_epi64(v_cache_rules,
> > +                                                      ~hwol_emc_smc_hitmask,
> > +                                                      &dpcls_rules[dpcls_idx]);
> > +            _mm512_storeu_si512(&rules[i], v_merged_rules);
> > +
> > +            /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits.
> > +             * There are NUM_U64_IN_ZMM_REG output pointers per register,
> > +             * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules
> > +             * consumed.
> > +             */
> > +            uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF);
> > +            dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF);
> > +            hwol_emc_smc_hitmask =
> > +                (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG);
> > +        }
> > +    }
> > +
> > +    /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC
> > +     * if required.
> > +     */
> > +    /* Insert SMC and DPCLS hits into EMC. */
> > +    /* Insert DPCLS hits into SMC. */
> > +    if (emc_enabled) {
> > +        uint32_t emc_insert_mask = smc_hitmask | ~hwol_emc_smc_hitmask;
> > +        emc_insert_mask &= lookup_pkts_bitmask;
> > +        emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask);
> > +    }
> > +    if (smc_enabled) {
> > +        uint32_t smc_insert_mask = ~hwol_emc_smc_hitmask;
> > +        smc_insert_mask &= lookup_pkts_bitmask;
> > +        smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask);
> > +    }
> > +
> > +    /* At this point we don't return error anymore, so commit stats here. */
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
> > +                            dpcls_key_idx);
> > +    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
> > +                            dpcls_key_idx);
> > +
> > +    /* Initialize the "Action Batch" for each flow handled below. */
> > +    struct dp_packet_batch action_batch;
> > +    action_batch.trunc = 0;
> > +
> > +    while (lookup_pkts_bitmask) {
> > +        uint32_t rule_pkt_idx = __builtin_ctz(lookup_pkts_bitmask);
> > +        uint64_t needle = (uintptr_t) rules[rule_pkt_idx];
> > +
> > +        /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a
> > +         * bitmask.
> > +         */
> > +        uint32_t batch_bitmask = 0;
> > +        for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) {
> > +            /* Pre-calculate store addr. */
> > +            uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask);
> > +            void *store_addr = &action_batch.packets[num_pkts_in_batch];
> > +
> > +            /* Search for identical flow* in burst, update bitmask. */
> > +            __m512i v_needle = _mm512_set1_epi64(needle);
> > +            __m512i v_hay = _mm512_loadu_si512(&rules[j]);
> > +            __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay);
> > +            uint32_t cmp_bits = k_cmp_bits;
> > +            batch_bitmask |= cmp_bits << j;
> > +
> > +            /* Compress and store the batched packets. */
> > +            struct dp_packet **packets_ptrs = &packets->packets[j];
> > +            __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs);
> > +            _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs);
> > +        }
> > +
> > +        /* Strip all packets in this batch from the lookup_pkts_bitmask. */
> > +        lookup_pkts_bitmask &= (~batch_bitmask);
> > +        action_batch.count = __builtin_popcountll(batch_bitmask);
> > +
> > +        /* Loop over all packets in this batch, to gather the byte and tcp_flag
> > +         * values, and pass them to the execute function. It would be nice to
> > +         * optimize this away, however it is not easy to refactor in dpif.
> > +         */
> > +        uint32_t bytes = 0;
> > +        uint16_t tcp_flags = 0;
> > +        uint32_t bitmask_iter = batch_bitmask;
> > +        for (int i = 0; i < action_batch.count; i++) {
> > +            uint32_t idx = __builtin_ctzll(bitmask_iter);
> > +            bitmask_iter = _blsr_u64(bitmask_iter);
> > +
> > +            bytes += pkt_meta[idx].bytes;
> > +            tcp_flags |= pkt_meta[idx].tcp_flags;
> > +        }
> > +
> > +        dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx],
> > +                                bytes, tcp_flags);
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +#endif
> > +#endif
> > diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
> > index 6a672d1b3..d5d4da7ea 100644
> > --- a/lib/dpif-netdev-private-dfc.h
> > +++ b/lib/dpif-netdev-private-dfc.h
> > @@ -81,6 +81,14 @@ extern "C" {
> >  #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
> >                                      DEFAULT_EM_FLOW_INSERT_INV_PROB)
> >
> > +/* Forward declaration for SMC function prototype that requires access to
> > + * 'struct dp_netdev_pmd_thread'. */
> > +struct dp_netdev_pmd_thread;
> > +
> > +/* Forward declaration for EMC and SMC batch insert function prototypes that
> > + * require access to 'struct dpcls_rule'. */
> > +struct dpcls_rule;
> > +
> >  struct emc_entry {
> >      struct dp_netdev_flow *flow;
> >      struct netdev_flow_key key;   /* key.hash used for emc hash value. */
> > @@ -168,6 +176,23 @@ emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
> >      return NULL;
> >  }
> >
> > +/* Insert a batch of keys/flows into the EMC and SMC caches. */
> > +void
> > +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t emc_insert_mask);
> > +
> > +void
> > +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t smc_insert_mask);
> > +
> > +struct dp_netdev_flow *
> > +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> > +                  struct dp_packet *packet,
> > +                  struct netdev_flow_key *key);
> >
> >  #ifdef  __cplusplus
> >  }
> > diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
> > new file mode 100644
> > index 000000000..2fd7cc400
> > --- /dev/null
> > +++ b/lib/dpif-netdev-private-dpif.h
> > @@ -0,0 +1,32 @@
> > +/*
> > + * Copyright (c) 2021 Intel Corporation.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing, software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> > + * See the License for the specific language governing permissions and
> > + * limitations under the License.
> > + */
> > +
> > +#ifndef DPIF_NETDEV_PRIVATE_DPIF_H
> > +#define DPIF_NETDEV_PRIVATE_DPIF_H 1
> > +
> > +#include "openvswitch/types.h"
> > +
> > +/* Forward declarations to avoid including files. */
> > +struct dp_netdev_pmd_thread;
> > +struct dp_packet_batch;
> > +
> > +/* Available implementations for dpif work. */
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port);
> > +
> > +#endif /* netdev-private.h */
> > diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
> > index 0d674ab83..17356d5e2 100644
> > --- a/lib/dpif-netdev-private-thread.h
> > +++ b/lib/dpif-netdev-private-thread.h
> > @@ -45,14 +45,19 @@ struct dp_netdev_pmd_thread_ctx {
> >      struct dp_netdev_rxq *last_rxq;
> >      /* EMC insertion probability context for the current processing cycle. */
> >      uint32_t emc_insert_min;
> > +    /* Enable the SMC cache from ovsdb config. */
> > +    bool smc_enable_db;
> >  };
> >
> >  /* Forward declaration for typedef. */
> >  struct dp_netdev_pmd_thread;
> >
> > -typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> > -                                     struct dp_packet_batch *packets,
> > -                                     odp_port_t port_no);
> > +/* Typedef for DPIF functions.
> > + * Returns a bitmask of packets to handle, possibly including upcall/misses.
> > + */
> > +typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
> > +                                        struct dp_packet_batch *packets,
> > +                                        odp_port_t port_no);
> >
> >  /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
> >   * the performance overhead of interrupt processing.  Therefore netdev can
> > diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
> > index d7b6fd7ec..0315b5bf6 100644
> > --- a/lib/dpif-netdev-private.h
> > +++ b/lib/dpif-netdev-private.h
> > @@ -31,4 +31,29 @@
> >  #include "dpif-netdev-private-dfc.h"
> >  #include "dpif-netdev-private-thread.h"
> >
> > +/* Allow other implementations to lookup the DPCLS instances. */
> > +struct dpcls *
> > +dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> > +                           odp_port_t in_port);
> > +
> > +/* Allow other implementations to call dpcls_lookup() for subtable search. */
> > +bool
> > +dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> > +             struct dpcls_rule **rules, const size_t cnt,
> > +             int *num_lookups_p);
> > +
> > +/* Allow other implementations to execute actions on a batch. */
> > +void
> > +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> > +                        struct dp_packet_batch *packets,
> > +                        struct dpcls_rule *rule,
> > +                        uint32_t bytes,
> > +                        uint16_t tcp_flags);
> > +
> > +/* Available implementations for dpif work. */
> > +int32_t
> > +dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
> > +                             struct dp_packet_batch *packets,
> > +                             odp_port_t in_port);
> > +
> >  #endif /* netdev-private.h */
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index e6486417e..1f15af882 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -183,10 +183,6 @@ static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
> >  static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
> >                           const struct netdev_flow_key *mask);
> >  static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
> > -static bool dpcls_lookup(struct dpcls *cls,
> > -                         const struct netdev_flow_key *keys[],
> > -                         struct dpcls_rule **rules, size_t cnt,
> > -                         int *num_lookups_p);
> >
> >  /* Set of supported meter flags */
> >  #define DP_SUPPORTED_METER_FLAGS_MASK \
> > @@ -483,7 +479,7 @@ static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
> >                                        const struct flow *flow,
> >                                        const struct nlattr *actions,
> >                                        size_t actions_len);
> > -static void dp_netdev_input(struct dp_netdev_pmd_thread *,
> > +static int32_t dp_netdev_input(struct dp_netdev_pmd_thread *,
> >                              struct dp_packet_batch *, odp_port_t port_no);
> >  static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
> >                                    struct dp_packet_batch *);
> > @@ -555,7 +551,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
> >                                 bool purge);
> >  static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
> >                                        struct tx_port *tx);
> > -static inline struct dpcls *
> > +inline struct dpcls *
> >  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> >                             odp_port_t in_port);
> >
> > @@ -1920,7 +1916,7 @@ void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
> >      }
> >  }
> >
> > -static inline struct dpcls *
> > +inline struct dpcls *
> >  dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
> >                             odp_port_t in_port)
> >  {
> > @@ -2714,13 +2710,46 @@ smc_insert(struct dp_netdev_pmd_thread *pmd,
> >      bucket->flow_idx[i] = index;
> >  }
> >
> > +inline void
> > +emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                               const struct netdev_flow_key *keys,
> > +                               struct dpcls_rule **rules,
> > +                               uint32_t emc_insert_mask)
> > +{
> > +    while (emc_insert_mask) {
> > +        uint32_t i = __builtin_ctz(emc_insert_mask);
> 
> I got an error on Windows:
> 
> [...]
> libtool: compile:  build-aux/cccl -DHAVE_CONFIG_H -I. -I ./include/windows -I ./datapath-
> windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I ./include -I ./lib -I ./lib -
> IC:/OpenSSL-Win64/include -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat
> -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-
> prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -Wthread-safety -
> fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -
> Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -Wmultistatement-macros -Wcast-
> align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-arithmetic -g -DHAVE_AVX512F -c
> lib/dpif-netdev.c
> libtool: compile: mv -f "dpif-netdev-lookup-autovalidator.obj" "lib/dpif-netdev-lookup-
> autovalidator.obj"
> c:\PTHREADS-BUILT\include\_ptw32.h(120): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
> c:\openvswitch_compile\config.h(207): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
> c:\openvswitch_compile\lib\ovs-rcu.h(215): warning C4311: 'type cast': pointer truncation from 'void
> *' to 'long'
> libtool: compile: mv -f "dpif-netdev-lookup-generic.obj" "lib/dpif-netdev-lookup-generic.obj"
> dpif-netdev.c
> \
> 	source='lib/dpif-netdev-private-dfc.c' object='lib/dpif-netdev-private-dfc.lo' libtool=yes \
> 	DEPDIR=.deps depmode=none /bin/sh ./build-aux/depcomp \
> 	/bin/sh ./libtool  --tag=CC   --mode=compile build-aux/cccl -DHAVE_CONFIG_H -I.   -I
> ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I
> ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include   -Wstrict-prototypes -Wall -Wextra -Wno-sign-
> compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-
> cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-
> initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-
> array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -
> Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-
> arithmetic    -g -DHAVE_AVX512F -c -o lib/dpif-netdev-private-dfc.lo lib/dpif-netdev-private-dfc.c
> c:\PTHREADS-BUILT\include\_ptw32.h(120): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
> c:\openvswitch_compile\config.h(207): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
> c:\openvswitch_compile\lib\ovs-rcu.h(215): warning C4311: 'type cast': pointer truncation from 'void
> *' to 'long'
> c:\openvswitch_compile\config.h(207): warning C4005: 'HAVE_STRUCT_TIMESPEC': macro redefinition
> c:\PTHREADS-BUILT\include\_ptw32.h(120): note: see previous definition of 'HAVE_STRUCT_TIMESPEC'
> lib/dpif-netdev.c(2826): error C4013: '__builtin_ctz' undefined; assuming extern returning int
> lib/dpif-netdev.c(2919): warning C4311: 'type cast': pointer truncation from 'const char *const ' to
> 'long'
> \
> 	source='lib/dpif-netdev-private-dpif.c' object='lib/dpif-netdev-private-dpif.lo' libtool=yes \
> 	DEPDIR=.deps depmode=none /bin/sh ./build-aux/depcomp \
> 	/bin/sh ./libtool  --tag=CC   --mode=compile build-aux/cccl -DHAVE_CONFIG_H -I.   -I
> ./include/windows -I ./datapath-windows/include -Ic:/PTHREADS-BUILT//include -O2 -I ./include -I
> ./include -I ./lib -I ./lib -IC:/OpenSSL-Win64/include   -Wstrict-prototypes -Wall -Wextra -Wno-sign-
> compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-
> cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-
> initializers -Wthread-safety -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-
> array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Qunused-arguments -Wshadow -
> Wmultistatement-macros -Wcast-align=strict -Wno-null-pointer-arithmetic -Warray-bounds-pointer-
> arithmetic    -g -DHAVE_AVX512F -c -o lib/dpif-netdev-private-dpif.lo lib/dpif-netdev-private-dpif.c
> make[2]: *** [lib/dpif-netdev.lo] Error 1
> make[2]: *** Waiting for unfinished jobs....
> 

Thanks for testing and finding this. We don't have the Windows machines set up to test this.

We need to use OVS's raw_ctz(). This will wrap the uses of __builtin_ctz and __builtin_ctzll. This should fix the above error. I'll fix this.

> Thanks,
> fbl
> 
> 
> > +        emc_insert_mask &= emc_insert_mask - 1;
> > +        /* Get the require parameters for EMC/SMC from the rule */
> > +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> > +        /* Insert the key into EMC/SMC. */
> > +        emc_probabilistic_insert(pmd, &keys[i], flow);
> > +    }
> > +}
> > +
> > +inline void
> > +smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
> > +                 const struct netdev_flow_key *keys,
> > +                 struct dpcls_rule **rules,
> > +                 uint32_t smc_insert_mask)
> > +{
> > +    while (smc_insert_mask) {
> > +        uint32_t i = __builtin_ctz(smc_insert_mask);
> > +        smc_insert_mask &= smc_insert_mask - 1;
> > +        /* Get the require parameters for EMC/SMC from the rule */
> > +        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
> > +        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
> > +        /* Insert the key into EMC/SMC. */
> > +        smc_insert(pmd, &keys[i], hash);
> > +    }
> > +}
> > +
> >  static struct dp_netdev_flow *
> >  dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
> >                            const struct netdev_flow_key *key,
> >                            int *lookup_num_p)
> >  {
> >      struct dpcls *cls;
> > -    struct dpcls_rule *rule;
> > +    struct dpcls_rule *rule = NULL;
> >      odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
> >                                                       in_port.odp_port));
> >      struct dp_netdev_flow *netdev_flow = NULL;
> > @@ -4233,7 +4262,10 @@ dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
> >          }
> >
> >          /* Process packet batch. */
> > -        pmd->netdev_input_func(pmd, &batch, port_no);
> > +        int32_t ret = pmd->netdev_input_func(pmd, &batch, port_no);
> > +        if (ret) {
> > +            dp_netdev_input(pmd, &batch, port_no);
> > +        }
> >
> >          /* Assign processing cycles to rx queue. */
> >          cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
> > @@ -5251,6 +5283,8 @@ dpif_netdev_run(struct dpif *dpif)
> >                      non_pmd->ctx.emc_insert_min = 0;
> >                  }
> >
> > +                non_pmd->ctx.smc_enable_db = dp->smc_enable_db;
> > +
> >                  for (i = 0; i < port->n_rxq; i++) {
> >
> >                      if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
> > @@ -5522,6 +5556,8 @@ reload:
> >                  pmd->ctx.emc_insert_min = 0;
> >              }
> >
> > +            pmd->ctx.smc_enable_db = pmd->dp->smc_enable_db;
> > +
> >              process_packets =
> >                  dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
> >                                             poll_list[i].port_no);
> > @@ -6415,6 +6451,24 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
> >                                actions->actions, actions->size);
> >  }
> >
> > +void
> > +dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
> > +                        struct dp_packet_batch *packets,
> > +                        struct dpcls_rule *rule,
> > +                        uint32_t bytes,
> > +                        uint16_t tcp_flags)
> > +{
> > +    /* Gets action* from the rule. */
> > +    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
> > +    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
> > +
> > +    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
> > +                        tcp_flags, pmd->ctx.now / 1000);
> > +    const uint32_t steal = 1;
> > +    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
> > +                              actions->actions, actions->size);
> > +}
> > +
> >  static inline void
> >  dp_netdev_queue_batches(struct dp_packet *pkt,
> >                          struct dp_netdev_flow *flow, uint16_t tcp_flags,
> > @@ -6519,6 +6573,30 @@ smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
> >      pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
> >  }
> >
> > +struct dp_netdev_flow *
> > +smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
> > +                  struct dp_packet *packet,
> > +                  struct netdev_flow_key *key)
> > +{
> > +    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
> > +
> > +    if (OVS_LIKELY(flow_node != NULL)) {
> > +        struct dp_netdev_flow *flow = NULL;
> > +
> > +        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
> > +            /* Since we dont have per-port megaflow to check the port
> > +             * number, we need to verify that the input ports match. */
> > +            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
> > +                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
> > +
> > +                return (void *) flow;
> > +            }
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> >  /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
> >   * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
> >   * miniflow is copied into 'keys' and the packet pointer is moved at the
> > @@ -6924,12 +7002,13 @@ dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
> >      }
> >  }
> >
> > -static void
> > +static int32_t
> >  dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
> >                  struct dp_packet_batch *packets,
> >                  odp_port_t port_no)
> >  {
> >      dp_netdev_input__(pmd, packets, false, port_no);
> > +    return 0;
> >  }
> >
> >  static void
> > @@ -8369,7 +8448,7 @@ dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
> >
> >  /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
> >   * in 'mask' the values in 'key' and 'target' are the same. */
> > -bool
> > +inline bool ALWAYS_INLINE
> >  dpcls_rule_matches_key(const struct dpcls_rule *rule,
> >                         const struct netdev_flow_key *target)
> >  {
> > @@ -8395,7 +8474,7 @@ dpcls_rule_matches_key(const struct dpcls_rule *rule,
> >   * priorities, instead returning any rule which matches the flow.
> >   *
> >   * Returns true if all miniflows found a corresponding rule. */
> > -static bool
> > +bool
> >  dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
> >               struct dpcls_rule **rules, const size_t cnt,
> >               int *num_lookups_p)
> > --
> > 2.32.0
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> --
> fbl
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index 96b3a61c8..6a4a7b76d 100644
--- a/NEWS
+++ b/NEWS
@@ -10,6 +10,8 @@  Post-v2.15.0
      * Auto load balancing of PMDs now partially supports cross-NUMA polling
        cases, e.g if all PMD threads are running on the same NUMA node.
      * Refactor lib/dpif-netdev.c to multiple header files.
+     * Add avx512 implementation of dpif which can process non recirculated
+       packets. It supports partial HWOL, EMC, SMC and DPCLS lookups.
    - ovs-ctl:
      * New option '--no-record-hostname' to disable hostname configuration
        in ovsdb on startup.
diff --git a/lib/automake.mk b/lib/automake.mk
index 3a33cdd5c..660cd07f0 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -33,11 +33,13 @@  lib_libopenvswitchavx512_la_CFLAGS = \
 	-mavx512f \
 	-mavx512bw \
 	-mavx512dq \
+	-mbmi \
 	-mbmi2 \
 	-fPIC \
 	$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
-	lib/dpif-netdev-lookup-avx512-gather.c
+	lib/dpif-netdev-lookup-avx512-gather.c \
+	lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
 	-static
 endif
@@ -114,6 +116,7 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dpif-netdev-private-dfc.c \
 	lib/dpif-netdev-private-dfc.h \
 	lib/dpif-netdev-private-dpcls.h \
+	lib/dpif-netdev-private-dpif.h \
 	lib/dpif-netdev-private-flow.h \
 	lib/dpif-netdev-private-hwol.h \
 	lib/dpif-netdev-private-thread.h \
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
new file mode 100644
index 000000000..0e55b0be2
--- /dev/null
+++ b/lib/dpif-netdev-avx512.c
@@ -0,0 +1,327 @@ 
+/*
+ * Copyright (c) 2021 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions. */
+#if !defined(__CHECKER__)
+
+#include <config.h>
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-perf.h"
+
+#include "dpif-netdev-private.h"
+#include "dpif-netdev-private-dpcls.h"
+#include "dpif-netdev-private-flow.h"
+#include "dpif-netdev-private-thread.h"
+#include "dpif-netdev-private-hwol.h"
+
+#include "dp-packet.h"
+#include "netdev.h"
+
+#include "immintrin.h"
+
+/* Each AVX512 register (zmm register in assembly notation) can contain up to
+ * 512 bits, which is equivalent to 8 uint64_t variables. This is the maximum
+ * number of miniflow blocks that can be processed in a single pass of the
+ * AVX512 code at a time.
+ */
+#define NUM_U64_IN_ZMM_REG (8)
+
+/* Structure to contain per-packet metadata that must be attributed to the
+ * dp netdev flow. This is unfortunate to have to track per packet, however
+ * it's a bit awkward to maintain them in a performant way. This structure
+ * helps to keep two variables on a single cache line per packet.
+ */
+struct pkt_flow_meta {
+    uint16_t bytes;
+    uint16_t tcp_flags;
+};
+
+/* Structure of heap allocated memory for DPIF internals. */
+struct dpif_userdata {
+    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+        struct netdev_flow_key keys[NETDEV_MAX_BURST];
+    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+        struct netdev_flow_key *key_ptrs[NETDEV_MAX_BURST];
+    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)
+        struct pkt_flow_meta pkt_meta[NETDEV_MAX_BURST];
+};
+
+int32_t
+dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
+                             struct dp_packet_batch *packets,
+                             odp_port_t in_port)
+{
+    /* Allocate DPIF userdata. */
+    if (OVS_UNLIKELY(!pmd->netdev_input_func_userdata)) {
+        pmd->netdev_input_func_userdata =
+                xmalloc_pagealign(sizeof(struct dpif_userdata));
+    }
+
+    struct dpif_userdata *ud = pmd->netdev_input_func_userdata;
+    struct netdev_flow_key *keys = ud->keys;
+    struct netdev_flow_key **key_ptrs = ud->key_ptrs;
+    struct pkt_flow_meta *pkt_meta = ud->pkt_meta;
+
+    /* The AVX512 DPIF implementation handles rules in a way that is optimized
+     * for reducing data-movement between HWOL/EMC/SMC and DPCLS. This is
+     * achieved by separating the rule arrays. Bitmasks are kept for each
+     * packet, indicating if it matched in the HWOL/EMC/SMC array or DPCLS
+     * array. Later the two arrays are merged by AVX-512 expand instructions.
+     */
+
+    /* Stores the computed output: a rule pointer for each packet. */
+    /* Used initially for HWOL/EMC/SMC. */
+    struct dpcls_rule *rules[NETDEV_MAX_BURST];
+    /* Used for DPCLS. */
+    struct dpcls_rule *dpcls_rules[NETDEV_MAX_BURST];
+
+    uint32_t dpcls_key_idx = 0;
+
+    for (uint32_t i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
+        _mm512_storeu_si512(&rules[i], _mm512_setzero_si512());
+        _mm512_storeu_si512(&dpcls_rules[i], _mm512_setzero_si512());
+    }
+
+    /* Prefetch each packet's metadata. */
+    const size_t batch_size = dp_packet_batch_size(packets);
+    for (int i = 0; i < batch_size; i++) {
+        struct dp_packet *packet = packets->packets[i];
+        OVS_PREFETCH(dp_packet_data(packet));
+        pkt_metadata_prefetch_init(&packet->md);
+    }
+
+    /* Check if EMC or SMC are enabled. */
+    struct dfc_cache *cache = &pmd->flow_cache;
+    const uint32_t emc_enabled = pmd->ctx.emc_insert_min != 0;
+    const uint32_t smc_enabled = pmd->ctx.smc_enable_db;
+
+    uint32_t emc_hits = 0;
+    uint32_t smc_hits = 0;
+
+    /* A 1 bit in this mask indicates a hit, so no DPCLS lookup on the pkt. */
+    uint32_t hwol_emc_smc_hitmask = 0;
+    uint32_t smc_hitmask = 0;
+
+    /* The below while loop is based on the 'iter' variable which has a number
+     * of bits set representing packets that we want to process
+     * (HWOL->MFEX->EMC->SMC). As each packet is processed, we clear (set to 0)
+     * the bit representing that packet using '_blsr_u64()'. The
+     * '__builtin_ctz()' will give us the correct index into the 'packets',
+     * 'pkt_meta', 'keys' and 'rules' arrays.
+     *
+     * For one iteration of the while loop, here's some psuedocode as an
+     * example where 'iter' is represented in binary:
+     *
+     * while (iter) { // iter = 1100
+     *     uint32_t i = __builtin_ctz(iter); // i = 2
+     *     iter = _blsr_u64(iter); // iter = 1000
+     *     // do all processing (HWOL->MFEX->EMC->SMC)
+     * }
+     */
+    uint32_t lookup_pkts_bitmask = (1ULL << batch_size) - 1;
+    uint32_t iter = lookup_pkts_bitmask;
+    while (iter) {
+        uint32_t i = __builtin_ctz(iter);
+        iter = _blsr_u64(iter);
+
+        /* Get packet pointer from bitmask and packet md. */
+        struct dp_packet *packet = packets->packets[i];
+        pkt_metadata_init(&packet->md, in_port);
+
+        struct dp_netdev_flow *f = NULL;
+
+        /* Check for partial hardware offload mark. */
+        uint32_t mark;
+        if (dp_packet_has_flow_mark(packet, &mark)) {
+            f = mark_to_flow_find(pmd, mark);
+            if (f) {
+                rules[i] = &f->cr;
+                pkt_meta[i].tcp_flags = parse_tcp_flags(packet);
+                pkt_meta[i].bytes = dp_packet_size(packet);
+                hwol_emc_smc_hitmask |= (1 << i);
+                continue;
+            }
+        }
+
+        /* Do miniflow extract into keys. */
+        struct netdev_flow_key *key = &keys[i];
+        miniflow_extract(packet, &key->mf);
+
+        /* Cache TCP and byte values for all packets. */
+        pkt_meta[i].bytes = dp_packet_size(packet);
+        pkt_meta[i].tcp_flags = miniflow_get_tcp_flags(&key->mf);
+
+        key->len = netdev_flow_key_size(miniflow_n_values(&key->mf));
+        key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
+
+        if (emc_enabled) {
+            f = emc_lookup(&cache->emc_cache, key);
+
+            if (f) {
+                rules[i] = &f->cr;
+                emc_hits++;
+                hwol_emc_smc_hitmask |= (1 << i);
+                continue;
+            }
+        }
+
+        if (smc_enabled && !f) {
+            f = smc_lookup_single(pmd, packet, key);
+            if (f) {
+                rules[i] = &f->cr;
+                smc_hits++;
+                smc_hitmask |= (1 << i);
+                continue;
+            }
+        }
+
+        /* The flow pointer was not found in HWOL/EMC/SMC, so add it to the
+         * dpcls input keys array for batch lookup later.
+         */
+        key_ptrs[dpcls_key_idx] = &keys[i];
+        dpcls_key_idx++;
+    }
+
+    hwol_emc_smc_hitmask |= smc_hitmask;
+
+    /* DPCLS handles any packets missed by HWOL/EMC/SMC. It operates on the
+     * key_ptrs[] for input miniflows to match, storing results in the
+     * dpcls_rules[] array.
+     */
+    if (dpcls_key_idx > 0) {
+        struct dpcls *cls = dp_netdev_pmd_lookup_dpcls(pmd, in_port);
+        if (OVS_UNLIKELY(!cls)) {
+            return -1;
+        }
+        bool any_miss =
+            !dpcls_lookup(cls, (const struct netdev_flow_key **) key_ptrs,
+                          dpcls_rules, dpcls_key_idx, NULL);
+        if (OVS_UNLIKELY(any_miss)) {
+            return -1;
+        }
+
+        /* Merge DPCLS rules and HWOL/EMC/SMC rules. */
+        uint32_t dpcls_idx = 0;
+        for (int i = 0; i < NETDEV_MAX_BURST; i += NUM_U64_IN_ZMM_REG) {
+            /* Indexing here is somewhat complicated due to DPCLS output rule
+             * load index depending on the hitmask of HWOL/EMC/SMC. More
+             * packets from HWOL/EMC/SMC bitmask means less DPCLS rules are
+             * used.
+             */
+            __m512i v_cache_rules = _mm512_loadu_si512(&rules[i]);
+            __m512i v_merged_rules =
+                        _mm512_mask_expandloadu_epi64(v_cache_rules,
+                                                      ~hwol_emc_smc_hitmask,
+                                                      &dpcls_rules[dpcls_idx]);
+            _mm512_storeu_si512(&rules[i], v_merged_rules);
+
+            /* Update DPCLS load index and bitmask for HWOL/EMC/SMC hits.
+             * There are NUM_U64_IN_ZMM_REG output pointers per register,
+             * subtract the HWOL/EMC/SMC lanes equals the number of DPCLS rules
+             * consumed.
+             */
+            uint32_t hitmask_FF = (hwol_emc_smc_hitmask & 0xFF);
+            dpcls_idx += NUM_U64_IN_ZMM_REG - __builtin_popcountll(hitmask_FF);
+            hwol_emc_smc_hitmask =
+                (hwol_emc_smc_hitmask >> NUM_U64_IN_ZMM_REG);
+        }
+    }
+
+    /* At this point we have a 1:1 pkt to rules mapping, so update EMC/SMC
+     * if required.
+     */
+    /* Insert SMC and DPCLS hits into EMC. */
+    /* Insert DPCLS hits into SMC. */
+    if (emc_enabled) {
+        uint32_t emc_insert_mask = smc_hitmask | ~hwol_emc_smc_hitmask;
+        emc_insert_mask &= lookup_pkts_bitmask;
+        emc_probabilistic_insert_batch(pmd, keys, &rules[0], emc_insert_mask);
+    }
+    if (smc_enabled) {
+        uint32_t smc_insert_mask = ~hwol_emc_smc_hitmask;
+        smc_insert_mask &= lookup_pkts_bitmask;
+        smc_insert_batch(pmd, keys, &rules[0], smc_insert_mask);
+    }
+
+    /* At this point we don't return error anymore, so commit stats here. */
+    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_RECV, batch_size);
+    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, emc_hits);
+    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, smc_hits);
+    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
+                            dpcls_key_idx);
+    pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
+                            dpcls_key_idx);
+
+    /* Initialize the "Action Batch" for each flow handled below. */
+    struct dp_packet_batch action_batch;
+    action_batch.trunc = 0;
+
+    while (lookup_pkts_bitmask) {
+        uint32_t rule_pkt_idx = __builtin_ctz(lookup_pkts_bitmask);
+        uint64_t needle = (uintptr_t) rules[rule_pkt_idx];
+
+        /* Parallel compare NUM_U64_IN_ZMM_REG flow* 's to the needle, create a
+         * bitmask.
+         */
+        uint32_t batch_bitmask = 0;
+        for (uint32_t j = 0; j < NETDEV_MAX_BURST; j += NUM_U64_IN_ZMM_REG) {
+            /* Pre-calculate store addr. */
+            uint32_t num_pkts_in_batch = __builtin_popcountll(batch_bitmask);
+            void *store_addr = &action_batch.packets[num_pkts_in_batch];
+
+            /* Search for identical flow* in burst, update bitmask. */
+            __m512i v_needle = _mm512_set1_epi64(needle);
+            __m512i v_hay = _mm512_loadu_si512(&rules[j]);
+            __mmask8 k_cmp_bits = _mm512_cmpeq_epi64_mask(v_needle, v_hay);
+            uint32_t cmp_bits = k_cmp_bits;
+            batch_bitmask |= cmp_bits << j;
+
+            /* Compress and store the batched packets. */
+            struct dp_packet **packets_ptrs = &packets->packets[j];
+            __m512i v_pkt_ptrs = _mm512_loadu_si512(packets_ptrs);
+            _mm512_mask_compressstoreu_epi64(store_addr, cmp_bits, v_pkt_ptrs);
+        }
+
+        /* Strip all packets in this batch from the lookup_pkts_bitmask. */
+        lookup_pkts_bitmask &= (~batch_bitmask);
+        action_batch.count = __builtin_popcountll(batch_bitmask);
+
+        /* Loop over all packets in this batch, to gather the byte and tcp_flag
+         * values, and pass them to the execute function. It would be nice to
+         * optimize this away, however it is not easy to refactor in dpif.
+         */
+        uint32_t bytes = 0;
+        uint16_t tcp_flags = 0;
+        uint32_t bitmask_iter = batch_bitmask;
+        for (int i = 0; i < action_batch.count; i++) {
+            uint32_t idx = __builtin_ctzll(bitmask_iter);
+            bitmask_iter = _blsr_u64(bitmask_iter);
+
+            bytes += pkt_meta[idx].bytes;
+            tcp_flags |= pkt_meta[idx].tcp_flags;
+        }
+
+        dp_netdev_batch_execute(pmd, &action_batch, rules[rule_pkt_idx],
+                                bytes, tcp_flags);
+    }
+
+    return 0;
+}
+
+#endif
+#endif
diff --git a/lib/dpif-netdev-private-dfc.h b/lib/dpif-netdev-private-dfc.h
index 6a672d1b3..d5d4da7ea 100644
--- a/lib/dpif-netdev-private-dfc.h
+++ b/lib/dpif-netdev-private-dfc.h
@@ -81,6 +81,14 @@  extern "C" {
 #define DEFAULT_EM_FLOW_INSERT_MIN (UINT32_MAX /                     \
                                     DEFAULT_EM_FLOW_INSERT_INV_PROB)
 
+/* Forward declaration for SMC function prototype that requires access to
+ * 'struct dp_netdev_pmd_thread'. */
+struct dp_netdev_pmd_thread;
+
+/* Forward declaration for EMC and SMC batch insert function prototypes that
+ * require access to 'struct dpcls_rule'. */
+struct dpcls_rule;
+
 struct emc_entry {
     struct dp_netdev_flow *flow;
     struct netdev_flow_key key;   /* key.hash used for emc hash value. */
@@ -168,6 +176,23 @@  emc_lookup(struct emc_cache *cache, const struct netdev_flow_key *key)
     return NULL;
 }
 
+/* Insert a batch of keys/flows into the EMC and SMC caches. */
+void
+emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
+                               const struct netdev_flow_key *keys,
+                               struct dpcls_rule **rules,
+                               uint32_t emc_insert_mask);
+
+void
+smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
+                               const struct netdev_flow_key *keys,
+                               struct dpcls_rule **rules,
+                               uint32_t smc_insert_mask);
+
+struct dp_netdev_flow *
+smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
+                  struct dp_packet *packet,
+                  struct netdev_flow_key *key);
 
 #ifdef  __cplusplus
 }
diff --git a/lib/dpif-netdev-private-dpif.h b/lib/dpif-netdev-private-dpif.h
new file mode 100644
index 000000000..2fd7cc400
--- /dev/null
+++ b/lib/dpif-netdev-private-dpif.h
@@ -0,0 +1,32 @@ 
+/*
+ * Copyright (c) 2021 Intel Corporation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_PRIVATE_DPIF_H
+#define DPIF_NETDEV_PRIVATE_DPIF_H 1
+
+#include "openvswitch/types.h"
+
+/* Forward declarations to avoid including files. */
+struct dp_netdev_pmd_thread;
+struct dp_packet_batch;
+
+/* Available implementations for dpif work. */
+int32_t
+dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
+                             struct dp_packet_batch *packets,
+                             odp_port_t in_port);
+
+#endif /* netdev-private.h */
diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index 0d674ab83..17356d5e2 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -45,14 +45,19 @@  struct dp_netdev_pmd_thread_ctx {
     struct dp_netdev_rxq *last_rxq;
     /* EMC insertion probability context for the current processing cycle. */
     uint32_t emc_insert_min;
+    /* Enable the SMC cache from ovsdb config. */
+    bool smc_enable_db;
 };
 
 /* Forward declaration for typedef. */
 struct dp_netdev_pmd_thread;
 
-typedef void (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
-                                     struct dp_packet_batch *packets,
-                                     odp_port_t port_no);
+/* Typedef for DPIF functions.
+ * Returns a bitmask of packets to handle, possibly including upcall/misses.
+ */
+typedef int32_t (*dp_netdev_input_func)(struct dp_netdev_pmd_thread *pmd,
+                                        struct dp_packet_batch *packets,
+                                        odp_port_t port_no);
 
 /* PMD: Poll modes drivers.  PMD accesses devices via polling to eliminate
  * the performance overhead of interrupt processing.  Therefore netdev can
diff --git a/lib/dpif-netdev-private.h b/lib/dpif-netdev-private.h
index d7b6fd7ec..0315b5bf6 100644
--- a/lib/dpif-netdev-private.h
+++ b/lib/dpif-netdev-private.h
@@ -31,4 +31,29 @@ 
 #include "dpif-netdev-private-dfc.h"
 #include "dpif-netdev-private-thread.h"
 
+/* Allow other implementations to lookup the DPCLS instances. */
+struct dpcls *
+dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
+                           odp_port_t in_port);
+
+/* Allow other implementations to call dpcls_lookup() for subtable search. */
+bool
+dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
+             struct dpcls_rule **rules, const size_t cnt,
+             int *num_lookups_p);
+
+/* Allow other implementations to execute actions on a batch. */
+void
+dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
+                        struct dp_packet_batch *packets,
+                        struct dpcls_rule *rule,
+                        uint32_t bytes,
+                        uint16_t tcp_flags);
+
+/* Available implementations for dpif work. */
+int32_t
+dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
+                             struct dp_packet_batch *packets,
+                             odp_port_t in_port);
+
 #endif /* netdev-private.h */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index e6486417e..1f15af882 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -183,10 +183,6 @@  static uint32_t dpcls_subtable_lookup_reprobe(struct dpcls *cls);
 static void dpcls_insert(struct dpcls *, struct dpcls_rule *,
                          const struct netdev_flow_key *mask);
 static void dpcls_remove(struct dpcls *, struct dpcls_rule *);
-static bool dpcls_lookup(struct dpcls *cls,
-                         const struct netdev_flow_key *keys[],
-                         struct dpcls_rule **rules, size_t cnt,
-                         int *num_lookups_p);
 
 /* Set of supported meter flags */
 #define DP_SUPPORTED_METER_FLAGS_MASK \
@@ -483,7 +479,7 @@  static void dp_netdev_execute_actions(struct dp_netdev_pmd_thread *pmd,
                                       const struct flow *flow,
                                       const struct nlattr *actions,
                                       size_t actions_len);
-static void dp_netdev_input(struct dp_netdev_pmd_thread *,
+static int32_t dp_netdev_input(struct dp_netdev_pmd_thread *,
                             struct dp_packet_batch *, odp_port_t port_no);
 static void dp_netdev_recirculate(struct dp_netdev_pmd_thread *,
                                   struct dp_packet_batch *);
@@ -555,7 +551,7 @@  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
                                bool purge);
 static int dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
                                       struct tx_port *tx);
-static inline struct dpcls *
+inline struct dpcls *
 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
                            odp_port_t in_port);
 
@@ -1920,7 +1916,7 @@  void dp_netdev_flow_unref(struct dp_netdev_flow *flow)
     }
 }
 
-static inline struct dpcls *
+inline struct dpcls *
 dp_netdev_pmd_lookup_dpcls(struct dp_netdev_pmd_thread *pmd,
                            odp_port_t in_port)
 {
@@ -2714,13 +2710,46 @@  smc_insert(struct dp_netdev_pmd_thread *pmd,
     bucket->flow_idx[i] = index;
 }
 
+inline void
+emc_probabilistic_insert_batch(struct dp_netdev_pmd_thread *pmd,
+                               const struct netdev_flow_key *keys,
+                               struct dpcls_rule **rules,
+                               uint32_t emc_insert_mask)
+{
+    while (emc_insert_mask) {
+        uint32_t i = __builtin_ctz(emc_insert_mask);
+        emc_insert_mask &= emc_insert_mask - 1;
+        /* Get the require parameters for EMC/SMC from the rule */
+        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
+        /* Insert the key into EMC/SMC. */
+        emc_probabilistic_insert(pmd, &keys[i], flow);
+    }
+}
+
+inline void
+smc_insert_batch(struct dp_netdev_pmd_thread *pmd,
+                 const struct netdev_flow_key *keys,
+                 struct dpcls_rule **rules,
+                 uint32_t smc_insert_mask)
+{
+    while (smc_insert_mask) {
+        uint32_t i = __builtin_ctz(smc_insert_mask);
+        smc_insert_mask &= smc_insert_mask - 1;
+        /* Get the require parameters for EMC/SMC from the rule */
+        struct dp_netdev_flow *flow = dp_netdev_flow_cast(rules[i]);
+        uint32_t hash = dp_netdev_flow_hash(&flow->ufid);
+        /* Insert the key into EMC/SMC. */
+        smc_insert(pmd, &keys[i], hash);
+    }
+}
+
 static struct dp_netdev_flow *
 dp_netdev_pmd_lookup_flow(struct dp_netdev_pmd_thread *pmd,
                           const struct netdev_flow_key *key,
                           int *lookup_num_p)
 {
     struct dpcls *cls;
-    struct dpcls_rule *rule;
+    struct dpcls_rule *rule = NULL;
     odp_port_t in_port = u32_to_odp(MINIFLOW_GET_U32(&key->mf,
                                                      in_port.odp_port));
     struct dp_netdev_flow *netdev_flow = NULL;
@@ -4233,7 +4262,10 @@  dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
         }
 
         /* Process packet batch. */
-        pmd->netdev_input_func(pmd, &batch, port_no);
+        int32_t ret = pmd->netdev_input_func(pmd, &batch, port_no);
+        if (ret) {
+            dp_netdev_input(pmd, &batch, port_no);
+        }
 
         /* Assign processing cycles to rx queue. */
         cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
@@ -5251,6 +5283,8 @@  dpif_netdev_run(struct dpif *dpif)
                     non_pmd->ctx.emc_insert_min = 0;
                 }
 
+                non_pmd->ctx.smc_enable_db = dp->smc_enable_db;
+
                 for (i = 0; i < port->n_rxq; i++) {
 
                     if (!netdev_rxq_enabled(port->rxqs[i].rx)) {
@@ -5522,6 +5556,8 @@  reload:
                 pmd->ctx.emc_insert_min = 0;
             }
 
+            pmd->ctx.smc_enable_db = pmd->dp->smc_enable_db;
+
             process_packets =
                 dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
                                            poll_list[i].port_no);
@@ -6415,6 +6451,24 @@  packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
                               actions->actions, actions->size);
 }
 
+void
+dp_netdev_batch_execute(struct dp_netdev_pmd_thread *pmd,
+                        struct dp_packet_batch *packets,
+                        struct dpcls_rule *rule,
+                        uint32_t bytes,
+                        uint16_t tcp_flags)
+{
+    /* Gets action* from the rule. */
+    struct dp_netdev_flow *flow = dp_netdev_flow_cast(rule);
+    struct dp_netdev_actions *actions = dp_netdev_flow_get_actions(flow);
+
+    dp_netdev_flow_used(flow, dp_packet_batch_size(packets), bytes,
+                        tcp_flags, pmd->ctx.now / 1000);
+    const uint32_t steal = 1;
+    dp_netdev_execute_actions(pmd, packets, steal, &flow->flow,
+                              actions->actions, actions->size);
+}
+
 static inline void
 dp_netdev_queue_batches(struct dp_packet *pkt,
                         struct dp_netdev_flow *flow, uint16_t tcp_flags,
@@ -6519,6 +6573,30 @@  smc_lookup_batch(struct dp_netdev_pmd_thread *pmd,
     pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SMC_HIT, n_smc_hit);
 }
 
+struct dp_netdev_flow *
+smc_lookup_single(struct dp_netdev_pmd_thread *pmd,
+                  struct dp_packet *packet,
+                  struct netdev_flow_key *key)
+{
+    const struct cmap_node *flow_node = smc_entry_get(pmd, key->hash);
+
+    if (OVS_LIKELY(flow_node != NULL)) {
+        struct dp_netdev_flow *flow = NULL;
+
+        CMAP_NODE_FOR_EACH (flow, node, flow_node) {
+            /* Since we dont have per-port megaflow to check the port
+             * number, we need to verify that the input ports match. */
+            if (OVS_LIKELY(dpcls_rule_matches_key(&flow->cr, key) &&
+                flow->flow.in_port.odp_port == packet->md.in_port.odp_port)) {
+
+                return (void *) flow;
+            }
+        }
+    }
+
+    return NULL;
+}
+
 /* Try to process all ('cnt') the 'packets' using only the datapath flow cache
  * 'pmd->flow_cache'. If a flow is not found for a packet 'packets[i]', the
  * miniflow is copied into 'keys' and the packet pointer is moved at the
@@ -6924,12 +7002,13 @@  dp_netdev_input__(struct dp_netdev_pmd_thread *pmd,
     }
 }
 
-static void
+static int32_t
 dp_netdev_input(struct dp_netdev_pmd_thread *pmd,
                 struct dp_packet_batch *packets,
                 odp_port_t port_no)
 {
     dp_netdev_input__(pmd, packets, false, port_no);
+    return 0;
 }
 
 static void
@@ -8369,7 +8448,7 @@  dpcls_flow_key_gen_masks(const struct netdev_flow_key *tbl,
 
 /* Returns true if 'target' satisfies 'key' in 'mask', that is, if each 1-bit
  * in 'mask' the values in 'key' and 'target' are the same. */
-bool
+inline bool ALWAYS_INLINE
 dpcls_rule_matches_key(const struct dpcls_rule *rule,
                        const struct netdev_flow_key *target)
 {
@@ -8395,7 +8474,7 @@  dpcls_rule_matches_key(const struct dpcls_rule *rule,
  * priorities, instead returning any rule which matches the flow.
  *
  * Returns true if all miniflows found a corresponding rule. */
-static bool
+bool
 dpcls_lookup(struct dpcls *cls, const struct netdev_flow_key *keys[],
              struct dpcls_rule **rules, const size_t cnt,
              int *num_lookups_p)