diff mbox series

[ovs-dev,v12,14/16] dpcls-avx512: Enable avx512 vector popcount instruction.

Message ID 20210517140434.59555-15-cian.ferriter@intel.com
State Changes Requested
Headers show
Series DPIF Framework + Optimizations | expand

Commit Message

Ferriter, Cian May 17, 2021, 2:04 p.m. UTC
From: Harry van Haaren <harry.van.haaren@intel.com>

This commit enables the AVX512-VPOPCNTDQ Vector Popcount
instruction. This instruction is not available on every CPU
that supports the AVX512-F Foundation ISA, hence it is enabled
only when the additional VPOPCNTDQ ISA check is passed.

The vector popcount instruction is used instead of the AVX512
popcount emulation code present in the avx512 optimized DPCLS today.
It provides higher performance in the SIMD miniflow processing
as that requires the popcount to calculate the miniflow block indexes.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

v8: Add NEWS entry.
---
 NEWS                                   |  3 +
 lib/dpdk.c                             |  1 +
 lib/dpif-netdev-lookup-avx512-gather.c | 84 ++++++++++++++++++++------
 3 files changed, 70 insertions(+), 18 deletions(-)

Comments

Stokes, Ian June 9, 2021, 3:56 p.m. UTC | #1
> This commit enables the AVX512-VPOPCNTDQ Vector Popcount
> instruction. This instruction is not available on every CPU
> that supports the AVX512-F Foundation ISA, hence it is enabled
> only when the additional VPOPCNTDQ ISA check is passed.
> 
> The vector popcount instruction is used instead of the AVX512
> popcount emulation code present in the avx512 optimized DPCLS today.
> It provides higher performance in the SIMD miniflow processing
> as that requires the popcount to calculate the miniflow block indexes.
> 
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

Thanks for the patch Harry/Cian.

A few comments inline below.
> 
> ---
> 
> v8: Add NEWS entry.
> ---
>  NEWS                                   |  3 +
>  lib/dpdk.c                             |  1 +
>  lib/dpif-netdev-lookup-avx512-gather.c | 84 ++++++++++++++++++++------
>  3 files changed, 70 insertions(+), 18 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index c71273ddd..d04dac746 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -14,6 +14,9 @@ Post-v2.15.0
>       * Enable AVX512 optimized DPCLS to search subtables with larger
> miniflows.
>       * Add more specialized DPCLS subtables to cover common rules,
> enhancing
>         the lookup performance.
> +     * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction
> if the
> +       CPU supports it. This enhances performance by using the native
> vpopcount
> +       instructions, instead of the emulated version of vpopcount.
>     - ovs-ctl:
>       * New option '--no-record-hostname' to disable hostname configuration
>         in ovsdb on startup.
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index c883a4b8b..a9494a40f 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char
> *feature)
>  #if __x86_64__
>      /* CPU flags only defined for the architecture that support it. */
>      CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
> +    CHECK_CPU_FEATURE(feature, "avx512vpopcntdq",
> RTE_CPUFLAG_AVX512VPOPCNTDQ);
>      CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
>  #endif
> 
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-
> avx512-gather.c
> index 7adf29914..c338c2fcd 100644
> --- a/lib/dpif-netdev-lookup-avx512-gather.c
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -53,6 +53,15 @@
> 
>  VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> 
> +
No need for the extra whitespace added above.

> +/* Wrapper function required to enable ISA. */
> +static inline __m512i
> +__attribute__((__target__("avx512vpopcntdq")))
> +_mm512_popcnt_epi64_wrapper(__m512i v_in)
> +{
> +    return _mm512_popcnt_epi64(v_in);
> +}
> +
>  static inline __m512i
>  _mm512_popcnt_epi64_manual(__m512i v_in)
>  {
> @@ -126,7 +135,8 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of
> all u0 bits */
>                       __mmask64 u1_bcast_msk,      /* mask of u1 lanes */
>                       const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
>                       __mmask64 zero_mask, /* maskz if pkt not have mf bit */
> -                     __mmask64 u64_lanes_mask) /* total lane count to use */
> +                     __mmask64 u64_lanes_mask, /* total lane count to use */
> +                     const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
>  {
>          /* Suggest to compiler to load tbl blocks ahead of gather(). */
>          __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
> @@ -140,8 +150,15 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64
> of all u0 bits */
>                                                        tbl_mf_masks);
>          __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
> 
> -        /* Manual AVX512 popcount for u64 lanes. */
> -        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> +        /* Calculate AVX512 popcount for u64 lanes using the native instruction
> +         * if available, or using emulation if not available.
> +         */
> +        __m512i v_popcnts;
> +        if (use_vpop) {
> +            v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
> +        } else {
> +            v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> +        }
> 
>          /* Add popcounts and offset for u1 bits. */
>          __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
> @@ -166,7 +183,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
>                     const struct netdev_flow_key *keys[],
>                     struct dpcls_rule **rules,
>                     const uint32_t bit_count_u0,
> -                   const uint32_t bit_count_u1)
> +                   const uint32_t bit_count_u1,
> +                   const uint32_t use_vpop)
>  {
>      OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> block_cache[BLOCKS_CACHE_SIZE];
>      uint32_t hashes[NETDEV_MAX_BURST];
> @@ -218,7 +236,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
>                                                  u1_bcast_mask,
>                                                  pkt_mf_u0_pop,
>                                                  zero_mask,
> -                                                bit_count_total_mask);
> +                                                bit_count_total_mask,
> +                                                use_vpop);
>          _mm512_storeu_si512(&block_cache[i * MF_BLOCKS_PER_PACKET],
> v_blocks);
> 
>          if (bit_count_total > 8) {
> @@ -239,7 +258,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
>                                                      u1_bcast_mask_gt8,
>                                                      pkt_mf_u0_pop,
>                                                      zero_mask_gt8,
> -                                                    bit_count_gt8_mask);
> +                                                    bit_count_gt8_mask,
> +                                                    use_vpop);
>              _mm512_storeu_si512(&block_cache[(i * MF_BLOCKS_PER_PACKET)
> + 8],
>                                  v_blocks_gt8);
>          }
> @@ -288,7 +308,11 @@ avx512_lookup_impl(struct dpcls_subtable
> *subtable,
>      return found_map;
>  }
> 
> -/* Expand out specialized functions with U0 and U1 bit attributes. */
> +/* Expand out specialized functions with U0 and U1 bit attributes. As the
> + * AVX512 vpopcnt instruction is not supported on all AVX512 capable CPUs,
> + * create two functions for each miniflow signature. This allows the runtime
> + * CPU detection in probe() to select the ideal implementation.
> + */

I'm trying to think is there a cleaner way of implementing this rather than having two functions but I'm not sure.

On one hand the functions use the (mostly) same implementation except for the vpop check.

Was there any thoughts on just implementing the one function and having a dynamic check within that?
Or did that impact on the performance too much?

On the other hand I do like the approach of the single variable vpop. Certainly makes it clearer to myself at least of whether the instruction gets used or not and an easy point to debug if required in the future.

When selecting the vpop implementation, is it flagged to the user at any stage that vpop will be used?

Regards
Ian

>  #define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1)                             \
>      static uint32_t                                                           \
>      dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable,
> \
> @@ -296,7 +320,20 @@ avx512_lookup_impl(struct dpcls_subtable
> *subtable,
>                                         const struct netdev_flow_key *keys[],  \
>                                         struct dpcls_rule **rules)             \
>      {                                                                         \
> -        return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
> +        const uint32_t use_vpop = 0;                                          \
> +        return avx512_lookup_impl(subtable, keys_map, keys, rules,            \
> +                                  U0, U1, use_vpop);                          \
> +    }                                                                         \
> +                                                                              \
> +    static uint32_t __attribute__((__target__("avx512vpopcntdq")))            \
> +    dpcls_avx512_gather_mf_##U0##_##U1##_vpop(struct dpcls_subtable
> *subtable,\
> +                                       uint32_t keys_map,                     \
> +                                       const struct netdev_flow_key *keys[],  \
> +                                       struct dpcls_rule **rules)             \
> +    {                                                                         \
> +        const uint32_t use_vpop = 1;                                          \
> +        return avx512_lookup_impl(subtable, keys_map, keys, rules,            \
> +                                  U0, U1, use_vpop);                          \
>      }                                                                         \
> 
>  DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
> @@ -306,11 +343,18 @@ DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
>  DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
>  DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
> 
> -/* Check if a specialized function is valid for the required subtable. */
> -#define CHECK_LOOKUP_FUNCTION(U0, U1)                                         \
> +/* Check if a specialized function is valid for the required subtable.
> + * The use_vpop variable is used to decide if the VPOPCNT instruction can
> be
> + * used or not.
> + */
> +#define CHECK_LOOKUP_FUNCTION(U0, U1, use_vpop)                               \
>      ovs_assert((U0 + U1) <= (NUM_U64_IN_ZMM_REG * 2));                        \
>      if (!f && u0_bits == U0 && u1_bits == U1) {                               \
> -        f = dpcls_avx512_gather_mf_##U0##_##U1;                               \
> +        if (use_vpop) {                                                       \
> +            f = dpcls_avx512_gather_mf_##U0##_##U1##_vpop;                    \
> +        } else {                                                              \
> +            f = dpcls_avx512_gather_mf_##U0##_##U1;                           \
> +        }                                                                     \
>      }
> 
>  static uint32_t
> @@ -318,9 +362,11 @@ dpcls_avx512_gather_mf_any(struct dpcls_subtable
> *subtable, uint32_t keys_map,
>                             const struct netdev_flow_key *keys[],
>                             struct dpcls_rule **rules)
>  {
> +    const uint32_t use_vpop = 0;
>      return avx512_lookup_impl(subtable, keys_map, keys, rules,
>                                subtable->mf_bits_set_unit0,
> -                              subtable->mf_bits_set_unit1);
> +                              subtable->mf_bits_set_unit1,
> +                              use_vpop);
>  }
> 
>  dpcls_subtable_lookup_func
> @@ -334,12 +380,14 @@ dpcls_subtable_avx512_gather_probe(uint32_t
> u0_bits, uint32_t u1_bits)
>          return NULL;
>      }
> 
> -    CHECK_LOOKUP_FUNCTION(9, 4);
> -    CHECK_LOOKUP_FUNCTION(9, 1);
> -    CHECK_LOOKUP_FUNCTION(5, 3);
> -    CHECK_LOOKUP_FUNCTION(5, 1);
> -    CHECK_LOOKUP_FUNCTION(4, 1);
> -    CHECK_LOOKUP_FUNCTION(4, 0);
> +    int use_vpop = dpdk_get_cpu_has_isa("x86_64", "avx512vpopcntdq");
> +
> +    CHECK_LOOKUP_FUNCTION(9, 4, use_vpop);
> +    CHECK_LOOKUP_FUNCTION(9, 1, use_vpop);
> +    CHECK_LOOKUP_FUNCTION(5, 3, use_vpop);
> +    CHECK_LOOKUP_FUNCTION(5, 1, use_vpop);
> +    CHECK_LOOKUP_FUNCTION(4, 1, use_vpop);
> +    CHECK_LOOKUP_FUNCTION(4, 0, use_vpop);
> 
>      /* Check if the _any looping version of the code can perform this miniflow
>       * lookup. Performance gain may be less pronounced due to non-
> specialized
> --
> 2.31.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Van Haaren, Harry June 10, 2021, 7:42 p.m. UTC | #2
> -----Original Message-----
> From: Stokes, Ian <ian.stokes@intel.com>
> Sent: Wednesday, June 9, 2021 4:56 PM
> To: Ferriter, Cian <cian.ferriter@intel.com>; ovs-dev@openvswitch.org; Van
> Haaren, Harry <harry.van.haaren@intel.com>
> Cc: i.maximets@ovn.org
> Subject: RE: [ovs-dev] [v12 14/16] dpcls-avx512: Enable avx512 vector popcount
> instruction.
> 
> > This commit enables the AVX512-VPOPCNTDQ Vector Popcount
> > instruction. This instruction is not available on every CPU
> > that supports the AVX512-F Foundation ISA, hence it is enabled
> > only when the additional VPOPCNTDQ ISA check is passed.
> >
> > The vector popcount instruction is used instead of the AVX512
> > popcount emulation code present in the avx512 optimized DPCLS today.
> > It provides higher performance in the SIMD miniflow processing
> > as that requires the popcount to calculate the miniflow block indexes.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> 
> Thanks for the patch Harry/Cian.
> 
> A few comments inline below.
> >
> > ---
> >
> > v8: Add NEWS entry.
> > ---
> >  NEWS                                   |  3 +
> >  lib/dpdk.c                             |  1 +
> >  lib/dpif-netdev-lookup-avx512-gather.c | 84 ++++++++++++++++++++------
> >  3 files changed, 70 insertions(+), 18 deletions(-)
> >
> > diff --git a/NEWS b/NEWS
> > index c71273ddd..d04dac746 100644
> > --- a/NEWS
> > +++ b/NEWS
> > @@ -14,6 +14,9 @@ Post-v2.15.0
> >       * Enable AVX512 optimized DPCLS to search subtables with larger
> > miniflows.
> >       * Add more specialized DPCLS subtables to cover common rules,
> > enhancing
> >         the lookup performance.
> > +     * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction
> > if the
> > +       CPU supports it. This enhances performance by using the native
> > vpopcount
> > +       instructions, instead of the emulated version of vpopcount.
> >     - ovs-ctl:
> >       * New option '--no-record-hostname' to disable hostname configuration
> >         in ovsdb on startup.
> > diff --git a/lib/dpdk.c b/lib/dpdk.c
> > index c883a4b8b..a9494a40f 100644
> > --- a/lib/dpdk.c
> > +++ b/lib/dpdk.c
> > @@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char
> > *feature)
> >  #if __x86_64__
> >      /* CPU flags only defined for the architecture that support it. */
> >      CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
> > +    CHECK_CPU_FEATURE(feature, "avx512vpopcntdq",
> > RTE_CPUFLAG_AVX512VPOPCNTDQ);
> >      CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
> >  #endif
> >
> > diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-
> > avx512-gather.c
> > index 7adf29914..c338c2fcd 100644
> > --- a/lib/dpif-netdev-lookup-avx512-gather.c
> > +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> > @@ -53,6 +53,15 @@
> >
> >  VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> >
> > +
> No need for the extra whitespace added above.

Ack, can fix.


> > +/* Wrapper function required to enable ISA. */
> > +static inline __m512i
> > +__attribute__((__target__("avx512vpopcntdq")))
> > +_mm512_popcnt_epi64_wrapper(__m512i v_in)
> > +{
> > +    return _mm512_popcnt_epi64(v_in);
> > +}
> > +
> >  static inline __m512i
> >  _mm512_popcnt_epi64_manual(__m512i v_in)
> >  {
> > @@ -126,7 +135,8 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64 of
> > all u0 bits */
> >                       __mmask64 u1_bcast_msk,      /* mask of u1 lanes */
> >                       const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
> >                       __mmask64 zero_mask, /* maskz if pkt not have mf bit */
> > -                     __mmask64 u64_lanes_mask) /* total lane count to use */
> > +                     __mmask64 u64_lanes_mask, /* total lane count to use */
> > +                     const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
> >  {
> >          /* Suggest to compiler to load tbl blocks ahead of gather(). */
> >          __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
> > @@ -140,8 +150,15 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64
> > of all u0 bits */
> >                                                        tbl_mf_masks);
> >          __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
> >
> > -        /* Manual AVX512 popcount for u64 lanes. */
> > -        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > +        /* Calculate AVX512 popcount for u64 lanes using the native instruction
> > +         * if available, or using emulation if not available.
> > +         */
> > +        __m512i v_popcnts;
> > +        if (use_vpop) {
> > +            v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
> > +        } else {
> > +            v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > +        }
> >
> >          /* Add popcounts and offset for u1 bits. */
> >          __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
> > @@ -166,7 +183,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
> >                     const struct netdev_flow_key *keys[],
> >                     struct dpcls_rule **rules,
> >                     const uint32_t bit_count_u0,
> > -                   const uint32_t bit_count_u1)
> > +                   const uint32_t bit_count_u1,
> > +                   const uint32_t use_vpop)
> >  {
> >      OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> > block_cache[BLOCKS_CACHE_SIZE];
> >      uint32_t hashes[NETDEV_MAX_BURST];
> > @@ -218,7 +236,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
> >                                                  u1_bcast_mask,
> >                                                  pkt_mf_u0_pop,
> >                                                  zero_mask,
> > -                                                bit_count_total_mask);
> > +                                                bit_count_total_mask,
> > +                                                use_vpop);
> >          _mm512_storeu_si512(&block_cache[i * MF_BLOCKS_PER_PACKET],
> > v_blocks);
> >
> >          if (bit_count_total > 8) {
> > @@ -239,7 +258,8 @@ avx512_lookup_impl(struct dpcls_subtable *subtable,
> >                                                      u1_bcast_mask_gt8,
> >                                                      pkt_mf_u0_pop,
> >                                                      zero_mask_gt8,
> > -                                                    bit_count_gt8_mask);
> > +                                                    bit_count_gt8_mask,
> > +                                                    use_vpop);
> >              _mm512_storeu_si512(&block_cache[(i * MF_BLOCKS_PER_PACKET)
> > + 8],
> >                                  v_blocks_gt8);
> >          }
> > @@ -288,7 +308,11 @@ avx512_lookup_impl(struct dpcls_subtable
> > *subtable,
> >      return found_map;
> >  }
> >
> > -/* Expand out specialized functions with U0 and U1 bit attributes. */
> > +/* Expand out specialized functions with U0 and U1 bit attributes. As the
> > + * AVX512 vpopcnt instruction is not supported on all AVX512 capable CPUs,
> > + * create two functions for each miniflow signature. This allows the runtime
> > + * CPU detection in probe() to select the ideal implementation.
> > + */
> 
> I'm trying to think is there a cleaner way of implementing this rather than having two
> functions but I'm not sure.
> 
> On one hand the functions use the (mostly) same implementation except for the
> vpop check.
> 
> Was there any thoughts on just implementing the one function and having a dynamic
> check within that?
> Or did that impact on the performance too much?
> 
> On the other hand I do like the approach of the single variable vpop. Certainly makes
> it clearer to myself at least of whether the instruction gets used or not and an easy
> point to debug if required in the future.
> 
> When selecting the vpop implementation, is it flagged to the user at any stage that
> vpop will be used?

The big part of the question here is "what will the compiler allow".
So a compiler will *not* insert the vpopcnt instruction into a function
that does not explicitly enable the instruction.

The danger here is that if we *do* enable avx512-vpopcnt for the whole function,
the compiler is *technically* allowed to just use the instruction regardless of the
use_vpopcnt variable, as it could identify that the _manual() version achieves the
same thing as the actual vpopcnt, and hence just always call vpopcnt.

So the only way to have the compiler be happy, and get correctness, is to ensure
that the compiler *does* have vpopcnt for one function, and *does not* have
that ISA available for the other implementation.

There's some trickery going on with inlining functions with different ISAs, to avoid
code-duplication in the generic code. The nice side-effect of this is that indeed the
function is branch-free on how it does its vpop-counting :)

In my opinion this code is the best it can be. Regards, -Harry
Stokes, Ian June 16, 2021, 12:38 p.m. UTC | #3
> > -----Original Message-----
> > From: Stokes, Ian <ian.stokes@intel.com>
> > Sent: Wednesday, June 9, 2021 4:56 PM
> > To: Ferriter, Cian <cian.ferriter@intel.com>; ovs-dev@openvswitch.org; Van
> > Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: i.maximets@ovn.org
> > Subject: RE: [ovs-dev] [v12 14/16] dpcls-avx512: Enable avx512 vector
> popcount
> > instruction.
> >
> > > This commit enables the AVX512-VPOPCNTDQ Vector Popcount
> > > instruction. This instruction is not available on every CPU
> > > that supports the AVX512-F Foundation ISA, hence it is enabled
> > > only when the additional VPOPCNTDQ ISA check is passed.
> > >
> > > The vector popcount instruction is used instead of the AVX512
> > > popcount emulation code present in the avx512 optimized DPCLS today.
> > > It provides higher performance in the SIMD miniflow processing
> > > as that requires the popcount to calculate the miniflow block indexes.
> > >
> > > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> >
> > Thanks for the patch Harry/Cian.
> >
> > A few comments inline below.
> > >
> > > ---
> > >
> > > v8: Add NEWS entry.
> > > ---
> > >  NEWS                                   |  3 +
> > >  lib/dpdk.c                             |  1 +
> > >  lib/dpif-netdev-lookup-avx512-gather.c | 84 ++++++++++++++++++++------
> > >  3 files changed, 70 insertions(+), 18 deletions(-)
> > >
> > > diff --git a/NEWS b/NEWS
> > > index c71273ddd..d04dac746 100644
> > > --- a/NEWS
> > > +++ b/NEWS
> > > @@ -14,6 +14,9 @@ Post-v2.15.0
> > >       * Enable AVX512 optimized DPCLS to search subtables with larger
> > > miniflows.
> > >       * Add more specialized DPCLS subtables to cover common rules,
> > > enhancing
> > >         the lookup performance.
> > > +     * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction
> > > if the
> > > +       CPU supports it. This enhances performance by using the native
> > > vpopcount
> > > +       instructions, instead of the emulated version of vpopcount.
> > >     - ovs-ctl:
> > >       * New option '--no-record-hostname' to disable hostname configuration
> > >         in ovsdb on startup.
> > > diff --git a/lib/dpdk.c b/lib/dpdk.c
> > > index c883a4b8b..a9494a40f 100644
> > > --- a/lib/dpdk.c
> > > +++ b/lib/dpdk.c
> > > @@ -655,6 +655,7 @@ dpdk_get_cpu_has_isa(const char *arch, const char
> > > *feature)
> > >  #if __x86_64__
> > >      /* CPU flags only defined for the architecture that support it. */
> > >      CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
> > > +    CHECK_CPU_FEATURE(feature, "avx512vpopcntdq",
> > > RTE_CPUFLAG_AVX512VPOPCNTDQ);
> > >      CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
> > >  #endif
> > >
> > > diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-
> > > avx512-gather.c
> > > index 7adf29914..c338c2fcd 100644
> > > --- a/lib/dpif-netdev-lookup-avx512-gather.c
> > > +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> > > @@ -53,6 +53,15 @@
> > >
> > >  VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> > >
> > > +
> > No need for the extra whitespace added above.
> 
> Ack, can fix.
> 
> 
> > > +/* Wrapper function required to enable ISA. */
> > > +static inline __m512i
> > > +__attribute__((__target__("avx512vpopcntdq")))
> > > +_mm512_popcnt_epi64_wrapper(__m512i v_in)
> > > +{
> > > +    return _mm512_popcnt_epi64(v_in);
> > > +}
> > > +
> > >  static inline __m512i
> > >  _mm512_popcnt_epi64_manual(__m512i v_in)
> > >  {
> > > @@ -126,7 +135,8 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64
> of
> > > all u0 bits */
> > >                       __mmask64 u1_bcast_msk,      /* mask of u1 lanes */
> > >                       const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
> > >                       __mmask64 zero_mask, /* maskz if pkt not have mf bit */
> > > -                     __mmask64 u64_lanes_mask) /* total lane count to use */
> > > +                     __mmask64 u64_lanes_mask, /* total lane count to use */
> > > +                     const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
> > >  {
> > >          /* Suggest to compiler to load tbl blocks ahead of gather(). */
> > >          __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
> > > @@ -140,8 +150,15 @@ avx512_blocks_gather(__m512i v_u0, /* reg of u64
> > > of all u0 bits */
> > >                                                        tbl_mf_masks);
> > >          __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
> > >
> > > -        /* Manual AVX512 popcount for u64 lanes. */
> > > -        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > > +        /* Calculate AVX512 popcount for u64 lanes using the native
> instruction
> > > +         * if available, or using emulation if not available.
> > > +         */
> > > +        __m512i v_popcnts;
> > > +        if (use_vpop) {
> > > +            v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
> > > +        } else {
> > > +            v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
> > > +        }
> > >
> > >          /* Add popcounts and offset for u1 bits. */
> > >          __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
> > > @@ -166,7 +183,8 @@ avx512_lookup_impl(struct dpcls_subtable
> *subtable,
> > >                     const struct netdev_flow_key *keys[],
> > >                     struct dpcls_rule **rules,
> > >                     const uint32_t bit_count_u0,
> > > -                   const uint32_t bit_count_u1)
> > > +                   const uint32_t bit_count_u1,
> > > +                   const uint32_t use_vpop)
> > >  {
> > >      OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t
> > > block_cache[BLOCKS_CACHE_SIZE];
> > >      uint32_t hashes[NETDEV_MAX_BURST];
> > > @@ -218,7 +236,8 @@ avx512_lookup_impl(struct dpcls_subtable
> *subtable,
> > >                                                  u1_bcast_mask,
> > >                                                  pkt_mf_u0_pop,
> > >                                                  zero_mask,
> > > -                                                bit_count_total_mask);
> > > +                                                bit_count_total_mask,
> > > +                                                use_vpop);
> > >          _mm512_storeu_si512(&block_cache[i * MF_BLOCKS_PER_PACKET],
> > > v_blocks);
> > >
> > >          if (bit_count_total > 8) {
> > > @@ -239,7 +258,8 @@ avx512_lookup_impl(struct dpcls_subtable
> *subtable,
> > >                                                      u1_bcast_mask_gt8,
> > >                                                      pkt_mf_u0_pop,
> > >                                                      zero_mask_gt8,
> > > -                                                    bit_count_gt8_mask);
> > > +                                                    bit_count_gt8_mask,
> > > +                                                    use_vpop);
> > >              _mm512_storeu_si512(&block_cache[(i * MF_BLOCKS_PER_PACKET)
> > > + 8],
> > >                                  v_blocks_gt8);
> > >          }
> > > @@ -288,7 +308,11 @@ avx512_lookup_impl(struct dpcls_subtable
> > > *subtable,
> > >      return found_map;
> > >  }
> > >
> > > -/* Expand out specialized functions with U0 and U1 bit attributes. */
> > > +/* Expand out specialized functions with U0 and U1 bit attributes. As the
> > > + * AVX512 vpopcnt instruction is not supported on all AVX512 capable CPUs,
> > > + * create two functions for each miniflow signature. This allows the runtime
> > > + * CPU detection in probe() to select the ideal implementation.
> > > + */
> >
> > I'm trying to think is there a cleaner way of implementing this rather than
> having two
> > functions but I'm not sure.
> >
> > On one hand the functions use the (mostly) same implementation except for
> the
> > vpop check.
> >
> > Was there any thoughts on just implementing the one function and having a
> dynamic
> > check within that?
> > Or did that impact on the performance too much?
> >
> > On the other hand I do like the approach of the single variable vpop. Certainly
> makes
> > it clearer to myself at least of whether the instruction gets used or not and an
> easy
> > point to debug if required in the future.
> >
> > When selecting the vpop implementation, is it flagged to the user at any stage
> that
> > vpop will be used?
> 
> The big part of the question here is "what will the compiler allow".
> So a compiler will *not* insert the vpopcnt instruction into a function
> that does not explicitly enable the instruction.
> 
> The danger here is that if we *do* enable avx512-vpopcnt for the whole
> function,
> the compiler is *technically* allowed to just use the instruction regardless of the
> use_vpopcnt variable, as it could identify that the _manual() version achieves the
> same thing as the actual vpopcnt, and hence just always call vpopcnt.
> 
> So the only way to have the compiler be happy, and get correctness, is to ensure
> that the compiler *does* have vpopcnt for one function, and *does not* have
> that ISA available for the other implementation.

Understood, had a feeling there was more to this than met the eye 😊.
> 
> There's some trickery going on with inlining functions with different ISAs, to
> avoid
> code-duplication in the generic code. The nice side-effect of this is that indeed
> the
> function is branch-free on how it does its vpop-counting :)
> 
> In my opinion this code is the best it can be. Regards, -Harry

Agreed.

Thanks for the detailed explanation.
Ian
diff mbox series

Patch

diff --git a/NEWS b/NEWS
index c71273ddd..d04dac746 100644
--- a/NEWS
+++ b/NEWS
@@ -14,6 +14,9 @@  Post-v2.15.0
      * Enable AVX512 optimized DPCLS to search subtables with larger miniflows.
      * Add more specialized DPCLS subtables to cover common rules, enhancing
        the lookup performance.
+     * Enable the AVX512 DPCLS implementation to use VPOPCNT instruction if the
+       CPU supports it. This enhances performance by using the native vpopcount
+       instructions, instead of the emulated version of vpopcount.
    - ovs-ctl:
      * New option '--no-record-hostname' to disable hostname configuration
        in ovsdb on startup.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index c883a4b8b..a9494a40f 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -655,6 +655,7 @@  dpdk_get_cpu_has_isa(const char *arch, const char *feature)
 #if __x86_64__
     /* CPU flags only defined for the architecture that support it. */
     CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
+    CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
     CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
 #endif
 
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
index 7adf29914..c338c2fcd 100644
--- a/lib/dpif-netdev-lookup-avx512-gather.c
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -53,6 +53,15 @@ 
 
 VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
 
+
+/* Wrapper function required to enable ISA. */
+static inline __m512i
+__attribute__((__target__("avx512vpopcntdq")))
+_mm512_popcnt_epi64_wrapper(__m512i v_in)
+{
+    return _mm512_popcnt_epi64(v_in);
+}
+
 static inline __m512i
 _mm512_popcnt_epi64_manual(__m512i v_in)
 {
@@ -126,7 +135,8 @@  avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 bits */
                      __mmask64 u1_bcast_msk,      /* mask of u1 lanes */
                      const uint64_t pkt_mf_u0_pop, /* num bits in u0 of pkt */
                      __mmask64 zero_mask, /* maskz if pkt not have mf bit */
-                     __mmask64 u64_lanes_mask) /* total lane count to use */
+                     __mmask64 u64_lanes_mask, /* total lane count to use */
+                     const uint32_t use_vpop)  /* use AVX512 vpopcntdq */
 {
         /* Suggest to compiler to load tbl blocks ahead of gather(). */
         __m512i v_tbl_blocks = _mm512_maskz_loadu_epi64(u64_lanes_mask,
@@ -140,8 +150,15 @@  avx512_blocks_gather(__m512i v_u0, /* reg of u64 of all u0 bits */
                                                       tbl_mf_masks);
         __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_tbl_masks);
 
-        /* Manual AVX512 popcount for u64 lanes. */
-        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+        /* Calculate AVX512 popcount for u64 lanes using the native instruction
+         * if available, or using emulation if not available.
+         */
+        __m512i v_popcnts;
+        if (use_vpop) {
+            v_popcnts = _mm512_popcnt_epi64_wrapper(v_masks);
+        } else {
+            v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+        }
 
         /* Add popcounts and offset for u1 bits. */
         __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_msk,
@@ -166,7 +183,8 @@  avx512_lookup_impl(struct dpcls_subtable *subtable,
                    const struct netdev_flow_key *keys[],
                    struct dpcls_rule **rules,
                    const uint32_t bit_count_u0,
-                   const uint32_t bit_count_u1)
+                   const uint32_t bit_count_u1,
+                   const uint32_t use_vpop)
 {
     OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
     uint32_t hashes[NETDEV_MAX_BURST];
@@ -218,7 +236,8 @@  avx512_lookup_impl(struct dpcls_subtable *subtable,
                                                 u1_bcast_mask,
                                                 pkt_mf_u0_pop,
                                                 zero_mask,
-                                                bit_count_total_mask);
+                                                bit_count_total_mask,
+                                                use_vpop);
         _mm512_storeu_si512(&block_cache[i * MF_BLOCKS_PER_PACKET], v_blocks);
 
         if (bit_count_total > 8) {
@@ -239,7 +258,8 @@  avx512_lookup_impl(struct dpcls_subtable *subtable,
                                                     u1_bcast_mask_gt8,
                                                     pkt_mf_u0_pop,
                                                     zero_mask_gt8,
-                                                    bit_count_gt8_mask);
+                                                    bit_count_gt8_mask,
+                                                    use_vpop);
             _mm512_storeu_si512(&block_cache[(i * MF_BLOCKS_PER_PACKET) + 8],
                                 v_blocks_gt8);
         }
@@ -288,7 +308,11 @@  avx512_lookup_impl(struct dpcls_subtable *subtable,
     return found_map;
 }
 
-/* Expand out specialized functions with U0 and U1 bit attributes. */
+/* Expand out specialized functions with U0 and U1 bit attributes. As the
+ * AVX512 vpopcnt instruction is not supported on all AVX512 capable CPUs,
+ * create two functions for each miniflow signature. This allows the runtime
+ * CPU detection in probe() to select the ideal implementation.
+ */
 #define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1)                             \
     static uint32_t                                                           \
     dpcls_avx512_gather_mf_##U0##_##U1(struct dpcls_subtable *subtable,       \
@@ -296,7 +320,20 @@  avx512_lookup_impl(struct dpcls_subtable *subtable,
                                        const struct netdev_flow_key *keys[],  \
                                        struct dpcls_rule **rules)             \
     {                                                                         \
-        return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
+        const uint32_t use_vpop = 0;                                          \
+        return avx512_lookup_impl(subtable, keys_map, keys, rules,            \
+                                  U0, U1, use_vpop);                          \
+    }                                                                         \
+                                                                              \
+    static uint32_t __attribute__((__target__("avx512vpopcntdq")))            \
+    dpcls_avx512_gather_mf_##U0##_##U1##_vpop(struct dpcls_subtable *subtable,\
+                                       uint32_t keys_map,                     \
+                                       const struct netdev_flow_key *keys[],  \
+                                       struct dpcls_rule **rules)             \
+    {                                                                         \
+        const uint32_t use_vpop = 1;                                          \
+        return avx512_lookup_impl(subtable, keys_map, keys, rules,            \
+                                  U0, U1, use_vpop);                          \
     }                                                                         \
 
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(9, 4)
@@ -306,11 +343,18 @@  DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
 DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
 
-/* Check if a specialized function is valid for the required subtable. */
-#define CHECK_LOOKUP_FUNCTION(U0, U1)                                         \
+/* Check if a specialized function is valid for the required subtable.
+ * The use_vpop variable is used to decide if the VPOPCNT instruction can be
+ * used or not.
+ */
+#define CHECK_LOOKUP_FUNCTION(U0, U1, use_vpop)                               \
     ovs_assert((U0 + U1) <= (NUM_U64_IN_ZMM_REG * 2));                        \
     if (!f && u0_bits == U0 && u1_bits == U1) {                               \
-        f = dpcls_avx512_gather_mf_##U0##_##U1;                               \
+        if (use_vpop) {                                                       \
+            f = dpcls_avx512_gather_mf_##U0##_##U1##_vpop;                    \
+        } else {                                                              \
+            f = dpcls_avx512_gather_mf_##U0##_##U1;                           \
+        }                                                                     \
     }
 
 static uint32_t
@@ -318,9 +362,11 @@  dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map,
                            const struct netdev_flow_key *keys[],
                            struct dpcls_rule **rules)
 {
+    const uint32_t use_vpop = 0;
     return avx512_lookup_impl(subtable, keys_map, keys, rules,
                               subtable->mf_bits_set_unit0,
-                              subtable->mf_bits_set_unit1);
+                              subtable->mf_bits_set_unit1,
+                              use_vpop);
 }
 
 dpcls_subtable_lookup_func
@@ -334,12 +380,14 @@  dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
         return NULL;
     }
 
-    CHECK_LOOKUP_FUNCTION(9, 4);
-    CHECK_LOOKUP_FUNCTION(9, 1);
-    CHECK_LOOKUP_FUNCTION(5, 3);
-    CHECK_LOOKUP_FUNCTION(5, 1);
-    CHECK_LOOKUP_FUNCTION(4, 1);
-    CHECK_LOOKUP_FUNCTION(4, 0);
+    int use_vpop = dpdk_get_cpu_has_isa("x86_64", "avx512vpopcntdq");
+
+    CHECK_LOOKUP_FUNCTION(9, 4, use_vpop);
+    CHECK_LOOKUP_FUNCTION(9, 1, use_vpop);
+    CHECK_LOOKUP_FUNCTION(5, 3, use_vpop);
+    CHECK_LOOKUP_FUNCTION(5, 1, use_vpop);
+    CHECK_LOOKUP_FUNCTION(4, 1, use_vpop);
+    CHECK_LOOKUP_FUNCTION(4, 0, use_vpop);
 
     /* Check if the _any looping version of the code can perform this miniflow
      * lookup. Performance gain may be less pronounced due to non-specialized