diff mbox series

[ovs-dev,v4,5/7] dpif-lookup: add avx512 gather implementation.

Message ID 20200618165354.87787-6-harry.van.haaren@intel.com
State Superseded
Headers show
Series DPCLS Subtable ISA Optimization | expand

Commit Message

Harry van Haaren June 18, 2020, 4:53 p.m. UTC
This commit adds an AVX-512 dpcls lookup implementation.
It uses the AVX-512 SIMD ISA to perform multiple miniflow
operations in parallel.

To run this implementation, the "avx512f" and "bmi2" ISAs are
required. These ISA checks are performed at runtime while
probing the subtable implementation. If a CPU does not provide
both "avx512f" and "bmi2", then this code does not execute.

The avx512 code is built as a seperate static library, with added
CFLAGS to enable the required ISA features. By building only this
static library with avx512 enabled, it is ensured that the main OVS
core library is *not* using avx512, and that OVS continues to run
as before on CPUs that do not support avx512.

The approach taken in this implementation is to use the
gather instruction to access the packet miniflow, allowing
any miniflow blocks to be loaded into an AVX-512 register.
This maximises the usefulness of the register, and hence this
implementation handles any subtable with up to miniflow 8 bits.

Note that specialization of these avx512 lookup routines
still provides performance value, as the hashing of the
resulting data is performed in scalar code, and compile-time
loop unrolling occurs when specialized to miniflow bits.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>

---

v4:
- Remove TODO comment on prio-set command (was accidentally
  added to this commit in v3)
- Fixup v3 changlog to not include #warning comment (William Tu)
- Remove #define for debugging in lookup.h
- Fix builds on older gcc versions that don't support -mavx512f.
  Solution involves CC_CHECK and #ifdefs in code (OVS Robot, William Tu)

v3:
- Improve function name for _any subtable lookup
- Use "" include not <> for immintrin.h
- Add checks for SSE42 instructions in core OVS for CRC32 based hashing
  If not available, disable AVX512 lookup implementation as it requires
  uses CRC32 for hashing, and the hashing algorithm must match core OVS.
- Rework ovs_asserts() into function selection time check
- Add #define for magic number 8, number of u64 blocks in AVX512 register
- Add #if CHECKER around AVX code, sparse doesn't like checking it
- Simplify avx512 enabled building, fixes builds with --enable-shared
---
 configure.ac                           |   2 +
 lib/automake.mk                        |  17 ++
 lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++
 lib/dpif-netdev-lookup.c               |  17 ++
 lib/dpif-netdev-lookup.h               |   4 +
 5 files changed, 305 insertions(+)
 create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c

Comments

William Tu June 27, 2020, 6:27 p.m. UTC | #1
On Thu, Jun 18, 2020 at 9:53 AM Harry van Haaren
<harry.van.haaren@intel.com> wrote:
>
> This commit adds an AVX-512 dpcls lookup implementation.
> It uses the AVX-512 SIMD ISA to perform multiple miniflow
> operations in parallel.
>
> To run this implementation, the "avx512f" and "bmi2" ISAs are
> required. These ISA checks are performed at runtime while
> probing the subtable implementation. If a CPU does not provide
> both "avx512f" and "bmi2", then this code does not execute.
>
> The avx512 code is built as a seperate static library, with added
> CFLAGS to enable the required ISA features. By building only this
> static library with avx512 enabled, it is ensured that the main OVS
> core library is *not* using avx512, and that OVS continues to run
> as before on CPUs that do not support avx512.
>
> The approach taken in this implementation is to use the
> gather instruction to access the packet miniflow, allowing
> any miniflow blocks to be loaded into an AVX-512 register.
> This maximises the usefulness of the register, and hence this
> implementation handles any subtable with up to miniflow 8 bits.
>
> Note that specialization of these avx512 lookup routines
> still provides performance value, as the hashing of the
> resulting data is performed in scalar code, and compile-time
> loop unrolling occurs when specialized to miniflow bits.
>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
>
> ---
>
> v4:
> - Remove TODO comment on prio-set command (was accidentally
>   added to this commit in v3)
> - Fixup v3 changlog to not include #warning comment (William Tu)
> - Remove #define for debugging in lookup.h
> - Fix builds on older gcc versions that don't support -mavx512f.
>   Solution involves CC_CHECK and #ifdefs in code (OVS Robot, William Tu)
>
> v3:
> - Improve function name for _any subtable lookup
> - Use "" include not <> for immintrin.h
> - Add checks for SSE42 instructions in core OVS for CRC32 based hashing
>   If not available, disable AVX512 lookup implementation as it requires
>   uses CRC32 for hashing, and the hashing algorithm must match core OVS.
> - Rework ovs_asserts() into function selection time check
> - Add #define for magic number 8, number of u64 blocks in AVX512 register
> - Add #if CHECKER around AVX code, sparse doesn't like checking it
> - Simplify avx512 enabled building, fixes builds with --enable-shared
> ---
>  configure.ac                           |   2 +
>  lib/automake.mk                        |  17 ++
>  lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++
>  lib/dpif-netdev-lookup.c               |  17 ++
>  lib/dpif-netdev-lookup.h               |   4 +
>  5 files changed, 305 insertions(+)
>  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
>
> diff --git a/configure.ac b/configure.ac
> index 81893e56e..1367c868b 100644
> --- a/configure.ac
> +++ b/configure.ac
> @@ -178,6 +178,8 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic])
>  OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic])
>  OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED])
>  OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], [HAVE_WNO_UNUSED_PARAMETER])
> +OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
> +OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])

Do you need both checks?
I thought the first one OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
is good enough since at lib/automake.mk, you add the -mavx512f to CFLAGS.

>  OVS_ENABLE_WERROR
>  OVS_ENABLE_SPARSE
>  OVS_CTAGS_IDENTIFIERS
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 1fc1a209e..fab056b8a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -11,6 +11,7 @@ lib_libopenvswitch_la_LIBADD = $(SSL_LIBS)
>  lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD)
>  lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD)
>
> +
>  if WIN32
>  lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS}
>  endif
> @@ -20,6 +21,22 @@ lib_libopenvswitch_la_LDFLAGS = \
>          -Wl,--version-script=$(top_builddir)/lib/libopenvswitch.sym \
>          $(AM_LDFLAGS)
>
> +if HAVE_AVX512F
> +# Build library of avx512 code with CPU ISA CFLAGS enabled. This allows the
> +# compiler to use the ISA features required for the ISA optimized code-paths.
> +lib_LTLIBRARIES += lib/libopenvswitchavx512.la
> +lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
> +lib_libopenvswitchavx512_la_CFLAGS = \
> +       -mavx512f \
> +       -mavx512bw \
> +       -mavx512dq \
> +       -mbmi2 \
> +       $(AM_CFLAGS)
> +lib_libopenvswitchavx512_la_SOURCES = \
> +       lib/dpif-netdev-lookup-avx512-gather.c
> +endif
> +
> +# Build core vswitch libraries as before
>  lib_libopenvswitch_la_SOURCES = \
>         lib/aes128.c \
>         lib/aes128.h \
> diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
> new file mode 100644
> index 000000000..754cd0e3c
> --- /dev/null
> +++ b/lib/dpif-netdev-lookup-avx512-gather.c
> @@ -0,0 +1,265 @@
> +/*
> + * Copyright (c) 2020, Intel Corperation.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifdef __x86_64__
> +#if !defined(__CHECKER__)
> +
> +#include <config.h>
> +
> +#include "dpif-netdev.h"
> +#include "dpif-netdev-lookup.h"
> +#include "dpif-netdev-private.h"
> +#include "cmap.h"
> +#include "flow.h"
> +#include "pvector.h"
> +#include "openvswitch/vlog.h"
> +
> +#include "immintrin.h"
> +
> +/* Each AVX512 register (zmm register in assembly notation) can contain up to
> + * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum

typo: equivalent

> + * number of miniflow blocks that can be processed in a single pass of the
> + * AVX512 code at a time.
> + */
> +#define NUM_U64_IN_ZMM_REG (8)
> +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
> +
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> +
> +static inline __m512i
> +_mm512_popcnt_epi64_manual(__m512i v_in)
> +{
> +    static const uint8_t pop_lut[64] = {
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> +    };
> +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> +
> +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> +
> +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> +
> +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> +}

I forgot whether you mentioned this or not.
But why create this manual popcnt?
Isn't there a _mm512_popcnt_* in the library?

The rest looks good to me,
Thanks

William
Harry van Haaren June 30, 2020, 10 a.m. UTC | #2
> -----Original Message-----
> From: William Tu <u9012063@gmail.com>
> Sent: Saturday, June 27, 2020 7:27 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: ovs-dev <ovs-dev@openvswitch.org>; Stokes, Ian <ian.stokes@intel.com>;
> Ilya Maximets <i.maximets@ovn.org>; Federico Iezzi <fiezzi@redhat.com>
> Subject: Re: [PATCH v4 5/7] dpif-lookup: add avx512 gather implementation.
> 
> On Thu, Jun 18, 2020 at 9:53 AM Harry van Haaren
> <harry.van.haaren@intel.com> wrote:
> >
> > This commit adds an AVX-512 dpcls lookup implementation.
> > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > operations in parallel.
> >
> > To run this implementation, the "avx512f" and "bmi2" ISAs are
> > required. These ISA checks are performed at runtime while
> > probing the subtable implementation. If a CPU does not provide
> > both "avx512f" and "bmi2", then this code does not execute.
> >
> > The avx512 code is built as a seperate static library, with added
> > CFLAGS to enable the required ISA features. By building only this
> > static library with avx512 enabled, it is ensured that the main OVS
> > core library is *not* using avx512, and that OVS continues to run
> > as before on CPUs that do not support avx512.
> >
> > The approach taken in this implementation is to use the
> > gather instruction to access the packet miniflow, allowing
> > any miniflow blocks to be loaded into an AVX-512 register.
> > This maximises the usefulness of the register, and hence this
> > implementation handles any subtable with up to miniflow 8 bits.
> >
> > Note that specialization of these avx512 lookup routines
> > still provides performance value, as the hashing of the
> > resulting data is performed in scalar code, and compile-time
> > loop unrolling occurs when specialized to miniflow bits.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> >
> > ---
> >
> > v4:
> > - Remove TODO comment on prio-set command (was accidentally
> >   added to this commit in v3)
> > - Fixup v3 changlog to not include #warning comment (William Tu)
> > - Remove #define for debugging in lookup.h
> > - Fix builds on older gcc versions that don't support -mavx512f.
> >   Solution involves CC_CHECK and #ifdefs in code (OVS Robot, William Tu)
> >
> > v3:
> > - Improve function name for _any subtable lookup
> > - Use "" include not <> for immintrin.h
> > - Add checks for SSE42 instructions in core OVS for CRC32 based hashing
> >   If not available, disable AVX512 lookup implementation as it requires
> >   uses CRC32 for hashing, and the hashing algorithm must match core OVS.
> > - Rework ovs_asserts() into function selection time check
> > - Add #define for magic number 8, number of u64 blocks in AVX512 register
> > - Add #if CHECKER around AVX code, sparse doesn't like checking it
> > - Simplify avx512 enabled building, fixes builds with --enable-shared
> > ---
> >  configure.ac                           |   2 +
> >  lib/automake.mk                        |  17 ++
> >  lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++
> >  lib/dpif-netdev-lookup.c               |  17 ++
> >  lib/dpif-netdev-lookup.h               |   4 +
> >  5 files changed, 305 insertions(+)
> >  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
> >
> > diff --git a/configure.ac b/configure.ac
> > index 81893e56e..1367c868b 100644
> > --- a/configure.ac
> > +++ b/configure.ac
> > @@ -178,6 +178,8 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic])
> >  OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic])
> >  OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED])
> >  OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter],
> [HAVE_WNO_UNUSED_PARAMETER])
> > +OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
> > +OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])
> 
> Do you need both checks?
> I thought the first one OVS_CONDITIONAL_CC_OPTION([-mavx512f],
> [HAVE_AVX512F])
> is good enough since at lib/automake.mk, you add the -mavx512f to CFLAGS.

From testing during development, both are required.
CONDITIONAL_CC_OPTION adds a build-system flag, indicating its present, but doesn't
seem to add a C #define for it, that can be used for conditional compilation?

The CHECK_CC_OPTION is used to manually add a #define via command-line -D parameter, it is used to add the avx512_gather probe function in the available lookup function struct.

There may be a more elegant way to achieve both in the same line, my AC-fu is somewhat outdated, suggestions welcome if you know of a better method :)

<snip some patch contents>

> > +#include "immintrin.h"
> > +
> > +/* Each AVX512 register (zmm register in assembly notation) can contain up
> to
> > + * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum
> 
> typo: equivalent

Will fix.


> > + * number of miniflow blocks that can be processed in a single pass of the
> > + * AVX512 code at a time.
> > + */
> > +#define NUM_U64_IN_ZMM_REG (8)
> > +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST *
> NUM_U64_IN_ZMM_REG)
> > +
> > +
> > +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> > +
> > +static inline __m512i
> > +_mm512_popcnt_epi64_manual(__m512i v_in)
> > +{
> > +    static const uint8_t pop_lut[64] = {
> > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > +    };
> > +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> > +
> > +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> > +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> > +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> > +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> > +
> > +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> > +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> > +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> > +
> > +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> > +}
> 
> I forgot whether you mentioned this or not.
> But why create this manual popcnt?
> Isn't there a _mm512_popcnt_* in the library?

To answer your question directly:
The vector popcount instruction requires AVX512VPOPCNTDQ. Skylake does not include
the VPOPCNTDQ AVX512 extension. The "_manual" version enables the DPCLS to execute
on all AVX512 CPUs available today. In future, support for the AVX512 vector popcount can
be added with little effort.

The intrinsic guide for   _mm512_popcnt_epi64()  has more details: 
https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpopcnt&expand=4368

Note that it lists "CPUID Flags: AVX512VPOPCNTDQ", indicating a requirement on that ISA level.
It becomes available in the Ice Lake microarchitecture, more ISA details available here for those interested:
https://software.intel.com/content/www/us/en/develop/download/10th-generation-intel-core-processor-instruction-throughput-and-latency-docs.html


> The rest looks good to me,
> Thanks

Thanks for review.
William Tu July 1, 2020, 4:20 a.m. UTC | #3
On Tue, Jun 30, 2020 at 3:01 AM Van Haaren, Harry
<harry.van.haaren@intel.com> wrote:
>
> > -----Original Message-----
> > From: William Tu <u9012063@gmail.com>
> > Sent: Saturday, June 27, 2020 7:27 PM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: ovs-dev <ovs-dev@openvswitch.org>; Stokes, Ian <ian.stokes@intel.com>;
> > Ilya Maximets <i.maximets@ovn.org>; Federico Iezzi <fiezzi@redhat.com>
> > Subject: Re: [PATCH v4 5/7] dpif-lookup: add avx512 gather implementation.
> >
> > On Thu, Jun 18, 2020 at 9:53 AM Harry van Haaren
> > <harry.van.haaren@intel.com> wrote:
> > >
> > > This commit adds an AVX-512 dpcls lookup implementation.
> > > It uses the AVX-512 SIMD ISA to perform multiple miniflow
> > > operations in parallel.
> > >
> > > To run this implementation, the "avx512f" and "bmi2" ISAs are
> > > required. These ISA checks are performed at runtime while
> > > probing the subtable implementation. If a CPU does not provide
> > > both "avx512f" and "bmi2", then this code does not execute.
> > >
> > > The avx512 code is built as a seperate static library, with added
> > > CFLAGS to enable the required ISA features. By building only this
> > > static library with avx512 enabled, it is ensured that the main OVS
> > > core library is *not* using avx512, and that OVS continues to run
> > > as before on CPUs that do not support avx512.
> > >
> > > The approach taken in this implementation is to use the
> > > gather instruction to access the packet miniflow, allowing
> > > any miniflow blocks to be loaded into an AVX-512 register.
> > > This maximises the usefulness of the register, and hence this
> > > implementation handles any subtable with up to miniflow 8 bits.
> > >
> > > Note that specialization of these avx512 lookup routines
> > > still provides performance value, as the hashing of the
> > > resulting data is performed in scalar code, and compile-time
> > > loop unrolling occurs when specialized to miniflow bits.
> > >
> > > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > >
> > > ---
> > >
> > > v4:
> > > - Remove TODO comment on prio-set command (was accidentally
> > >   added to this commit in v3)
> > > - Fixup v3 changlog to not include #warning comment (William Tu)
> > > - Remove #define for debugging in lookup.h
> > > - Fix builds on older gcc versions that don't support -mavx512f.
> > >   Solution involves CC_CHECK and #ifdefs in code (OVS Robot, William Tu)
> > >
> > > v3:
> > > - Improve function name for _any subtable lookup
> > > - Use "" include not <> for immintrin.h
> > > - Add checks for SSE42 instructions in core OVS for CRC32 based hashing
> > >   If not available, disable AVX512 lookup implementation as it requires
> > >   uses CRC32 for hashing, and the hashing algorithm must match core OVS.
> > > - Rework ovs_asserts() into function selection time check
> > > - Add #define for magic number 8, number of u64 blocks in AVX512 register
> > > - Add #if CHECKER around AVX code, sparse doesn't like checking it
> > > - Simplify avx512 enabled building, fixes builds with --enable-shared
> > > ---
> > >  configure.ac                           |   2 +
> > >  lib/automake.mk                        |  17 ++
> > >  lib/dpif-netdev-lookup-avx512-gather.c | 265 +++++++++++++++++++++++++
> > >  lib/dpif-netdev-lookup.c               |  17 ++
> > >  lib/dpif-netdev-lookup.h               |   4 +
> > >  5 files changed, 305 insertions(+)
> > >  create mode 100644 lib/dpif-netdev-lookup-avx512-gather.c
> > >
> > > diff --git a/configure.ac b/configure.ac
> > > index 81893e56e..1367c868b 100644
> > > --- a/configure.ac
> > > +++ b/configure.ac
> > > @@ -178,6 +178,8 @@ OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic])
> > >  OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic])
> > >  OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED])
> > >  OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter],
> > [HAVE_WNO_UNUSED_PARAMETER])
> > > +OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
> > > +OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])
> >
> > Do you need both checks?
> > I thought the first one OVS_CONDITIONAL_CC_OPTION([-mavx512f],
> > [HAVE_AVX512F])
> > is good enough since at lib/automake.mk, you add the -mavx512f to CFLAGS.
>
> From testing during development, both are required.
> CONDITIONAL_CC_OPTION adds a build-system flag, indicating its present, but doesn't
> seem to add a C #define for it, that can be used for conditional compilation?
>
> The CHECK_CC_OPTION is used to manually add a #define via command-line -D parameter, it is used to add the avx512_gather probe function in the available lookup function struct.
>
> There may be a more elegant way to achieve both in the same line, my AC-fu is somewhat outdated, suggestions welcome if you know of a better method :)
>
I see, thanks. I don't know any better way.

> <snip some patch contents>
>
> > > +#include "immintrin.h"
> > > +
> > > +/* Each AVX512 register (zmm register in assembly notation) can contain up
> > to
> > > + * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum
> >
> > typo: equivalent
>
> Will fix.
>
>
> > > + * number of miniflow blocks that can be processed in a single pass of the
> > > + * AVX512 code at a time.
> > > + */
> > > +#define NUM_U64_IN_ZMM_REG (8)
> > > +#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST *
> > NUM_U64_IN_ZMM_REG)
> > > +
> > > +
> > > +VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
> > > +
> > > +static inline __m512i
> > > +_mm512_popcnt_epi64_manual(__m512i v_in)
> > > +{
> > > +    static const uint8_t pop_lut[64] = {
> > > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > > +        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
> > > +    };
> > > +    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
> > > +
> > > +    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
> > > +    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
> > > +    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
> > > +    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
> > > +
> > > +    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
> > > +    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
> > > +    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
> > > +
> > > +    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
> > > +}
> >
> > I forgot whether you mentioned this or not.
> > But why create this manual popcnt?
> > Isn't there a _mm512_popcnt_* in the library?
>
> To answer your question directly:
> The vector popcount instruction requires AVX512VPOPCNTDQ. Skylake does not include
> the VPOPCNTDQ AVX512 extension. The "_manual" version enables the DPCLS to execute
> on all AVX512 CPUs available today. In future, support for the AVX512 vector popcount can
> be added with little effort.
>
> The intrinsic guide for   _mm512_popcnt_epi64()  has more details:
> https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=vpopcnt&expand=4368
>
> Note that it lists "CPUID Flags: AVX512VPOPCNTDQ", indicating a requirement on that ISA level.
> It becomes available in the Ice Lake microarchitecture, more ISA details available here for those interested:
> https://software.intel.com/content/www/us/en/develop/download/10th-generation-intel-core-processor-instruction-throughput-and-latency-docs.html

Thanks
William
diff mbox series

Patch

diff --git a/configure.ac b/configure.ac
index 81893e56e..1367c868b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -178,6 +178,8 @@  OVS_ENABLE_OPTION([-Wno-null-pointer-arithmetic])
 OVS_ENABLE_OPTION([-Warray-bounds-pointer-arithmetic])
 OVS_CONDITIONAL_CC_OPTION([-Wno-unused], [HAVE_WNO_UNUSED])
 OVS_CONDITIONAL_CC_OPTION([-Wno-unused-parameter], [HAVE_WNO_UNUSED_PARAMETER])
+OVS_CONDITIONAL_CC_OPTION([-mavx512f], [HAVE_AVX512F])
+OVS_CHECK_CC_OPTION([-mavx512f], [CFLAGS="$CFLAGS -DHAVE_AVX512F"])
 OVS_ENABLE_WERROR
 OVS_ENABLE_SPARSE
 OVS_CTAGS_IDENTIFIERS
diff --git a/lib/automake.mk b/lib/automake.mk
index 1fc1a209e..fab056b8a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -11,6 +11,7 @@  lib_libopenvswitch_la_LIBADD = $(SSL_LIBS)
 lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD)
 lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD)
 
+
 if WIN32
 lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS}
 endif
@@ -20,6 +21,22 @@  lib_libopenvswitch_la_LDFLAGS = \
         -Wl,--version-script=$(top_builddir)/lib/libopenvswitch.sym \
         $(AM_LDFLAGS)
 
+if HAVE_AVX512F
+# Build library of avx512 code with CPU ISA CFLAGS enabled. This allows the
+# compiler to use the ISA features required for the ISA optimized code-paths.
+lib_LTLIBRARIES += lib/libopenvswitchavx512.la
+lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
+lib_libopenvswitchavx512_la_CFLAGS = \
+	-mavx512f \
+	-mavx512bw \
+	-mavx512dq \
+	-mbmi2 \
+	$(AM_CFLAGS)
+lib_libopenvswitchavx512_la_SOURCES = \
+	lib/dpif-netdev-lookup-avx512-gather.c
+endif
+
+# Build core vswitch libraries as before
 lib_libopenvswitch_la_SOURCES = \
 	lib/aes128.c \
 	lib/aes128.h \
diff --git a/lib/dpif-netdev-lookup-avx512-gather.c b/lib/dpif-netdev-lookup-avx512-gather.c
new file mode 100644
index 000000000..754cd0e3c
--- /dev/null
+++ b/lib/dpif-netdev-lookup-avx512-gather.c
@@ -0,0 +1,265 @@ 
+/*
+ * Copyright (c) 2020, Intel Corperation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+#if !defined(__CHECKER__)
+
+#include <config.h>
+
+#include "dpif-netdev.h"
+#include "dpif-netdev-lookup.h"
+#include "dpif-netdev-private.h"
+#include "cmap.h"
+#include "flow.h"
+#include "pvector.h"
+#include "openvswitch/vlog.h"
+
+#include "immintrin.h"
+
+/* Each AVX512 register (zmm register in assembly notation) can contain up to
+ * 512 bits, which is equivelent to 8 uint64_t variables. This is the maximum
+ * number of miniflow blocks that can be processed in a single pass of the
+ * AVX512 code at a time.
+ */
+#define NUM_U64_IN_ZMM_REG (8)
+#define BLOCKS_CACHE_SIZE (NETDEV_MAX_BURST * NUM_U64_IN_ZMM_REG)
+
+
+VLOG_DEFINE_THIS_MODULE(dpif_lookup_avx512_gather);
+
+static inline __m512i
+_mm512_popcnt_epi64_manual(__m512i v_in)
+{
+    static const uint8_t pop_lut[64] = {
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+        0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
+    };
+    __m512i v_pop_lut = _mm512_loadu_si512(pop_lut);
+
+    __m512i v_in_srl8 = _mm512_srli_epi64(v_in, 4);
+    __m512i v_nibble_mask = _mm512_set1_epi8(0xF);
+    __m512i v_in_lo = _mm512_and_si512(v_in, v_nibble_mask);
+    __m512i v_in_hi = _mm512_and_si512(v_in_srl8, v_nibble_mask);
+
+    __m512i v_lo_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_lo);
+    __m512i v_hi_pop = _mm512_shuffle_epi8(v_pop_lut, v_in_hi);
+    __m512i v_u8_pop = _mm512_add_epi8(v_lo_pop, v_hi_pop);
+
+    return _mm512_sad_epu8(v_u8_pop, _mm512_setzero_si512());
+}
+
+static inline uint64_t
+netdev_rule_matches_key(const struct dpcls_rule *rule,
+                        const uint32_t mf_bits_total,
+                        const uint64_t * block_cache)
+{
+    const uint64_t *keyp = miniflow_get_values(&rule->flow.mf);
+    const uint64_t *maskp = miniflow_get_values(&rule->mask->mf);
+    const uint32_t lane_mask = (1 << mf_bits_total) - 1;
+
+    /* Always load a full cache line from blocks_cache. Other loads must be
+     * trimmed to the amount of data required for mf_bits_total blocks.
+     */
+    __m512i v_blocks = _mm512_loadu_si512(&block_cache[0]);
+    __m512i v_mask   = _mm512_maskz_loadu_epi64(lane_mask, &maskp[0]);
+    __m512i v_key    = _mm512_maskz_loadu_epi64(lane_mask, &keyp[0]);
+
+    __m512i v_data = _mm512_and_si512(v_blocks, v_mask);
+    uint32_t res_mask = _mm512_mask_cmpeq_epi64_mask(lane_mask, v_data, v_key);
+
+    /* returns 1 assuming result of SIMD compare is all blocks */
+    return res_mask == lane_mask;
+}
+
+static inline uint32_t ALWAYS_INLINE
+avx512_lookup_impl(struct dpcls_subtable *subtable,
+                   uint32_t keys_map,
+                   const struct netdev_flow_key *keys[],
+                   struct dpcls_rule **rules,
+                   const uint32_t bit_count_u0,
+                   const uint32_t bit_count_u1)
+{
+    OVS_ALIGNED_VAR(CACHE_LINE_SIZE)uint64_t block_cache[BLOCKS_CACHE_SIZE];
+
+    const uint32_t bit_count_total = bit_count_u0 + bit_count_u1;
+    int i;
+    uint32_t hashes[NETDEV_MAX_BURST];
+    const uint32_t n_pkts = __builtin_popcountll(keys_map);
+    ovs_assert(NETDEV_MAX_BURST >= n_pkts);
+
+    const uint64_t tbl_u0 = subtable->mask.mf.map.bits[0];
+    const uint64_t tbl_u1 = subtable->mask.mf.map.bits[1];
+
+    /* Load subtable blocks for masking later */
+    const uint64_t *tbl_blocks = miniflow_get_values(&subtable->mask.mf);
+    const __m512i v_tbl_blocks = _mm512_loadu_si512(&tbl_blocks[0]);
+
+    /* Load pre-created subtable masks for each block in subtable */
+    const __mmask8 bit_count_total_mask = (1 << bit_count_total) - 1;
+    const __m512i v_mf_masks = _mm512_maskz_loadu_epi64(bit_count_total_mask,
+                                                        subtable->mf_masks);
+
+    ULLONG_FOR_EACH_1 (i, keys_map) {
+        const uint64_t pkt_mf_u0_bits = keys[i]->mf.map.bits[0];
+        const uint64_t pkt_mf_u0_pop = __builtin_popcountll(pkt_mf_u0_bits);
+
+        /* Pre-create register with *PER PACKET* u0 offset */
+        const __mmask8 u1_bcast_mask = (UINT8_MAX << bit_count_u0);
+        const __m512i v_idx_u0_offset = _mm512_maskz_set1_epi64(u1_bcast_mask,
+                                                                pkt_mf_u0_pop);
+
+        /* Broadcast u0, u1 bitmasks to 8x u64 lanes */
+        __m512i v_u0 = _mm512_set1_epi64(pkt_mf_u0_bits);
+        __m512i v_pkt_bits = _mm512_mask_set1_epi64(v_u0, u1_bcast_mask,
+                                         keys[i]->mf.map.bits[1]);
+
+        /* Bitmask by pre-created masks */
+        __m512i v_masks = _mm512_and_si512(v_pkt_bits, v_mf_masks);
+
+        /* Manual AVX512 popcount for u64 lanes */
+        __m512i v_popcnts = _mm512_popcnt_epi64_manual(v_masks);
+
+        /* Offset popcounts for u1 with pre-created offset register */
+        __m512i v_indexes = _mm512_add_epi64(v_popcnts, v_idx_u0_offset);
+
+        /* Gather u64 blocks from packet miniflow */
+        const __m512i v_zeros = _mm512_setzero_si512();
+        const uint64_t *pkt_data = miniflow_get_values(&keys[i]->mf);
+        __m512i v_all_blocks = _mm512_mask_i64gather_epi64(v_zeros,
+                                   bit_count_total_mask, v_indexes,
+                                   pkt_data, 8);
+
+        /* Zero out bits that pkt doesn't have:
+         * - 2x pext() to extract bits from packet miniflow as needed by TBL
+         * - Shift u1 over by bit_count of u0, OR to create zero bitmask
+         */
+         uint64_t u0_to_zero = _pext_u64(keys[i]->mf.map.bits[0], tbl_u0);
+         uint64_t u1_to_zero = _pext_u64(keys[i]->mf.map.bits[1], tbl_u1);
+         uint64_t zero_mask = (u1_to_zero << bit_count_u0) | u0_to_zero;
+
+        /* Mask blocks using AND with subtable blocks, use k-mask to zero
+         * where lanes as required for this packet.
+         */
+        __m512i v_masked_blocks = _mm512_maskz_and_epi64(zero_mask,
+                                                v_all_blocks, v_tbl_blocks);
+
+        /* Store to blocks cache, full cache line aligned */
+        _mm512_storeu_si512(&block_cache[i * 8], v_masked_blocks);
+    }
+
+    /* Hash the now linearized blocks of packet metadata. */
+    ULLONG_FOR_EACH_1 (i, keys_map) {
+        uint64_t *block_ptr = &block_cache[i * 8];
+        uint32_t hash = hash_add_words64(0, block_ptr, bit_count_total);
+        hashes[i] = hash_finish(hash, bit_count_total * 8);
+    }
+
+    /* Lookup: this returns a bitmask of packets where the hash table had
+     * an entry for the given hash key. Presence of a hash key does not
+     * guarantee matching the key, as there can be hash collisions.
+     */
+    uint32_t found_map;
+    const struct cmap_node *nodes[NETDEV_MAX_BURST];
+    found_map = cmap_find_batch(&subtable->rules, keys_map, hashes, nodes);
+
+    /* Verify that packet actually matched rule. If not found, a hash
+     * collision has taken place, so continue searching with the next node.
+     */
+    ULLONG_FOR_EACH_1 (i, found_map) {
+        struct dpcls_rule *rule;
+
+        CMAP_NODE_FOR_EACH (rule, cmap_node, nodes[i]) {
+            const uint32_t cidx = i * 8;
+            uint32_t match = netdev_rule_matches_key(rule, bit_count_total,
+                                                     &block_cache[cidx]);
+            if (OVS_LIKELY(match)) {
+                rules[i] = rule;
+                subtable->hit_cnt++;
+                goto next;
+            }
+        }
+
+        /* None of the found rules was a match.  Clear the i-th bit to
+         * search for this key in the next subtable. */
+        ULLONG_SET0(found_map, i);
+    next:
+        ;                     /* Keep Sparse happy. */
+    }
+
+    return found_map;
+}
+
+/* Expand out specialized functions with U0 and U1 bit attributes. */
+#define DECLARE_OPTIMIZED_LOOKUP_FUNCTION(U0, U1)                             \
+    static uint32_t                                                           \
+    dpcls_avx512_gather_skx_mf_##U0##_##U1(                                   \
+                                         struct dpcls_subtable *subtable,     \
+                                         uint32_t keys_map,                   \
+                                         const struct netdev_flow_key *keys[],\
+                                         struct dpcls_rule **rules)           \
+    {                                                                         \
+        return avx512_lookup_impl(subtable, keys_map, keys, rules, U0, U1);   \
+    }                                                                         \
+
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(5, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 1)
+DECLARE_OPTIMIZED_LOOKUP_FUNCTION(4, 0)
+
+/* Check if a specialized function is valid for the required subtable. */
+#define CHECK_LOOKUP_FUNCTION(U0, U1)                                         \
+    ovs_assert((U0 + U1) <= NUM_U64_IN_ZMM_REG);                              \
+    if (!f && u0_bits == U0 && u1_bits == U1) {                               \
+        f = dpcls_avx512_gather_skx_mf_##U0##_##U1;                           \
+    }
+
+static uint32_t
+dpcls_avx512_gather_mf_any(struct dpcls_subtable *subtable, uint32_t keys_map,
+                           const struct netdev_flow_key *keys[],
+                           struct dpcls_rule **rules)
+{
+    return avx512_lookup_impl(subtable, keys_map, keys, rules,
+                              subtable->mf_bits_set_unit0,
+                              subtable->mf_bits_set_unit1);
+}
+
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bits, uint32_t u1_bits)
+{
+    dpcls_subtable_lookup_func f = NULL;
+
+    int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+    int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+    if (!avx512f_available || !bmi2_available) {
+        return NULL;
+    }
+
+    CHECK_LOOKUP_FUNCTION(5, 1);
+    CHECK_LOOKUP_FUNCTION(4, 1);
+    CHECK_LOOKUP_FUNCTION(4, 0);
+
+    if (!f && (u0_bits + u1_bits) < NUM_U64_IN_ZMM_REG) {
+        f = dpcls_avx512_gather_mf_any;
+        VLOG_INFO("Using avx512_gather_mf_any for subtable (%d,%d)\n",
+                  u0_bits, u1_bits);
+    }
+
+    return f;
+}
+
+#endif /* CHECKER */
+#endif /* __x86_64__ */
diff --git a/lib/dpif-netdev-lookup.c b/lib/dpif-netdev-lookup.c
index dfdbc73a1..2c740399b 100644
--- a/lib/dpif-netdev-lookup.c
+++ b/lib/dpif-netdev-lookup.c
@@ -42,6 +42,23 @@  static struct dpcls_subtable_lookup_info_t subtable_lookups[] = {
     { .prio = 1,
       .probe = dpcls_subtable_generic_probe,
       .name = "generic", },
+
+#ifdef __x86_64__
+#if HAVE_AVX512F
+#ifdef __SSE4_2__
+    /* Only available on x86_64 bit builds with SSE 4.2 used for OVS core. */
+    { .prio = 0,
+      .probe = dpcls_subtable_avx512_gather_probe,
+      .name = "avx512_gather", },
+#else
+    /* Disabling AVX512 at compile time, due to core OVS not using SSE42
+     * instruction set. The SSE42 instructions are required to use CRC32
+     * ISA for high performance hashing. Consider ./configure of OVS with
+     * -msse42 (or newer) to enable CRC32 hashing and higher performance.
+     */
+#endif
+#endif
+#endif
 };
 
 int32_t
diff --git a/lib/dpif-netdev-lookup.h b/lib/dpif-netdev-lookup.h
index 61f44b9e8..bd72aa29b 100644
--- a/lib/dpif-netdev-lookup.h
+++ b/lib/dpif-netdev-lookup.h
@@ -42,6 +42,10 @@  dpcls_subtable_autovalidator_probe(uint32_t u0_bit_count,
 dpcls_subtable_lookup_func
 dpcls_subtable_generic_probe(uint32_t u0_bit_count, uint32_t u1_bit_count);
 
+/* Probe function for AVX-512 gather implementation */
+dpcls_subtable_lookup_func
+dpcls_subtable_avx512_gather_probe(uint32_t u0_bit_cnt, uint32_t u1_bit_cnt);
+
 
 /* Subtable registration and iteration helpers */
 struct dpcls_subtable_lookup_info_t {