diff mbox series

[ovs-dev,v3] odp-execute: Add ISA implementation of set_masked IPv6 action

Message ID 20220926132946.1767182-1-emma.finn@intel.com
State Superseded
Headers show
Series [ovs-dev,v3] odp-execute: Add ISA implementation of set_masked IPv6 action | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Emma Finn Sept. 26, 2022, 1:29 p.m. UTC
This commit adds support for the AVX512 implementation of the
ipv6_set_addrs action as well as an AVX512 implementation of
updating the L4 checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>

---
v3:
  - Added a runtime check for AVX512 vbmi.
v2:
  - Added check for availbility of s6_addr32 field of struct in6_addr.
  - Fixed network headers for freebsd builds.
---
---
 lib/odp-execute-avx512.c  | 176 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |  17 ++++
 lib/odp-execute-private.h |   1 +
 3 files changed, 194 insertions(+)

Comments

Eelco Chaudron Nov. 17, 2022, 9:20 a.m. UTC | #1
On 26 Sep 2022, at 15:29, Emma Finn wrote:

> This commit adds support for the AVX512 implementation of the
> ipv6_set_addrs action as well as an AVX512 implementation of
> updating the L4 checksums.
>
> Signed-off-by: Emma Finn <emma.finn@intel.com>

Hi Emma,

Thanks for further enhancing the implementation of the AVX512 actions. Below are some comments, mostly style related, but with one additional optimization.

Cheers,

Eelco

> ---
> v3:
>   - Added a runtime check for AVX512 vbmi.
> v2:
>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>   - Fixed network headers for freebsd builds.
> ---
> ---
>  lib/odp-execute-avx512.c  | 176 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |  17 ++++
>  lib/odp-execute-private.h |   1 +
>  3 files changed, 194 insertions(+)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 6c7713251..f97b3c2f7 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -20,6 +20,9 @@
>
>  #include <config.h>
>  #include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
>
>  #include "csum.h"
>  #include "dp-packet.h"
> @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
>      }
>  }
>
> +#if HAVE_AVX512VBMI
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_get_delta(__m512i ip6_header)

I guess the function name was from before you split up this function, as it's not at all what you're doing in this function.
I would suggest changing the name to something like avx512_ipv6_sum_header().

Also, can you go over the register naming and comment text below, as they also make no sense in the current form?

> +{
> +    __m256i v_zeros = _mm256_setzero_si256();
> +    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
> +                                               0xFF, 0xFF, 0xFF, 0xFF);
> +
> +    __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, ip6_header);
> +    __m256i v_ip6_src_dst =  _mm512_extracti64x4_epi64(v_header, 0);

Remove the extra space after the equal sign.

Please add a new line before the comment.

> +    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
> +     * src and dst fields and add padding after each 16-bit value for the
> +     * following carry over addition. */
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
> +                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
> +                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
> +    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
> +    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
> +
> +    /* Add each part of the old and new headers together. */
> +    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final
> +     * horizontal add. */
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
> +
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
> +
> +    /* Extract delta value. */
> +    return _mm256_extract_epi16(v_delta, 0);
> +}
> +
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
> +{
> +    uint16_t delta;
> +    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
> +    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
> +    old_delta = ~old_delta;
> +    uint32_t csum_delta = old_delta + new_delta;
> +    delta = csum_finish(csum_delta);
> +
> +    return ~delta;

This function looks rather cluttered, what about the following:

{
    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
    uint32_t csum_delta = ~old_delta + new_delta;

    return ~csum_finish(csum_delta);
}

> +}
> +
> +/* This function performs the same operation on each packet in the batch as
> + * the scalar odp_set_ipv6() function. */
> +static void
> +__attribute__((__target__("avx512vbmi")))
> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
> +                             const struct nlattr *a)
> +{
> +    const struct ovs_key_ipv6 *key, *mask;
> +    struct dp_packet *packet;

Add a new line between definitions and code.

> +    a = nl_attr_get(a);
> +    key = nl_attr_get(a);
> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);

We have build asserts for the ovs_key_ipv4 key structure to make sure they do not change, we should add the same for v6.

> +
> +    /* Read the content of the key and mask in the respective registers. We
> +     * only load the size of the actual structure, which is only 40 bytes. */
> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> +
> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> +     * ip6_hdr structure layout. */
> +    static const uint8_t ip_shuffle_mask[64] = {
> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
> +    };
> +
> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> +
> +    /* This shuffle is required for key and mask to match the layout of the
> +     * ip6_hdr struct. */
> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> +
> +        /* Load the 40 bytes of the IPv6 header. */
> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
> +
> +        /* AND the v_pkt_mask to the packet data (v_packet). */
> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
> +
> +        /* OR the new addresses (v_key_shuf) with the masked packet addresses
> +         * (v_pkt_masked). */
> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
> +
> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> +         * be updated. */
> +        bool do_checksum = false;

So as we are trying to optimise code, this only needs to be done once, so we should move this outside the DP_PACKET_BATCH_FOR_EACH() loop.

> +#ifdef s6_addr32
> +        for (int j = 0; j < 4; j++) {
> +            if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) {
> +                do_checksum = true;
> +            }
> +        }
> +#else
> +        for (int j = 0; j < 16; j++) {
> +             if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) {
> +                do_checksum = true;
> +            }
> +        }
> +#endif

Not sure how fast slow/fast the above is compared with doing an AVX512 AND on the v_mask with a new v_address_mask + popcount?

> +        if (do_checksum) {
> +            uint8_t proto = nh->ip6_nxt;
> +            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> +                                                                  v_new_hdr);
> +
> +            if (proto == IPPROTO_UDP) {
> +                struct udp_header *uh = dp_packet_l4(packet);

Add a new line here.

> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;

Add a new line here.

> +                    udp_checksum = csum_finish(udp_checksum);
> +
> +                    if (!udp_checksum) {
> +                        udp_checksum = htons(0xffff);
> +                    }
> +
> +                    uh->udp_csum = udp_checksum;
> +                }
> +            } else if (proto == IPPROTO_TCP) {
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;

Add a new line here.

> +                tcp_checksum = csum_finish(tcp_checksum);
> +

Remove the new line.

> +                th->tcp_csum = tcp_checksum;
> +            } else if (proto == IPPROTO_ICMPV6) {
> +                struct icmp6_header *icmp = dp_packet_l4(packet);
> +                uint16_t old_icmp_checksum = ~icmp->icmp6_cksum;

Keep name consistency, so I would call it old_icmp6_checksum.

> +                uint32_t icmp6_checksum = old_icmp_checksum + delta_checksum;

Add a new line here.

> +                icmp6_checksum = csum_finish(icmp6_checksum);
> +

Remove the new line.

> +                icmp->icmp6_cksum = icmp6_checksum;
> +            }
> +        }
> +        /* Write back the modified IPv6 addresses. */
> +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> +    }
> +}
> +#endif

As the #if is way out of sight, I would make this +#endif /* HAVE_AVX512VBMI */

> +
>  static void
>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
>  {
> @@ -514,6 +683,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
>
> +#if HAVE_AVX512VBMI
> +    if (action_avx512vbmi_isa_probe()) {
> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> +                              action_avx512_ipv6_set_addrs;
> +    }
> +#endif
> +
>      return 0;
>  }
>
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index f80ae5a23..ff29e116f 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>
>  #endif
>
> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> +        return false;
> +    }
> +    return true;
> +}

just a nit, but I would make this as follows:

bool
action_avx512vbmi_isa_probe(void)
{
    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
        return true;
    }
    return false;
}

> +#else
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    return false;
> +}
> +#endif
> +
>  static struct odp_execute_action_impl action_impls[] = {
>      [ACTION_IMPL_AUTOVALIDATOR] = {
>          .available = false,
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 940180c99..643f41c2a 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
>  #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>
>  bool action_avx512_isa_probe(void);
> +bool action_avx512vbmi_isa_probe(void);
>
>  /* Odp execute init handles setting up the state of the actions functions at
>   * initialization time. It cannot return errors, as it must always succeed in
> -- 
> 2.25.1
Emma Finn Nov. 22, 2022, 3:10 p.m. UTC | #2
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Thursday 17 November 2022 09:21
> To: Finn, Emma <emma.finn@intel.com>
> Cc: dev@openvswitch.org; david.marchand@redhat.com;
> i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of
> set_masked IPv6 action
> 
> On 26 Sep 2022, at 15:29, Emma Finn wrote:
> 
> > This commit adds support for the AVX512 implementation of the
> > ipv6_set_addrs action as well as an AVX512 implementation of updating
> > the L4 checksums.
> >
> > Signed-off-by: Emma Finn <emma.finn@intel.com>
> 
> Hi Emma,
> 
> Thanks for further enhancing the implementation of the AVX512 actions.
> Below are some comments, mostly style related, but with one additional
> optimization.
> 
> Cheers,
> 
> Eelco
> 

Thanks for the review Eelco. Sure, I will clean up and change all the style related comments. 
Some other replies inline below. 

Thanks, 
Emma 

> > ---
> > v3:
> >   - Added a runtime check for AVX512 vbmi.
> > v2:
> >   - Added check for availbility of s6_addr32 field of struct in6_addr.
> >   - Fixed network headers for freebsd builds.
> > ---
> > ---
> >  lib/odp-execute-avx512.c  | 176
> > ++++++++++++++++++++++++++++++++++++++
> >  lib/odp-execute-private.c |  17 ++++
> >  lib/odp-execute-private.h |   1 +
> >  3 files changed, 194 insertions(+)
> >
> > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
> > 6c7713251..f97b3c2f7 100644
> > --- a/lib/odp-execute-avx512.c
> > +++ b/lib/odp-execute-avx512.c
> > @@ -20,6 +20,9 @@
> >
> >  #include <config.h>
> >  #include <errno.h>
> > +#include <sys/types.h>
> > +#include <netinet/in.h>
> > +#include <netinet/ip6.h>
> >
> >  #include "csum.h"
> >  #include "dp-packet.h"
> > @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct
> dp_packet_batch *batch,
> >      }
> >  }
> >
> > +#if HAVE_AVX512VBMI
> > +static inline uint16_t ALWAYS_INLINE
> > +__attribute__((__target__("avx512vbmi")))
> > +avx512_ipv6_get_delta(__m512i ip6_header)
> 
> I guess the function name was from before you split up this function, as it's
> not at all what you're doing in this function.
> I would suggest changing the name to something like
> avx512_ipv6_sum_header().
> 
> Also, can you go over the register naming and comment text below, as they
> also make no sense in the current form?
> 
> > +{
> > +    __m256i v_zeros = _mm256_setzero_si256();
> > +    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
> > +                                               0xFF, 0xFF, 0xFF,
> > +0xFF);
> > +
> > +    __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst,
> ip6_header);
> > +    __m256i v_ip6_src_dst =  _mm512_extracti64x4_epi64(v_header, 0);
> 
> Remove the extra space after the equal sign.
> 
> Please add a new line before the comment.
> 
> > +    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle
> the
> > +     * src and dst fields and add padding after each 16-bit value for the
> > +     * following carry over addition. */
> > +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302,
> 0xFFFF,
> > +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
> > +                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
> > +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
> > +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A,
> 0xFFFF,
> > +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
> > +                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> > +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
> > +    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst,
> v_swap16a);
> > +    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst,
> > + v_swap16b);
> > +
> > +    /* Add each part of the old and new headers together. */
> > +    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
> > +
> > +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> > +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> > +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> > +
> > +    /* Shuffle 32-bit value from 3rd lane into first lane for final
> > +     * horizontal add. */
> > +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> > +                                          0xF, 0xF, 0xF, 0xF);
> > +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
> > +
> > +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> > +    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
> > +
> > +    /* Extract delta value. */
> > +    return _mm256_extract_epi16(v_delta, 0); }
> > +
> > +static inline uint16_t ALWAYS_INLINE
> > +__attribute__((__target__("avx512vbmi")))
> > +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i
> new_header) {
> > +    uint16_t delta;
> > +    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
> > +    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
> > +    old_delta = ~old_delta;
> > +    uint32_t csum_delta = old_delta + new_delta;
> > +    delta = csum_finish(csum_delta);
> > +
> > +    return ~delta;
> 
> This function looks rather cluttered, what about the following:
> 
> {
>     uint16_t old_delta = avx512_ipv6_get_delta(old_header);
>     uint16_t new_delta = avx512_ipv6_get_delta(new_header);
>     uint32_t csum_delta = ~old_delta + new_delta;
> 
Sure I will tidy this up. The above line will need to have cast around old_delta though. 
Otherwise it will invert 32bits instead of 16bits of old_delta, which will make the addition wrong. 
Something like this:
	uint32_t csum_delta = (uint16_t)~old_delta + new_delta;	

>     return ~csum_finish(csum_delta);
> }
> 
> > +}
> > +
> > +/* This function performs the same operation on each packet in the
> > +batch as
> > + * the scalar odp_set_ipv6() function. */ static void
> > +__attribute__((__target__("avx512vbmi")))
> > +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
> > +                             const struct nlattr *a) {
> > +    const struct ovs_key_ipv6 *key, *mask;
> > +    struct dp_packet *packet;
> 
> Add a new line between definitions and code.
> 
> > +    a = nl_attr_get(a);
> > +    key = nl_attr_get(a);
> > +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
> 
> We have build asserts for the ovs_key_ipv4 key structure to make sure they
> do not change, we should add the same for v6.
> 
> > +
> > +    /* Read the content of the key and mask in the respective registers. We
> > +     * only load the size of the actual structure, which is only 40 bytes. */
> > +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> > +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> > +
> > +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> > +     * ip6_hdr structure layout. */
> > +    static const uint8_t ip_shuffle_mask[64] = {
> > +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> > +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> > +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> > +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> > +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
> > +    };
> > +
> > +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> > +
> > +    /* This shuffle is required for key and mask to match the layout of the
> > +     * ip6_hdr struct. */
> > +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> > +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle,
> v_mask);
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> > +
> > +        /* Load the 40 bytes of the IPv6 header. */
> > +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *)
> > + nh);
> > +
> > +        /* AND the v_pkt_mask to the packet data (v_packet). */
> > +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf,
> > + v_packet);
> > +
> > +        /* OR the new addresses (v_key_shuf) with the masked packet
> addresses
> > +         * (v_pkt_masked). */
> > +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf,
> > + v_pkt_masked);
> > +
> > +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> > +         * be updated. */
> > +        bool do_checksum = false;
> 
> So as we are trying to optimise code, this only needs to be done once, so we
> should move this outside the DP_PACKET_BATCH_FOR_EACH() loop.
> 
The compiler is likely already lifting that code out of the loop as it is loop invariant.
But moving outside the loop will make it clearer,  so yes will do.

> > +#ifdef s6_addr32
> > +        for (int j = 0; j < 4; j++) {
> > +            if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) {
> > +                do_checksum = true;
> > +            }
> > +        }
> > +#else
> > +        for (int j = 0; j < 16; j++) {
> > +             if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) {
> > +                do_checksum = true;
> > +            }
> > +        }
> > +#endif
> 
> Not sure how fast slow/fast the above is compared with doing an AVX512
> AND on the v_mask with a new v_address_mask + popcount?
> 
Ah yes good idea. To keep the same behaviour, I think we want to bitwise-OR the two registers (not bitwise-AND) and then check if any bits are set?
If any bit is set in src/dst mask, we need to do_checksum.
Something like this
	v_dst = Loadu_si128(dst)
	v_src = Loadu_si128(src)
	v_or = _or_si128(v_dst, v_src)

	/* generate all ones register from cmpeq of v_zeros vs itself? */
	 v_zeros = _setzero_si128()	
    	v_all_ones = _cmpeq_epi(v_zeros, v_zeros);
	int do_checksum = _mm_test_all_zeros(v_or, v_all_ones);

Does this approach make sense to you?

> > +        if (do_checksum) {
> > +            uint8_t proto = nh->ip6_nxt;
> > +            uint16_t delta_checksum =
> avx512_ipv6_addr_csum_delta(v_packet,
> > +
> > + v_new_hdr);
> > +
> > +            if (proto == IPPROTO_UDP) {
> > +                struct udp_header *uh = dp_packet_l4(packet);
> 
> Add a new line here.
> 
> > +                if (uh->udp_csum) {
> > +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> > +                    uint32_t udp_checksum = old_udp_checksum +
> > + delta_checksum;
> 
> Add a new line here.
> 
> > +                    udp_checksum = csum_finish(udp_checksum);
> > +
> > +                    if (!udp_checksum) {
> > +                        udp_checksum = htons(0xffff);
> > +                    }
> > +
> > +                    uh->udp_csum = udp_checksum;
> > +                }
> > +            } else if (proto == IPPROTO_TCP) {
> > +                struct tcp_header *th = dp_packet_l4(packet);
> > +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> > +                uint32_t tcp_checksum = old_tcp_checksum +
> > + delta_checksum;
> 
> Add a new line here.
> 
> > +                tcp_checksum = csum_finish(tcp_checksum);
> > +
> 
> Remove the new line.
> 
> > +                th->tcp_csum = tcp_checksum;
> > +            } else if (proto == IPPROTO_ICMPV6) {
> > +                struct icmp6_header *icmp = dp_packet_l4(packet);
> > +                uint16_t old_icmp_checksum = ~icmp->icmp6_cksum;
> 
> Keep name consistency, so I would call it old_icmp6_checksum.
> 
> > +                uint32_t icmp6_checksum = old_icmp_checksum +
> > + delta_checksum;
> 
> Add a new line here.
> 
> > +                icmp6_checksum = csum_finish(icmp6_checksum);
> > +
> 
> Remove the new line.
> 
> > +                icmp->icmp6_cksum = icmp6_checksum;
> > +            }
> > +        }
> > +        /* Write back the modified IPv6 addresses. */
> > +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> > +    }
> > +}
> > +#endif
> 
> As the #if is way out of sight, I would make this +#endif /*
> HAVE_AVX512VBMI */
> 
> > +
> >  static void
> >  action_avx512_set_masked(struct dp_packet_batch *batch, const struct
> > nlattr *a)  { @@ -514,6 +683,13 @@ action_avx512_init(struct
> > odp_execute_action_impl *self OVS_UNUSED)
> >      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> action_avx512_eth_set_addrs;
> >      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> > action_avx512_ipv4_set_addrs;
> >
> > +#if HAVE_AVX512VBMI
> > +    if (action_avx512vbmi_isa_probe()) {
> > +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> > +                              action_avx512_ipv6_set_addrs;
> > +    }
> > +#endif
> > +
> >      return 0;
> >  }
> >
> > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> > index f80ae5a23..ff29e116f 100644
> > --- a/lib/odp-execute-private.c
> > +++ b/lib/odp-execute-private.c
> > @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
> >
> >  #endif
> >
> > +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI bool
> > +action_avx512vbmi_isa_probe(void)
> > +{
> > +    if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> > +        return false;
> > +    }
> > +    return true;
> > +}
> 
> just a nit, but I would make this as follows:
> 
> bool
> action_avx512vbmi_isa_probe(void)
> {
>     if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
>         return true;
>     }
>     return false;
> }
> 
> > +#else
> > +bool
> > +action_avx512vbmi_isa_probe(void)
> > +{
> > +    return false;
> > +}
> > +#endif
> > +
> >  static struct odp_execute_action_impl action_impls[] = {
> >      [ACTION_IMPL_AUTOVALIDATOR] = {
> >          .available = false,
> > diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> > index 940180c99..643f41c2a 100644
> > --- a/lib/odp-execute-private.h
> > +++ b/lib/odp-execute-private.h
> > @@ -78,6 +78,7 @@
> BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
> > #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
> >
> >  bool action_avx512_isa_probe(void);
> > +bool action_avx512vbmi_isa_probe(void);
> >
> >  /* Odp execute init handles setting up the state of the actions functions at
> >   * initialization time. It cannot return errors, as it must always
> > succeed in
> > --
> > 2.25.1
Eelco Chaudron Nov. 23, 2022, 1:55 p.m. UTC | #3
On 22 Nov 2022, at 16:10, Finn, Emma wrote:

>> -----Original Message-----
>> From: Eelco Chaudron <echaudro@redhat.com>
>> Sent: Thursday 17 November 2022 09:21
>> To: Finn, Emma <emma.finn@intel.com>
>> Cc: dev@openvswitch.org; david.marchand@redhat.com;
>> i.maximets@ovn.org
>> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of
>> set_masked IPv6 action
>>
>> On 26 Sep 2022, at 15:29, Emma Finn wrote:
>>
>>> This commit adds support for the AVX512 implementation of the
>>> ipv6_set_addrs action as well as an AVX512 implementation of updating
>>> the L4 checksums.
>>>
>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
>>
>> Hi Emma,
>>
>> Thanks for further enhancing the implementation of the AVX512 actions.
>> Below are some comments, mostly style related, but with one additional
>> optimization.
>>
>> Cheers,
>>
>> Eelco
>>
>
> Thanks for the review Eelco. Sure, I will clean up and change all the style related comments.
> Some other replies inline below.

See inline comments below…

>>> ---
>>> v3:
>>>   - Added a runtime check for AVX512 vbmi.
>>> v2:
>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>>>   - Fixed network headers for freebsd builds.
>>> ---
>>> ---
>>>  lib/odp-execute-avx512.c  | 176
>>> ++++++++++++++++++++++++++++++++++++++
>>>  lib/odp-execute-private.c |  17 ++++
>>>  lib/odp-execute-private.h |   1 +
>>>  3 files changed, 194 insertions(+)
>>>
>>> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
>>> 6c7713251..f97b3c2f7 100644
>>> --- a/lib/odp-execute-avx512.c
>>> +++ b/lib/odp-execute-avx512.c
>>> @@ -20,6 +20,9 @@
>>>
>>>  #include <config.h>
>>>  #include <errno.h>
>>> +#include <sys/types.h>
>>> +#include <netinet/in.h>
>>> +#include <netinet/ip6.h>
>>>
>>>  #include "csum.h"
>>>  #include "dp-packet.h"
>>> @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct
>> dp_packet_batch *batch,
>>>      }
>>>  }
>>>
>>> +#if HAVE_AVX512VBMI
>>> +static inline uint16_t ALWAYS_INLINE
>>> +__attribute__((__target__("avx512vbmi")))
>>> +avx512_ipv6_get_delta(__m512i ip6_header)
>>
>> I guess the function name was from before you split up this function, as it's
>> not at all what you're doing in this function.
>> I would suggest changing the name to something like
>> avx512_ipv6_sum_header().
>>
>> Also, can you go over the register naming and comment text below, as they
>> also make no sense in the current form?
>>
>>> +{
>>> +    __m256i v_zeros = _mm256_setzero_si256();
>>> +    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
>>> +                                               0xFF, 0xFF, 0xFF,
>>> +0xFF);
>>> +
>>> +    __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst,
>> ip6_header);
>>> +    __m256i v_ip6_src_dst =  _mm512_extracti64x4_epi64(v_header, 0);
>>
>> Remove the extra space after the equal sign.
>>
>> Please add a new line before the comment.
>>
>>> +    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle
>> the
>>> +     * src and dst fields and add padding after each 16-bit value for the
>>> +     * following carry over addition. */
>>> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302,
>> 0xFFFF,
>>> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
>>> +                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
>>> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
>>> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A,
>> 0xFFFF,
>>> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
>>> +                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
>>> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
>>> +    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst,
>> v_swap16a);
>>> +    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst,
>>> + v_swap16b);
>>> +
>>> +    /* Add each part of the old and new headers together. */
>>> +    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
>>> +
>>> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
>>> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
>>> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
>>> +
>>> +    /* Shuffle 32-bit value from 3rd lane into first lane for final
>>> +     * horizontal add. */
>>> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
>>> +                                          0xF, 0xF, 0xF, 0xF);
>>> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
>>> +
>>> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
>>> +    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
>>> +
>>> +    /* Extract delta value. */
>>> +    return _mm256_extract_epi16(v_delta, 0); }
>>> +
>>> +static inline uint16_t ALWAYS_INLINE
>>> +__attribute__((__target__("avx512vbmi")))
>>> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i
>> new_header) {
>>> +    uint16_t delta;
>>> +    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
>>> +    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
>>> +    old_delta = ~old_delta;
>>> +    uint32_t csum_delta = old_delta + new_delta;
>>> +    delta = csum_finish(csum_delta);
>>> +
>>> +    return ~delta;
>>
>> This function looks rather cluttered, what about the following:
>>
>> {
>>     uint16_t old_delta = avx512_ipv6_get_delta(old_header);
>>     uint16_t new_delta = avx512_ipv6_get_delta(new_header);
>>     uint32_t csum_delta = ~old_delta + new_delta;
>>
> Sure I will tidy this up. The above line will need to have cast around old_delta though.
> Otherwise it will invert 32bits instead of 16bits of old_delta, which will make the addition wrong.
> Something like this:
> 	uint32_t csum_delta = (uint16_t)~old_delta + new_delta;	

That should be fine, maybe explicitly mark the part you are interested in, saw this in other code:

uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta;	

>
>>     return ~csum_finish(csum_delta);
>> }
>>
>>> +}
>>> +
>>> +/* This function performs the same operation on each packet in the
>>> +batch as
>>> + * the scalar odp_set_ipv6() function. */ static void
>>> +__attribute__((__target__("avx512vbmi")))
>>> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
>>> +                             const struct nlattr *a) {
>>> +    const struct ovs_key_ipv6 *key, *mask;
>>> +    struct dp_packet *packet;
>>
>> Add a new line between definitions and code.
>>
>>> +    a = nl_attr_get(a);
>>> +    key = nl_attr_get(a);
>>> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
>>
>> We have build asserts for the ovs_key_ipv4 key structure to make sure they
>> do not change, we should add the same for v6.
>>
>>> +
>>> +    /* Read the content of the key and mask in the respective registers. We
>>> +     * only load the size of the actual structure, which is only 40 bytes. */
>>> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
>>> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
>>> +
>>> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
>>> +     * ip6_hdr structure layout. */
>>> +    static const uint8_t ip_shuffle_mask[64] = {
>>> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
>>> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
>>> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
>>> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
>>> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
>>> +    };
>>> +
>>> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
>>> +
>>> +    /* This shuffle is required for key and mask to match the layout of the
>>> +     * ip6_hdr struct. */
>>> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
>>> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle,
>> v_mask);
>>> +
>>> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
>>> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
>>> +
>>> +        /* Load the 40 bytes of the IPv6 header. */
>>> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *)
>>> + nh);
>>> +
>>> +        /* AND the v_pkt_mask to the packet data (v_packet). */
>>> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf,
>>> + v_packet);
>>> +
>>> +        /* OR the new addresses (v_key_shuf) with the masked packet
>> addresses
>>> +         * (v_pkt_masked). */
>>> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf,
>>> + v_pkt_masked);
>>> +
>>> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
>>> +         * be updated. */
>>> +        bool do_checksum = false;
>>
>> So as we are trying to optimise code, this only needs to be done once, so we
>> should move this outside the DP_PACKET_BATCH_FOR_EACH() loop.
>>
> The compiler is likely already lifting that code out of the loop as it is loop invariant.
> But moving outside the loop will make it clearer,  so yes will do.

Thanks, it’s just my autistic brain not trusting the optimizer ;)

>>> +#ifdef s6_addr32
>>> +        for (int j = 0; j < 4; j++) {
>>> +            if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) {
>>> +                do_checksum = true;
>>> +            }
>>> +        }
>>> +#else
>>> +        for (int j = 0; j < 16; j++) {
>>> +             if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) {
>>> +                do_checksum = true;
>>> +            }
>>> +        }
>>> +#endif
>>
>> Not sure how fast slow/fast the above is compared with doing an AVX512
>> AND on the v_mask with a new v_address_mask + popcount?
>>
> Ah yes good idea. To keep the same behaviour, I think we want to bitwise-OR the two registers (not bitwise-AND) and then check if any bits are set?
> If any bit is set in src/dst mask, we need to do_checksum.
> Something like this
> 	v_dst = Loadu_si128(dst)
> 	v_src = Loadu_si128(src)
> 	v_or = _or_si128(v_dst, v_src)
>
> 	/* generate all ones register from cmpeq of v_zeros vs itself? */
> 	 v_zeros = _setzero_si128()	
>     	v_all_ones = _cmpeq_epi(v_zeros, v_zeros);
> 	int do_checksum = _mm_test_all_zeros(v_or, v_all_ones);
>
> Does this approach make sense to you?

Yes perfectly, I was not aware of the _mm_test_all_zeros() which saves the popcount ;)

One comment here is that do_checksum should be a bool type, something like

bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones);


>>> +        if (do_checksum) {
>>> +            uint8_t proto = nh->ip6_nxt;
>>> +            uint16_t delta_checksum =
>> avx512_ipv6_addr_csum_delta(v_packet,
>>> +
>>> + v_new_hdr);
>>> +
>>> +            if (proto == IPPROTO_UDP) {
>>> +                struct udp_header *uh = dp_packet_l4(packet);
>>
>> Add a new line here.
>>
>>> +                if (uh->udp_csum) {
>>> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
>>> +                    uint32_t udp_checksum = old_udp_checksum +
>>> + delta_checksum;
>>
>> Add a new line here.
>>
>>> +                    udp_checksum = csum_finish(udp_checksum);
>>> +
>>> +                    if (!udp_checksum) {
>>> +                        udp_checksum = htons(0xffff);
>>> +                    }
>>> +
>>> +                    uh->udp_csum = udp_checksum;
>>> +                }
>>> +            } else if (proto == IPPROTO_TCP) {
>>> +                struct tcp_header *th = dp_packet_l4(packet);
>>> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
>>> +                uint32_t tcp_checksum = old_tcp_checksum +
>>> + delta_checksum;
>>
>> Add a new line here.
>>
>>> +                tcp_checksum = csum_finish(tcp_checksum);
>>> +
>>
>> Remove the new line.
>>
>>> +                th->tcp_csum = tcp_checksum;
>>> +            } else if (proto == IPPROTO_ICMPV6) {
>>> +                struct icmp6_header *icmp = dp_packet_l4(packet);
>>> +                uint16_t old_icmp_checksum = ~icmp->icmp6_cksum;
>>
>> Keep name consistency, so I would call it old_icmp6_checksum.
>>
>>> +                uint32_t icmp6_checksum = old_icmp_checksum +
>>> + delta_checksum;
>>
>> Add a new line here.
>>
>>> +                icmp6_checksum = csum_finish(icmp6_checksum);
>>> +
>>
>> Remove the new line.
>>
>>> +                icmp->icmp6_cksum = icmp6_checksum;
>>> +            }
>>> +        }
>>> +        /* Write back the modified IPv6 addresses. */
>>> +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
>>> +    }
>>> +}
>>> +#endif
>>
>> As the #if is way out of sight, I would make this +#endif /*
>> HAVE_AVX512VBMI */
>>
>>> +
>>>  static void
>>>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct
>>> nlattr *a)  { @@ -514,6 +683,13 @@ action_avx512_init(struct
>>> odp_execute_action_impl *self OVS_UNUSED)
>>>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
>> action_avx512_eth_set_addrs;
>>>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] =
>>> action_avx512_ipv4_set_addrs;
>>>
>>> +#if HAVE_AVX512VBMI
>>> +    if (action_avx512vbmi_isa_probe()) {
>>> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
>>> +                              action_avx512_ipv6_set_addrs;
>>> +    }
>>> +#endif
>>> +
>>>      return 0;
>>>  }
>>>
>>> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
>>> index f80ae5a23..ff29e116f 100644
>>> --- a/lib/odp-execute-private.c
>>> +++ b/lib/odp-execute-private.c
>>> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>>>
>>>  #endif
>>>
>>> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI bool
>>> +action_avx512vbmi_isa_probe(void)
>>> +{
>>> +    if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
>>> +        return false;
>>> +    }
>>> +    return true;
>>> +}
>>
>> just a nit, but I would make this as follows:
>>
>> bool
>> action_avx512vbmi_isa_probe(void)
>> {
>>     if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
>>         return true;
>>     }
>>     return false;
>> }
>>
>>> +#else
>>> +bool
>>> +action_avx512vbmi_isa_probe(void)
>>> +{
>>> +    return false;
>>> +}
>>> +#endif
>>> +
>>>  static struct odp_execute_action_impl action_impls[] = {
>>>      [ACTION_IMPL_AUTOVALIDATOR] = {
>>>          .available = false,
>>> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
>>> index 940180c99..643f41c2a 100644
>>> --- a/lib/odp-execute-private.h
>>> +++ b/lib/odp-execute-private.h
>>> @@ -78,6 +78,7 @@
>> BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
>>> #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>>>
>>>  bool action_avx512_isa_probe(void);
>>> +bool action_avx512vbmi_isa_probe(void);
>>>
>>>  /* Odp execute init handles setting up the state of the actions functions at
>>>   * initialization time. It cannot return errors, as it must always
>>> succeed in
>>> --
>>> 2.25.1
Van Haaren, Harry Nov. 23, 2022, 2:05 p.m. UTC | #4
> -----Original Message-----
> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
> Sent: Wednesday, November 23, 2022 1:55 PM
> To: Finn, Emma <emma.finn@intel.com>
> Cc: dev@openvswitch.org; david.marchand@redhat.com; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of set_masked IPv6
> action

<snip>

> > Something like this
> > 	v_dst = Loadu_si128(dst)
> > 	v_src = Loadu_si128(src)
> > 	v_or = _or_si128(v_dst, v_src)
> >
> > 	/* generate all ones register from cmpeq of v_zeros vs itself? */
> > 	 v_zeros = _setzero_si128()
> >     	v_all_ones = _cmpeq_epi(v_zeros, v_zeros);
> > 	int do_checksum = _mm_test_all_zeros(v_or, v_all_ones);
> >
> > Does this approach make sense to you?
> 
> Yes perfectly, I was not aware of the _mm_test_all_zeros() which saves the
> popcount ;)
> 
> One comment here is that do_checksum should be a bool type, something like
> 
> bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones);

In the interest of micro-optimization discussions, we'd need to check if the resulting ASM is the same...
Branching on a value is usually a "test" with a register/register, or register/constant, and that sets the "flags" register.

Note that the test_all_zeros() *already* sets the flags register!
https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html?wapkw=intrinsics%20guide#text=mm_test_all_zero&ig_expand=7187

By taking the result, doing the bitwise !! ops , and branching on the result, it might force the compiler into emitting a
bunch of noisy-not-useful instructions.

The test_all_zeros() isn't just a bypass of the popcnt instruction, it also avoids the "test" with a register to set flags register.
By having set the ZF (zero-flag) we can JumpZero (JZ instruction) or JNZ (JumpNotZero) on the result of it, no GPR register usage.

Given this code is x86 specific anyway, I don't see value add from the bool type and !! trick to canonicalize the "any value" to 0 or 1.
If the ASM generated is the same, I'm OK with either approach, just noting the micro-optimization around test/flags-register.

Regards, -Harry
Eelco Chaudron Nov. 23, 2022, 2:14 p.m. UTC | #5
On 23 Nov 2022, at 15:05, Van Haaren, Harry wrote:

>> -----Original Message-----
>> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
>> Sent: Wednesday, November 23, 2022 1:55 PM
>> To: Finn, Emma <emma.finn@intel.com>
>> Cc: dev@openvswitch.org; david.marchand@redhat.com; i.maximets@ovn.org
>> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of set_masked IPv6
>> action
>
> <snip>
>
>>> Something like this
>>> 	v_dst = Loadu_si128(dst)
>>> 	v_src = Loadu_si128(src)
>>> 	v_or = _or_si128(v_dst, v_src)
>>>
>>> 	/* generate all ones register from cmpeq of v_zeros vs itself? */
>>> 	 v_zeros = _setzero_si128()
>>>     	v_all_ones = _cmpeq_epi(v_zeros, v_zeros);
>>> 	int do_checksum = _mm_test_all_zeros(v_or, v_all_ones);
>>>
>>> Does this approach make sense to you?
>>
>> Yes perfectly, I was not aware of the _mm_test_all_zeros() which saves the
>> popcount ;)
>>
>> One comment here is that do_checksum should be a bool type, something like
>>
>> bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones);
>
> In the interest of micro-optimization discussions, we'd need to check if the resulting ASM is the same...
> Branching on a value is usually a "test" with a register/register, or register/constant, and that sets the "flags" register.
>
> Note that the test_all_zeros() *already* sets the flags register!
> https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html?wapkw=intrinsics%20guide#text=mm_test_all_zero&ig_expand=7187
>
> By taking the result, doing the bitwise !! ops , and branching on the result, it might force the compiler into emitting a
> bunch of noisy-not-useful instructions.
>
> The test_all_zeros() isn't just a bypass of the popcnt instruction, it also avoids the "test" with a register to set flags register.
> By having set the ZF (zero-flag) we can JumpZero (JZ instruction) or JNZ (JumpNotZero) on the result of it, no GPR register usage.
>
> Given this code is x86 specific anyway, I don't see value add from the bool type and !! trick to canonicalize the "any value" to 0 or 1.
> If the ASM generated is the same, I'm OK with either approach, just noting the micro-optimization around test/flags-register.

Lets see the asm, if we do keep int we should add a comment. But as this code will move outside the loop, I assume the flag register will be cleared out before it hits this in the loop.

//Eelco
Emma Finn Nov. 24, 2022, 9:16 a.m. UTC | #6
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Wednesday 23 November 2022 14:14
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org;
> david.marchand@redhat.com; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of
> set_masked IPv6 action
> 
> 
> 
> On 23 Nov 2022, at 15:05, Van Haaren, Harry wrote:
> 
> >> -----Original Message-----
> >> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco
> >> Chaudron
> >> Sent: Wednesday, November 23, 2022 1:55 PM
> >> To: Finn, Emma <emma.finn@intel.com>
> >> Cc: dev@openvswitch.org; david.marchand@redhat.com;
> >> i.maximets@ovn.org
> >> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of
> >> set_masked IPv6 action
> >
> > <snip>
> >
> >>> Something like this
> >>> 	v_dst = Loadu_si128(dst)
> >>> 	v_src = Loadu_si128(src)
> >>> 	v_or = _or_si128(v_dst, v_src)
> >>>
> >>> 	/* generate all ones register from cmpeq of v_zeros vs itself? */
> >>> 	 v_zeros = _setzero_si128()
> >>>     	v_all_ones = _cmpeq_epi(v_zeros, v_zeros);
> >>> 	int do_checksum = _mm_test_all_zeros(v_or, v_all_ones);
> >>>
> >>> Does this approach make sense to you?
> >>
> >> Yes perfectly, I was not aware of the _mm_test_all_zeros() which
> >> saves the popcount ;)
> >>
> >> One comment here is that do_checksum should be a bool type,
> something
> >> like
> >>
> >> bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones);
> >
> > In the interest of micro-optimization discussions, we'd need to check if the
> resulting ASM is the same...
> > Branching on a value is usually a "test" with a register/register, or
> register/constant, and that sets the "flags" register.
> >
> > Note that the test_all_zeros() *already* sets the flags register!
> > https://www.intel.com/content/www/us/en/docs/intrinsics-
> guide/index.ht
> > ml?wapkw=intrinsics%20guide#text=mm_test_all_zero&ig_expand=7187
> >
> > By taking the result, doing the bitwise !! ops , and branching on the
> > result, it might force the compiler into emitting a bunch of noisy-not-useful
> instructions.
> >
> > The test_all_zeros() isn't just a bypass of the popcnt instruction, it also
> avoids the "test" with a register to set flags register.
> > By having set the ZF (zero-flag) we can JumpZero (JZ instruction) or JNZ
> (JumpNotZero) on the result of it, no GPR register usage.
> >
> > Given this code is x86 specific anyway, I don't see value add from the bool
> type and !! trick to canonicalize the "any value" to 0 or 1.
> > If the ASM generated is the same, I'm OK with either approach, just noting
> the micro-optimization around test/flags-register.
> 
> Lets see the asm, if we do keep int we should add a comment. But as this
> code will move outside the loop, I assume the flag register will be cleared out
> before it hits this in the loop.
> 
> //Eelco

Let's change this to a bool type. 
I will send v4 of this patch shortly with all these changes. 

Thanks, 
Emma
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 6c7713251..f97b3c2f7 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -20,6 +20,9 @@ 
 
 #include <config.h>
 #include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
 
 #include "csum.h"
 #include "dp-packet.h"
@@ -483,6 +486,172 @@  action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
     }
 }
 
+#if HAVE_AVX512VBMI
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_get_delta(__m512i ip6_header)
+{
+    __m256i v_zeros = _mm256_setzero_si256();
+    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
+                                               0xFF, 0xFF, 0xFF, 0xFF);
+
+    __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, ip6_header);
+    __m256i v_ip6_src_dst =  _mm512_extracti64x4_epi64(v_header, 0);
+    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
+     * src and dst fields and add padding after each 16-bit value for the
+     * following carry over addition. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
+                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
+                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
+    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
+    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
+
+    /* Add each part of the old and new headers together. */
+    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final
+     * horizontal add. */
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
+
+    /* Extract delta value. */
+    return _mm256_extract_epi16(v_delta, 0);
+}
+
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
+{
+    uint16_t delta;
+    uint16_t old_delta = avx512_ipv6_get_delta(old_header);
+    uint16_t new_delta = avx512_ipv6_get_delta(new_header);
+    old_delta = ~old_delta;
+    uint32_t csum_delta = old_delta + new_delta;
+    delta = csum_finish(csum_delta);
+
+    return ~delta;
+}
+
+/* This function performs the same operation on each packet in the batch as
+ * the scalar odp_set_ipv6() function. */
+static void
+__attribute__((__target__("avx512vbmi")))
+action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
+                             const struct nlattr *a)
+{
+    const struct ovs_key_ipv6 *key, *mask;
+    struct dp_packet *packet;
+    a = nl_attr_get(a);
+    key = nl_attr_get(a);
+    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
+
+    /* Read the content of the key and mask in the respective registers. We
+     * only load the size of the actual structure, which is only 40 bytes. */
+    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
+    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
+
+    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
+     * ip6_hdr structure layout. */
+    static const uint8_t ip_shuffle_mask[64] = {
+            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
+    };
+
+    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
+
+    /* This shuffle is required for key and mask to match the layout of the
+     * ip6_hdr struct. */
+    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
+    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
+
+        /* Load the 40 bytes of the IPv6 header. */
+        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
+
+        /* AND the v_pkt_mask to the packet data (v_packet). */
+        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
+
+        /* OR the new addresses (v_key_shuf) with the masked packet addresses
+         * (v_pkt_masked). */
+        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
+
+        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
+         * be updated. */
+        bool do_checksum = false;
+#ifdef s6_addr32
+        for (int j = 0; j < 4; j++) {
+            if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) {
+                do_checksum = true;
+            }
+        }
+#else
+        for (int j = 0; j < 16; j++) {
+             if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) {
+                do_checksum = true;
+            }
+        }
+#endif
+        if (do_checksum) {
+            uint8_t proto = nh->ip6_nxt;
+            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
+                                                                  v_new_hdr);
+
+            if (proto == IPPROTO_UDP) {
+                struct udp_header *uh = dp_packet_l4(packet);
+                if (uh->udp_csum) {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
+                    udp_checksum = csum_finish(udp_checksum);
+
+                    if (!udp_checksum) {
+                        udp_checksum = htons(0xffff);
+                    }
+
+                    uh->udp_csum = udp_checksum;
+                }
+            } else if (proto == IPPROTO_TCP) {
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
+                tcp_checksum = csum_finish(tcp_checksum);
+
+                th->tcp_csum = tcp_checksum;
+            } else if (proto == IPPROTO_ICMPV6) {
+                struct icmp6_header *icmp = dp_packet_l4(packet);
+                uint16_t old_icmp_checksum = ~icmp->icmp6_cksum;
+                uint32_t icmp6_checksum = old_icmp_checksum + delta_checksum;
+                icmp6_checksum = csum_finish(icmp6_checksum);
+
+                icmp->icmp6_cksum = icmp6_checksum;
+            }
+        }
+        /* Write back the modified IPv6 addresses. */
+         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
+    }
+}
+#endif
+
 static void
 action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
 {
@@ -514,6 +683,13 @@  action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
     impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
     impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
 
+#if HAVE_AVX512VBMI
+    if (action_avx512vbmi_isa_probe()) {
+        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
+                              action_avx512_ipv6_set_addrs;
+    }
+#endif
+
     return 0;
 }
 
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index f80ae5a23..ff29e116f 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -60,6 +60,23 @@  action_avx512_isa_probe(void)
 
 #endif
 
+#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
+        return false;
+    }
+    return true;
+}
+#else
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    return false;
+}
+#endif
+
 static struct odp_execute_action_impl action_impls[] = {
     [ACTION_IMPL_AUTOVALIDATOR] = {
         .available = false,
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 940180c99..643f41c2a 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -78,6 +78,7 @@  BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
 #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
 
 bool action_avx512_isa_probe(void);
+bool action_avx512vbmi_isa_probe(void);
 
 /* Odp execute init handles setting up the state of the actions functions at
  * initialization time. It cannot return errors, as it must always succeed in