Message ID | 20220926132946.1767182-1-emma.finn@intel.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,v3] odp-execute: Add ISA implementation of set_masked IPv6 action | expand |
Context | Check | Description |
---|---|---|
ovsrobot/apply-robot | success | apply and check: success |
ovsrobot/github-robot-_Build_and_Test | success | github build: passed |
ovsrobot/intel-ovs-compilation | success | test: success |
On 26 Sep 2022, at 15:29, Emma Finn wrote: > This commit adds support for the AVX512 implementation of the > ipv6_set_addrs action as well as an AVX512 implementation of > updating the L4 checksums. > > Signed-off-by: Emma Finn <emma.finn@intel.com> Hi Emma, Thanks for further enhancing the implementation of the AVX512 actions. Below are some comments, mostly style related, but with one additional optimization. Cheers, Eelco > --- > v3: > - Added a runtime check for AVX512 vbmi. > v2: > - Added check for availbility of s6_addr32 field of struct in6_addr. > - Fixed network headers for freebsd builds. > --- > --- > lib/odp-execute-avx512.c | 176 ++++++++++++++++++++++++++++++++++++++ > lib/odp-execute-private.c | 17 ++++ > lib/odp-execute-private.h | 1 + > 3 files changed, 194 insertions(+) > > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c > index 6c7713251..f97b3c2f7 100644 > --- a/lib/odp-execute-avx512.c > +++ b/lib/odp-execute-avx512.c > @@ -20,6 +20,9 @@ > > #include <config.h> > #include <errno.h> > +#include <sys/types.h> > +#include <netinet/in.h> > +#include <netinet/ip6.h> > > #include "csum.h" > #include "dp-packet.h" > @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, > } > } > > +#if HAVE_AVX512VBMI > +static inline uint16_t ALWAYS_INLINE > +__attribute__((__target__("avx512vbmi"))) > +avx512_ipv6_get_delta(__m512i ip6_header) I guess the function name was from before you split up this function, as it's not at all what you're doing in this function. I would suggest changing the name to something like avx512_ipv6_sum_header(). Also, can you go over the register naming and comment text below, as they also make no sense in the current form? > +{ > + __m256i v_zeros = _mm256_setzero_si256(); > + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, > + 0xFF, 0xFF, 0xFF, 0xFF); > + > + __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, ip6_header); > + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_header, 0); Remove the extra space after the equal sign. Please add a new line before the comment. > + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the > + * src and dst fields and add padding after each 16-bit value for the > + * following carry over addition. */ > + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF, > + 0x0504, 0xFFFF, 0x0706, 0xFFFF, > + 0x0100, 0xFFFF, 0x0302, 0xFFFF, > + 0x0504, 0xFFFF, 0x0706, 0xFFFF); > + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF, > + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, > + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, > + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); > + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a); > + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b); > + > + /* Add each part of the old and new headers together. */ > + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); > + > + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ > + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > + > + /* Shuffle 32-bit value from 3rd lane into first lane for final > + * horizontal add. */ > + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, > + 0xF, 0xF, 0xF, 0xF); > + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); > + > + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); > + > + /* Extract delta value. */ > + return _mm256_extract_epi16(v_delta, 0); > +} > + > +static inline uint16_t ALWAYS_INLINE > +__attribute__((__target__("avx512vbmi"))) > +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) > +{ > + uint16_t delta; > + uint16_t old_delta = avx512_ipv6_get_delta(old_header); > + uint16_t new_delta = avx512_ipv6_get_delta(new_header); > + old_delta = ~old_delta; > + uint32_t csum_delta = old_delta + new_delta; > + delta = csum_finish(csum_delta); > + > + return ~delta; This function looks rather cluttered, what about the following: { uint16_t old_delta = avx512_ipv6_get_delta(old_header); uint16_t new_delta = avx512_ipv6_get_delta(new_header); uint32_t csum_delta = ~old_delta + new_delta; return ~csum_finish(csum_delta); } > +} > + > +/* This function performs the same operation on each packet in the batch as > + * the scalar odp_set_ipv6() function. */ > +static void > +__attribute__((__target__("avx512vbmi"))) > +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch, > + const struct nlattr *a) > +{ > + const struct ovs_key_ipv6 *key, *mask; > + struct dp_packet *packet; Add a new line between definitions and code. > + a = nl_attr_get(a); > + key = nl_attr_get(a); > + mask = odp_get_key_mask(a, struct ovs_key_ipv6); We have build asserts for the ovs_key_ipv4 key structure to make sure they do not change, we should add the same for v6. > + > + /* Read the content of the key and mask in the respective registers. We > + * only load the size of the actual structure, which is only 40 bytes. */ > + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); > + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); > + > + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the > + * ip6_hdr structure layout. */ > + static const uint8_t ip_shuffle_mask[64] = { > + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, > + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, > + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, > + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, > + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF > + }; > + > + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); > + > + /* This shuffle is required for key and mask to match the layout of the > + * ip6_hdr struct. */ > + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); > + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); > + > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); > + > + /* Load the 40 bytes of the IPv6 header. */ > + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh); > + > + /* AND the v_pkt_mask to the packet data (v_packet). */ > + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet); > + > + /* OR the new addresses (v_key_shuf) with the masked packet addresses > + * (v_pkt_masked). */ > + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked); > + > + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to > + * be updated. */ > + bool do_checksum = false; So as we are trying to optimise code, this only needs to be done once, so we should move this outside the DP_PACKET_BATCH_FOR_EACH() loop. > +#ifdef s6_addr32 > + for (int j = 0; j < 4; j++) { > + if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) { > + do_checksum = true; > + } > + } > +#else > + for (int j = 0; j < 16; j++) { > + if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) { > + do_checksum = true; > + } > + } > +#endif Not sure how fast slow/fast the above is compared with doing an AVX512 AND on the v_mask with a new v_address_mask + popcount? > + if (do_checksum) { > + uint8_t proto = nh->ip6_nxt; > + uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, > + v_new_hdr); > + > + if (proto == IPPROTO_UDP) { > + struct udp_header *uh = dp_packet_l4(packet); Add a new line here. > + if (uh->udp_csum) { > + uint16_t old_udp_checksum = ~uh->udp_csum; > + uint32_t udp_checksum = old_udp_checksum + delta_checksum; Add a new line here. > + udp_checksum = csum_finish(udp_checksum); > + > + if (!udp_checksum) { > + udp_checksum = htons(0xffff); > + } > + > + uh->udp_csum = udp_checksum; > + } > + } else if (proto == IPPROTO_TCP) { > + struct tcp_header *th = dp_packet_l4(packet); > + uint16_t old_tcp_checksum = ~th->tcp_csum; > + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; Add a new line here. > + tcp_checksum = csum_finish(tcp_checksum); > + Remove the new line. > + th->tcp_csum = tcp_checksum; > + } else if (proto == IPPROTO_ICMPV6) { > + struct icmp6_header *icmp = dp_packet_l4(packet); > + uint16_t old_icmp_checksum = ~icmp->icmp6_cksum; Keep name consistency, so I would call it old_icmp6_checksum. > + uint32_t icmp6_checksum = old_icmp_checksum + delta_checksum; Add a new line here. > + icmp6_checksum = csum_finish(icmp6_checksum); > + Remove the new line. > + icmp->icmp6_cksum = icmp6_checksum; > + } > + } > + /* Write back the modified IPv6 addresses. */ > + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); > + } > +} > +#endif As the #if is way out of sight, I would make this +#endif /* HAVE_AVX512VBMI */ > + > static void > action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a) > { > @@ -514,6 +683,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED) > impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs; > impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs; > > +#if HAVE_AVX512VBMI > + if (action_avx512vbmi_isa_probe()) { > + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = > + action_avx512_ipv6_set_addrs; > + } > +#endif > + > return 0; > } > > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c > index f80ae5a23..ff29e116f 100644 > --- a/lib/odp-execute-private.c > +++ b/lib/odp-execute-private.c > @@ -60,6 +60,23 @@ action_avx512_isa_probe(void) > > #endif > > +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI > +bool > +action_avx512vbmi_isa_probe(void) > +{ > + if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { > + return false; > + } > + return true; > +} just a nit, but I would make this as follows: bool action_avx512vbmi_isa_probe(void) { if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { return true; } return false; } > +#else > +bool > +action_avx512vbmi_isa_probe(void) > +{ > + return false; > +} > +#endif > + > static struct odp_execute_action_impl action_impls[] = { > [ACTION_IMPL_AUTOVALIDATOR] = { > .available = false, > diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h > index 940180c99..643f41c2a 100644 > --- a/lib/odp-execute-private.h > +++ b/lib/odp-execute-private.h > @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); > #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) > > bool action_avx512_isa_probe(void); > +bool action_avx512vbmi_isa_probe(void); > > /* Odp execute init handles setting up the state of the actions functions at > * initialization time. It cannot return errors, as it must always succeed in > -- > 2.25.1
> -----Original Message----- > From: Eelco Chaudron <echaudro@redhat.com> > Sent: Thursday 17 November 2022 09:21 > To: Finn, Emma <emma.finn@intel.com> > Cc: dev@openvswitch.org; david.marchand@redhat.com; > i.maximets@ovn.org > Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of > set_masked IPv6 action > > On 26 Sep 2022, at 15:29, Emma Finn wrote: > > > This commit adds support for the AVX512 implementation of the > > ipv6_set_addrs action as well as an AVX512 implementation of updating > > the L4 checksums. > > > > Signed-off-by: Emma Finn <emma.finn@intel.com> > > Hi Emma, > > Thanks for further enhancing the implementation of the AVX512 actions. > Below are some comments, mostly style related, but with one additional > optimization. > > Cheers, > > Eelco > Thanks for the review Eelco. Sure, I will clean up and change all the style related comments. Some other replies inline below. Thanks, Emma > > --- > > v3: > > - Added a runtime check for AVX512 vbmi. > > v2: > > - Added check for availbility of s6_addr32 field of struct in6_addr. > > - Fixed network headers for freebsd builds. > > --- > > --- > > lib/odp-execute-avx512.c | 176 > > ++++++++++++++++++++++++++++++++++++++ > > lib/odp-execute-private.c | 17 ++++ > > lib/odp-execute-private.h | 1 + > > 3 files changed, 194 insertions(+) > > > > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index > > 6c7713251..f97b3c2f7 100644 > > --- a/lib/odp-execute-avx512.c > > +++ b/lib/odp-execute-avx512.c > > @@ -20,6 +20,9 @@ > > > > #include <config.h> > > #include <errno.h> > > +#include <sys/types.h> > > +#include <netinet/in.h> > > +#include <netinet/ip6.h> > > > > #include "csum.h" > > #include "dp-packet.h" > > @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct > dp_packet_batch *batch, > > } > > } > > > > +#if HAVE_AVX512VBMI > > +static inline uint16_t ALWAYS_INLINE > > +__attribute__((__target__("avx512vbmi"))) > > +avx512_ipv6_get_delta(__m512i ip6_header) > > I guess the function name was from before you split up this function, as it's > not at all what you're doing in this function. > I would suggest changing the name to something like > avx512_ipv6_sum_header(). > > Also, can you go over the register naming and comment text below, as they > also make no sense in the current form? > > > +{ > > + __m256i v_zeros = _mm256_setzero_si256(); > > + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, > > + 0xFF, 0xFF, 0xFF, > > +0xFF); > > + > > + __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, > ip6_header); > > + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_header, 0); > > Remove the extra space after the equal sign. > > Please add a new line before the comment. > > > + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle > the > > + * src and dst fields and add padding after each 16-bit value for the > > + * following carry over addition. */ > > + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, > 0xFFFF, > > + 0x0504, 0xFFFF, 0x0706, 0xFFFF, > > + 0x0100, 0xFFFF, 0x0302, 0xFFFF, > > + 0x0504, 0xFFFF, 0x0706, 0xFFFF); > > + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, > 0xFFFF, > > + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, > > + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, > > + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); > > + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, > v_swap16a); > > + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, > > + v_swap16b); > > + > > + /* Add each part of the old and new headers together. */ > > + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); > > + > > + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ > > + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > > + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > > + > > + /* Shuffle 32-bit value from 3rd lane into first lane for final > > + * horizontal add. */ > > + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, > > + 0xF, 0xF, 0xF, 0xF); > > + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); > > + > > + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); > > + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); > > + > > + /* Extract delta value. */ > > + return _mm256_extract_epi16(v_delta, 0); } > > + > > +static inline uint16_t ALWAYS_INLINE > > +__attribute__((__target__("avx512vbmi"))) > > +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i > new_header) { > > + uint16_t delta; > > + uint16_t old_delta = avx512_ipv6_get_delta(old_header); > > + uint16_t new_delta = avx512_ipv6_get_delta(new_header); > > + old_delta = ~old_delta; > > + uint32_t csum_delta = old_delta + new_delta; > > + delta = csum_finish(csum_delta); > > + > > + return ~delta; > > This function looks rather cluttered, what about the following: > > { > uint16_t old_delta = avx512_ipv6_get_delta(old_header); > uint16_t new_delta = avx512_ipv6_get_delta(new_header); > uint32_t csum_delta = ~old_delta + new_delta; > Sure I will tidy this up. The above line will need to have cast around old_delta though. Otherwise it will invert 32bits instead of 16bits of old_delta, which will make the addition wrong. Something like this: uint32_t csum_delta = (uint16_t)~old_delta + new_delta; > return ~csum_finish(csum_delta); > } > > > +} > > + > > +/* This function performs the same operation on each packet in the > > +batch as > > + * the scalar odp_set_ipv6() function. */ static void > > +__attribute__((__target__("avx512vbmi"))) > > +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch, > > + const struct nlattr *a) { > > + const struct ovs_key_ipv6 *key, *mask; > > + struct dp_packet *packet; > > Add a new line between definitions and code. > > > + a = nl_attr_get(a); > > + key = nl_attr_get(a); > > + mask = odp_get_key_mask(a, struct ovs_key_ipv6); > > We have build asserts for the ovs_key_ipv4 key structure to make sure they > do not change, we should add the same for v6. > > > + > > + /* Read the content of the key and mask in the respective registers. We > > + * only load the size of the actual structure, which is only 40 bytes. */ > > + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); > > + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); > > + > > + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the > > + * ip6_hdr structure layout. */ > > + static const uint8_t ip_shuffle_mask[64] = { > > + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, > > + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, > > + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, > > + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, > > + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, > > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, > > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, > > + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF > > + }; > > + > > + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); > > + > > + /* This shuffle is required for key and mask to match the layout of the > > + * ip6_hdr struct. */ > > + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); > > + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, > v_mask); > > + > > + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { > > + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); > > + > > + /* Load the 40 bytes of the IPv6 header. */ > > + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) > > + nh); > > + > > + /* AND the v_pkt_mask to the packet data (v_packet). */ > > + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, > > + v_packet); > > + > > + /* OR the new addresses (v_key_shuf) with the masked packet > addresses > > + * (v_pkt_masked). */ > > + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, > > + v_pkt_masked); > > + > > + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to > > + * be updated. */ > > + bool do_checksum = false; > > So as we are trying to optimise code, this only needs to be done once, so we > should move this outside the DP_PACKET_BATCH_FOR_EACH() loop. > The compiler is likely already lifting that code out of the loop as it is loop invariant. But moving outside the loop will make it clearer, so yes will do. > > +#ifdef s6_addr32 > > + for (int j = 0; j < 4; j++) { > > + if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) { > > + do_checksum = true; > > + } > > + } > > +#else > > + for (int j = 0; j < 16; j++) { > > + if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) { > > + do_checksum = true; > > + } > > + } > > +#endif > > Not sure how fast slow/fast the above is compared with doing an AVX512 > AND on the v_mask with a new v_address_mask + popcount? > Ah yes good idea. To keep the same behaviour, I think we want to bitwise-OR the two registers (not bitwise-AND) and then check if any bits are set? If any bit is set in src/dst mask, we need to do_checksum. Something like this v_dst = Loadu_si128(dst) v_src = Loadu_si128(src) v_or = _or_si128(v_dst, v_src) /* generate all ones register from cmpeq of v_zeros vs itself? */ v_zeros = _setzero_si128() v_all_ones = _cmpeq_epi(v_zeros, v_zeros); int do_checksum = _mm_test_all_zeros(v_or, v_all_ones); Does this approach make sense to you? > > + if (do_checksum) { > > + uint8_t proto = nh->ip6_nxt; > > + uint16_t delta_checksum = > avx512_ipv6_addr_csum_delta(v_packet, > > + > > + v_new_hdr); > > + > > + if (proto == IPPROTO_UDP) { > > + struct udp_header *uh = dp_packet_l4(packet); > > Add a new line here. > > > + if (uh->udp_csum) { > > + uint16_t old_udp_checksum = ~uh->udp_csum; > > + uint32_t udp_checksum = old_udp_checksum + > > + delta_checksum; > > Add a new line here. > > > + udp_checksum = csum_finish(udp_checksum); > > + > > + if (!udp_checksum) { > > + udp_checksum = htons(0xffff); > > + } > > + > > + uh->udp_csum = udp_checksum; > > + } > > + } else if (proto == IPPROTO_TCP) { > > + struct tcp_header *th = dp_packet_l4(packet); > > + uint16_t old_tcp_checksum = ~th->tcp_csum; > > + uint32_t tcp_checksum = old_tcp_checksum + > > + delta_checksum; > > Add a new line here. > > > + tcp_checksum = csum_finish(tcp_checksum); > > + > > Remove the new line. > > > + th->tcp_csum = tcp_checksum; > > + } else if (proto == IPPROTO_ICMPV6) { > > + struct icmp6_header *icmp = dp_packet_l4(packet); > > + uint16_t old_icmp_checksum = ~icmp->icmp6_cksum; > > Keep name consistency, so I would call it old_icmp6_checksum. > > > + uint32_t icmp6_checksum = old_icmp_checksum + > > + delta_checksum; > > Add a new line here. > > > + icmp6_checksum = csum_finish(icmp6_checksum); > > + > > Remove the new line. > > > + icmp->icmp6_cksum = icmp6_checksum; > > + } > > + } > > + /* Write back the modified IPv6 addresses. */ > > + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); > > + } > > +} > > +#endif > > As the #if is way out of sight, I would make this +#endif /* > HAVE_AVX512VBMI */ > > > + > > static void > > action_avx512_set_masked(struct dp_packet_batch *batch, const struct > > nlattr *a) { @@ -514,6 +683,13 @@ action_avx512_init(struct > > odp_execute_action_impl *self OVS_UNUSED) > > impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = > action_avx512_eth_set_addrs; > > impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = > > action_avx512_ipv4_set_addrs; > > > > +#if HAVE_AVX512VBMI > > + if (action_avx512vbmi_isa_probe()) { > > + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = > > + action_avx512_ipv6_set_addrs; > > + } > > +#endif > > + > > return 0; > > } > > > > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c > > index f80ae5a23..ff29e116f 100644 > > --- a/lib/odp-execute-private.c > > +++ b/lib/odp-execute-private.c > > @@ -60,6 +60,23 @@ action_avx512_isa_probe(void) > > > > #endif > > > > +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI bool > > +action_avx512vbmi_isa_probe(void) > > +{ > > + if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { > > + return false; > > + } > > + return true; > > +} > > just a nit, but I would make this as follows: > > bool > action_avx512vbmi_isa_probe(void) > { > if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { > return true; > } > return false; > } > > > +#else > > +bool > > +action_avx512vbmi_isa_probe(void) > > +{ > > + return false; > > +} > > +#endif > > + > > static struct odp_execute_action_impl action_impls[] = { > > [ACTION_IMPL_AUTOVALIDATOR] = { > > .available = false, > > diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h > > index 940180c99..643f41c2a 100644 > > --- a/lib/odp-execute-private.h > > +++ b/lib/odp-execute-private.h > > @@ -78,6 +78,7 @@ > BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); > > #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) > > > > bool action_avx512_isa_probe(void); > > +bool action_avx512vbmi_isa_probe(void); > > > > /* Odp execute init handles setting up the state of the actions functions at > > * initialization time. It cannot return errors, as it must always > > succeed in > > -- > > 2.25.1
On 22 Nov 2022, at 16:10, Finn, Emma wrote: >> -----Original Message----- >> From: Eelco Chaudron <echaudro@redhat.com> >> Sent: Thursday 17 November 2022 09:21 >> To: Finn, Emma <emma.finn@intel.com> >> Cc: dev@openvswitch.org; david.marchand@redhat.com; >> i.maximets@ovn.org >> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of >> set_masked IPv6 action >> >> On 26 Sep 2022, at 15:29, Emma Finn wrote: >> >>> This commit adds support for the AVX512 implementation of the >>> ipv6_set_addrs action as well as an AVX512 implementation of updating >>> the L4 checksums. >>> >>> Signed-off-by: Emma Finn <emma.finn@intel.com> >> >> Hi Emma, >> >> Thanks for further enhancing the implementation of the AVX512 actions. >> Below are some comments, mostly style related, but with one additional >> optimization. >> >> Cheers, >> >> Eelco >> > > Thanks for the review Eelco. Sure, I will clean up and change all the style related comments. > Some other replies inline below. See inline comments below… >>> --- >>> v3: >>> - Added a runtime check for AVX512 vbmi. >>> v2: >>> - Added check for availbility of s6_addr32 field of struct in6_addr. >>> - Fixed network headers for freebsd builds. >>> --- >>> --- >>> lib/odp-execute-avx512.c | 176 >>> ++++++++++++++++++++++++++++++++++++++ >>> lib/odp-execute-private.c | 17 ++++ >>> lib/odp-execute-private.h | 1 + >>> 3 files changed, 194 insertions(+) >>> >>> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index >>> 6c7713251..f97b3c2f7 100644 >>> --- a/lib/odp-execute-avx512.c >>> +++ b/lib/odp-execute-avx512.c >>> @@ -20,6 +20,9 @@ >>> >>> #include <config.h> >>> #include <errno.h> >>> +#include <sys/types.h> >>> +#include <netinet/in.h> >>> +#include <netinet/ip6.h> >>> >>> #include "csum.h" >>> #include "dp-packet.h" >>> @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct >> dp_packet_batch *batch, >>> } >>> } >>> >>> +#if HAVE_AVX512VBMI >>> +static inline uint16_t ALWAYS_INLINE >>> +__attribute__((__target__("avx512vbmi"))) >>> +avx512_ipv6_get_delta(__m512i ip6_header) >> >> I guess the function name was from before you split up this function, as it's >> not at all what you're doing in this function. >> I would suggest changing the name to something like >> avx512_ipv6_sum_header(). >> >> Also, can you go over the register naming and comment text below, as they >> also make no sense in the current form? >> >>> +{ >>> + __m256i v_zeros = _mm256_setzero_si256(); >>> + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, >>> + 0xFF, 0xFF, 0xFF, >>> +0xFF); >>> + >>> + __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, >> ip6_header); >>> + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_header, 0); >> >> Remove the extra space after the equal sign. >> >> Please add a new line before the comment. >> >>> + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle >> the >>> + * src and dst fields and add padding after each 16-bit value for the >>> + * following carry over addition. */ >>> + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, >> 0xFFFF, >>> + 0x0504, 0xFFFF, 0x0706, 0xFFFF, >>> + 0x0100, 0xFFFF, 0x0302, 0xFFFF, >>> + 0x0504, 0xFFFF, 0x0706, 0xFFFF); >>> + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, >> 0xFFFF, >>> + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, >>> + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, >>> + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); >>> + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, >> v_swap16a); >>> + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, >>> + v_swap16b); >>> + >>> + /* Add each part of the old and new headers together. */ >>> + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); >>> + >>> + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ >>> + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); >>> + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); >>> + >>> + /* Shuffle 32-bit value from 3rd lane into first lane for final >>> + * horizontal add. */ >>> + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, >>> + 0xF, 0xF, 0xF, 0xF); >>> + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); >>> + >>> + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); >>> + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); >>> + >>> + /* Extract delta value. */ >>> + return _mm256_extract_epi16(v_delta, 0); } >>> + >>> +static inline uint16_t ALWAYS_INLINE >>> +__attribute__((__target__("avx512vbmi"))) >>> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i >> new_header) { >>> + uint16_t delta; >>> + uint16_t old_delta = avx512_ipv6_get_delta(old_header); >>> + uint16_t new_delta = avx512_ipv6_get_delta(new_header); >>> + old_delta = ~old_delta; >>> + uint32_t csum_delta = old_delta + new_delta; >>> + delta = csum_finish(csum_delta); >>> + >>> + return ~delta; >> >> This function looks rather cluttered, what about the following: >> >> { >> uint16_t old_delta = avx512_ipv6_get_delta(old_header); >> uint16_t new_delta = avx512_ipv6_get_delta(new_header); >> uint32_t csum_delta = ~old_delta + new_delta; >> > Sure I will tidy this up. The above line will need to have cast around old_delta though. > Otherwise it will invert 32bits instead of 16bits of old_delta, which will make the addition wrong. > Something like this: > uint32_t csum_delta = (uint16_t)~old_delta + new_delta; That should be fine, maybe explicitly mark the part you are interested in, saw this in other code: uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta; > >> return ~csum_finish(csum_delta); >> } >> >>> +} >>> + >>> +/* This function performs the same operation on each packet in the >>> +batch as >>> + * the scalar odp_set_ipv6() function. */ static void >>> +__attribute__((__target__("avx512vbmi"))) >>> +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch, >>> + const struct nlattr *a) { >>> + const struct ovs_key_ipv6 *key, *mask; >>> + struct dp_packet *packet; >> >> Add a new line between definitions and code. >> >>> + a = nl_attr_get(a); >>> + key = nl_attr_get(a); >>> + mask = odp_get_key_mask(a, struct ovs_key_ipv6); >> >> We have build asserts for the ovs_key_ipv4 key structure to make sure they >> do not change, we should add the same for v6. >> >>> + >>> + /* Read the content of the key and mask in the respective registers. We >>> + * only load the size of the actual structure, which is only 40 bytes. */ >>> + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); >>> + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); >>> + >>> + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the >>> + * ip6_hdr structure layout. */ >>> + static const uint8_t ip_shuffle_mask[64] = { >>> + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, >>> + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, >>> + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, >>> + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, >>> + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, >>> + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, >>> + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, >>> + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF >>> + }; >>> + >>> + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); >>> + >>> + /* This shuffle is required for key and mask to match the layout of the >>> + * ip6_hdr struct. */ >>> + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); >>> + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, >> v_mask); >>> + >>> + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { >>> + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); >>> + >>> + /* Load the 40 bytes of the IPv6 header. */ >>> + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) >>> + nh); >>> + >>> + /* AND the v_pkt_mask to the packet data (v_packet). */ >>> + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, >>> + v_packet); >>> + >>> + /* OR the new addresses (v_key_shuf) with the masked packet >> addresses >>> + * (v_pkt_masked). */ >>> + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, >>> + v_pkt_masked); >>> + >>> + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to >>> + * be updated. */ >>> + bool do_checksum = false; >> >> So as we are trying to optimise code, this only needs to be done once, so we >> should move this outside the DP_PACKET_BATCH_FOR_EACH() loop. >> > The compiler is likely already lifting that code out of the loop as it is loop invariant. > But moving outside the loop will make it clearer, so yes will do. Thanks, it’s just my autistic brain not trusting the optimizer ;) >>> +#ifdef s6_addr32 >>> + for (int j = 0; j < 4; j++) { >>> + if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) { >>> + do_checksum = true; >>> + } >>> + } >>> +#else >>> + for (int j = 0; j < 16; j++) { >>> + if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) { >>> + do_checksum = true; >>> + } >>> + } >>> +#endif >> >> Not sure how fast slow/fast the above is compared with doing an AVX512 >> AND on the v_mask with a new v_address_mask + popcount? >> > Ah yes good idea. To keep the same behaviour, I think we want to bitwise-OR the two registers (not bitwise-AND) and then check if any bits are set? > If any bit is set in src/dst mask, we need to do_checksum. > Something like this > v_dst = Loadu_si128(dst) > v_src = Loadu_si128(src) > v_or = _or_si128(v_dst, v_src) > > /* generate all ones register from cmpeq of v_zeros vs itself? */ > v_zeros = _setzero_si128() > v_all_ones = _cmpeq_epi(v_zeros, v_zeros); > int do_checksum = _mm_test_all_zeros(v_or, v_all_ones); > > Does this approach make sense to you? Yes perfectly, I was not aware of the _mm_test_all_zeros() which saves the popcount ;) One comment here is that do_checksum should be a bool type, something like bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones); >>> + if (do_checksum) { >>> + uint8_t proto = nh->ip6_nxt; >>> + uint16_t delta_checksum = >> avx512_ipv6_addr_csum_delta(v_packet, >>> + >>> + v_new_hdr); >>> + >>> + if (proto == IPPROTO_UDP) { >>> + struct udp_header *uh = dp_packet_l4(packet); >> >> Add a new line here. >> >>> + if (uh->udp_csum) { >>> + uint16_t old_udp_checksum = ~uh->udp_csum; >>> + uint32_t udp_checksum = old_udp_checksum + >>> + delta_checksum; >> >> Add a new line here. >> >>> + udp_checksum = csum_finish(udp_checksum); >>> + >>> + if (!udp_checksum) { >>> + udp_checksum = htons(0xffff); >>> + } >>> + >>> + uh->udp_csum = udp_checksum; >>> + } >>> + } else if (proto == IPPROTO_TCP) { >>> + struct tcp_header *th = dp_packet_l4(packet); >>> + uint16_t old_tcp_checksum = ~th->tcp_csum; >>> + uint32_t tcp_checksum = old_tcp_checksum + >>> + delta_checksum; >> >> Add a new line here. >> >>> + tcp_checksum = csum_finish(tcp_checksum); >>> + >> >> Remove the new line. >> >>> + th->tcp_csum = tcp_checksum; >>> + } else if (proto == IPPROTO_ICMPV6) { >>> + struct icmp6_header *icmp = dp_packet_l4(packet); >>> + uint16_t old_icmp_checksum = ~icmp->icmp6_cksum; >> >> Keep name consistency, so I would call it old_icmp6_checksum. >> >>> + uint32_t icmp6_checksum = old_icmp_checksum + >>> + delta_checksum; >> >> Add a new line here. >> >>> + icmp6_checksum = csum_finish(icmp6_checksum); >>> + >> >> Remove the new line. >> >>> + icmp->icmp6_cksum = icmp6_checksum; >>> + } >>> + } >>> + /* Write back the modified IPv6 addresses. */ >>> + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); >>> + } >>> +} >>> +#endif >> >> As the #if is way out of sight, I would make this +#endif /* >> HAVE_AVX512VBMI */ >> >>> + >>> static void >>> action_avx512_set_masked(struct dp_packet_batch *batch, const struct >>> nlattr *a) { @@ -514,6 +683,13 @@ action_avx512_init(struct >>> odp_execute_action_impl *self OVS_UNUSED) >>> impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = >> action_avx512_eth_set_addrs; >>> impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = >>> action_avx512_ipv4_set_addrs; >>> >>> +#if HAVE_AVX512VBMI >>> + if (action_avx512vbmi_isa_probe()) { >>> + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = >>> + action_avx512_ipv6_set_addrs; >>> + } >>> +#endif >>> + >>> return 0; >>> } >>> >>> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c >>> index f80ae5a23..ff29e116f 100644 >>> --- a/lib/odp-execute-private.c >>> +++ b/lib/odp-execute-private.c >>> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void) >>> >>> #endif >>> >>> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI bool >>> +action_avx512vbmi_isa_probe(void) >>> +{ >>> + if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { >>> + return false; >>> + } >>> + return true; >>> +} >> >> just a nit, but I would make this as follows: >> >> bool >> action_avx512vbmi_isa_probe(void) >> { >> if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { >> return true; >> } >> return false; >> } >> >>> +#else >>> +bool >>> +action_avx512vbmi_isa_probe(void) >>> +{ >>> + return false; >>> +} >>> +#endif >>> + >>> static struct odp_execute_action_impl action_impls[] = { >>> [ACTION_IMPL_AUTOVALIDATOR] = { >>> .available = false, >>> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h >>> index 940180c99..643f41c2a 100644 >>> --- a/lib/odp-execute-private.h >>> +++ b/lib/odp-execute-private.h >>> @@ -78,6 +78,7 @@ >> BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); >>> #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) >>> >>> bool action_avx512_isa_probe(void); >>> +bool action_avx512vbmi_isa_probe(void); >>> >>> /* Odp execute init handles setting up the state of the actions functions at >>> * initialization time. It cannot return errors, as it must always >>> succeed in >>> -- >>> 2.25.1
> -----Original Message----- > From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron > Sent: Wednesday, November 23, 2022 1:55 PM > To: Finn, Emma <emma.finn@intel.com> > Cc: dev@openvswitch.org; david.marchand@redhat.com; i.maximets@ovn.org > Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of set_masked IPv6 > action <snip> > > Something like this > > v_dst = Loadu_si128(dst) > > v_src = Loadu_si128(src) > > v_or = _or_si128(v_dst, v_src) > > > > /* generate all ones register from cmpeq of v_zeros vs itself? */ > > v_zeros = _setzero_si128() > > v_all_ones = _cmpeq_epi(v_zeros, v_zeros); > > int do_checksum = _mm_test_all_zeros(v_or, v_all_ones); > > > > Does this approach make sense to you? > > Yes perfectly, I was not aware of the _mm_test_all_zeros() which saves the > popcount ;) > > One comment here is that do_checksum should be a bool type, something like > > bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones); In the interest of micro-optimization discussions, we'd need to check if the resulting ASM is the same... Branching on a value is usually a "test" with a register/register, or register/constant, and that sets the "flags" register. Note that the test_all_zeros() *already* sets the flags register! https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html?wapkw=intrinsics%20guide#text=mm_test_all_zero&ig_expand=7187 By taking the result, doing the bitwise !! ops , and branching on the result, it might force the compiler into emitting a bunch of noisy-not-useful instructions. The test_all_zeros() isn't just a bypass of the popcnt instruction, it also avoids the "test" with a register to set flags register. By having set the ZF (zero-flag) we can JumpZero (JZ instruction) or JNZ (JumpNotZero) on the result of it, no GPR register usage. Given this code is x86 specific anyway, I don't see value add from the bool type and !! trick to canonicalize the "any value" to 0 or 1. If the ASM generated is the same, I'm OK with either approach, just noting the micro-optimization around test/flags-register. Regards, -Harry
On 23 Nov 2022, at 15:05, Van Haaren, Harry wrote: >> -----Original Message----- >> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron >> Sent: Wednesday, November 23, 2022 1:55 PM >> To: Finn, Emma <emma.finn@intel.com> >> Cc: dev@openvswitch.org; david.marchand@redhat.com; i.maximets@ovn.org >> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of set_masked IPv6 >> action > > <snip> > >>> Something like this >>> v_dst = Loadu_si128(dst) >>> v_src = Loadu_si128(src) >>> v_or = _or_si128(v_dst, v_src) >>> >>> /* generate all ones register from cmpeq of v_zeros vs itself? */ >>> v_zeros = _setzero_si128() >>> v_all_ones = _cmpeq_epi(v_zeros, v_zeros); >>> int do_checksum = _mm_test_all_zeros(v_or, v_all_ones); >>> >>> Does this approach make sense to you? >> >> Yes perfectly, I was not aware of the _mm_test_all_zeros() which saves the >> popcount ;) >> >> One comment here is that do_checksum should be a bool type, something like >> >> bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones); > > In the interest of micro-optimization discussions, we'd need to check if the resulting ASM is the same... > Branching on a value is usually a "test" with a register/register, or register/constant, and that sets the "flags" register. > > Note that the test_all_zeros() *already* sets the flags register! > https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html?wapkw=intrinsics%20guide#text=mm_test_all_zero&ig_expand=7187 > > By taking the result, doing the bitwise !! ops , and branching on the result, it might force the compiler into emitting a > bunch of noisy-not-useful instructions. > > The test_all_zeros() isn't just a bypass of the popcnt instruction, it also avoids the "test" with a register to set flags register. > By having set the ZF (zero-flag) we can JumpZero (JZ instruction) or JNZ (JumpNotZero) on the result of it, no GPR register usage. > > Given this code is x86 specific anyway, I don't see value add from the bool type and !! trick to canonicalize the "any value" to 0 or 1. > If the ASM generated is the same, I'm OK with either approach, just noting the micro-optimization around test/flags-register. Lets see the asm, if we do keep int we should add a comment. But as this code will move outside the loop, I assume the flag register will be cleared out before it hits this in the loop. //Eelco
> -----Original Message----- > From: Eelco Chaudron <echaudro@redhat.com> > Sent: Wednesday 23 November 2022 14:14 > To: Van Haaren, Harry <harry.van.haaren@intel.com> > Cc: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org; > david.marchand@redhat.com; i.maximets@ovn.org > Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of > set_masked IPv6 action > > > > On 23 Nov 2022, at 15:05, Van Haaren, Harry wrote: > > >> -----Original Message----- > >> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco > >> Chaudron > >> Sent: Wednesday, November 23, 2022 1:55 PM > >> To: Finn, Emma <emma.finn@intel.com> > >> Cc: dev@openvswitch.org; david.marchand@redhat.com; > >> i.maximets@ovn.org > >> Subject: Re: [ovs-dev] [v3] odp-execute: Add ISA implementation of > >> set_masked IPv6 action > > > > <snip> > > > >>> Something like this > >>> v_dst = Loadu_si128(dst) > >>> v_src = Loadu_si128(src) > >>> v_or = _or_si128(v_dst, v_src) > >>> > >>> /* generate all ones register from cmpeq of v_zeros vs itself? */ > >>> v_zeros = _setzero_si128() > >>> v_all_ones = _cmpeq_epi(v_zeros, v_zeros); > >>> int do_checksum = _mm_test_all_zeros(v_or, v_all_ones); > >>> > >>> Does this approach make sense to you? > >> > >> Yes perfectly, I was not aware of the _mm_test_all_zeros() which > >> saves the popcount ;) > >> > >> One comment here is that do_checksum should be a bool type, > something > >> like > >> > >> bool do_checksum = !!_mm_test_all_zeros(v_or, v_all_ones); > > > > In the interest of micro-optimization discussions, we'd need to check if the > resulting ASM is the same... > > Branching on a value is usually a "test" with a register/register, or > register/constant, and that sets the "flags" register. > > > > Note that the test_all_zeros() *already* sets the flags register! > > https://www.intel.com/content/www/us/en/docs/intrinsics- > guide/index.ht > > ml?wapkw=intrinsics%20guide#text=mm_test_all_zero&ig_expand=7187 > > > > By taking the result, doing the bitwise !! ops , and branching on the > > result, it might force the compiler into emitting a bunch of noisy-not-useful > instructions. > > > > The test_all_zeros() isn't just a bypass of the popcnt instruction, it also > avoids the "test" with a register to set flags register. > > By having set the ZF (zero-flag) we can JumpZero (JZ instruction) or JNZ > (JumpNotZero) on the result of it, no GPR register usage. > > > > Given this code is x86 specific anyway, I don't see value add from the bool > type and !! trick to canonicalize the "any value" to 0 or 1. > > If the ASM generated is the same, I'm OK with either approach, just noting > the micro-optimization around test/flags-register. > > Lets see the asm, if we do keep int we should add a comment. But as this > code will move outside the loop, I assume the flag register will be cleared out > before it hits this in the loop. > > //Eelco Let's change this to a bool type. I will send v4 of this patch shortly with all these changes. Thanks, Emma
diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index 6c7713251..f97b3c2f7 100644 --- a/lib/odp-execute-avx512.c +++ b/lib/odp-execute-avx512.c @@ -20,6 +20,9 @@ #include <config.h> #include <errno.h> +#include <sys/types.h> +#include <netinet/in.h> +#include <netinet/ip6.h> #include "csum.h" #include "dp-packet.h" @@ -483,6 +486,172 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch, } } +#if HAVE_AVX512VBMI +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_get_delta(__m512i ip6_header) +{ + __m256i v_zeros = _mm256_setzero_si256(); + __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04, + 0xFF, 0xFF, 0xFF, 0xFF); + + __m512i v_header = _mm512_permutexvar_epi64(v_shuf_src_dst, ip6_header); + __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_header, 0); + /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the + * src and dst fields and add padding after each 16-bit value for the + * following carry over addition. */ + __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF, + 0x0100, 0xFFFF, 0x0302, 0xFFFF, + 0x0504, 0xFFFF, 0x0706, 0xFFFF); + __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF, + 0x0908, 0xFFFF, 0x0B0A, 0xFFFF, + 0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF); + __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a); + __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b); + + /* Add each part of the old and new headers together. */ + __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2); + + /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */ + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + + /* Shuffle 32-bit value from 3rd lane into first lane for final + * horizontal add. */ + __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF, + 0xF, 0xF, 0xF, 0xF); + v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta); + + v_delta = _mm256_hadd_epi32(v_delta, v_zeros); + v_delta = _mm256_hadd_epi16(v_delta, v_zeros); + + /* Extract delta value. */ + return _mm256_extract_epi16(v_delta, 0); +} + +static inline uint16_t ALWAYS_INLINE +__attribute__((__target__("avx512vbmi"))) +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header) +{ + uint16_t delta; + uint16_t old_delta = avx512_ipv6_get_delta(old_header); + uint16_t new_delta = avx512_ipv6_get_delta(new_header); + old_delta = ~old_delta; + uint32_t csum_delta = old_delta + new_delta; + delta = csum_finish(csum_delta); + + return ~delta; +} + +/* This function performs the same operation on each packet in the batch as + * the scalar odp_set_ipv6() function. */ +static void +__attribute__((__target__("avx512vbmi"))) +action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch, + const struct nlattr *a) +{ + const struct ovs_key_ipv6 *key, *mask; + struct dp_packet *packet; + a = nl_attr_get(a); + key = nl_attr_get(a); + mask = odp_get_key_mask(a, struct ovs_key_ipv6); + + /* Read the content of the key and mask in the respective registers. We + * only load the size of the actual structure, which is only 40 bytes. */ + __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key); + __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask); + + /* This shuffle mask v_shuffle, is to shuffle key and mask to match the + * ip6_hdr structure layout. */ + static const uint8_t ip_shuffle_mask[64] = { + 0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, + 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF + }; + + __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask); + + /* This shuffle is required for key and mask to match the layout of the + * ip6_hdr struct. */ + __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key); + __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask); + + DP_PACKET_BATCH_FOR_EACH (i, packet, batch) { + struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet); + + /* Load the 40 bytes of the IPv6 header. */ + __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh); + + /* AND the v_pkt_mask to the packet data (v_packet). */ + __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet); + + /* OR the new addresses (v_key_shuf) with the masked packet addresses + * (v_pkt_masked). */ + __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked); + + /* If ip6_src or ip6_dst has been modified, L4 checksum needs to + * be updated. */ + bool do_checksum = false; +#ifdef s6_addr32 + for (int j = 0; j < 4; j++) { + if (mask->ipv6_dst.s6_addr32[j] || mask->ipv6_src.s6_addr32[j]) { + do_checksum = true; + } + } +#else + for (int j = 0; j < 16; j++) { + if (mask->ipv6_dst.s6_addr[j] || mask->ipv6_src.s6_addr[j]) { + do_checksum = true; + } + } +#endif + if (do_checksum) { + uint8_t proto = nh->ip6_nxt; + uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet, + v_new_hdr); + + if (proto == IPPROTO_UDP) { + struct udp_header *uh = dp_packet_l4(packet); + if (uh->udp_csum) { + uint16_t old_udp_checksum = ~uh->udp_csum; + uint32_t udp_checksum = old_udp_checksum + delta_checksum; + udp_checksum = csum_finish(udp_checksum); + + if (!udp_checksum) { + udp_checksum = htons(0xffff); + } + + uh->udp_csum = udp_checksum; + } + } else if (proto == IPPROTO_TCP) { + struct tcp_header *th = dp_packet_l4(packet); + uint16_t old_tcp_checksum = ~th->tcp_csum; + uint32_t tcp_checksum = old_tcp_checksum + delta_checksum; + tcp_checksum = csum_finish(tcp_checksum); + + th->tcp_csum = tcp_checksum; + } else if (proto == IPPROTO_ICMPV6) { + struct icmp6_header *icmp = dp_packet_l4(packet); + uint16_t old_icmp_checksum = ~icmp->icmp6_cksum; + uint32_t icmp6_checksum = old_icmp_checksum + delta_checksum; + icmp6_checksum = csum_finish(icmp6_checksum); + + icmp->icmp6_cksum = icmp6_checksum; + } + } + /* Write back the modified IPv6 addresses. */ + _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr); + } +} +#endif + static void action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a) { @@ -514,6 +683,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED) impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs; impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs; +#if HAVE_AVX512VBMI + if (action_avx512vbmi_isa_probe()) { + impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = + action_avx512_ipv6_set_addrs; + } +#endif + return 0; } diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c index f80ae5a23..ff29e116f 100644 --- a/lib/odp-execute-private.c +++ b/lib/odp-execute-private.c @@ -60,6 +60,23 @@ action_avx512_isa_probe(void) #endif +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI +bool +action_avx512vbmi_isa_probe(void) +{ + if (!cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) { + return false; + } + return true; +} +#else +bool +action_avx512vbmi_isa_probe(void) +{ + return false; +} +#endif + static struct odp_execute_action_impl action_impls[] = { [ACTION_IMPL_AUTOVALIDATOR] = { .available = false, diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h index 940180c99..643f41c2a 100644 --- a/lib/odp-execute-private.h +++ b/lib/odp-execute-private.h @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1); #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1) bool action_avx512_isa_probe(void); +bool action_avx512vbmi_isa_probe(void); /* Odp execute init handles setting up the state of the actions functions at * initialization time. It cannot return errors, as it must always succeed in
This commit adds support for the AVX512 implementation of the ipv6_set_addrs action as well as an AVX512 implementation of updating the L4 checksums. Signed-off-by: Emma Finn <emma.finn@intel.com> --- v3: - Added a runtime check for AVX512 vbmi. v2: - Added check for availbility of s6_addr32 field of struct in6_addr. - Fixed network headers for freebsd builds. --- --- lib/odp-execute-avx512.c | 176 ++++++++++++++++++++++++++++++++++++++ lib/odp-execute-private.c | 17 ++++ lib/odp-execute-private.h | 1 + 3 files changed, 194 insertions(+)