diff mbox series

[ovs-dev,v6] odp-execute: Add ISA implementation of set_masked IPv6 action

Message ID 20221130155717.2674414-1-emma.finn@intel.com
State Superseded
Headers show
Series [ovs-dev,v6] odp-execute: Add ISA implementation of set_masked IPv6 action | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Emma Finn Nov. 30, 2022, 3:57 p.m. UTC
This commit adds support for the AVX512 implementation of the
ipv6_set_addrs action as well as an AVX512 implementation of
updating the L4 checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>

---
v6:
 - Added check for ipv6 extension headers.
v5:
  - Fixed load for ip6 src and dst mask for checksum check.
v4:
  - Reworked and moved check for checksum outside loop.
  - Code cleanup based on review from Eelco.
v3:
  - Added a runtime check for AVX512 vbmi.
v2:
  - Added check for availbility of s6_addr32 field of struct in6_addr.
  - Fixed network headers for freebsd builds.
---
---
 lib/odp-execute-avx512.c  | 217 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |  14 +++
 lib/odp-execute-private.h |   1 +
 lib/packets.c             |   2 +-
 lib/packets.h             |   2 +
 5 files changed, 235 insertions(+), 1 deletion(-)

Comments

Ilya Maximets Dec. 2, 2022, 2:21 p.m. UTC | #1
On 11/30/22 16:57, Emma Finn wrote:
> This commit adds support for the AVX512 implementation of the
> ipv6_set_addrs action as well as an AVX512 implementation of
> updating the L4 checksums.
> 
> Signed-off-by: Emma Finn <emma.finn@intel.com>

Hi.  Thanks for the updated version!
Could you also provide some performance numbers in the commit
message?  Performance related patches should typically have some.

Some comments inline.  There is also a bug in ipv4 implementation.

> 
> ---
> v6:
>  - Added check for ipv6 extension headers.
> v5:
>   - Fixed load for ip6 src and dst mask for checksum check.
> v4:
>   - Reworked and moved check for checksum outside loop.
>   - Code cleanup based on review from Eelco.
> v3:
>   - Added a runtime check for AVX512 vbmi.
> v2:
>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>   - Fixed network headers for freebsd builds.
> ---
> ---
>  lib/odp-execute-avx512.c  | 217 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |  14 +++
>  lib/odp-execute-private.h |   1 +
>  lib/packets.c             |   2 +-
>  lib/packets.h             |   2 +
>  5 files changed, 235 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index 6c7713251..87dae6d05 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -20,6 +20,9 @@
>  
>  #include <config.h>
>  #include <errno.h>
> +#include <sys/types.h>
> +#include <netinet/in.h>
> +#include <netinet/ip6.h>
>  
>  #include "csum.h"
>  #include "dp-packet.h"
> @@ -28,6 +31,7 @@
>  #include "odp-execute-private.h"
>  #include "odp-netlink.h"
>  #include "openvswitch/vlog.h"
> +#include "packets.h"
>  
>  VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
>  
> @@ -75,6 +79,26 @@ BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
>                    MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
>                    offsetof(struct ovs_key_ipv4, ipv4_ttl));
>  
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_dst));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_label));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_proto));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_tclass));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) ==
> +                  offsetof(struct ovs_key_ipv6, ipv6_hlimit));
> +
>  /* Array of callback functions, one for each masked operation. */
>  odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
>  
> @@ -483,6 +507,193 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
>      }
>  }
>  
> +#if HAVE_AVX512VBMI
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_sum_header(__m512i ip6_header)
> +{
> +    __m256i v_zeros = _mm256_setzero_si256();
> +    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
> +                                               0xFF, 0xFF, 0xFF, 0xFF);
> +
> +    /* Shuffle ip6 src and dst to beginning of register. */
> +    __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst,
> +                                                      ip6_header);
> +
> +    /* Extract ip6 src and dst into smaller 256-bit wide register. */
> +    __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0);
> +
> +    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
> +     * src and dst fields and add padding after each 16-bit value for the
> +     * following carry over addition. */
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
> +                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
> +                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
> +                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
> +                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
> +    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
> +    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
> +
> +    /* Add each part of the old and new headers together. */
> +    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final
> +     * horizontal add. */
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +
> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
> +    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
> +    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
> +
> +    /* Extract delta value. */
> +    return _mm256_extract_epi16(v_delta, 0);
> +}
> +
> +static inline uint16_t ALWAYS_INLINE
> +__attribute__((__target__("avx512vbmi")))
> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
> +{
> +    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
> +    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
> +    uint32_t csum_delta = ((uint16_t)~old_delta) + new_delta;

Is the cast necessary here?  The 'old_delta' is uint16_t.  The bit inversion
should not change the type, right?

> +
> +    return  ~csum_finish(csum_delta);

One too many spaces after 'return'.

> +}
> +
> +/* This function performs the same operation on each packet in the batch as
> + * the scalar odp_set_ipv6() function. */
> +static void
> +__attribute__((__target__("avx512vbmi")))
> +action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a)
> +{
> +    const struct ovs_key_ipv6 *key, *mask;
> +    struct dp_packet *packet;
> +
> +    a = nl_attr_get(a);
> +    key = nl_attr_get(a);
> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
> +
> +    /* Read the content of the key and mask in the respective registers. We
> +     * only load the size of the actual structure, which is only 40 bytes. */
> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> +
> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> +     * ip6_hdr structure layout. */
> +    static const uint8_t ip_shuffle_mask[64] = {
> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF

These are overindented.  Should be moved 4 spaces to the left.

> +    };
> +
> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> +
> +    /* This shuffle is required for key and mask to match the layout of the
> +     * ip6_hdr struct. */
> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
> +
> +    /* Set the v_zero register to all zero's. */
> +    const __m128i v_zeros = _mm_setzero_si128();
> +
> +    /* Set the v_all_ones register to all one's. */
> +    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
> +
> +    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
> +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
> +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
> +
> +    /* Perform a bitwise OR between src and dst registers. */
> +    __m128i v_or = _mm_or_si128(v_src, v_dst);
> +
> +    /* Will return true if any bit has been set in v_or, else it will return
> +     * false. */
> +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> +
> +        /* Load the 40 bytes of the IPv6 header. */
> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
> +
> +        /* AND the v_pkt_mask to the packet data (v_packet). */
> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
> +
> +        /* OR the new addresses (v_key_shuf) with the masked packet addresses
> +         * (v_pkt_masked). */
> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
> +
> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to be
> +         * updated. */
> +        uint8_t proto = 0;
> +        bool rh_present;
> +
> +        rh_present = packet_rh_present(packet, &proto, &do_checksum);

Hmm, the 'do_checksum' is global for all packets in a batch.
packet_rh_present() will overwrite the value.

> +
> +        if (do_checksum) {
> +            uint16_t delta_checksum;
> +            __m512i v_new_hdr_for_cksum = v_new_hdr;

Reverse x-mass tree.

> +
> +            /* In case of routing header being present, checksum should not be
> +             * updated for the destination address. */
> +            if (rh_present) {
> +                v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr,
> +                                                              v_packet);
> +            }
> +
> +            delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> +                                                         v_new_hdr_for_cksum);
> +
> +            if (proto == IPPROTO_UDP) {
> +                struct udp_header *uh = dp_packet_l4(packet);
> +
> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
> +
> +                    udp_checksum = csum_finish(udp_checksum);
> +
> +                    if (!udp_checksum) {
> +                        udp_checksum = htons(0xffff);
> +                    }
> +
> +                    uh->udp_csum = udp_checksum;
> +                }
> +            } else if (proto == IPPROTO_TCP) {
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
> +
> +                tcp_checksum = csum_finish(tcp_checksum);
> +                th->tcp_csum = tcp_checksum;
> +            } else if (proto == IPPROTO_ICMPV6) {
> +                struct icmp6_header *icmp = dp_packet_l4(packet);
> +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
> +                uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum;
> +
> +                icmp6_checksum = csum_finish(icmp6_checksum);
> +                icmp->icmp6_cksum = icmp6_checksum;
> +            }
> +        }
> +        /* Write back the modified IPv6 addresses. */
> +        _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);

I think, there supposed to be a pkt_metadata_init_conn(&packet->md) call,
otherwise we may misuse outdated conntrack connection later.

Hmm, action_avx512_ipv4_set_addrs() seems to have the same problem in it.

Comparision of the packet metadata should, probably, be added to the
actions autovalidator.

You can verify that conntrack is broken by running:

  $ make check-system-userspace TESTSUITEFLAGS='-k negative'

But adding the line 'ovs-appctl odp-execute/action-impl-set avx512' to
the 'conntrack - negative test for recirculation optimization' test
beforehand.

The test doesn't fail with just autovalidator, because autovalidator
doesn't compare packet metadata, i.e. the packet->md.conn field,
and packets that were handled by generic scalar implementation are
actually used for later procesing, not ones changed by the avx512
implementation.

> +    }
> +}
> +#endif /* HAVE_AVX512VBMI */
> +
>  static void
>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
>  {
> @@ -514,6 +725,12 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
>  
> +#if HAVE_AVX512VBMI
> +    if (action_avx512vbmi_isa_probe()) {
> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = action_avx512_set_ipv6;
> +    }
> +#endif
> +
>      return 0;
>  }
>  
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index f80ae5a23..c2f7dce6b 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -60,6 +60,20 @@ action_avx512_isa_probe(void)
>  
>  #endif
>  
> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI);
> +}
> +#else
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    return false;
> +}
> +#endif
> +
>  static struct odp_execute_action_impl action_impls[] = {
>      [ACTION_IMPL_AUTOVALIDATOR] = {
>          .available = false,
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 940180c99..643f41c2a 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
>  #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>  
>  bool action_avx512_isa_probe(void);
> +bool action_avx512vbmi_isa_probe(void);
>  
>  /* Odp execute init handles setting up the state of the actions functions at
>   * initialization time. It cannot return errors, as it must always succeed in
> diff --git a/lib/packets.c b/lib/packets.c
> index 1dcd4a6fc..06f516cb1 100644
> --- a/lib/packets.c
> +++ b/lib/packets.c
> @@ -1152,7 +1152,7 @@ packet_set_ipv4_addr(struct dp_packet *packet,
>   * segements_left > 0.
>   *
>   * This function assumes that L3 and L4 offsets are set in the packet. */
> -static bool
> +bool
>  packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag)
>  {
>      const struct ovs_16aligned_ip6_hdr *nh;
> diff --git a/lib/packets.h b/lib/packets.h
> index 5bdf6e4bb..8626aac8d 100644
> --- a/lib/packets.h
> +++ b/lib/packets.h
> @@ -1642,6 +1642,8 @@ void packet_put_ra_prefix_opt(struct dp_packet *,
>                                ovs_be32 preferred_lifetime,
>                                const ovs_be128 router_prefix);
>  uint32_t packet_csum_pseudoheader(const struct ip_header *);
> +bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr,
> +                       bool *first_frag);
>  void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
>  
>  #define DNS_HEADER_LEN 12
Emma Finn Dec. 5, 2022, 11:53 a.m. UTC | #2
> -----Original Message-----
> From: Ilya Maximets <i.maximets@ovn.org>
> Sent: Friday 2 December 2022 14:22
> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
> Cc: i.maximets@ovn.org; Van Haaren, Harry <harry.van.haaren@intel.com>;
> echaudro@redhat.com; Stokes, Ian <ian.stokes@intel.com>
> Subject: Re: [v6] odp-execute: Add ISA implementation of set_masked IPv6
> action
> 
> On 11/30/22 16:57, Emma Finn wrote:
> > This commit adds support for the AVX512 implementation of the
> > ipv6_set_addrs action as well as an AVX512 implementation of updating
> > the L4 checksums.
> >
> > Signed-off-by: Emma Finn <emma.finn@intel.com>
> 
> Hi.  Thanks for the updated version!
> Could you also provide some performance numbers in the commit message?
> Performance related patches should typically have some.
> 
Yes, I will add some relative performance numbers when I send out the next version.

> Some comments inline.  There is also a bug in ipv4 implementation.
> 
> >
> > ---
> > v6:
> >  - Added check for ipv6 extension headers.
> > v5:
> >   - Fixed load for ip6 src and dst mask for checksum check.
> > v4:
> >   - Reworked and moved check for checksum outside loop.
> >   - Code cleanup based on review from Eelco.
> > v3:
> >   - Added a runtime check for AVX512 vbmi.
> > v2:
> >   - Added check for availbility of s6_addr32 field of struct in6_addr.
> >   - Fixed network headers for freebsd builds.
> > ---
> > ---
<snip>

> > +static inline uint16_t ALWAYS_INLINE
> > +__attribute__((__target__("avx512vbmi")))
> > +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i
> new_header) {
> > +    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
> > +    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
> > +    uint32_t csum_delta = ((uint16_t)~old_delta) + new_delta;
> 
> Is the cast necessary here?  The 'old_delta' is uint16_t.  The bit inversion
> should not change the type, right?
> 
Yes cast is necessary here. 
Bit inversion doesn't change type but the addition with result being saved to a 32-bit does. Without cast, delta is incorrect

> > +
> > +    return  ~csum_finish(csum_delta);
> 
> One too many spaces after 'return'.
> 
> > +}
> > +
> > +/* This function performs the same operation on each packet in the
> > +batch as
> > + * the scalar odp_set_ipv6() function. */ static void
> > +__attribute__((__target__("avx512vbmi")))
> > +action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct
> > +nlattr *a) {
> > +    const struct ovs_key_ipv6 *key, *mask;
> > +    struct dp_packet *packet;
> > +
> > +    a = nl_attr_get(a);
> > +    key = nl_attr_get(a);
> > +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
> > +
> > +    /* Read the content of the key and mask in the respective registers. We
> > +     * only load the size of the actual structure, which is only 40 bytes. */
> > +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
> > +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
> > +
> > +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
> > +     * ip6_hdr structure layout. */
> > +    static const uint8_t ip_shuffle_mask[64] = {
> > +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
> > +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
> > +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
> > +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
> > +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
> 
> These are overindented.  Should be moved 4 spaces to the left.
> 
> > +    };
> > +
> > +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
> > +
> > +    /* This shuffle is required for key and mask to match the layout of the
> > +     * ip6_hdr struct. */
> > +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
> > +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle,
> v_mask);
> > +
> > +    /* Set the v_zero register to all zero's. */
> > +    const __m128i v_zeros = _mm_setzero_si128();
> > +
> > +    /* Set the v_all_ones register to all one's. */
> > +    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
> > +
> > +    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
> > +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
> > +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
> > +
> > +    /* Perform a bitwise OR between src and dst registers. */
> > +    __m128i v_or = _mm_or_si128(v_src, v_dst);
> > +
> > +    /* Will return true if any bit has been set in v_or, else it will return
> > +     * false. */
> > +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> > +
> > +        /* Load the 40 bytes of the IPv6 header. */
> > +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *)
> > + nh);
> > +
> > +        /* AND the v_pkt_mask to the packet data (v_packet). */
> > +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf,
> > + v_packet);
> > +
> > +        /* OR the new addresses (v_key_shuf) with the masked packet
> addresses
> > +         * (v_pkt_masked). */
> > +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf,
> > + v_pkt_masked);
> > +
> > +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to be
> > +         * updated. */
> > +        uint8_t proto = 0;
> > +        bool rh_present;
> > +
> > +        rh_present = packet_rh_present(packet, &proto, &do_checksum);
> 
> Hmm, the 'do_checksum' is global for all packets in a batch.
> packet_rh_present() will overwrite the value.
> 
> > +
> > +        if (do_checksum) {
> > +            uint16_t delta_checksum;
> > +            __m512i v_new_hdr_for_cksum = v_new_hdr;
> 
> Reverse x-mass tree.
> 
> > +
> > +            /* In case of routing header being present, checksum should not be
> > +             * updated for the destination address. */
> > +            if (rh_present) {
> > +                v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18,
> v_new_hdr,
> > +                                                              v_packet);
> > +            }
> > +
> > +            delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> > +
> > + v_new_hdr_for_cksum);
> > +
> > +            if (proto == IPPROTO_UDP) {
> > +                struct udp_header *uh = dp_packet_l4(packet);
> > +
> > +                if (uh->udp_csum) {
> > +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> > +                    uint32_t udp_checksum = old_udp_checksum +
> > + delta_checksum;
> > +
> > +                    udp_checksum = csum_finish(udp_checksum);
> > +
> > +                    if (!udp_checksum) {
> > +                        udp_checksum = htons(0xffff);
> > +                    }
> > +
> > +                    uh->udp_csum = udp_checksum;
> > +                }
> > +            } else if (proto == IPPROTO_TCP) {
> > +                struct tcp_header *th = dp_packet_l4(packet);
> > +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> > +                uint32_t tcp_checksum = old_tcp_checksum +
> > + delta_checksum;
> > +
> > +                tcp_checksum = csum_finish(tcp_checksum);
> > +                th->tcp_csum = tcp_checksum;
> > +            } else if (proto == IPPROTO_ICMPV6) {
> > +                struct icmp6_header *icmp = dp_packet_l4(packet);
> > +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
> > +                uint32_t icmp6_checksum = old_icmp6_checksum +
> > + delta_checksum;
> > +
> > +                icmp6_checksum = csum_finish(icmp6_checksum);
> > +                icmp->icmp6_cksum = icmp6_checksum;
> > +            }
> > +        }
> > +        /* Write back the modified IPv6 addresses. */
> > +        _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> 
> I think, there supposed to be a pkt_metadata_init_conn(&packet->md) call,
> otherwise we may misuse outdated conntrack connection later.
> 
> Hmm, action_avx512_ipv4_set_addrs() seems to have the same problem in
> it.
> 
Sure, I will send a separate patch to address the ipv4 bug. 

> Comparision of the packet metadata should, probably, be added to the
> actions autovalidator.
> 
Good Idea. Would just comparing packet->md.conn cover this issue for now?
Then I can add a comment that if future actions opts modify md, this check will need to be expanded.

> You can verify that conntrack is broken by running:
> 
>   $ make check-system-userspace TESTSUITEFLAGS='-k negative'
> 
> But adding the line 'ovs-appctl odp-execute/action-impl-set avx512' to the
> 'conntrack - negative test for recirculation optimization' test beforehand.
> 
> The test doesn't fail with just autovalidator, because autovalidator doesn't
> compare packet metadata, i.e. the packet->md.conn field, and packets that
> were handled by generic scalar implementation are actually used for later
> procesing, not ones changed by the avx512 implementation.
> 

<SNIP>
Ilya Maximets Dec. 5, 2022, 12:45 p.m. UTC | #3
On 12/5/22 12:53, Finn, Emma wrote:
> 
> 
>> -----Original Message-----
>> From: Ilya Maximets <i.maximets@ovn.org>
>> Sent: Friday 2 December 2022 14:22
>> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org
>> Cc: i.maximets@ovn.org; Van Haaren, Harry <harry.van.haaren@intel.com>;
>> echaudro@redhat.com; Stokes, Ian <ian.stokes@intel.com>
>> Subject: Re: [v6] odp-execute: Add ISA implementation of set_masked IPv6
>> action
>>
>> On 11/30/22 16:57, Emma Finn wrote:
>>> This commit adds support for the AVX512 implementation of the
>>> ipv6_set_addrs action as well as an AVX512 implementation of updating
>>> the L4 checksums.
>>>
>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
>>
>> Hi.  Thanks for the updated version!
>> Could you also provide some performance numbers in the commit message?
>> Performance related patches should typically have some.
>>
> Yes, I will add some relative performance numbers when I send out the next version.

Thanks!

> 
>> Some comments inline.  There is also a bug in ipv4 implementation.
>>
>>>
>>> ---
>>> v6:
>>>  - Added check for ipv6 extension headers.
>>> v5:
>>>   - Fixed load for ip6 src and dst mask for checksum check.
>>> v4:
>>>   - Reworked and moved check for checksum outside loop.
>>>   - Code cleanup based on review from Eelco.
>>> v3:
>>>   - Added a runtime check for AVX512 vbmi.
>>> v2:
>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>>>   - Fixed network headers for freebsd builds.
>>> ---
>>> ---
> <snip>
> 
>>> +static inline uint16_t ALWAYS_INLINE
>>> +__attribute__((__target__("avx512vbmi")))
>>> +avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i
>> new_header) {
>>> +    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
>>> +    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
>>> +    uint32_t csum_delta = ((uint16_t)~old_delta) + new_delta;
>>
>> Is the cast necessary here?  The 'old_delta' is uint16_t.  The bit inversion
>> should not change the type, right?
>>
> Yes cast is necessary here. 
> Bit inversion doesn't change type but the addition with result being saved
> to a 32-bit does. Without cast, delta is incorrect

Hmm, OK.  Please, add a space between the cast and the inversion then,
as Eelco suggested in his diff for v4.

> 
>>> +
>>> +    return  ~csum_finish(csum_delta);
>>
>> One too many spaces after 'return'.
>>
>>> +}
>>> +
>>> +/* This function performs the same operation on each packet in the
>>> +batch as
>>> + * the scalar odp_set_ipv6() function. */ static void
>>> +__attribute__((__target__("avx512vbmi")))
>>> +action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct
>>> +nlattr *a) {
>>> +    const struct ovs_key_ipv6 *key, *mask;
>>> +    struct dp_packet *packet;
>>> +
>>> +    a = nl_attr_get(a);
>>> +    key = nl_attr_get(a);
>>> +    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
>>> +
>>> +    /* Read the content of the key and mask in the respective registers. We
>>> +     * only load the size of the actual structure, which is only 40 bytes. */
>>> +    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
>>> +    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
>>> +
>>> +    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
>>> +     * ip6_hdr structure layout. */
>>> +    static const uint8_t ip_shuffle_mask[64] = {
>>> +            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
>>> +            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
>>> +            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
>>> +            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
>>> +            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
>>> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
>>
>> These are overindented.  Should be moved 4 spaces to the left.
>>
>>> +    };
>>> +
>>> +    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
>>> +
>>> +    /* This shuffle is required for key and mask to match the layout of the
>>> +     * ip6_hdr struct. */
>>> +    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
>>> +    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle,
>> v_mask);
>>> +
>>> +    /* Set the v_zero register to all zero's. */
>>> +    const __m128i v_zeros = _mm_setzero_si128();
>>> +
>>> +    /* Set the v_all_ones register to all one's. */
>>> +    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
>>> +
>>> +    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
>>> +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
>>> +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
>>> +
>>> +    /* Perform a bitwise OR between src and dst registers. */
>>> +    __m128i v_or = _mm_or_si128(v_src, v_dst);
>>> +
>>> +    /* Will return true if any bit has been set in v_or, else it will return
>>> +     * false. */
>>> +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
>>> +
>>> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
>>> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
>>> +
>>> +        /* Load the 40 bytes of the IPv6 header. */
>>> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *)
>>> + nh);
>>> +
>>> +        /* AND the v_pkt_mask to the packet data (v_packet). */
>>> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf,
>>> + v_packet);
>>> +
>>> +        /* OR the new addresses (v_key_shuf) with the masked packet
>> addresses
>>> +         * (v_pkt_masked). */
>>> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf,
>>> + v_pkt_masked);
>>> +
>>> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to be
>>> +         * updated. */
>>> +        uint8_t proto = 0;
>>> +        bool rh_present;
>>> +
>>> +        rh_present = packet_rh_present(packet, &proto, &do_checksum);
>>
>> Hmm, the 'do_checksum' is global for all packets in a batch.
>> packet_rh_present() will overwrite the value.
>>
>>> +
>>> +        if (do_checksum) {
>>> +            uint16_t delta_checksum;
>>> +            __m512i v_new_hdr_for_cksum = v_new_hdr;
>>
>> Reverse x-mass tree.
>>
>>> +
>>> +            /* In case of routing header being present, checksum should not be
>>> +             * updated for the destination address. */
>>> +            if (rh_present) {
>>> +                v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18,
>> v_new_hdr,
>>> +                                                              v_packet);
>>> +            }
>>> +
>>> +            delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
>>> +
>>> + v_new_hdr_for_cksum);
>>> +
>>> +            if (proto == IPPROTO_UDP) {
>>> +                struct udp_header *uh = dp_packet_l4(packet);
>>> +
>>> +                if (uh->udp_csum) {
>>> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
>>> +                    uint32_t udp_checksum = old_udp_checksum +
>>> + delta_checksum;
>>> +
>>> +                    udp_checksum = csum_finish(udp_checksum);
>>> +
>>> +                    if (!udp_checksum) {
>>> +                        udp_checksum = htons(0xffff);
>>> +                    }
>>> +
>>> +                    uh->udp_csum = udp_checksum;
>>> +                }
>>> +            } else if (proto == IPPROTO_TCP) {
>>> +                struct tcp_header *th = dp_packet_l4(packet);
>>> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
>>> +                uint32_t tcp_checksum = old_tcp_checksum +
>>> + delta_checksum;
>>> +
>>> +                tcp_checksum = csum_finish(tcp_checksum);
>>> +                th->tcp_csum = tcp_checksum;
>>> +            } else if (proto == IPPROTO_ICMPV6) {
>>> +                struct icmp6_header *icmp = dp_packet_l4(packet);
>>> +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
>>> +                uint32_t icmp6_checksum = old_icmp6_checksum +
>>> + delta_checksum;
>>> +
>>> +                icmp6_checksum = csum_finish(icmp6_checksum);
>>> +                icmp->icmp6_cksum = icmp6_checksum;
>>> +            }
>>> +        }
>>> +        /* Write back the modified IPv6 addresses. */
>>> +        _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
>>
>> I think, there supposed to be a pkt_metadata_init_conn(&packet->md) call,
>> otherwise we may misuse outdated conntrack connection later.
>>
>> Hmm, action_avx512_ipv4_set_addrs() seems to have the same problem in
>> it.
>>
> Sure, I will send a separate patch to address the ipv4 bug. 
> 
>> Comparision of the packet metadata should, probably, be added to the
>> actions autovalidator.
>>
> Good Idea. Would just comparing packet->md.conn cover this issue for now?
> Then I can add a comment that if future actions opts modify md, this check will need to be expanded.

I think, the point of autovalidator is to catch differences that
we can miss while working on a change.  So, unless it's checking
everything, it doesn't make a lot of sense.

I'd suggest to expand the autovalidator check in the ipv4 bug fix
patch and make it compare the whole metadata.

Since the metadata gets directly copied with memcpy by the
autovalidator, we may get away with just memcmp... ?  Unless we
want some more detailed report.

Make sure to use ds_put_sparse_hex_dump() instead of regular hex
dump while dumping big chunks of metadata though.  It is huge and
mostly contains zeroes, there is no point in printing it out in
full.

> 
>> You can verify that conntrack is broken by running:
>>
>>   $ make check-system-userspace TESTSUITEFLAGS='-k negative'
>>
>> But adding the line 'ovs-appctl odp-execute/action-impl-set avx512' to the
>> 'conntrack - negative test for recirculation optimization' test beforehand.
>>
>> The test doesn't fail with just autovalidator, because autovalidator doesn't
>> compare packet metadata, i.e. the packet->md.conn field, and packets that
>> were handled by generic scalar implementation are actually used for later
>> procesing, not ones changed by the avx512 implementation.
>>
> 
> <SNIP>
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 6c7713251..87dae6d05 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -20,6 +20,9 @@ 
 
 #include <config.h>
 #include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
 
 #include "csum.h"
 #include "dp-packet.h"
@@ -28,6 +31,7 @@ 
 #include "odp-execute-private.h"
 #include "odp-netlink.h"
 #include "openvswitch/vlog.h"
+#include "packets.h"
 
 VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
 
@@ -75,6 +79,26 @@  BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
                   MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
                   offsetof(struct ovs_key_ipv4, ipv4_ttl));
 
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_label));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_tclass));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_hlimit));
+
 /* Array of callback functions, one for each masked operation. */
 odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
 
@@ -483,6 +507,193 @@  action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
     }
 }
 
+#if HAVE_AVX512VBMI
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_sum_header(__m512i ip6_header)
+{
+    __m256i v_zeros = _mm256_setzero_si256();
+    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
+                                               0xFF, 0xFF, 0xFF, 0xFF);
+
+    /* Shuffle ip6 src and dst to beginning of register. */
+    __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst,
+                                                      ip6_header);
+
+    /* Extract ip6 src and dst into smaller 256-bit wide register. */
+    __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0);
+
+    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
+     * src and dst fields and add padding after each 16-bit value for the
+     * following carry over addition. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
+                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
+                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
+    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
+    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
+
+    /* Add each part of the old and new headers together. */
+    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final
+     * horizontal add. */
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
+
+    /* Extract delta value. */
+    return _mm256_extract_epi16(v_delta, 0);
+}
+
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
+{
+    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
+    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
+    uint32_t csum_delta = ((uint16_t)~old_delta) + new_delta;
+
+    return  ~csum_finish(csum_delta);
+}
+
+/* This function performs the same operation on each packet in the batch as
+ * the scalar odp_set_ipv6() function. */
+static void
+__attribute__((__target__("avx512vbmi")))
+action_avx512_set_ipv6(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+    const struct ovs_key_ipv6 *key, *mask;
+    struct dp_packet *packet;
+
+    a = nl_attr_get(a);
+    key = nl_attr_get(a);
+    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
+
+    /* Read the content of the key and mask in the respective registers. We
+     * only load the size of the actual structure, which is only 40 bytes. */
+    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
+    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
+
+    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
+     * ip6_hdr structure layout. */
+    static const uint8_t ip_shuffle_mask[64] = {
+            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
+    };
+
+    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
+
+    /* This shuffle is required for key and mask to match the layout of the
+     * ip6_hdr struct. */
+    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
+    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
+
+    /* Set the v_zero register to all zero's. */
+    const __m128i v_zeros = _mm_setzero_si128();
+
+    /* Set the v_all_ones register to all one's. */
+    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
+    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
+    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
+
+    /* Perform a bitwise OR between src and dst registers. */
+    __m128i v_or = _mm_or_si128(v_src, v_dst);
+
+    /* Will return true if any bit has been set in v_or, else it will return
+     * false. */
+    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
+
+        /* Load the 40 bytes of the IPv6 header. */
+        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
+
+        /* AND the v_pkt_mask to the packet data (v_packet). */
+        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
+
+        /* OR the new addresses (v_key_shuf) with the masked packet addresses
+         * (v_pkt_masked). */
+        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
+
+        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to be
+         * updated. */
+        uint8_t proto = 0;
+        bool rh_present;
+
+        rh_present = packet_rh_present(packet, &proto, &do_checksum);
+
+        if (do_checksum) {
+            uint16_t delta_checksum;
+            __m512i v_new_hdr_for_cksum = v_new_hdr;
+
+            /* In case of routing header being present, checksum should not be
+             * updated for the destination address. */
+            if (rh_present) {
+                v_new_hdr_for_cksum = _mm512_mask_blend_epi64(0x18, v_new_hdr,
+                                                              v_packet);
+            }
+
+            delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
+                                                         v_new_hdr_for_cksum);
+
+            if (proto == IPPROTO_UDP) {
+                struct udp_header *uh = dp_packet_l4(packet);
+
+                if (uh->udp_csum) {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
+
+                    udp_checksum = csum_finish(udp_checksum);
+
+                    if (!udp_checksum) {
+                        udp_checksum = htons(0xffff);
+                    }
+
+                    uh->udp_csum = udp_checksum;
+                }
+            } else if (proto == IPPROTO_TCP) {
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
+
+                tcp_checksum = csum_finish(tcp_checksum);
+                th->tcp_csum = tcp_checksum;
+            } else if (proto == IPPROTO_ICMPV6) {
+                struct icmp6_header *icmp = dp_packet_l4(packet);
+                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
+                uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum;
+
+                icmp6_checksum = csum_finish(icmp6_checksum);
+                icmp->icmp6_cksum = icmp6_checksum;
+            }
+        }
+        /* Write back the modified IPv6 addresses. */
+        _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
+    }
+}
+#endif /* HAVE_AVX512VBMI */
+
 static void
 action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
 {
@@ -514,6 +725,12 @@  action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
     impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
     impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
 
+#if HAVE_AVX512VBMI
+    if (action_avx512vbmi_isa_probe()) {
+        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] = action_avx512_set_ipv6;
+    }
+#endif
+
     return 0;
 }
 
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index f80ae5a23..c2f7dce6b 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -60,6 +60,20 @@  action_avx512_isa_probe(void)
 
 #endif
 
+#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    return cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI);
+}
+#else
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    return false;
+}
+#endif
+
 static struct odp_execute_action_impl action_impls[] = {
     [ACTION_IMPL_AUTOVALIDATOR] = {
         .available = false,
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 940180c99..643f41c2a 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -78,6 +78,7 @@  BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
 #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
 
 bool action_avx512_isa_probe(void);
+bool action_avx512vbmi_isa_probe(void);
 
 /* Odp execute init handles setting up the state of the actions functions at
  * initialization time. It cannot return errors, as it must always succeed in
diff --git a/lib/packets.c b/lib/packets.c
index 1dcd4a6fc..06f516cb1 100644
--- a/lib/packets.c
+++ b/lib/packets.c
@@ -1152,7 +1152,7 @@  packet_set_ipv4_addr(struct dp_packet *packet,
  * segements_left > 0.
  *
  * This function assumes that L3 and L4 offsets are set in the packet. */
-static bool
+bool
 packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr, bool *first_frag)
 {
     const struct ovs_16aligned_ip6_hdr *nh;
diff --git a/lib/packets.h b/lib/packets.h
index 5bdf6e4bb..8626aac8d 100644
--- a/lib/packets.h
+++ b/lib/packets.h
@@ -1642,6 +1642,8 @@  void packet_put_ra_prefix_opt(struct dp_packet *,
                               ovs_be32 preferred_lifetime,
                               const ovs_be128 router_prefix);
 uint32_t packet_csum_pseudoheader(const struct ip_header *);
+bool packet_rh_present(struct dp_packet *packet, uint8_t *nexthdr,
+                       bool *first_frag);
 void IP_ECN_set_ce(struct dp_packet *pkt, bool is_ipv6);
 
 #define DNS_HEADER_LEN 12