diff mbox series

[ovs-dev,v4] odp-execute: Add ISA implementation of set_masked IPv6 action

Message ID 20221124093000.3869344-1-emma.finn@intel.com
State Superseded
Headers show
Series [ovs-dev,v4] odp-execute: Add ISA implementation of set_masked IPv6 action | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Finn, Emma Nov. 24, 2022, 9:30 a.m. UTC
This commit adds support for the AVX512 implementation of the
ipv6_set_addrs action as well as an AVX512 implementation of
updating the L4 checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>

---
v4:
  - Reworked and moved check for checksum outside loop.
  - Code cleanup based on review from Eelco.
v3:
  - Added a runtime check for AVX512 vbmi.
v2:
  - Added check for availbility of s6_addr32 field of struct in6_addr.
  - Fixed network headers for freebsd builds.
---
---
 lib/odp-execute-avx512.c  | 204 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |  17 ++++
 lib/odp-execute-private.h |   1 +
 3 files changed, 222 insertions(+)

Comments

Eelco Chaudron Nov. 25, 2022, 12:26 p.m. UTC | #1
On 24 Nov 2022, at 10:30, Emma Finn wrote:

> This commit adds support for the AVX512 implementation of the
> ipv6_set_addrs action as well as an AVX512 implementation of
> updating the L4 checksums.
>
> Signed-off-by: Emma Finn <emma.finn@intel.com>

Thanks Emma for the v4, I have one question and a couple of style issues. To speed things up I just provide the diff for the style issues.

I was not able to do any actual testing, as my system did not have the avx512vbmi extension :(

Cheers,

Eelco

> ---

Style issues diff:

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 82ff7e647..f798d6708 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -20,9 +20,9 @@

 #include <config.h>
 #include <errno.h>
-#include <sys/types.h>
 #include <netinet/in.h>
 #include <netinet/ip6.h>
+#include <sys/types.h>

 #include "csum.h"
 #include "dp-packet.h"
@@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header)
      * horizontal add. */
     __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
                                           0xF, 0xF, 0xF, 0xF);
-    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);

+    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
     v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
     v_delta = _mm256_hadd_epi16(v_delta, v_zeros);

@@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
 {
     uint16_t old_delta = avx512_ipv6_sum_header(old_header);
     uint16_t new_delta = avx512_ipv6_sum_header(new_header);
-    uint32_t csum_delta = (uint16_t)~old_delta + new_delta;
+    uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta;

     return  ~csum_finish(csum_delta);
 }
@@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
     __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
     __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);

-
     /* Set the v_zero register to all zero's. */
     const __m128i v_zeros = _mm_setzero_si128();
+
     /* Set the v_all_ones register to all one's. */
     const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);

-    /* Load ip6 src and dst respectively into 128-bit wide registers. */
+    /* Load ip6 src and dst masks respectively into 128-bit wide registers. */
     __m128i v_src = _mm_loadu_si128((void *) mask);
-    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
+    __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask);

     /* Perform a bitwise OR between src and dst registers. */
     __m128i v_or = _mm_or_si128(v_src, v_dst);

> v4:
>   - Reworked and moved check for checksum outside loop.
>   - Code cleanup based on review from Eelco.
> v3:
>   - Added a runtime check for AVX512 vbmi.
> v2:
>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>   - Fixed network headers for freebsd builds.
>

<SNIP>

> +    /* Load ip6 src and dst respectively into 128-bit wide registers. */
> +    __m128i v_src = _mm_loadu_si128((void *) mask);
> +    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);

Guess it might be me, but I do not understand how _mm_maskz_loadu_epi64() will load the dst from the mask.
Looking at the intrinsics guide it will only read the first two 64-bit values, but mask points to src?

Should we not just do the following here?

+    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
+    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);

> +
> +    /* Perform a bitwise OR between src and dst registers. */
> +    __m128i v_or = _mm_or_si128(v_src, v_dst);
> +
> +    /* Will return true if any bit has been set in v_or, else it will return
> +     * false. */
> +    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
> +
> +        /* Load the 40 bytes of the IPv6 header. */
> +        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
> +
> +        /* AND the v_pkt_mask to the packet data (v_packet). */
> +        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
> +
> +        /* OR the new addresses (v_key_shuf) with the masked packet addresses
> +         * (v_pkt_masked). */
> +        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
> +
> +        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
> +         * be updated. */
> +        if (do_checksum) {
> +            uint8_t proto = nh->ip6_nxt;
> +            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
> +                                                                  v_new_hdr);
> +
> +            if (proto == IPPROTO_UDP) {
> +                struct udp_header *uh = dp_packet_l4(packet);
> +
> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
> +
> +                    udp_checksum = csum_finish(udp_checksum);
> +
> +                    if (!udp_checksum) {
> +                        udp_checksum = htons(0xffff);
> +                    }
> +
> +                    uh->udp_csum = udp_checksum;
> +                }
> +            } else if (proto == IPPROTO_TCP) {
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
> +
> +                tcp_checksum = csum_finish(tcp_checksum);
> +                th->tcp_csum = tcp_checksum;
> +            } else if (proto == IPPROTO_ICMPV6) {
> +                struct icmp6_header *icmp = dp_packet_l4(packet);
> +                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
> +                uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum;
> +
> +                icmp6_checksum = csum_finish(icmp6_checksum);
> +                icmp->icmp6_cksum = icmp6_checksum;
> +            }
> +        }
> +        /* Write back the modified IPv6 addresses. */
> +         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
> +    }
> +}
> +#endif /* HAVE_AVX512VBMI */
> +
>  static void
>  action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
>  {
> @@ -514,6 +711,13 @@ action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
>      impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
>      impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
>
> +#if HAVE_AVX512VBMI
> +    if (action_avx512vbmi_isa_probe()) {
> +        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
> +                              action_avx512_ipv6_set_addrs;
> +    }
> +#endif
> +
>      return 0;
>  }
>
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index f80ae5a23..8b86b1e4f 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -60,6 +60,23 @@ action_avx512_isa_probe(void)
>
>  #endif
>
> +#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
> +        return true;
> +    }
> +    return false;
> +}
> +#else
> +bool
> +action_avx512vbmi_isa_probe(void)
> +{
> +    return false;
> +}
> +#endif
> +
>  static struct odp_execute_action_impl action_impls[] = {
>      [ACTION_IMPL_AUTOVALIDATOR] = {
>          .available = false,
> diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
> index 940180c99..643f41c2a 100644
> --- a/lib/odp-execute-private.h
> +++ b/lib/odp-execute-private.h
> @@ -78,6 +78,7 @@ BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
>  #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
>
>  bool action_avx512_isa_probe(void);
> +bool action_avx512vbmi_isa_probe(void);
>
>  /* Odp execute init handles setting up the state of the actions functions at
>   * initialization time. It cannot return errors, as it must always succeed in
> -- 
> 2.25.1
Finn, Emma Nov. 25, 2022, 3:20 p.m. UTC | #2
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Friday 25 November 2022 12:26
> To: Finn, Emma <emma.finn@intel.com>
> Cc: dev@openvswitch.org; Van Haaren, Harry
> <harry.van.haaren@intel.com>
> Subject: Re: [v4] odp-execute: Add ISA implementation of set_masked IPv6
> action
> 
> 
> 
> On 24 Nov 2022, at 10:30, Emma Finn wrote:
> 
> > This commit adds support for the AVX512 implementation of the
> > ipv6_set_addrs action as well as an AVX512 implementation of updating
> > the L4 checksums.
> >
> > Signed-off-by: Emma Finn <emma.finn@intel.com>
> 
> Thanks Emma for the v4, I have one question and a couple of style issues. To
> speed things up I just provide the diff for the style issues.
> 
> I was not able to do any actual testing, as my system did not have the
> avx512vbmi extension :(
> 
> Cheers,
> 
> Eelco
> 
> > ---
> 
> Style issues diff:
> 
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
> 82ff7e647..f798d6708 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -20,9 +20,9 @@
> 
>  #include <config.h>
>  #include <errno.h>
> -#include <sys/types.h>
>  #include <netinet/in.h>
>  #include <netinet/ip6.h>
> +#include <sys/types.h>
> 
For FreeBSD, network headers must be included in a certain order. 
See this commit for details:
    b2befd5bb2db ("sparse: Add guards to prevent FreeBSD-incompatible #include order.")
So these need to stay in this order. 

>  #include "csum.h"
>  #include "dp-packet.h"
> @@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header)
>       * horizontal add. */
>      __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
>                                            0xF, 0xF, 0xF, 0xF);
> -    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
> 
> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
>      v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
>      v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
> 
> @@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header,
> __m512i new_header)  {
>      uint16_t old_delta = avx512_ipv6_sum_header(old_header);
>      uint16_t new_delta = avx512_ipv6_sum_header(new_header);
> -    uint32_t csum_delta = (uint16_t)~old_delta + new_delta;
> +    uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta;
> 
>      return  ~csum_finish(csum_delta);
>  }
> @@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct
> dp_packet_batch *batch,
>      __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
>      __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
> 
> -
>      /* Set the v_zero register to all zero's. */
>      const __m128i v_zeros = _mm_setzero_si128();
> +
>      /* Set the v_all_ones register to all one's. */
>      const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
> 
> -    /* Load ip6 src and dst respectively into 128-bit wide registers. */
> +    /* Load ip6 src and dst masks respectively into 128-bit wide
> + registers. */
>      __m128i v_src = _mm_loadu_si128((void *) mask);
> -    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
> +    __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask);
> 
>      /* Perform a bitwise OR between src and dst registers. */
>      __m128i v_or = _mm_or_si128(v_src, v_dst);
> 
> > v4:
> >   - Reworked and moved check for checksum outside loop.
> >   - Code cleanup based on review from Eelco.
> > v3:
> >   - Added a runtime check for AVX512 vbmi.
> > v2:
> >   - Added check for availbility of s6_addr32 field of struct in6_addr.
> >   - Fixed network headers for freebsd builds.
> >
> 
> <SNIP>
> 
> > +    /* Load ip6 src and dst respectively into 128-bit wide registers. */
> > +    __m128i v_src = _mm_loadu_si128((void *) mask);
> > +    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
> 
> Guess it might be me, but I do not understand how
> _mm_maskz_loadu_epi64() will load the dst from the mask.
> Looking at the intrinsics guide it will only read the first two 64-bit values, but
> mask points to src?
> 
> Should we not just do the following here?
> 
> +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
> +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
> 

Yes, good catch. The _maskz_load_ isn't correctly pointing to the dst values from the mask. 
 
I will make all the changes above (except the include header comment) and send v5 shortly. 

<SNIP>
Eelco Chaudron Nov. 25, 2022, 3:45 p.m. UTC | #3
On 25 Nov 2022, at 16:20, Finn, Emma wrote:

>> -----Original Message-----
>> From: Eelco Chaudron <echaudro@redhat.com>
>> Sent: Friday 25 November 2022 12:26
>> To: Finn, Emma <emma.finn@intel.com>
>> Cc: dev@openvswitch.org; Van Haaren, Harry
>> <harry.van.haaren@intel.com>
>> Subject: Re: [v4] odp-execute: Add ISA implementation of set_masked IPv6
>> action
>>
>>
>>
>> On 24 Nov 2022, at 10:30, Emma Finn wrote:
>>
>>> This commit adds support for the AVX512 implementation of the
>>> ipv6_set_addrs action as well as an AVX512 implementation of updating
>>> the L4 checksums.
>>>
>>> Signed-off-by: Emma Finn <emma.finn@intel.com>
>>
>> Thanks Emma for the v4, I have one question and a couple of style issues. To
>> speed things up I just provide the diff for the style issues.
>>
>> I was not able to do any actual testing, as my system did not have the
>> avx512vbmi extension :(
>>
>> Cheers,
>>
>> Eelco
>>
>>> ---
>>
>> Style issues diff:
>>
>> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
>> 82ff7e647..f798d6708 100644
>> --- a/lib/odp-execute-avx512.c
>> +++ b/lib/odp-execute-avx512.c
>> @@ -20,9 +20,9 @@
>>
>>  #include <config.h>
>>  #include <errno.h>
>> -#include <sys/types.h>
>>  #include <netinet/in.h>
>>  #include <netinet/ip6.h>
>> +#include <sys/types.h>
>>
> For FreeBSD, network headers must be included in a certain order.
> See this commit for details:
>     b2befd5bb2db ("sparse: Add guards to prevent FreeBSD-incompatible #include order.")
> So these need to stay in this order.

Oops, yes I did not test with FreeBSD (or windows), I need to figure out how to do that in the future ;)

>>  #include "csum.h"
>>  #include "dp-packet.h"
>> @@ -547,8 +547,8 @@ avx512_ipv6_sum_header(__m512i ip6_header)
>>       * horizontal add. */
>>      __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
>>                                            0xF, 0xF, 0xF, 0xF);
>> -    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
>>
>> +    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
>>      v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
>>      v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
>>
>> @@ -562,7 +562,7 @@ avx512_ipv6_addr_csum_delta(__m512i old_header,
>> __m512i new_header)  {
>>      uint16_t old_delta = avx512_ipv6_sum_header(old_header);
>>      uint16_t new_delta = avx512_ipv6_sum_header(new_header);
>> -    uint32_t csum_delta = (uint16_t)~old_delta + new_delta;
>> +    uint32_t csum_delta = ((uint16_t) ~old_delta) + new_delta;
>>
>>      return  ~csum_finish(csum_delta);
>>  }
>> @@ -606,15 +606,15 @@ action_avx512_ipv6_set_addrs(struct
>> dp_packet_batch *batch,
>>      __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
>>      __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
>>
>> -
>>      /* Set the v_zero register to all zero's. */
>>      const __m128i v_zeros = _mm_setzero_si128();
>> +
>>      /* Set the v_all_ones register to all one's. */
>>      const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
>>
>> -    /* Load ip6 src and dst respectively into 128-bit wide registers. */
>> +    /* Load ip6 src and dst masks respectively into 128-bit wide
>> + registers. */
>>      __m128i v_src = _mm_loadu_si128((void *) mask);
>> -    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
>> +    __m128i v_dst = _mm_maskz_loadu_epi64(0xC, (void *) mask);
>>
>>      /* Perform a bitwise OR between src and dst registers. */
>>      __m128i v_or = _mm_or_si128(v_src, v_dst);
>>
>>> v4:
>>>   - Reworked and moved check for checksum outside loop.
>>>   - Code cleanup based on review from Eelco.
>>> v3:
>>>   - Added a runtime check for AVX512 vbmi.
>>> v2:
>>>   - Added check for availbility of s6_addr32 field of struct in6_addr.
>>>   - Fixed network headers for freebsd builds.
>>>
>>
>> <SNIP>
>>
>>> +    /* Load ip6 src and dst respectively into 128-bit wide registers. */
>>> +    __m128i v_src = _mm_loadu_si128((void *) mask);
>>> +    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
>>
>> Guess it might be me, but I do not understand how
>> _mm_maskz_loadu_epi64() will load the dst from the mask.
>> Looking at the intrinsics guide it will only read the first two 64-bit values, but
>> mask points to src?
>>
>> Should we not just do the following here?
>>
>> +    __m128i v_src = _mm_loadu_si128((void *) &mask->ipv6_src);
>> +    __m128i v_dst = _mm_loadu_si128((void *) &mask->ipv6_dst);
>>
>
> Yes, good catch. The _maskz_load_ isn't correctly pointing to the dst values from the mask.
>
> I will make all the changes above (except the include header comment) and send v5 shortly.

Thanks, take your time and enjoy the weekend!

> <SNIP>
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 6c7713251..82ff7e647 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -20,6 +20,9 @@ 
 
 #include <config.h>
 #include <errno.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <netinet/ip6.h>
 
 #include "csum.h"
 #include "dp-packet.h"
@@ -75,6 +78,26 @@  BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
                   MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
                   offsetof(struct ovs_key_ipv4, ipv4_ttl));
 
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_src) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_dst) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_label));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_label) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_label) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_proto) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_tclass));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv6, ipv6_tclass) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv6, ipv6_tclass) ==
+                  offsetof(struct ovs_key_ipv6, ipv6_hlimit));
+
 /* Array of callback functions, one for each masked operation. */
 odp_execute_action_cb impl_set_masked_funcs[__OVS_KEY_ATTR_MAX];
 
@@ -483,6 +506,180 @@  action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
     }
 }
 
+#if HAVE_AVX512VBMI
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_sum_header(__m512i ip6_header)
+{
+    __m256i v_zeros = _mm256_setzero_si256();
+    __m512i v_shuf_src_dst = _mm512_setr_epi64(0x01, 0x02, 0x03, 0x04,
+                                               0xFF, 0xFF, 0xFF, 0xFF);
+
+    /* Shuffle ip6 src and dst to beginning of register. */
+    __m512i v_ip6_hdr_shuf = _mm512_permutexvar_epi64(v_shuf_src_dst,
+                                                      ip6_header);
+
+    /* Extract ip6 src and dst into smaller 256-bit wide register. */
+    __m256i v_ip6_src_dst = _mm512_extracti64x4_epi64(v_ip6_hdr_shuf, 0);
+
+    /* These two shuffle masks, v_swap16a and v_swap16b, are to shuffle the
+     * src and dst fields and add padding after each 16-bit value for the
+     * following carry over addition. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
+                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
+                                          0x0908, 0xFFFF, 0x0B0A, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF);
+    __m256i v_shuf_old1 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16a);
+    __m256i v_shuf_old2 = _mm256_shuffle_epi8(v_ip6_src_dst, v_swap16b);
+
+    /* Add each part of the old and new headers together. */
+    __m256i v_delta = _mm256_add_epi32(v_shuf_old1, v_shuf_old2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final
+     * horizontal add. */
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+    v_delta = _mm256_permutexvar_epi32(v_swap32a, v_delta);
+
+    v_delta = _mm256_hadd_epi32(v_delta, v_zeros);
+    v_delta = _mm256_hadd_epi16(v_delta, v_zeros);
+
+    /* Extract delta value. */
+    return _mm256_extract_epi16(v_delta, 0);
+}
+
+static inline uint16_t ALWAYS_INLINE
+__attribute__((__target__("avx512vbmi")))
+avx512_ipv6_addr_csum_delta(__m512i old_header, __m512i new_header)
+{
+    uint16_t old_delta = avx512_ipv6_sum_header(old_header);
+    uint16_t new_delta = avx512_ipv6_sum_header(new_header);
+    uint32_t csum_delta = (uint16_t)~old_delta + new_delta;
+
+    return  ~csum_finish(csum_delta);
+}
+
+/* This function performs the same operation on each packet in the batch as
+ * the scalar odp_set_ipv6() function. */
+static void
+__attribute__((__target__("avx512vbmi")))
+action_avx512_ipv6_set_addrs(struct dp_packet_batch *batch,
+                             const struct nlattr *a)
+{
+    const struct ovs_key_ipv6 *key, *mask;
+    struct dp_packet *packet;
+
+    a = nl_attr_get(a);
+    key = nl_attr_get(a);
+    mask = odp_get_key_mask(a, struct ovs_key_ipv6);
+
+    /* Read the content of the key and mask in the respective registers. We
+     * only load the size of the actual structure, which is only 40 bytes. */
+    __m512i v_key = _mm512_maskz_loadu_epi64(0x1F, (void *) key);
+    __m512i v_mask = _mm512_maskz_loadu_epi64(0x1F, (void *) mask);
+
+    /* This shuffle mask v_shuffle, is to shuffle key and mask to match the
+     * ip6_hdr structure layout. */
+    static const uint8_t ip_shuffle_mask[64] = {
+            0x20, 0x21, 0x22, 0x23, 0xFF, 0xFF, 0x24, 0x26,
+            0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+            0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+            0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+            0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0XFF, 0xFF
+    };
+
+    __m512i v_shuffle = _mm512_loadu_si512((void *) ip_shuffle_mask);
+
+    /* This shuffle is required for key and mask to match the layout of the
+     * ip6_hdr struct. */
+    __m512i v_key_shuf = _mm512_permutexvar_epi8(v_shuffle, v_key);
+    __m512i v_mask_shuf = _mm512_permutexvar_epi8(v_shuffle, v_mask);
+
+
+    /* Set the v_zero register to all zero's. */
+    const __m128i v_zeros = _mm_setzero_si128();
+    /* Set the v_all_ones register to all one's. */
+    const __m128i v_all_ones = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+    /* Load ip6 src and dst respectively into 128-bit wide registers. */
+    __m128i v_src = _mm_loadu_si128((void *) mask);
+    __m128i v_dst = _mm_maskz_loadu_epi64(0xC,(void *) mask);
+
+    /* Perform a bitwise OR between src and dst registers. */
+    __m128i v_or = _mm_or_si128(v_src, v_dst);
+
+    /* Will return true if any bit has been set in v_or, else it will return
+     * false. */
+    bool do_checksum = !_mm_test_all_zeros(v_or, v_all_ones);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ovs_16aligned_ip6_hdr *nh = dp_packet_l3(packet);
+
+        /* Load the 40 bytes of the IPv6 header. */
+        __m512i v_packet = _mm512_maskz_loadu_epi64(0x1F, (void *) nh);
+
+        /* AND the v_pkt_mask to the packet data (v_packet). */
+        __m512i v_pkt_masked = _mm512_andnot_si512(v_mask_shuf, v_packet);
+
+        /* OR the new addresses (v_key_shuf) with the masked packet addresses
+         * (v_pkt_masked). */
+        __m512i v_new_hdr = _mm512_or_si512(v_key_shuf, v_pkt_masked);
+
+        /* If ip6_src or ip6_dst has been modified, L4 checksum needs to
+         * be updated. */
+        if (do_checksum) {
+            uint8_t proto = nh->ip6_nxt;
+            uint16_t delta_checksum = avx512_ipv6_addr_csum_delta(v_packet,
+                                                                  v_new_hdr);
+
+            if (proto == IPPROTO_UDP) {
+                struct udp_header *uh = dp_packet_l4(packet);
+
+                if (uh->udp_csum) {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+                    uint32_t udp_checksum = old_udp_checksum + delta_checksum;
+
+                    udp_checksum = csum_finish(udp_checksum);
+
+                    if (!udp_checksum) {
+                        udp_checksum = htons(0xffff);
+                    }
+
+                    uh->udp_csum = udp_checksum;
+                }
+            } else if (proto == IPPROTO_TCP) {
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+                uint32_t tcp_checksum = old_tcp_checksum + delta_checksum;
+
+                tcp_checksum = csum_finish(tcp_checksum);
+                th->tcp_csum = tcp_checksum;
+            } else if (proto == IPPROTO_ICMPV6) {
+                struct icmp6_header *icmp = dp_packet_l4(packet);
+                uint16_t old_icmp6_checksum = ~icmp->icmp6_cksum;
+                uint32_t icmp6_checksum = old_icmp6_checksum + delta_checksum;
+
+                icmp6_checksum = csum_finish(icmp6_checksum);
+                icmp->icmp6_cksum = icmp6_checksum;
+            }
+        }
+        /* Write back the modified IPv6 addresses. */
+         _mm512_mask_storeu_epi64((void *) nh, 0x1F, v_new_hdr);
+    }
+}
+#endif /* HAVE_AVX512VBMI */
+
 static void
 action_avx512_set_masked(struct dp_packet_batch *batch, const struct nlattr *a)
 {
@@ -514,6 +711,13 @@  action_avx512_init(struct odp_execute_action_impl *self OVS_UNUSED)
     impl_set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_avx512_eth_set_addrs;
     impl_set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_avx512_ipv4_set_addrs;
 
+#if HAVE_AVX512VBMI
+    if (action_avx512vbmi_isa_probe()) {
+        impl_set_masked_funcs[OVS_KEY_ATTR_IPV6] =
+                              action_avx512_ipv6_set_addrs;
+    }
+#endif
+
     return 0;
 }
 
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index f80ae5a23..8b86b1e4f 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -60,6 +60,23 @@  action_avx512_isa_probe(void)
 
 #endif
 
+#if ACTION_IMPL_AVX512_CHECK && HAVE_AVX512VBMI
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    if (cpu_has_isa(OVS_CPU_ISA_X86_AVX512VBMI)) {
+        return true;
+    }
+    return false;
+}
+#else
+bool
+action_avx512vbmi_isa_probe(void)
+{
+    return false;
+}
+#endif
+
 static struct odp_execute_action_impl action_impls[] = {
     [ACTION_IMPL_AUTOVALIDATOR] = {
         .available = false,
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 940180c99..643f41c2a 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -78,6 +78,7 @@  BUILD_ASSERT_DECL(ACTION_IMPL_AUTOVALIDATOR == 1);
 #define ACTION_IMPL_BEGIN (ACTION_IMPL_AUTOVALIDATOR + 1)
 
 bool action_avx512_isa_probe(void);
+bool action_avx512vbmi_isa_probe(void);
 
 /* Odp execute init handles setting up the state of the actions functions at
  * initialization time. It cannot return errors, as it must always succeed in