diff mbox series

[ovs-dev,v6,11/11] odp-execute: Add ISA implementation of set_masked IPv4 action

Message ID 20220510142202.1087967-12-emma.finn@intel.com
State Changes Requested
Headers show
Series Actions Infrastructure + Optimizations | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation fail test: fail

Commit Message

Emma Finn May 10, 2022, 2:22 p.m. UTC
This commit adds support for the AVX512 implementation of the
ipv4_set_addrs action as well as an AVX512 implementation of
updating the checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>
---
 lib/odp-execute-avx512.c  | 194 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |   1 +
 lib/odp-execute.c         |  21 ++++-
 3 files changed, 211 insertions(+), 5 deletions(-)

Comments

Eelco Chaudron June 2, 2022, 2:43 p.m. UTC | #1
On 10 May 2022, at 16:22, Emma Finn wrote:

> This commit adds support for the AVX512 implementation of the
> ipv4_set_addrs action as well as an AVX512 implementation of
> updating the checksums.
>
> Signed-off-by: Emma Finn <emma.finn@intel.com>
> ---
>  lib/odp-execute-avx512.c  | 194 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |   1 +
>  lib/odp-execute.c         |  21 ++++-
>  3 files changed, 211 insertions(+), 5 deletions(-)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index ede00b750..618fa37a7 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -22,6 +22,7 @@
>  #include <config.h>
>  #include <errno.h>
>
> +#include "csum.h"
>  #include "cpu.h"
>  #include "dp-packet.h"
>  #include "immintrin.h"
> @@ -175,6 +176,197 @@ action_avx512_eth_set_addrs(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
>      }
>  }
>
> +static inline uint16_t ALWAYS_INLINE
> +avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
> +{

Please add comments to the two below checksum functions, and I’ll do a full review in the next revision.

> +    uint16_t tmp_checksum;
> +    __m256i v_zeros = _mm256_setzero_si256();
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0x0504, 0xffff, 0x0706, 0xffff,
> +                                          0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
> +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +
> +    __m256i oh = _mm256_loadu_si256((void *) old_header);
> +    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
> +    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
> +    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
> +
> +    /* Add field values. */
> +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> +
> +    /* Extract checksum value. */
> +    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
> +
> +    return ~tmp_checksum;
> +}
> +
> +static inline uint16_t ALWAYS_INLINE
> +avx512_ipv4_recalc_csum(__m256i res)

Dont think this is a recalc, but just a new calculation, so maybe just call it
avx512_ipv4_csum()?

> +{
> +    uint32_t new_checksum;
> +    __m256i v_zeros = _mm256_setzero_si256();
> +
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0x0504, 0xffff, 0x0706, 0xffff,
> +                                          0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
> +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +
> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);

The above constant data seem to be the same as in avx512_l4_update_csum() so maybe define them as a constant and add some description to them.

> +
> +    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
> +    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
> +
> +    /* Add field values. */
> +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> +
> +    /* Extract new checksum value. */
> +    new_checksum = _mm256_extract_epi16(v_sum, 0);
> +
> +    return ~new_checksum;

How are IP options handled here?

> +}
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_dst));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_proto));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_tos));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_ttl));

Some comments on why we need these asserts and how they related to the code above/below.

> +
> +static void
> +action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED,
> +                             struct dp_packet_batch *batch,
> +                             const struct nlattr *a,
> +                             bool should_steal OVS_UNUSED)
> +{
> +    a = nl_attr_get(a);
> +    const struct ovs_key_ipv4 *key = nl_attr_get(a);
> +    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
> +    struct dp_packet *packet;
> +    ovs_be16 old_csum;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ip_header *nh = dp_packet_l3(packet);
> +        old_csum = nh->ip_csum;
> +
> +        __m256i v_key = _mm256_loadu_si256((void *) key);
> +        __m256i v_mask = _mm256_loadu_si256((void *) mask);

These two are not overwritten can we load them outside the loop?

> +        __m256i v_packet = _mm256_loadu_si256((void *) nh);
> +
> +        /* Shuffle key and mask to match ip_header struct layout. */
> +        static const uint8_t ip_shuffle_mask[32] = {
> +            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
> +            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};

Explain what the shuffle here?

> +        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
> +                                             0x1, 0xF, 0xF, 0xF);

v_shuf32 makes no sense to me, can we add a comment on what we do? Here and all the code below, as I do not want to figure it out each review ;)

> +
> +        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
> +
> +        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
> +        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
> +
> +        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
> +        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
> +
> +        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
> +        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
> +
> +        /* Update checksum. */
> +        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);

> +
> +        /* Insert new checksum. */
> +        v_res = _mm256_insert_epi16(v_res, checksum, 5);
> +
> +       /* If ip_src or ip_dst has been modified, L4 checksum needs to
> +        * be updated too.
> +        */
> +        int update_mask = _mm256_movemask_epi8(v_mask);
> +        if (update_mask & 0xFF) {
> +
> +            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
> +            tmp_checksum = ~tmp_checksum;
> +            uint16_t csum;
> +
> +            if (nh->ip_proto == IPPROTO_UDP) {
> +                /* New UDP checksum. */
> +                struct udp_header *uh = dp_packet_l4(packet);

This can be moved under the if statement below.

> +                if (uh->udp_csum) {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +
> +                    uint32_t udp_checksum = old_csum + tmp_checksum;
> +                    udp_checksum = csum_finish(udp_checksum);
> +                    uint16_t udp_csum = ~udp_checksum;
> +
> +                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
> +
> +                    csum =  csum_finish(nw_udp_checksum);
> +

I think there is this special UDP case that might need handling here:

if (!csum) {
     csum = htons(0xffff);

> +                    /* Insert new udp checksum. */
> +                    v_res = _mm256_insert_epi16(v_res, csum, 13);
> +                }
> +            }

Guess this could be an " } else if if (nh->ip_proto == IPPROTO_TCP) {"

> +            if (nh->ip_proto == IPPROTO_TCP) {
> +                /* New TCP checksum. */
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +
> +                uint32_t tcp_checksum = old_csum + tmp_checksum;
> +                tcp_checksum = csum_finish(tcp_checksum);
> +                uint16_t tcp_csum = ~tcp_checksum;
> +
> +                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
> +
> +                csum =  csum_finish(nw_tcp_checksum);
> +
> +                th->tcp_csum = csum;
> +            }
> +        }
> +
> +        /* Store new IP header. */
> +        _mm256_storeu_si256((void *) nh, v_res);
> +    }
> +}
> +

Did not review the below, as the architecture needs changing (see the previous patch).

>  static void
>  action_avx512_set_masked(void *dp OVS_UNUSED,
>                           struct dp_packet_batch *batch OVS_UNUSED,
> @@ -233,6 +425,8 @@ action_avx512_init(struct odp_execute_action_impl *self)
>      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_avx512_set_masked;
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
>                              action_avx512_eth_set_addrs;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> +                            action_avx512_ipv4_set_addrs;
>      active_impl = *self;
>
>      return 0;
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index 34f13523a..cb77bab31 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -284,6 +284,7 @@ action_autoval_init(struct odp_execute_action_impl *self)
>      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_autoval_generic;
>      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked_init;
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
>      active_impl = *self;
>
>      return 0;
> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
> index 5c4dd8e33..cbf528f93 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct dp_packet *packet,
>          break;
>      }
>
> -    case OVS_KEY_ATTR_IPV4:
> -        odp_set_ipv4(packet, nl_attr_get(a),
> -                     get_mask(a, struct ovs_key_ipv4));
> -        break;
> -
>      case OVS_KEY_ATTR_IPV6:
>          odp_set_ipv6(packet, nl_attr_get(a),
>                       get_mask(a, struct ovs_key_ipv6));
> @@ -657,6 +652,7 @@ odp_execute_masked_set_action(struct dp_packet *packet,
>      case OVS_KEY_ATTR_ETHERNET:
>      case OVS_KEY_ATTR_ETHERTYPE:
>      case OVS_KEY_ATTR_IN_PORT:
> +    case OVS_KEY_ATTR_IPV4:
>      case OVS_KEY_ATTR_VLAN:
>      case OVS_KEY_ATTR_ICMP:
>      case OVS_KEY_ATTR_ICMPV6:
> @@ -892,6 +888,20 @@ action_mod_eth(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
>      }
>  }
>
> +static void
> +action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
> +                const struct nlattr *a OVS_UNUSED,
> +                bool should_steal OVS_UNUSED)
> +{
> +    a = nl_attr_get(a);
> +    struct dp_packet *packet;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        odp_set_ipv4(packet, nl_attr_get(a),
> +                     get_mask(a, struct ovs_key_ipv4));
> +    }
> +}
> +
>  /* Implementation of the scalar actions impl init function. Build up the
>   * array of func ptrs here.
>   */
> @@ -902,6 +912,7 @@ odp_action_scalar_init(struct odp_execute_action_impl *self)
>      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
>      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked;
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
>      actions_active_impl = *self;
>
>      return 0;
> -- 
> 2.25.1

This completes my initial review of this series, let me know if something is not clear. I have not done any actual testing on an AVX machine, but I will try to do that on the next revision.

Also, wondering how you tested performance on all of this? Maybe you can some details to the cover letter on how the relative performance numbers were gathered? Also wondering if you tested all of this without including DPDK in your build (it was/is on my TODO but I have not AVX machine yet)?

Cheers,

Eelco
Emma Finn June 2, 2022, 2:59 p.m. UTC | #2
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Thursday 2 June 2022 15:43
> To: Finn, Emma <emma.finn@intel.com>
> Cc: Van Haaren, Harry <harry.van.haaren@intel.com>; Amber, Kumar
> <kumar.amber@intel.com>; Stokes, Ian <ian.stokes@intel.com>;
> dev@openvswitch.org
> Subject: Re: [v6 11/11] odp-execute: Add ISA implementation of
> set_masked IPv4 action
> 
> On 10 May 2022, at 16:22, Emma Finn wrote:
> 
> > This commit adds support for the AVX512 implementation of the
> > ipv4_set_addrs action as well as an AVX512 implementation of updating
> > the checksums.
> >
> > Signed-off-by: Emma Finn <emma.finn@intel.com>
> > ---
> >  lib/odp-execute-avx512.c  | 194
> ++++++++++++++++++++++++++++++++++++++
> >  lib/odp-execute-private.c |   1 +
> >  lib/odp-execute.c         |  21 ++++-
> >  3 files changed, 211 insertions(+), 5 deletions(-)
> >
> > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
> > ede00b750..618fa37a7 100644
> > --- a/lib/odp-execute-avx512.c
> > +++ b/lib/odp-execute-avx512.c
> > @@ -22,6 +22,7 @@
> >  #include <config.h>
> >  #include <errno.h>
> >
> > +#include "csum.h"
> >  #include "cpu.h"
> >  #include "dp-packet.h"
> >  #include "immintrin.h"
> > @@ -175,6 +176,197 @@ action_avx512_eth_set_addrs(void *dp
> OVS_UNUSED, struct dp_packet_batch *batch,
> >      }
> >  }
> >
> > +static inline uint16_t ALWAYS_INLINE
> > +avx512_l4_update_csum(struct ip_header *old_header, __m256i res) {
> 
> Please add comments to the two below checksum functions, and I’ll do a
> full review in the next revision.
> 
> > +    uint16_t tmp_checksum;
> > +    __m256i v_zeros = _mm256_setzero_si256();
> > +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302,
> 0xffff,
> > +                                          0x0504, 0xffff, 0x0706, 0xffff,
> > +                                          0x0100, 0xffff, 0x0302, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff);
> > +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff,
> 0xffff,
> > +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff);
> > +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> > +                                          0xF, 0xF, 0xF, 0xF);
> > +
> > +    __m256i oh = _mm256_loadu_si256((void *) old_header);
> > +    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
> > +    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
> > +    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
> > +
> > +    /* Add field values. */
> > +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> > +
> > +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +
> > +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> > +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> > +
> > +    /* Extract checksum value. */
> > +    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
> > +
> > +    return ~tmp_checksum;
> > +}
> > +
> > +static inline uint16_t ALWAYS_INLINE
> > +avx512_ipv4_recalc_csum(__m256i res)
> 
> Dont think this is a recalc, but just a new calculation, so maybe just call it
> avx512_ipv4_csum()?
> 
> > +{
> > +    uint32_t new_checksum;
> > +    __m256i v_zeros = _mm256_setzero_si256();
> > +
> > +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302,
> 0xffff,
> > +                                          0x0504, 0xffff, 0x0706, 0xffff,
> > +                                          0x0100, 0xffff, 0x0302, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff,
> > + 0xffff);
> > +
> > +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff,
> 0xffff,
> > +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff, 0xffff,
> > +                                          0xffff, 0xffff, 0xffff,
> > + 0xffff);
> > +
> > +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> > +                                          0xF, 0xF, 0xF, 0xF);
> 
> The above constant data seem to be the same as in
> avx512_l4_update_csum() so maybe define them as a constant and add
> some description to them.
> 
> > +
> > +    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
> > +    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
> > +
> > +    /* Add field values. */
> > +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> > +
> > +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +
> > +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> > +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> > +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> > +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> > +
> > +    /* Extract new checksum value. */
> > +    new_checksum = _mm256_extract_epi16(v_sum, 0);
> > +
> > +    return ~new_checksum;
> 
> How are IP options handled here?
> 
> > +}
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_dst));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_proto));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_tos));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
> > +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
> > +                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
> 
> Some comments on why we need these asserts and how they related to the
> code above/below.
> 
> > +
> > +static void
> > +action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED,
> > +                             struct dp_packet_batch *batch,
> > +                             const struct nlattr *a,
> > +                             bool should_steal OVS_UNUSED) {
> > +    a = nl_attr_get(a);
> > +    const struct ovs_key_ipv4 *key = nl_attr_get(a);
> > +    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
> > +    struct dp_packet *packet;
> > +    ovs_be16 old_csum;
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        struct ip_header *nh = dp_packet_l3(packet);
> > +        old_csum = nh->ip_csum;
> > +
> > +        __m256i v_key = _mm256_loadu_si256((void *) key);
> > +        __m256i v_mask = _mm256_loadu_si256((void *) mask);
> 
> These two are not overwritten can we load them outside the loop?
> 
> > +        __m256i v_packet = _mm256_loadu_si256((void *) nh);
> > +
> > +        /* Shuffle key and mask to match ip_header struct layout. */
> > +        static const uint8_t ip_shuffle_mask[32] = {
> > +            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
> > +            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
> > +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
> 
> Explain what the shuffle here?
> 
> > +        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
> > +                                             0x1, 0xF, 0xF, 0xF);
> 
> v_shuf32 makes no sense to me, can we add a comment on what we do?
> Here and all the code below, as I do not want to figure it out each review ;)
> 
> > +
> > +        __m256i v_shuffle = _mm256_loadu_si256((void *)
> > + ip_shuffle_mask);
> > +
> > +        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32,
> v_key);
> > +        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
> > +
> > +        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32,
> v_mask);
> > +        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
> > +
> > +        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf,
> v_packet);
> > +        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
> > +
> > +        /* Update checksum. */
> > +        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);
> 
> > +
> > +        /* Insert new checksum. */
> > +        v_res = _mm256_insert_epi16(v_res, checksum, 5);
> > +
> > +       /* If ip_src or ip_dst has been modified, L4 checksum needs to
> > +        * be updated too.
> > +        */
> > +        int update_mask = _mm256_movemask_epi8(v_mask);
> > +        if (update_mask & 0xFF) {
> > +
> > +            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
> > +            tmp_checksum = ~tmp_checksum;
> > +            uint16_t csum;
> > +
> > +            if (nh->ip_proto == IPPROTO_UDP) {
> > +                /* New UDP checksum. */
> > +                struct udp_header *uh = dp_packet_l4(packet);
> 
> This can be moved under the if statement below.
> 
> > +                if (uh->udp_csum) {
> > +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> > +
> > +                    uint32_t udp_checksum = old_csum + tmp_checksum;
> > +                    udp_checksum = csum_finish(udp_checksum);
> > +                    uint16_t udp_csum = ~udp_checksum;
> > +
> > +                    uint32_t nw_udp_checksum = udp_csum +
> > + old_udp_checksum;
> > +
> > +                    csum =  csum_finish(nw_udp_checksum);
> > +
> 
> I think there is this special UDP case that might need handling here:
> 
> if (!csum) {
>      csum = htons(0xffff);
> 
> > +                    /* Insert new udp checksum. */
> > +                    v_res = _mm256_insert_epi16(v_res, csum, 13);
> > +                }
> > +            }
> 
> Guess this could be an " } else if if (nh->ip_proto == IPPROTO_TCP) {"
> 
> > +            if (nh->ip_proto == IPPROTO_TCP) {
> > +                /* New TCP checksum. */
> > +                struct tcp_header *th = dp_packet_l4(packet);
> > +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> > +
> > +                uint32_t tcp_checksum = old_csum + tmp_checksum;
> > +                tcp_checksum = csum_finish(tcp_checksum);
> > +                uint16_t tcp_csum = ~tcp_checksum;
> > +
> > +                uint32_t nw_tcp_checksum = tcp_csum +
> > + old_tcp_checksum;
> > +
> > +                csum =  csum_finish(nw_tcp_checksum);
> > +
> > +                th->tcp_csum = csum;
> > +            }
> > +        }
> > +
> > +        /* Store new IP header. */
> > +        _mm256_storeu_si256((void *) nh, v_res);
> > +    }
> > +}
> > +
> 
> Did not review the below, as the architecture needs changing (see the
> previous patch).
> 
> >  static void
> >  action_avx512_set_masked(void *dp OVS_UNUSED,
> >                           struct dp_packet_batch *batch OVS_UNUSED, @@
> > -233,6 +425,8 @@ action_avx512_init(struct odp_execute_action_impl
> *self)
> >      self->funcs[OVS_ACTION_ATTR_SET_MASKED] =
> action_avx512_set_masked;
> >      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> >                              action_avx512_eth_set_addrs;
> > +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> > +                            action_avx512_ipv4_set_addrs;
> >      active_impl = *self;
> >
> >      return 0;
> > diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> > index 34f13523a..cb77bab31 100644
> > --- a/lib/odp-execute-private.c
> > +++ b/lib/odp-execute-private.c
> > @@ -284,6 +284,7 @@ action_autoval_init(struct
> odp_execute_action_impl *self)
> >      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] =
> action_autoval_generic;
> >      self->funcs[OVS_ACTION_ATTR_SET_MASKED] =
> action_set_masked_init;
> >      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> > action_autoval_generic;
> > +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> > + action_autoval_generic;
> >      active_impl = *self;
> >
> >      return 0;
> > diff --git a/lib/odp-execute.c b/lib/odp-execute.c index
> > 5c4dd8e33..cbf528f93 100644
> > --- a/lib/odp-execute.c
> > +++ b/lib/odp-execute.c
> > @@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct
> dp_packet *packet,
> >          break;
> >      }
> >
> > -    case OVS_KEY_ATTR_IPV4:
> > -        odp_set_ipv4(packet, nl_attr_get(a),
> > -                     get_mask(a, struct ovs_key_ipv4));
> > -        break;
> > -
> >      case OVS_KEY_ATTR_IPV6:
> >          odp_set_ipv6(packet, nl_attr_get(a),
> >                       get_mask(a, struct ovs_key_ipv6)); @@ -657,6
> > +652,7 @@ odp_execute_masked_set_action(struct dp_packet *packet,
> >      case OVS_KEY_ATTR_ETHERNET:
> >      case OVS_KEY_ATTR_ETHERTYPE:
> >      case OVS_KEY_ATTR_IN_PORT:
> > +    case OVS_KEY_ATTR_IPV4:
> >      case OVS_KEY_ATTR_VLAN:
> >      case OVS_KEY_ATTR_ICMP:
> >      case OVS_KEY_ATTR_ICMPV6:
> > @@ -892,6 +888,20 @@ action_mod_eth(void *dp OVS_UNUSED, struct
> dp_packet_batch *batch,
> >      }
> >  }
> >
> > +static void
> > +action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch
> *batch,
> > +                const struct nlattr *a OVS_UNUSED,
> > +                bool should_steal OVS_UNUSED) {
> > +    a = nl_attr_get(a);
> > +    struct dp_packet *packet;
> > +
> > +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> > +        odp_set_ipv4(packet, nl_attr_get(a),
> > +                     get_mask(a, struct ovs_key_ipv4));
> > +    }
> > +}
> > +
> >  /* Implementation of the scalar actions impl init function. Build up the
> >   * array of func ptrs here.
> >   */
> > @@ -902,6 +912,7 @@ odp_action_scalar_init(struct
> odp_execute_action_impl *self)
> >      self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
> >      self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked;
> >      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
> action_mod_eth;
> > +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
> >      actions_active_impl = *self;
> >
> >      return 0;
> > --
> > 2.25.1
> 
> This completes my initial review of this series, let me know if something is
> not clear. I have not done any actual testing on an AVX machine, but I will
> try to do that on the next revision.
> 
> Also, wondering how you tested performance on all of this? Maybe you can
> some details to the cover letter on how the relative performance numbers
> were gathered? Also wondering if you tested all of this without including
> DPDK in your build (it was/is on my TODO but I have not AVX machine yet)?
> 
> Cheers,
> 
> Eelco

Hi Eelco, 

Thanks for the review. Will take the comments on board and start re working for the next revision. 

Thanks, 
Emma
Emma Finn June 14, 2022, 11:40 a.m. UTC | #3
> -----Original Message-----
> From: Finn, Emma
> Sent: Thursday 2 June 2022 16:00
> To: Eelco Chaudron <echaudro@redhat.com>
> Cc: Van Haaren, Harry <harry.van.haaren@intel.com>; Amber, Kumar
> <Kumar.Amber@intel.com>; Stokes, Ian <ian.stokes@intel.com>;
> dev@openvswitch.org
> Subject: RE: [v6 11/11] odp-execute: Add ISA implementation of
> set_masked IPv4 action
> 
> 
> 
> > -----Original Message-----
> > From: Eelco Chaudron <echaudro@redhat.com>
> > Sent: Thursday 2 June 2022 15:43
> > To: Finn, Emma <emma.finn@intel.com>
> > Cc: Van Haaren, Harry <harry.van.haaren@intel.com>; Amber, Kumar
> > <kumar.amber@intel.com>; Stokes, Ian <ian.stokes@intel.com>;
> > dev@openvswitch.org
> > Subject: Re: [v6 11/11] odp-execute: Add ISA implementation of
> > set_masked IPv4 action
> >
> > On 10 May 2022, at 16:22, Emma Finn wrote:
> >
> > > This commit adds support for the AVX512 implementation of the
> > > ipv4_set_addrs action as well as an AVX512 implementation of
> > > updating the checksums.
> > >
> > > Signed-off-by: Emma Finn <emma.finn@intel.com>
> > > ---
> > >  lib/odp-execute-avx512.c  | 194
> > ++++++++++++++++++++++++++++++++++++++
> > >  lib/odp-execute-private.c |   1 +
> > >  lib/odp-execute.c         |  21 ++++-
> > >  3 files changed, 211 insertions(+), 5 deletions(-)
> > >

<snip>

> >
> > This completes my initial review of this series, let me know if
> > something is not clear. I have not done any actual testing on an AVX
> > machine, but I will try to do that on the next revision.
> >
> > Also, wondering how you tested performance on all of this? Maybe you
> > can some details to the cover letter on how the relative performance
> > numbers were gathered? Also wondering if you tested all of this
> > without including DPDK in your build (it was/is on my TODO but I have
> not AVX machine yet)?

The relative performance numbers was only tested with OvS/DPDK, I did not test on OvS alone. Each action was tested individually, using a single PMD thread and a single flow like such $ ovs-ofctl add-flow br0 'in_port=1,ip,actions=mod_nw_ttl:254,output=2'.  All other AVX components were disabled (DPCLS,MFEX,etc.) and traffic was sent at 100% line rate.


> >
> > Cheers,
> >
> > Eelco
> 
> Hi Eelco,
> 
> Thanks for the review. Will take the comments on board and start re
> working for the next revision.
> 
> Thanks,
> Emma
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index ede00b750..618fa37a7 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -22,6 +22,7 @@ 
 #include <config.h>
 #include <errno.h>
 
+#include "csum.h"
 #include "cpu.h"
 #include "dp-packet.h"
 #include "immintrin.h"
@@ -175,6 +176,197 @@  action_avx512_eth_set_addrs(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
     }
 }
 
+static inline uint16_t ALWAYS_INLINE
+avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
+{
+    uint16_t tmp_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i oh = _mm256_loadu_si256((void *) old_header);
+    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
+    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract checksum value. */
+    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~tmp_checksum;
+}
+
+static inline uint16_t ALWAYS_INLINE
+avx512_ipv4_recalc_csum(__m256i res)
+{
+    uint32_t new_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract new checksum value. */
+    new_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~new_checksum;
+}
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+
+static void
+action_avx512_ipv4_set_addrs(void *dp OVS_UNUSED,
+                             struct dp_packet_batch *batch,
+                             const struct nlattr *a,
+                             bool should_steal OVS_UNUSED)
+{
+    a = nl_attr_get(a);
+    const struct ovs_key_ipv4 *key = nl_attr_get(a);
+    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
+    struct dp_packet *packet;
+    ovs_be16 old_csum;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ip_header *nh = dp_packet_l3(packet);
+        old_csum = nh->ip_csum;
+
+        __m256i v_key = _mm256_loadu_si256((void *) key);
+        __m256i v_mask = _mm256_loadu_si256((void *) mask);
+        __m256i v_packet = _mm256_loadu_si256((void *) nh);
+
+        /* Shuffle key and mask to match ip_header struct layout. */
+        static const uint8_t ip_shuffle_mask[32] = {
+            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
+            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
+                                             0x1, 0xF, 0xF, 0xF);
+
+        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
+
+        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
+        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
+
+        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
+        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
+
+        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
+        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
+
+        /* Update checksum. */
+        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);
+
+        /* Insert new checksum. */
+        v_res = _mm256_insert_epi16(v_res, checksum, 5);
+
+       /* If ip_src or ip_dst has been modified, L4 checksum needs to
+        * be updated too.
+        */
+        int update_mask = _mm256_movemask_epi8(v_mask);
+        if (update_mask & 0xFF) {
+
+            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
+            tmp_checksum = ~tmp_checksum;
+            uint16_t csum;
+
+            if (nh->ip_proto == IPPROTO_UDP) {
+                /* New UDP checksum. */
+                struct udp_header *uh = dp_packet_l4(packet);
+                if (uh->udp_csum) {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+
+                    uint32_t udp_checksum = old_csum + tmp_checksum;
+                    udp_checksum = csum_finish(udp_checksum);
+                    uint16_t udp_csum = ~udp_checksum;
+
+                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
+
+                    csum =  csum_finish(nw_udp_checksum);
+
+                    /* Insert new udp checksum. */
+                    v_res = _mm256_insert_epi16(v_res, csum, 13);
+                }
+            }
+            if (nh->ip_proto == IPPROTO_TCP) {
+                /* New TCP checksum. */
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+
+                uint32_t tcp_checksum = old_csum + tmp_checksum;
+                tcp_checksum = csum_finish(tcp_checksum);
+                uint16_t tcp_csum = ~tcp_checksum;
+
+                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
+
+                csum =  csum_finish(nw_tcp_checksum);
+
+                th->tcp_csum = csum;
+            }
+        }
+
+        /* Store new IP header. */
+        _mm256_storeu_si256((void *) nh, v_res);
+    }
+}
+
 static void
 action_avx512_set_masked(void *dp OVS_UNUSED,
                          struct dp_packet_batch *batch OVS_UNUSED,
@@ -233,6 +425,8 @@  action_avx512_init(struct odp_execute_action_impl *self)
     self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_avx512_set_masked;
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
                             action_avx512_eth_set_addrs;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
+                            action_avx512_ipv4_set_addrs;
     active_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index 34f13523a..cb77bab31 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -284,6 +284,7 @@  action_autoval_init(struct odp_execute_action_impl *self)
     self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_autoval_generic;
     self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked_init;
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
     active_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 5c4dd8e33..cbf528f93 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -585,11 +585,6 @@  odp_execute_masked_set_action(struct dp_packet *packet,
         break;
     }
 
-    case OVS_KEY_ATTR_IPV4:
-        odp_set_ipv4(packet, nl_attr_get(a),
-                     get_mask(a, struct ovs_key_ipv4));
-        break;
-
     case OVS_KEY_ATTR_IPV6:
         odp_set_ipv6(packet, nl_attr_get(a),
                      get_mask(a, struct ovs_key_ipv6));
@@ -657,6 +652,7 @@  odp_execute_masked_set_action(struct dp_packet *packet,
     case OVS_KEY_ATTR_ETHERNET:
     case OVS_KEY_ATTR_ETHERTYPE:
     case OVS_KEY_ATTR_IN_PORT:
+    case OVS_KEY_ATTR_IPV4:
     case OVS_KEY_ATTR_VLAN:
     case OVS_KEY_ATTR_ICMP:
     case OVS_KEY_ATTR_ICMPV6:
@@ -892,6 +888,20 @@  action_mod_eth(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
     }
 }
 
+static void
+action_mod_ipv4(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
+                const struct nlattr *a OVS_UNUSED,
+                bool should_steal OVS_UNUSED)
+{
+    a = nl_attr_get(a);
+    struct dp_packet *packet;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        odp_set_ipv4(packet, nl_attr_get(a),
+                     get_mask(a, struct ovs_key_ipv4));
+    }
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -902,6 +912,7 @@  odp_action_scalar_init(struct odp_execute_action_impl *self)
     self->funcs[OVS_ACTION_ATTR_PUSH_VLAN] = action_push_vlan;
     self->funcs[OVS_ACTION_ATTR_SET_MASKED] = action_set_masked;
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
     actions_active_impl = *self;
 
     return 0;