diff mbox series

[ovs-dev,v7,11/11] odp-execute: Add ISA implementation of set_masked IPv4 action

Message ID 20220614115743.1143341-12-emma.finn@intel.com
State Changes Requested
Headers show
Series [ovs-dev,v7,01/11] ofproto-dpif: Fix incorrect checksums in input packets | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed
ovsrobot/intel-ovs-compilation success test: success

Commit Message

Emma Finn June 14, 2022, 11:57 a.m. UTC
This commit adds support for the AVX512 implementation of the
ipv4_set_addrs action as well as an AVX512 implementation of
updating the checksums.

Signed-off-by: Emma Finn <emma.finn@intel.com>
---
 lib/odp-execute-avx512.c  | 210 ++++++++++++++++++++++++++++++++++++++
 lib/odp-execute-private.c |   1 +
 lib/odp-execute.c         |  19 +++-
 3 files changed, 225 insertions(+), 5 deletions(-)

Comments

Eelco Chaudron June 23, 2022, 3:37 p.m. UTC | #1
On 14 Jun 2022, at 13:57, Emma Finn wrote:

> This commit adds support for the AVX512 implementation of the
> ipv4_set_addrs action as well as an AVX512 implementation of
> updating the checksums.
>
> Signed-off-by: Emma Finn <emma.finn@intel.com>
> ---
>  lib/odp-execute-avx512.c  | 210 ++++++++++++++++++++++++++++++++++++++
>  lib/odp-execute-private.c |   1 +
>  lib/odp-execute.c         |  19 +++-
>  3 files changed, 225 insertions(+), 5 deletions(-)
>
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index ffe25b41d..5cba14b92 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -22,6 +22,7 @@
>  #include <config.h>
>  #include <errno.h>
>
> +#include "csum.h"
>  #include "cpu.h"
>  #include "dp-packet.h"
>  #include "immintrin.h"
> @@ -193,6 +194,213 @@ action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
>      }
>  }
>
> +/* Calculate delta checksum by summing only ip_src and ip_dst fields of
> + * ip_header. Resulting checksum will be used for updating L4 checksum */
> +static inline uint16_t ALWAYS_INLINE
> +avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
> +{
> +    uint16_t tmp_checksum;
> +    __m256i v_zeros = _mm256_setzero_si256();
> +
> +    /* Each field needs to be shuffle into 16- bit granularity and across
> +     * lanes. */
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0x0504, 0xffff, 0x0706, 0xffff,
> +                                          0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
> +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);

Can you be consistent here, either all hex in capital or lower case?

> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +
> +    __m256i oh = _mm256_loadu_si256((void *) old_header);
> +    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
> +    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
> +    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
> +

Stopped reviewing (commenting in the patch) this function as I calculates the checksum only on the 20 bytes, but there can be more.
Also it should not do a full calculation as the data in the header might be received corrupted (see below).

> +    /* Add field values. */
> +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */

hadd?

> +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> +
> +    /* Extract checksum value. */
> +    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
> +
> +    return ~tmp_checksum;
> +}
> +

Did not review the function below, as it probably needs changing, see below.

> +/* Calculate checksum by summing entire contents of ip_header leaving out
> + * current checksum field. */
> +static inline uint16_t ALWAYS_INLINE
> +avx512_ipv4_recalc_csum(__m256i res)
> +{
> +    uint32_t new_checksum;
> +    __m256i v_zeros = _mm256_setzero_si256();
> +
> +    /* Each field needs to be shuffle into 16-bit granularity and across
> +     * lanes. */
> +    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0x0504, 0xffff, 0x0706, 0xffff,
> +                                          0x0100, 0xffff, 0x0302, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +
> +    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
> +                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff,
> +                                          0xffff, 0xffff, 0xffff, 0xffff);
> +

Can you be consistent here, either all hex in capital or lower case?

> +    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
> +                                          0xF, 0xF, 0xF, 0xF);
> +
> +    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
> +    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
> +
> +    /* Add field values. */
> +    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
> +
> +    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +
> +    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
> +    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
> +    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
> +    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
> +
> +    /* Extract new checksum value. */
> +    new_checksum = _mm256_extract_epi16(v_sum, 0);
> +
> +    return ~new_checksum;
> +}
> +

I changes the order, and comments a bit for the below, see the diff at the end.

> +/* The shuffles used in action_avx512_ipv4_set_addrs() require the ovs_key_ipv4
> + * struct to be in this layout. If struct changes, shuffle mask also needs to
> + * be updated. */
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_dst));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_proto));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_tos));
> +
> +BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
> +                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
> +                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
> +
> +static void
> +action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
> +                             const struct nlattr *a)
> +{
> +    a = nl_attr_get(a);
> +    const struct ovs_key_ipv4 *key = nl_attr_get(a);
> +    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
> +    struct dp_packet *packet;
> +    ovs_be16 old_csum;
> +
> +    __m256i v_key = _mm256_loadu_si256((void *) key);
> +    __m256i v_mask = _mm256_loadu_si256((void *) mask);

This loads 256 bits of data, but there are only 12 bytes to load. What happens if the memory at the remaining 6 bytes are not mapped in memory? Will we crash!?
Guess the key is fine, as we will read some bytes of the mask data.


> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        struct ip_header *nh = dp_packet_l3(packet);
> +        old_csum = nh->ip_csum;
> +
> +        __m256i v_packet = _mm256_loadu_si256((void *) nh);
> +
> +        /* Shuffle key and mask to match ip_header struct layout. */
> +        static const uint8_t ip_shuffle_mask[32] = {
> +            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
> +            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
> +            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
> +            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};

This is just my OCD kicking in, but can we align the numbers? 0xFF, 0x05, 0xFF....

> +        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
> +                                             0x1, 0xF, 0xF, 0xF);
> +
> +        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
> +

Not an AVX expert, but should it be more beneficial to move the above three statements out of the loop?

> +        /* Two shuffles are required for key and mask to match the layout of
> +         * the ip_header struct. The _shuffle_epi8 only works within 128-bit
> +         * lanes, so a permute is required to move src and dst into the correct
> +         * lanes. And then a shuffle is used to move the fields into the right
> +         * order.
> +         */
> +        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
> +        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
> +
> +        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
> +        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
> +
> +        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
> +        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
> +
> +        /* Recalculate the ip_csum based on updated values. */
> +        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);

This it wrong, we should NOT recalculate the checksum, but only update it. The philosophy of OVS is to ignore bad checksums (keep them as bad as the are ;), so we should perform a recalculation of the existing checksum, not a complete recalculation.

Basically you should remove patch 1, and there should be no failures.

> +        /* Insert new checksum. */
> +        v_res = _mm256_insert_epi16(v_res, checksum, 5);
> +
> +       /* If ip_src or ip_dst has been modified, L4 checksum needs to
> +        * be updated too. */
> +        int update_mask = _mm256_movemask_epi8(v_mask);

Guess this also only needs to be done once for all iterations, so can be moved out of the loop (see also comment below):

> +        if (update_mask & 0xFF) {

Not sure I understand this? What if the mask for IP is 0.0.0.128? This will not get the bit set?
Could this not be a simple: if (mask->ipv4_src || mask->ipv4_dst) {, or make it a bool outside of the loop, i.e. if (address_update) { (not sure if this helps with compiler optimizations).

> +
> +            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);

Did not review the remaining checksum logic, as doing the update based on the data is not the right approach we should just change the checksum based on the changed IPs.
I.e. this should pass without patch 1 applied. In addition there might also be some corner cases with TSO enabled, as I'm not the expert, I've copied in  Mike Pattrick.

> +            tmp_checksum = ~tmp_checksum;
> +            uint16_t csum;
> +
> +            if (nh->ip_proto == IPPROTO_UDP) {
> +                /* New UDP checksum. */
> +                struct udp_header *uh = dp_packet_l4(packet);
> +                if (!uh->udp_csum) {
> +                    uh->udp_csum = htons(0xffff);

I had the following comment last time:

> I think there is this special UDP case that might need handling here:
> if (!csum) {
>     csum = htons(0xffff);

This was not for this case, as that was handled correctly, i.e., if the original checksum is 0, no checksum needs to be updated.
The previous comment above, was meant for the, now, else block below.

> +                } else {
> +                    uint16_t old_udp_checksum = ~uh->udp_csum;
> +
> +                    uint32_t udp_checksum = old_csum + tmp_checksum;
> +                    udp_checksum = csum_finish(udp_checksum);
> +                    uint16_t udp_csum = ~udp_checksum;
> +
> +                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
> +
> +                    csum =  csum_finish(nw_udp_checksum);

The above comment was for this location...

One extra space after = sign.

> +                    /* Insert new udp checksum. */
> +                    v_res = _mm256_insert_epi16(v_res, csum, 13);
> +                }
> +            } else if (nh->ip_proto == IPPROTO_TCP) {
> +                /* New TCP checksum. */
> +                struct tcp_header *th = dp_packet_l4(packet);
> +                uint16_t old_tcp_checksum = ~th->tcp_csum;
> +
> +                uint32_t tcp_checksum = old_csum + tmp_checksum;
> +                tcp_checksum = csum_finish(tcp_checksum);
> +                uint16_t tcp_csum = ~tcp_checksum;
> +
> +                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
> +
> +                csum =  csum_finish(nw_tcp_checksum);

Extra space after = sign.

> +
> +                th->tcp_csum = csum;
> +            }
> +        }
> +        /* Store new IP header. */
> +        _mm256_storeu_si256((void *) nh, v_res);
> +    }
> +}
> +

The stuff below needs changing based on my feedback on patch 10.

>  static void
>  action_avx512_set_masked(struct dp_packet_batch *batch OVS_UNUSED,
>                           const struct nlattr *a)
> @@ -244,6 +452,8 @@ action_avx512_init(struct odp_execute_action_impl *self)
>       * are identified by OVS_KEY_ATTR_*. */
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
>                              action_avx512_eth_set_addrs;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
> +                            action_avx512_ipv4_set_addrs;
>      avx512_impl = *self;
>
>      return 0;
> diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
> index e2d650779..763c8afc7 100644
> --- a/lib/odp-execute-private.c
> +++ b/lib/odp-execute-private.c
> @@ -259,6 +259,7 @@ action_autoval_init(struct odp_execute_action_impl *self)
>       * requires further processing for action type. Note that 2nd level items
>       * are identified by OVS_KEY_ATTR_*. */
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
>      autoval_impl = *self;
>
>      return 0;
> diff --git a/lib/odp-execute.c b/lib/odp-execute.c
> index db6e1ec03..8da009ce9 100644
> --- a/lib/odp-execute.c
> +++ b/lib/odp-execute.c
> @@ -585,11 +585,6 @@ odp_execute_masked_set_action(struct dp_packet *packet,
>          break;
>      }
>
> -    case OVS_KEY_ATTR_IPV4:
> -        odp_set_ipv4(packet, nl_attr_get(a),
> -                     get_mask(a, struct ovs_key_ipv4));
> -        break;
> -
>      case OVS_KEY_ATTR_IPV6:
>          odp_set_ipv6(packet, nl_attr_get(a),
>                       get_mask(a, struct ovs_key_ipv6));
> @@ -664,6 +659,7 @@ odp_execute_masked_set_action(struct dp_packet *packet,
>      case __OVS_KEY_ATTR_MAX:
>      /* The following action types are handled by the scalar implementation. */
>      case OVS_KEY_ATTR_ETHERNET:
> +    case OVS_KEY_ATTR_IPV4:
>      default:
>          OVS_NOT_REACHED();
>      }
> @@ -887,6 +883,18 @@ action_mod_eth(struct dp_packet_batch *batch, const struct nlattr *a)
>      }
>  }
>
> +static void
> +action_mod_ipv4(struct dp_packet_batch *batch, const struct nlattr *a)
> +{
> +    a = nl_attr_get(a);
> +    struct dp_packet *packet;
> +
> +    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
> +        odp_set_ipv4(packet, nl_attr_get(a),
> +                     get_mask(a, struct ovs_key_ipv4));
> +    }
> +}
> +
>  /* Implementation of the scalar actions impl init function. Build up the
>   * array of func ptrs here.
>   */
> @@ -903,6 +911,7 @@ odp_action_scalar_init(struct odp_execute_action_impl *self)
>       * requires further processing for action type. Note that 2nd level items
>       * are identified by OVS_KEY_ATTR_*. */
>      self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
> +    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
>      actions_active_impl = *self;
>
>      return 0;
> -- 
> 2.32.0



This is the diff with the enhanced comments (and some other small changes):

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index 5947cd790..9cd4151f0 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -56,13 +56,29 @@ BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
 BUILD_ASSERT_DECL(sizeof(struct dp_packet) -
                   offsetof(struct dp_packet, l2_pad_size) >= sizeof(__m128i));

-/* The below build assert makes sure the order of eth_src and eth_dst is not
- * changing in the ovs_key_ethernet structure. This should not happen as this
- * is defined under the Linux uapi. */
+/* The below five build assert makes sure the order of the fields needed by
+ * the set masked functions shuffle operations do not change. This should not
+ * happen as these are defined under the Linux uapi. */
 BUILD_ASSERT_DECL(offsetof(struct ovs_key_ethernet, eth_src) +
                   MEMBER_SIZEOF(struct ovs_key_ethernet, eth_src) ==
                   offsetof(struct ovs_key_ethernet, eth_dst));

+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+

 static inline void ALWAYS_INLINE
 avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
@@ -291,21 +307,28 @@ avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
     uint16_t tmp_checksum;
     __m256i v_zeros = _mm256_setzero_si256();

+
+    /* Load original IPv4 header in oh */
+    __m256i oh = _mm256_loadu_si256((void *) old_header);
+
+    /* Combine the old and new header, i.e. adding in the new IP addresses
+     * in the old header (oh). This is done by using the 0x03C 16-bit mask,
+     * picking 16-bit word 7 till 10.  */
+    oh = _mm256_mask_blend_epi16(0x03C0, oh, res);
+
     /* Each field needs to be shuffle into 16- bit granularity and across
      * lanes. */
-    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
-                                          0x0504, 0xffff, 0x0706, 0xffff,
-                                          0x0100, 0xffff, 0x0302, 0xffff,
-                                          0xffff, 0xffff, 0xffff, 0xffff);
-    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
-                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
-                                          0xffff, 0xffff, 0xffff, 0xffff,
-                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0x0504, 0xFFFF, 0x0706, 0xFFFF,
+                                          0x0100, 0xFFFF, 0x0302, 0xFFFF,
+                                          0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xFFFF, 0xFFFF, 0xFFFF,
+                                          0x0D0C, 0xFFFF, 0x0F0E, 0xFFFF,
+                                          0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
+                                          0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF);
     __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
                                           0xF, 0xF, 0xF, 0xF);

-    __m256i oh = _mm256_loadu_si256((void *) old_header);
-    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
     __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
     __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);

@@ -371,50 +394,40 @@ avx512_ipv4_recalc_csum(__m256i res)
     return ~new_checksum;
 }

-/* The shuffles used in action_avx512_ipv4_set_addrs() require the ovs_key_ipv4
- * struct to be in this layout. If struct changes, shuffle mask also needs to
- * be updated. */
-BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
-                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
-                  offsetof(struct ovs_key_ipv4, ipv4_dst));
-
-BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
-                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
-                  offsetof(struct ovs_key_ipv4, ipv4_proto));
-
-BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
-                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
-                  offsetof(struct ovs_key_ipv4, ipv4_tos));
-
-BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
-                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
-                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
-
 static void
 action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
                              const struct nlattr *a)
 {
-    a = nl_attr_get(a);
-    const struct ovs_key_ipv4 *key = nl_attr_get(a);
-    const struct ovs_key_ipv4 *mask = odp_get_key_mask(a, struct ovs_key_ipv4);
+    const struct ovs_key_ipv4 *key, *mask;
     struct dp_packet *packet;
-    ovs_be16 old_csum;

+    a = nl_attr_get(a);
+    key = nl_attr_get(a);
+    mask = odp_get_key_mask(a, struct ovs_key_ipv4);
+
+    /* Read the content of the key(src) and mask in the respective registers.
+     * Note that we load more than the size of the actual structure, which
+     * is only 96-bits and not 256-bits. */
     __m256i v_key = _mm256_loadu_si256((void *) key);
     __m256i v_mask = _mm256_loadu_si256((void *) mask);

     DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
         struct ip_header *nh = dp_packet_l3(packet);
-        old_csum = nh->ip_csum;
+        ovs_be16 old_csum = nh->ip_csum;

+        /* Load the first 32 bytes of the IPv4 header. Without options, which
+         * is the most common case it's 20 bytes, but can be up to 60 bytes. */
         __m256i v_packet = _mm256_loadu_si256((void *) nh);

-        /* Shuffle key and mask to match ip_header struct layout. */
+        /* This two shuffle masks, v_shuf32, v_shuffle, are to shuffle key and
+         * mask to match the ip_header structure layout. */
+
         static const uint8_t ip_shuffle_mask[32] = {
-            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
-            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
-            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0x05, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            0x06, 0xFF, 0xFF, 0xFF, 0x00, 0x01, 0x02, 0x03,
+            0x00, 0x01, 0x02, 0x03, 0xFF, 0xFF, 0xFF, 0xFF,
             0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+
         __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
                                              0x1, 0xF, 0xF, 0xF);

@@ -432,7 +445,11 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
         __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
         v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);

+        /* AND the v_pkt_mask to the packet data (v_packet). */
         __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
+
+        /* OR the new addresses (v_key_shuf) with the masked packet addresses
+         * (v_pkt_masked). */
         __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);

         /* Recalculate the ip_csum based on updated values. */
@@ -453,9 +470,7 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
             if (nh->ip_proto == IPPROTO_UDP) {
                 /* New UDP checksum. */
                 struct udp_header *uh = dp_packet_l4(packet);
-                if (!uh->udp_csum) {
-                    uh->udp_csum = htons(0xffff);
-                } else {
+                if (uh->udp_csum) {
                     uint16_t old_udp_checksum = ~uh->udp_csum;

                     uint32_t udp_checksum = old_csum + tmp_checksum;
@@ -464,7 +479,10 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,

                     uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;

-                    csum =  csum_finish(nw_udp_checksum);
+                    csum = csum_finish(nw_udp_checksum);
+                    if (!csum) {
+                        csum = htons(0xffff);
+                    }

                     /* Insert new udp checksum. */
                     v_res = _mm256_insert_epi16(v_res, csum, 13);
@@ -485,7 +503,8 @@ action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
                 th->tcp_csum = csum;
             }
         }
-        /* Store new IP header. */
+
+        /* Write back the modified IPv4 addresses. */
         _mm256_storeu_si256((void *) nh, v_res);
     }
 }


This concludes my review. Please finish the discussion (ack/nack) on all open items (comments) before sending out a v8.

For example add a one-liner to each comment, like "will change", "will not change, because...", etc. For example on the previous patchset I made a comment about the checksum only being calculated on the fixed 20-byte size, but it was not changed in v7, or commented on in v6.

I spend 3 days on this patchset, so doing as less revisions as possible will help with my time management :)

Cheers,


Eelco
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index ffe25b41d..5cba14b92 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -22,6 +22,7 @@ 
 #include <config.h>
 #include <errno.h>
 
+#include "csum.h"
 #include "cpu.h"
 #include "dp-packet.h"
 #include "immintrin.h"
@@ -193,6 +194,213 @@  action_avx512_eth_set_addrs(struct dp_packet_batch *batch,
     }
 }
 
+/* Calculate delta checksum by summing only ip_src and ip_dst fields of
+ * ip_header. Resulting checksum will be used for updating L4 checksum */
+static inline uint16_t ALWAYS_INLINE
+avx512_l4_update_csum(struct ip_header *old_header, __m256i res)
+{
+    uint16_t tmp_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+
+    /* Each field needs to be shuffle into 16- bit granularity and across
+     * lanes. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i oh = _mm256_loadu_si256((void *) old_header);
+    oh = _mm256_mask_blend_epi16(0x3C0, oh, res);
+    __m256i v_shuf1 = _mm256_shuffle_epi8(oh, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(oh, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract checksum value. */
+    tmp_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~tmp_checksum;
+}
+
+/* Calculate checksum by summing entire contents of ip_header leaving out
+ * current checksum field. */
+static inline uint16_t ALWAYS_INLINE
+avx512_ipv4_recalc_csum(__m256i res)
+{
+    uint32_t new_checksum;
+    __m256i v_zeros = _mm256_setzero_si256();
+
+    /* Each field needs to be shuffle into 16-bit granularity and across
+     * lanes. */
+    __m256i v_swap16a = _mm256_setr_epi16(0x0100, 0xffff, 0x0302, 0xffff,
+                                          0x0504, 0xffff, 0x0706, 0xffff,
+                                          0x0100, 0xffff, 0x0302, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap16b = _mm256_setr_epi16(0x0908, 0xffff, 0xffff, 0xffff,
+                                          0x0d0c, 0xffff, 0x0f0e, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff,
+                                          0xffff, 0xffff, 0xffff, 0xffff);
+
+    __m256i v_swap32a = _mm256_setr_epi32(0x0, 0x4, 0xF, 0xF,
+                                          0xF, 0xF, 0xF, 0xF);
+
+    __m256i v_shuf1 = _mm256_shuffle_epi8(res, v_swap16a);
+    __m256i v_shuf2 = _mm256_shuffle_epi8(res, v_swap16b);
+
+    /* Add field values. */
+    __m256i v_sum = _mm256_add_epi32(v_shuf1, v_shuf2);
+
+    /* Perform horizontal add to go from 8x32-bits to 2x32-bits. */
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+
+    /* Shuffle 32-bit value from 3rd lane into first lane for final hadd. */
+    v_sum = _mm256_permutexvar_epi32(v_swap32a, v_sum);
+    v_sum = _mm256_hadd_epi32(v_sum, v_zeros);
+    v_sum = _mm256_hadd_epi16(v_sum, v_zeros);
+
+    /* Extract new checksum value. */
+    new_checksum = _mm256_extract_epi16(v_sum, 0);
+
+    return ~new_checksum;
+}
+
+/* The shuffles used in action_avx512_ipv4_set_addrs() require the ovs_key_ipv4
+ * struct to be in this layout. If struct changes, shuffle mask also needs to
+ * be updated. */
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_src) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_src) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_dst));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_dst) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_dst) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_proto));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_proto) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_proto) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_tos));
+
+BUILD_ASSERT_DECL(offsetof(struct ovs_key_ipv4, ipv4_tos) +
+                  MEMBER_SIZEOF(struct ovs_key_ipv4, ipv4_tos) ==
+                  offsetof(struct ovs_key_ipv4, ipv4_ttl));
+
+static void
+action_avx512_ipv4_set_addrs(struct dp_packet_batch *batch,
+                             const struct nlattr *a)
+{
+    a = nl_attr_get(a);
+    const struct ovs_key_ipv4 *key = nl_attr_get(a);
+    const struct ovs_key_ipv4 *mask = get_mask(a, struct ovs_key_ipv4);
+    struct dp_packet *packet;
+    ovs_be16 old_csum;
+
+    __m256i v_key = _mm256_loadu_si256((void *) key);
+    __m256i v_mask = _mm256_loadu_si256((void *) mask);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        struct ip_header *nh = dp_packet_l3(packet);
+        old_csum = nh->ip_csum;
+
+        __m256i v_packet = _mm256_loadu_si256((void *) nh);
+
+        /* Shuffle key and mask to match ip_header struct layout. */
+        static const uint8_t ip_shuffle_mask[32] = {
+            0xFF, 5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+            6, 0xFF, 0xFF, 0xFF, 0, 1, 2, 3,
+            0, 1, 2, 3, 0xFF, 0xFF, 0xFF, 0xFF,
+            0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
+        __m256i v_shuf32 = _mm256_setr_epi32(0x0, 0x2, 0xF, 0xF,
+                                             0x1, 0xF, 0xF, 0xF);
+
+        __m256i v_shuffle = _mm256_loadu_si256((void *) ip_shuffle_mask);
+
+        /* Two shuffles are required for key and mask to match the layout of
+         * the ip_header struct. The _shuffle_epi8 only works within 128-bit
+         * lanes, so a permute is required to move src and dst into the correct
+         * lanes. And then a shuffle is used to move the fields into the right
+         * order.
+         */
+        __m256i v_key_shuf = _mm256_permutexvar_epi32(v_shuf32, v_key);
+        v_key_shuf = _mm256_shuffle_epi8(v_key_shuf, v_shuffle);
+
+        __m256i v_mask_shuf = _mm256_permutexvar_epi32(v_shuf32, v_mask);
+        v_mask_shuf = _mm256_shuffle_epi8(v_mask_shuf, v_shuffle);
+
+        __m256i v_pkt_masked = _mm256_andnot_si256(v_mask_shuf, v_packet);
+        __m256i v_res = _mm256_or_si256(v_key_shuf, v_pkt_masked);
+
+        /* Recalculate the ip_csum based on updated values. */
+        uint16_t checksum = avx512_ipv4_recalc_csum(v_res);
+
+        /* Insert new checksum. */
+        v_res = _mm256_insert_epi16(v_res, checksum, 5);
+
+       /* If ip_src or ip_dst has been modified, L4 checksum needs to
+        * be updated too. */
+        int update_mask = _mm256_movemask_epi8(v_mask);
+        if (update_mask & 0xFF) {
+
+            uint16_t tmp_checksum = avx512_l4_update_csum(nh, v_res);
+            tmp_checksum = ~tmp_checksum;
+            uint16_t csum;
+
+            if (nh->ip_proto == IPPROTO_UDP) {
+                /* New UDP checksum. */
+                struct udp_header *uh = dp_packet_l4(packet);
+                if (!uh->udp_csum) {
+                    uh->udp_csum = htons(0xffff);
+                } else {
+                    uint16_t old_udp_checksum = ~uh->udp_csum;
+
+                    uint32_t udp_checksum = old_csum + tmp_checksum;
+                    udp_checksum = csum_finish(udp_checksum);
+                    uint16_t udp_csum = ~udp_checksum;
+
+                    uint32_t nw_udp_checksum = udp_csum + old_udp_checksum;
+
+                    csum =  csum_finish(nw_udp_checksum);
+
+                    /* Insert new udp checksum. */
+                    v_res = _mm256_insert_epi16(v_res, csum, 13);
+                }
+            } else if (nh->ip_proto == IPPROTO_TCP) {
+                /* New TCP checksum. */
+                struct tcp_header *th = dp_packet_l4(packet);
+                uint16_t old_tcp_checksum = ~th->tcp_csum;
+
+                uint32_t tcp_checksum = old_csum + tmp_checksum;
+                tcp_checksum = csum_finish(tcp_checksum);
+                uint16_t tcp_csum = ~tcp_checksum;
+
+                uint32_t nw_tcp_checksum = tcp_csum + old_tcp_checksum;
+
+                csum =  csum_finish(nw_tcp_checksum);
+
+                th->tcp_csum = csum;
+            }
+        }
+        /* Store new IP header. */
+        _mm256_storeu_si256((void *) nh, v_res);
+    }
+}
+
 static void
 action_avx512_set_masked(struct dp_packet_batch *batch OVS_UNUSED,
                          const struct nlattr *a)
@@ -244,6 +452,8 @@  action_avx512_init(struct odp_execute_action_impl *self)
      * are identified by OVS_KEY_ATTR_*. */
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] =
                             action_avx512_eth_set_addrs;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] =
+                            action_avx512_ipv4_set_addrs;
     avx512_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index e2d650779..763c8afc7 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -259,6 +259,7 @@  action_autoval_init(struct odp_execute_action_impl *self)
      * requires further processing for action type. Note that 2nd level items
      * are identified by OVS_KEY_ATTR_*. */
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_autoval_generic;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_autoval_generic;
     autoval_impl = *self;
 
     return 0;
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index db6e1ec03..8da009ce9 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -585,11 +585,6 @@  odp_execute_masked_set_action(struct dp_packet *packet,
         break;
     }
 
-    case OVS_KEY_ATTR_IPV4:
-        odp_set_ipv4(packet, nl_attr_get(a),
-                     get_mask(a, struct ovs_key_ipv4));
-        break;
-
     case OVS_KEY_ATTR_IPV6:
         odp_set_ipv6(packet, nl_attr_get(a),
                      get_mask(a, struct ovs_key_ipv6));
@@ -664,6 +659,7 @@  odp_execute_masked_set_action(struct dp_packet *packet,
     case __OVS_KEY_ATTR_MAX:
     /* The following action types are handled by the scalar implementation. */
     case OVS_KEY_ATTR_ETHERNET:
+    case OVS_KEY_ATTR_IPV4:
     default:
         OVS_NOT_REACHED();
     }
@@ -887,6 +883,18 @@  action_mod_eth(struct dp_packet_batch *batch, const struct nlattr *a)
     }
 }
 
+static void
+action_mod_ipv4(struct dp_packet_batch *batch, const struct nlattr *a)
+{
+    a = nl_attr_get(a);
+    struct dp_packet *packet;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        odp_set_ipv4(packet, nl_attr_get(a),
+                     get_mask(a, struct ovs_key_ipv4));
+    }
+}
+
 /* Implementation of the scalar actions impl init function. Build up the
  * array of func ptrs here.
  */
@@ -903,6 +911,7 @@  odp_action_scalar_init(struct odp_execute_action_impl *self)
      * requires further processing for action type. Note that 2nd level items
      * are identified by OVS_KEY_ATTR_*. */
     self->set_masked_funcs[OVS_KEY_ATTR_ETHERNET] = action_mod_eth;
+    self->set_masked_funcs[OVS_KEY_ATTR_IPV4] = action_mod_ipv4;
     actions_active_impl = *self;
 
     return 0;