diff mbox series

[ovs-dev,v5,7/8] odp-execute: Add ISA implementation of pop_vlan action.

Message ID 20220112094244.81402-8-emma.finn@intel.com
State Superseded
Headers show
Series Actions Infrastructure + Optimizations | expand

Commit Message

Emma Finn Jan. 12, 2022, 9:42 a.m. UTC
This commit adds the AVX512 implementation of the pop_vlan action.
The implementation here is auto-validated by the miniflow
extract autovalidator, hence its correctness can be easily
tested and verified.

Signed-off-by: Emma Finn <emma.finn@intel.com>
---
 lib/odp-execute-avx512.c  | 77 ++++++++++++++++++++++++++++++++++++++-
 lib/odp-execute-private.c |  2 +-
 lib/odp-execute-private.h |  2 +-
 3 files changed, 78 insertions(+), 3 deletions(-)

Comments

Stokes, Ian Jan. 12, 2022, 8:33 p.m. UTC | #1
> This commit adds the AVX512 implementation of the pop_vlan action.
> The implementation here is auto-validated by the miniflow
> extract autovalidator, hence its correctness can be easily
> tested and verified.
> 
> Signed-off-by: Emma Finn <emma.finn@intel.com>
 
Hi Emma, some comments below.

> ---
>  lib/odp-execute-avx512.c  | 77 ++++++++++++++++++++++++++++++++++++++-
>  lib/odp-execute-private.c |  2 +-
>  lib/odp-execute-private.h |  2 +-
>  3 files changed, 78 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
> index aa71faa1c..fcf27f070 100644
> --- a/lib/odp-execute-avx512.c
> +++ b/lib/odp-execute-avx512.c
> @@ -14,6 +14,11 @@
>   * limitations under the License.
>   */
> 
> +#ifdef __x86_64__
> +/* Sparse cannot handle the AVX512 instructions. */
> +#if !defined(__CHECKER__)
> +
> +
>  #include <config.h>
>  #include <errno.h>
> 
> @@ -25,6 +30,71 @@
> 
>  #include "immintrin.h"
> 
> +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
> +                  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
> +                  offsetof(struct dp_packet, l3_ofs));
> +
> +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
> +                           MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
> +                           offsetof(struct dp_packet, l4_ofs));
> +
> +static inline void ALWAYS_INLINE
> +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
> +{
> +    /* update packet size/data pointers */
Minor, Capitalize start of comment, missing period (goes for a few of the comments in the rest of this function also).

> +    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
> +    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
> +
> +    /* Increment u16 packet offset values */
> +    const __m128i v_zeros = _mm_setzero_si128();
> +    const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
> +
> +    /* Only these lanes can be incremented for push-VLAN action. */
> +    const uint8_t k_lanes = 0b1110;
> +    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);

Can you walk through the above logic, as this is the pop use cases, are you saying you don't want to use these lanes as they should only be used for the push vlan case?
> +
> +    /* Load packet and compare with UINT16_MAX */
> +    void *adjust_ptr = &b->l2_pad_size;
> +    __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
> +    __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes,
> v_adjust_src,
> +                                                    v_u16_max);
> +
> +    /* Add VLAN_HEADER_LEN using compare mask, store results. */

Again I'm confused here, if the operation being added is to pop_vlan why are we adding a vlan header?

If you could give a general run down of the expected operations here and logic it would be appreciated.

Thanks
Ian
Emma Finn Jan. 13, 2022, 9:41 a.m. UTC | #2
> -----Original Message-----
> From: Stokes, Ian <ian.stokes@intel.com>
> Sent: Wednesday 12 January 2022 20:34
> To: Finn, Emma <emma.finn@intel.com>; dev@openvswitch.org; Van Haaren,
> Harry <harry.van.haaren@intel.com>; Amber, Kumar
> <kumar.amber@intel.com>; i.maximets@ovn.org
> Subject: RE: [PATCH v5 7/8] odp-execute: Add ISA implementation of pop_vlan
> action.
> 
> > This commit adds the AVX512 implementation of the pop_vlan action.
> > The implementation here is auto-validated by the miniflow extract
> > autovalidator, hence its correctness can be easily tested and
> > verified.
> >
> > Signed-off-by: Emma Finn <emma.finn@intel.com>
> 
> Hi Emma, some comments below.
> 
Thanks Ian, Comments here are misleading so I will update them in v6 to make things clearer.
> > ---
> >  lib/odp-execute-avx512.c  | 77
> > ++++++++++++++++++++++++++++++++++++++-
> >  lib/odp-execute-private.c |  2 +-
> >  lib/odp-execute-private.h |  2 +-
> >  3 files changed, 78 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c index
> > aa71faa1c..fcf27f070 100644
> > --- a/lib/odp-execute-avx512.c
> > +++ b/lib/odp-execute-avx512.c
> > @@ -14,6 +14,11 @@
> >   * limitations under the License.
> >   */
> >
> > +#ifdef __x86_64__
> > +/* Sparse cannot handle the AVX512 instructions. */ #if
> > +!defined(__CHECKER__)
> > +
> > +
> >  #include <config.h>
> >  #include <errno.h>
> >
> > @@ -25,6 +30,71 @@
> >
> >  #include "immintrin.h"
> >
> > +VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
> > +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
> > +                  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
> > +                  offsetof(struct dp_packet, l3_ofs));
> > +
> > +BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
> > +                           MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
> > +                           offsetof(struct dp_packet, l4_ofs));
> > +
> > +static inline void ALWAYS_INLINE
> > +avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
> > +{
> > +    /* update packet size/data pointers */
> Minor, Capitalize start of comment, missing period (goes for a few of the
> comments in the rest of this function also).
> 
> > +    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
> > +    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
> > +
> > +    /* Increment u16 packet offset values */
> > +    const __m128i v_zeros = _mm_setzero_si128();
> > +    const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
> > +
> > +    /* Only these lanes can be incremented for push-VLAN action. */

Only these lanes should be updated for L2 of a packet. 
> > +    const uint8_t k_lanes = 0b1110;

Scalar value of the VLAN_HEADER length is broadcasted across a whole SIMD register
> > +    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);
> 
> Can you walk through the above logic, as this is the pop use cases, are you
> saying you don't want to use these lanes as they should only be used for the
> push vlan case?
> > +
> > +    /* Load packet and compare with UINT16_MAX */
> > +    void *adjust_ptr = &b->l2_pad_size;

Load the 4 uint16 values from dp_packet into a SIMD register.
And in this case that would be the 4 offset values in the packet mbuf.
> > +    __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);

Generate a k mask to use for updating offset values of the packet buffer.
> > +    __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes,
> > v_adjust_src,
> > +                                                    v_u16_max);
> > +
> > +    /* Add VLAN_HEADER_LEN using compare mask, store results. */
> 
Use a k mask subtract (for pop use case) to update the offset values of the packet buffer and store the updated values back

> Again I'm confused here, if the operation being added is to pop_vlan why are we
> adding a vlan header?
> 
> If you could give a general run down of the expected operations here and logic it
> would be appreciated.
> 
> Thanks
> Ian
diff mbox series

Patch

diff --git a/lib/odp-execute-avx512.c b/lib/odp-execute-avx512.c
index aa71faa1c..fcf27f070 100644
--- a/lib/odp-execute-avx512.c
+++ b/lib/odp-execute-avx512.c
@@ -14,6 +14,11 @@ 
  * limitations under the License.
  */
 
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions. */
+#if !defined(__CHECKER__)
+
+
 #include <config.h>
 #include <errno.h>
 
@@ -25,6 +30,71 @@ 
 
 #include "immintrin.h"
 
+VLOG_DEFINE_THIS_MODULE(odp_execute_avx512);
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l2_5_ofs) +
+                  MEMBER_SIZEOF(struct dp_packet, l2_5_ofs) ==
+                  offsetof(struct dp_packet, l3_ofs));
+
+BUILD_ASSERT_DECL(offsetof(struct dp_packet, l3_ofs) +
+                           MEMBER_SIZEOF(struct dp_packet, l3_ofs) ==
+                           offsetof(struct dp_packet, l4_ofs));
+
+static inline void ALWAYS_INLINE
+avx512_dp_packet_resize_l2(struct dp_packet *b, int resize_by_bytes)
+{
+    /* update packet size/data pointers */
+    dp_packet_set_data(b, (char *) dp_packet_data(b) - resize_by_bytes);
+    dp_packet_set_size(b, dp_packet_size(b) + resize_by_bytes);
+
+    /* Increment u16 packet offset values */
+    const __m128i v_zeros = _mm_setzero_si128();
+    const __m128i v_u16_max = _mm_cmpeq_epi16(v_zeros, v_zeros);
+
+    /* Only these lanes can be incremented for push-VLAN action. */
+    const uint8_t k_lanes = 0b1110;
+    __m128i v_offset = _mm_set1_epi16(VLAN_HEADER_LEN);
+
+    /* Load packet and compare with UINT16_MAX */
+    void *adjust_ptr = &b->l2_pad_size;
+    __m128i v_adjust_src = _mm_loadu_si128(adjust_ptr);
+    __mmask8 k_cmp = _mm_mask_cmpneq_epu16_mask(k_lanes, v_adjust_src,
+                                                    v_u16_max);
+
+    /* Add VLAN_HEADER_LEN using compare mask, store results. */
+    __m128i v_adjust_wip = _mm_mask_sub_epi16(v_adjust_src, k_cmp,
+                                              v_adjust_src, v_offset);
+    _mm_storeu_si128(adjust_ptr, v_adjust_wip);
+
+}
+
+static inline void ALWAYS_INLINE
+avx512_eth_pop_vlan(struct dp_packet *packet)
+{
+    struct vlan_eth_header *veh = dp_packet_eth(packet);
+
+    if (veh && dp_packet_size(packet) >= sizeof *veh &&
+        eth_type_vlan(veh->veth_type)) {
+
+        __m128i v_ether = _mm_loadu_si128((void *) veh);
+        __m128i v_realign = _mm_alignr_epi8(v_ether, _mm_setzero_si128(),
+                                            16 - VLAN_HEADER_LEN);
+        _mm_storeu_si128((void *) veh, v_realign);
+        avx512_dp_packet_resize_l2(packet, -VLAN_HEADER_LEN);
+
+    }
+}
+
+static void
+action_avx512_pop_vlan(void *dp OVS_UNUSED, struct dp_packet_batch *batch,
+                       const struct nlattr *a OVS_UNUSED,
+                       bool should_steal OVS_UNUSED)
+{
+    struct dp_packet *packet;
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+        avx512_eth_pop_vlan(packet);
+    }
+}
 
 /* Probe functions to check ISA requirements. */
 static int32_t
@@ -62,8 +132,13 @@  action_avx512_probe(void)
 
 
 int32_t
-action_avx512_init(void)
+action_avx512_init(struct odp_execute_action_impl *self)
 {
     avx512_isa_probe(0);
+    self->funcs[OVS_ACTION_ATTR_POP_VLAN] = action_avx512_pop_vlan;
+
     return 0;
 }
+
+#endif
+#endif
diff --git a/lib/odp-execute-private.c b/lib/odp-execute-private.c
index e61136e8b..175a80159 100644
--- a/lib/odp-execute-private.c
+++ b/lib/odp-execute-private.c
@@ -52,7 +52,7 @@  static struct odp_execute_action_impl action_impls[] = {
         .available = 1,
         .name = "avx512",
         .probe = action_avx512_probe,
-        .init_func = NULL,
+        .init_func = action_avx512_init,
     },
     #endif
 };
diff --git a/lib/odp-execute-private.h b/lib/odp-execute-private.h
index 4c09bee63..5ba2868bf 100644
--- a/lib/odp-execute-private.h
+++ b/lib/odp-execute-private.h
@@ -102,7 +102,7 @@  int32_t odp_execute_action_set(const char *name,
 int32_t odp_action_scalar_init(struct odp_execute_action_impl *self);
 
 /* Init function for the optimized with AVX512 actions. */
-int32_t action_avx512_init(void);
+int32_t action_avx512_init(struct odp_execute_action_impl *self);
 
 /* Probe function to check ISA requirements. */
 int32_t action_avx512_probe(void);