diff mbox series

[ovs-dev,v2,v2,4/6] dpif-netdev: add avx512 miniflow extract for traffic ip/udp

Message ID 20210428091931.2090062-5-kumar.amber@intel.com
State Superseded
Headers show
Series MFEX Infrastructure + Optimizations | expand

Commit Message

Kumar Amber April 28, 2021, 9:19 a.m. UTC
This patch introduces avx512 optimized function
pointer for IP/UDP traffic type and supporting
functions in dpif-netdev-extract-avx512.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
Co-authored-by: Kumar Amber <kumar.amber@intel.com>
Signed-off-by: Kumar Amber <kumar.amber@intel.com>
---
 lib/automake.mk                   |   1 +
 lib/dpdk.c                        |   1 +
 lib/dpif-netdev-extract-avx512.c  | 218 ++++++++++++++++++++++++++++++
 lib/dpif-netdev-private-extract.c |   5 +
 lib/dpif-netdev-private-extract.h |  11 ++
 5 files changed, 236 insertions(+)
 create mode 100644 lib/dpif-netdev-extract-avx512.c

Comments

Timothy Redaelli April 29, 2021, 1:52 p.m. UTC | #1
On Wed, 28 Apr 2021 14:49:29 +0530
Kumar Amber <kumar.amber@intel.com> wrote:

> This patch introduces avx512 optimized function
> pointer for IP/UDP traffic type and supporting
> functions in dpif-netdev-extract-avx512.
> 
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> ---
>  lib/automake.mk                   |   1 +
>  lib/dpdk.c                        |   1 +
>  lib/dpif-netdev-extract-avx512.c  | 218 ++++++++++++++++++++++++++++++
>  lib/dpif-netdev-private-extract.c |   5 +
>  lib/dpif-netdev-private-extract.h |  11 ++
>  5 files changed, 236 insertions(+)
>  create mode 100644 lib/dpif-netdev-extract-avx512.c
> 

Hi,
unlucky this patch breaks compilation on non-x86 arches:

libtool: compile:  gcc -DHAVE_CONFIG_H -I. -I.. -I ../include -I ./include -I ../lib -I ./lib -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wswitch-bool -Wlogical-not-parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value -Wduplicated-cond -Wshadow -Wmultistatement-macros -Wcast-align=strict -O2 -g -pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fexceptions -fstack-protector-strong -grecord-gcc-switches -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -m64 -march=z13 -mtune=z14 -fasynchronous-unwind-tables -fstack-clash-protection -c ../lib/dpif-netdev-extract-avx512.c  -fPIC -DPIC -o lib/.libs/dpif-netdev-extract-avx512.o
../lib/dpif-netdev-extract-avx512.c:18:10: fatal error: immintrin.h: No such file or directory
 #include <immintrin.h>
          ^~~~~~~~~~~~~
compilation terminated.
make[2]: *** [Makefile:4562: lib/dpif-netdev-extract-avx512.lo] Error 1
make[2]: *** Waiting for unfinished jobs....

You should, probably, keep all the file content inside an #ifdef
__x86_64__, and probably also inside an #if !defined(__CHECKER__), like
dpif-netdev-lookup-avx512-gather.c and dpif-netdev-avx512.c.
Van Haaren, Harry May 13, 2021, 10:01 a.m. UTC | #2
> -----Original Message-----
> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Timothy Redaelli
> Sent: Thursday, April 29, 2021 2:52 PM
> To: Amber, Kumar <kumar.amber@intel.com>
> Cc: dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v2 v2 4/6] dpif-netdev: add avx512 miniflow extract for
> traffic ip/udp
> 
> On Wed, 28 Apr 2021 14:49:29 +0530
> Kumar Amber <kumar.amber@intel.com> wrote:
> 
> > This patch introduces avx512 optimized function
> > pointer for IP/UDP traffic type and supporting
> > functions in dpif-netdev-extract-avx512.
> >
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > Co-authored-by: Kumar Amber <kumar.amber@intel.com>
> > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> > ---
> >  lib/automake.mk                   |   1 +
> >  lib/dpdk.c                        |   1 +
> >  lib/dpif-netdev-extract-avx512.c  | 218 ++++++++++++++++++++++++++++++
> >  lib/dpif-netdev-private-extract.c |   5 +
> >  lib/dpif-netdev-private-extract.h |  11 ++
> >  5 files changed, 236 insertions(+)
> >  create mode 100644 lib/dpif-netdev-extract-avx512.c
> >
> 
> Hi,
> unlucky this patch breaks compilation on non-x86 arches:
> 
> libtool: compile:  gcc -DHAVE_CONFIG_H -I. -I.. -I ../include -I ./include -I ../lib -I ./lib
> -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -
> Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -
> Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -
> Wmissing-field-initializers -fno-strict-aliasing -Wswitch-bool -Wlogical-not-
> parentheses -Wsizeof-array-argument -Wbool-compare -Wshift-negative-value -
> Wduplicated-cond -Wshadow -Wmultistatement-macros -Wcast-align=strict -O2 -g -
> pipe -Wall -Werror=format-security -Wp,-D_FORTIFY_SOURCE=2 -Wp,-
> D_GLIBCXX_ASSERTIONS -fexceptions -fstack-protector-strong -grecord-gcc-
> switches -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1 -
> specs=/usr/lib/rpm/redhat/redhat-annobin-cc1 -m64 -march=z13 -mtune=z14 -
> fasynchronous-unwind-tables -fstack-clash-protection -c ../lib/dpif-netdev-extract-
> avx512.c  -fPIC -DPIC -o lib/.libs/dpif-netdev-extract-avx512.o
> ../lib/dpif-netdev-extract-avx512.c:18:10: fatal error: immintrin.h: No such file or
> directory
>  #include <immintrin.h>
>           ^~~~~~~~~~~~~
> compilation terminated.
> make[2]: *** [Makefile:4562: lib/dpif-netdev-extract-avx512.lo] Error 1
> make[2]: *** Waiting for unfinished jobs....
> 
> You should, probably, keep all the file content inside an #ifdef
> __x86_64__, and probably also inside an #if !defined(__CHECKER__), like
> dpif-netdev-lookup-avx512-gather.c and dpif-netdev-avx512.c.

Hi Timothy,

Apologies for response in delay - just saw your review on the patchwork:
https://patchwork.ozlabs.org/project/openvswitch/patch/20210428091931.2090062-5-kumar.amber@intel.com/

You're absolutely right that the code here wasn't portable to other Archs,
this was a known limitation of the v2, and is fixed in the V3 which we intend
to send to the mailing list in the next days.

Thanks for review & input, -Harry
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index b04fd672f..f3412352a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -113,6 +113,7 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dpif-netdev-lookup-generic.c \
 	lib/dpif-netdev.c \
 	lib/dpif-netdev.h \
+	lib/dpif-netdev-extract-avx512.c \
 	lib/dpif-netdev-extract-study.c \
 	lib/dpif-netdev-private-dfc.h \
 	lib/dpif-netdev-private-dpcls.h \
diff --git a/lib/dpdk.c b/lib/dpdk.c
index a9494a40f..e0c76abe3 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -657,6 +657,7 @@  dpdk_get_cpu_has_isa(const char *arch, const char *feature)
     CHECK_CPU_FEATURE(feature, "avx512f", RTE_CPUFLAG_AVX512F);
     CHECK_CPU_FEATURE(feature, "avx512vpopcntdq", RTE_CPUFLAG_AVX512VPOPCNTDQ);
     CHECK_CPU_FEATURE(feature, "bmi2", RTE_CPUFLAG_BMI2);
+    CHECK_CPU_FEATURE(feature, "avx512bw", RTE_CPUFLAG_AVX512BW);
 #endif
 
     VLOG_WARN("Unknown CPU arch,feature: %s,%s. Returning not supported.\n",
diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
new file mode 100644
index 000000000..169775f4b
--- /dev/null
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -0,0 +1,218 @@ 
+/*
+ * Copyright (c) 2021 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <config.h>
+#include <errno.h>
+#include <immintrin.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "dpdk.h"
+#include "dpif-netdev-private-dpcls.h"
+#include "dpif-netdev-private-extract.h"
+#include "flow.h"
+
+/* This file contains optimized implementations of miniflow_extract()
+ * for specific common traffic patterns. The optimizations allow for
+ * quick probing of a specific packet type, and if a match with a specific
+ * type is found, a shuffle like proceedure builds up the required miniflow.
+ *
+ * The functionality here can be easily auto-validated and tested against the
+ * scalar miniflow_extract() function. As such, manual review of the code by
+ * the community (although welcome) is not required. Confidence in the
+ * correctness of the code can be had from the autovalidation.
+ */
+
+/* Generator for EtherType masks and values. */
+#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \
+  0, 0, 0, 0, 0, 0, /* Ether MAC DST */                                 \
+  0, 0, 0, 0, 0, 0, /* Ether MAC SRC */                                 \
+  type_b0, type_b1, /* EtherType */
+
+#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
+#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
+
+/* Generator for checking IPv4 ver, ihl, and proto */
+#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
+  VER_IHL, /* Version and IHL */                                        \
+  0, 0, 0, /* DSCP, ECN, Total Lenght */                                \
+  0, 0, /* Identification */                                            \
+  /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */  \
+  FLAG_OFF_B0, FLAG_OFF_B1,                                             \
+  0, /* TTL */                                                          \
+  PROTO, /* Protocol */                                                 \
+  0, 0, /* Header checksum */                                           \
+  0, 0, 0, 0, /* Src IP */                                              \
+  0, 0, 0, 0, /* Dst IP */
+
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
+
+#define NU 0
+#define PATTERN_IPV4_UDP_SHUFFLE \
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, NU, NU, /* Ether */ \
+  26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */  \
+  34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */
+
+/* Masks for Ether()/IP()/UDP() traffic */
+static const uint8_t eth_ip_udp_mask[64] = {
+    PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK
+};
+static const uint8_t eth_ip_udp_values[64] = {
+    PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP
+};
+static const uint8_t eth_ip_udp_shuf[64] = {
+    PATTERN_IPV4_UDP_SHUFFLE
+};
+
+static inline __m512i
+__attribute__((target("avx512bw")))
+_mm512_maskz_permutex2var_epi8_skx(__mmask64 k_mask,
+                                   __m512i v_data_0,
+                                   __m512i v_shuf_idxs,
+                                   __m512i v_data_1)
+{
+    /* Manipulate shuffle indexes for u16 size. */
+    __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA;
+    /* clear away ODD lane bytes. Cannot be done above due to no u8 shift */
+    __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                v_shuf_idxs, _mm512_setzero_si512());
+    v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1);
+
+    __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9);
+
+    /* Shuffle each half at 16-bit width */
+    __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn,
+                                                v_data_1);
+    __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd,
+                                                v_data_1);
+
+    /* Find if the shuffle index was odd, via mask and compare */
+    uint16_t index_odd_mask = 0x1;
+    const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask);
+
+    /* EVEN lanes, find if u8 index was odd,  result as u16 bitmask */
+    __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs,
+                                                 v_index_mask_u16);
+    __mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked,
+                                                        v_index_mask_u16);
+
+    /* ODD lanes, find if u8 index was odd, result as u16 bitmask */
+    __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8);
+    __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8,
+                                                v_index_mask_u16);
+    __mmask32 odd_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_odd_masked,
+                                                        v_index_mask_u16);
+    odd_rotate_mask = ~odd_rotate_mask;
+
+    /* Rotate and blend results from each index */
+    __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1, evn_rotate_mask,
+                                                    v_shuf1, 8);
+    __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2, odd_rotate_mask,
+                                                    v_shuf2, 8);
+
+    /* If shuffle index was odd, blend shifted version */
+    __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                                               v_shuf_res_evn, v_shuf_res_odd);
+
+    __m512i v_zeros = _mm512_setzero_si512();
+    __m512i v_result_kmskd = _mm512_mask_blend_epi8(k_mask, v_zeros,
+                                                    v_shuf_result);
+
+    return v_result_kmskd;
+}
+
+static inline void
+__attribute__((target("avx512bw")))
+avx512_ipv4_udp_store(const uint8_t *pkt, struct miniflow *mf,
+                          uint32_t in_port)
+{
+    int64_t u0b = 0x18a0000000000000;
+    int64_t u1b = 0x0000000000040401;
+    __m128i v_bits = {u0b, u1b};
+
+    /* Store mf Bits */
+    uint64_t *bits = (void *)&mf->map.bits[0];
+    uint64_t *blocks = miniflow_values(mf);
+    _mm_storeu_si128((void *) bits, v_bits);
+
+    /* Load packet and shuffle */
+    __m512i v_pkt0 = _mm512_loadu_si512(&pkt[0]);
+    __m512i v_eth_ip_udp_shuf = _mm512_loadu_si512(eth_ip_udp_shuf);
+
+    /* Shuffle pkt and store blocks */
+    __mmask64 k_shufzero = 0b0000111111110000111111110011111111111111;
+    __m512i v_zeros = _mm512_setzero_si512();
+    __m512i v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shufzero,
+                                        v_pkt0, v_eth_ip_udp_shuf, v_zeros);
+
+    _mm512_storeu_si512(&blocks[2], v_blk0);
+
+    uint64_t inp = ((uint64_t) in_port) << 32;
+    blocks[0] = inp;
+}
+
+static inline uint32_t
+__attribute__((target("avx512bw")))
+avx512_ipv4_udp_probe(const uint8_t *pkt, uint32_t len)
+{
+    /* Packet data is masked to known IPv4/UDP parse length. */
+    uint64_t klen = UINT64_MAX;
+    if (len < 64) {
+        klen = (1ULL << len) - 1;
+    }
+
+    __m512i v_pkt0 = _mm512_maskz_loadu_epi8(klen, &pkt[0]);
+    __m512i v_eth_ip_udp_mask = _mm512_loadu_si512(eth_ip_udp_mask);
+    __m512i v_eth_ip_udp_vals = _mm512_loadu_si512(eth_ip_udp_values);
+    __m512i v_pkt0_masked = _mm512_and_si512(v_pkt0, v_eth_ip_udp_mask);
+    __mmask64 k_cmp = _mm512_cmpeq_epi8_mask(v_pkt0_masked, v_eth_ip_udp_vals);
+
+    return (k_cmp == -1);
+}
+
+uint32_t
+__attribute__((target("avx512bw")))
+mfex_avx512_ipv4_udp(struct dp_packet_batch *packets,
+                     struct netdev_flow_key *keys,
+                     uint32_t keys_size OVS_UNUSED, odp_port_t in_port,
+                     void *pmd_handle OVS_UNUSED)
+{
+    uint32_t hitmask = 0;
+    struct dp_packet *packet;
+    DP_PACKET_BATCH_FOR_EACH (i, packet, packets) {
+        const uint32_t size = dp_packet_size(packet);
+        const uint8_t *pkt = dp_packet_data(packet);
+        uint32_t match = avx512_ipv4_udp_probe(pkt, size);
+        if (match) {
+            avx512_ipv4_udp_store(pkt, &keys[i].mf, in_port);
+            hitmask |= 1 << i;
+        }
+    }
+    return hitmask;
+}
+
+int32_t
+mfex_avx512_probe(void)
+{
+    int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+    int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+    int avx512bw_available = dpdk_get_cpu_has_isa("x86_64", "avx512bw");
+    if (!avx512f_available || !avx512bw_available || !bmi2_available) {
+        return -ENOTSUP;
+    }
+
+    return 0;
+}
diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
index 76c24c2f8..060c1939a 100644
--- a/lib/dpif-netdev-private-extract.c
+++ b/lib/dpif-netdev-private-extract.c
@@ -47,6 +47,11 @@  static struct dpif_miniflow_extract_impl mfex_impls[] = {
         .extract_func = mfex_study_traffic,
         .name = "study",
     },
+    {
+        .probe = mfex_avx512_probe,
+        .extract_func = mfex_avx512_ipv4_udp,
+        .name = "avx512_ip_udp",
+    },
 };
 
 BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
index 3ada413bb..e7b45c2b1 100644
--- a/lib/dpif-netdev-private-extract.h
+++ b/lib/dpif-netdev-private-extract.h
@@ -118,4 +118,15 @@  mfex_study_traffic(struct dp_packet_batch *packets,
                    uint32_t keys_size, odp_port_t in_port,
                    void *pmd_handle);
 
+/* Probe function to detect CPU ISA for SKX. */
+int32_t
+mfex_avx512_probe(void);
+
+/* Traffic specific AVX512 Eth/Ipv4/Udp traffic type for SKX. */
+uint32_t
+mfex_avx512_ipv4_udp(struct dp_packet_batch *packets,
+                         struct netdev_flow_key *keys,
+                         uint32_t keys_size, odp_port_t in_port,
+                         void *pmd_handle);
+
 #endif /* DPIF_NETDEV_AVX512_EXTRACT */