diff mbox series

[ovs-dev,v3,10/12] dpif-netdev/mfex: Add AVX512 based optimized miniflow extract

Message ID 20210517135708.685517-11-kumar.amber@intel.com
State Superseded
Headers show
Series MFEX Infrastructure + Optimizations | expand

Commit Message

Kumar Amber May 17, 2021, 1:57 p.m. UTC
From: Harry van Haaren <harry.van.haaren@intel.com>

This commit adds AVX512 implementations of miniflow extract.
By using the 64 bytes available in an AVX512 register, it is
possible to convert a packet to a miniflow data-structure in
a small quantity instructions.

The implementation here probes for Ether()/IP()/UDP() traffic,
and builds the appropriate miniflow data-structure for packets
that match the probe.

The implementation here is auto-validated by the miniflow
extract autovalidator, hence its correctness can be easily
tested and verified.

Note that this commit is designed to easily allow addition of new
traffic profiles in a scalable way, without code duplication for
each traffic profile.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 lib/automake.mk                   |   1 +
 lib/dpif-netdev-extract-avx512.c  | 416 ++++++++++++++++++++++++++++++
 lib/dpif-netdev-private-extract.c |  15 ++
 lib/dpif-netdev-private-extract.h |  19 ++
 4 files changed, 451 insertions(+)
 create mode 100644 lib/dpif-netdev-extract-avx512.c
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index b04fd672f..f349ffea1 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -39,6 +39,7 @@  lib_libopenvswitchavx512_la_CFLAGS = \
 	$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
 	lib/dpif-netdev-lookup-avx512-gather.c \
+	lib/dpif-netdev-extract-avx512.c \
 	lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
 	-static
diff --git a/lib/dpif-netdev-extract-avx512.c b/lib/dpif-netdev-extract-avx512.c
new file mode 100644
index 000000000..1145ac8a9
--- /dev/null
+++ b/lib/dpif-netdev-extract-avx512.c
@@ -0,0 +1,416 @@ 
+/*
+ * Copyright (c) 2021 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions. */
+#if !defined(__CHECKER__)
+
+#include <config.h>
+#include <errno.h>
+#include <immintrin.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "flow.h"
+#include "dpdk.h"
+
+#include "dpif-netdev-private-dpcls.h"
+#include "dpif-netdev-private-extract.h"
+#include "dpif-netdev-private-flow.h"
+
+/* AVX512-BW level permutex2var_epi8 emulation. */
+static inline __m512i
+__attribute__((target("avx512bw")))
+_mm512_maskz_permutex2var_epi8_skx(__mmask64 k_mask,
+                                   __m512i v_data_0,
+                                   __m512i v_shuf_idxs,
+                                   __m512i v_data_1)
+{
+    /* Manipulate shuffle indexes for u16 size. */
+    __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA;
+    /* clear away ODD lane bytes. Cannot be done above due to no u8 shift */
+    __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                v_shuf_idxs, _mm512_setzero_si512());
+    v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1);
+
+    __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9);
+
+    /* Shuffle each half at 16-bit width */
+    __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn,
+                                                v_data_1);
+    __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd,
+                                                v_data_1);
+
+    /* Find if the shuffle index was odd, via mask and compare */
+    uint16_t index_odd_mask = 0x1;
+    const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask);
+
+    /* EVEN lanes, find if u8 index was odd,  result as u16 bitmask */
+    __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs,
+                                                 v_index_mask_u16);
+    __mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked,
+                                                        v_index_mask_u16);
+
+    /* ODD lanes, find if u8 index was odd, result as u16 bitmask */
+    __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8);
+    __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8,
+                                                v_index_mask_u16);
+    __mmask32 odd_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_odd_masked,
+                                                        v_index_mask_u16);
+    odd_rotate_mask = ~odd_rotate_mask;
+
+    /* Rotate and blend results from each index */
+    __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1, evn_rotate_mask,
+                                                    v_shuf1, 8);
+    __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2, odd_rotate_mask,
+                                                    v_shuf2, 8);
+
+    /* If shuffle index was odd, blend shifted version */
+    __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                                               v_shuf_res_evn, v_shuf_res_odd);
+
+    __m512i v_zeros = _mm512_setzero_si512();
+    __m512i v_result_kmskd = _mm512_mask_blend_epi8(k_mask, v_zeros,
+                                                    v_shuf_result);
+
+    return v_result_kmskd;
+}
+
+/* Wrapper function required to enable ISA. */
+static inline __m512i
+__attribute__((__target__("avx512vbmi")))
+_mm512_maskz_permutexvar_epi8_wrap(__mmask64 kmask, __m512i idx, __m512i a)
+{
+    return _mm512_maskz_permutexvar_epi8(kmask, idx, a);
+}
+
+
+/* This file contains optimized implementations of miniflow_extract()
+ * for specific common traffic patterns. The optimizations allow for
+ * quick probing of a specific packet type, and if a match with a specific
+ * type is found, a shuffle like proceedure builds up the required miniflow.
+ *
+ * The functionality here can be easily auto-validated and tested against the
+ * scalar miniflow_extract() function. As such, manual review of the code by
+ * the community (although welcome) is not required. Confidence in the
+ * correctness of the code can be had from the autovalidation.
+ */
+
+/* Generator for EtherType masks and values. */
+#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \
+  0, 0, 0, 0, 0, 0, /* Ether MAC DST */                                 \
+  0, 0, 0, 0, 0, 0, /* Ether MAC SRC */                                 \
+  type_b0, type_b1, /* EtherType */
+
+#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
+#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
+
+/* Generator for checking IPv4 ver, ihl, and proto */
+#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
+  VER_IHL, /* Version and IHL */                                        \
+  0, 0, 0, /* DSCP, ECN, Total Lenght */                                \
+  0, 0, /* Identification */                                            \
+  /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */  \
+  FLAG_OFF_B0, FLAG_OFF_B1,                                             \
+  0, /* TTL */                                                          \
+  PROTO, /* Protocol */                                                 \
+  0, 0, /* Header checksum */                                           \
+  0, 0, 0, 0, /* Src IP */                                              \
+  0, 0, 0, 0, /* Dst IP */
+
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
+#define PATTERN_IPV4_TCP PATTERN_IPV4_GEN(0x45, 0, 0, 0x06)
+
+#define NU 0
+#define PATTERN_IPV4_UDP_SHUFFLE \
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, NU, NU, /* Ether */ \
+  26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */  \
+  34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */   \
+  NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* Unused. */
+
+
+/* Generation of K-mask bitmask values, to zero out data in result. Note that
+ * these correspond 1:1 to the above "*_SHUFFLE" values, and bit used must be
+ * set in this K-mask, and "NU" values must be zero in the k-mask. Each mask
+ * defined here represents 2 blocks, so 16 bytes, so 4 characters (eg. 0xFFFF).
+ *
+ * Note the ULL suffix allows shifting by 32 or more without integer overflow.
+ */
+#define KMASK_ETHER     0x1FFFULL
+#define KMASK_IPV4      0xF0FFULL
+#define KMASK_UDP       0x000FULL
+
+#define PATTERN_IPV4_UDP_KMASK \
+    (KMASK_ETHER | (KMASK_IPV4 << 16) | (KMASK_UDP << 32))
+
+
+/* This union allows initializing static data as u8, but easily loading it
+ * into AVX512 registers too. The union ensures proper alignment for the zmm.
+ */
+union mfex_data {
+    uint8_t u8_data[64];
+    __m512i zmm;
+};
+
+/* This structure represents a single traffic pattern. The AVX512 code to
+ * enable the specifics for each pattern is largely the same, so it is
+ * specialized to use the common profile data from here.
+ *
+ * Due to the nature of e.g. TCP flag handling, or VLAN CFI bit setting,
+ * some profiles require additional processing. This is handled by having
+ * all implementations call a post-process function, and specializing away
+ * the big switch() that handles all traffic types.
+ *
+ * This approach reduces AVX512 code-duplication for each traffic type.
+ */
+struct mfex_profile {
+    /* Required for probing a packet with the mfex pattern. */
+    union mfex_data probe_mask;
+    union mfex_data probe_data;
+
+    /* Required for reshaping packet into miniflow. */
+    union mfex_data store_shuf;
+    __mmask64 store_kmsk;
+
+    /* Constant data to set in mf.bits and dp_packet data on hit. */
+    uint64_t mf_bits[2];
+    uint16_t dp_pkt_offs[4];
+    uint16_t dp_pkt_min_size;
+};
+
+enum MFEX_PROFILES {
+    PROFILE_ETH_IPV4_UDP,
+    PROFILE_COUNT,
+};
+
+/* Static const instances of profiles. These are compile-time constants,
+ * and are specialized into individual miniflow-extract functions.
+ */
+static const struct mfex_profile mfex_profiles[PROFILE_COUNT] =
+{
+    [PROFILE_ETH_IPV4_UDP] = {
+        .probe_mask.u8_data = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK },
+        .probe_data.u8_data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP},
+
+        .store_shuf.u8_data = { PATTERN_IPV4_UDP_SHUFFLE },
+        .store_kmsk = PATTERN_IPV4_UDP_KMASK,
+
+        .mf_bits = { 0x18a0000000000000, 0x0000000000040401},
+        .dp_pkt_offs = {
+            0, UINT16_MAX, 14, 34,
+        },
+        .dp_pkt_min_size = 42,
+    },
+};
+
+
+/* Protocol specific helper functions, for calculating offsets/lenghts. */
+static int32_t
+mfex_ipv4_set_l2_pad_size(struct dp_packet *pkt, struct ip_header *nh,
+                          uint32_t len_from_ipv4)
+{
+        /* Handle dynamic l2_pad_size. */
+        uint16_t tot_len = ntohs(nh->ip_tot_len);
+        if (OVS_UNLIKELY(tot_len > len_from_ipv4 ||
+                (len_from_ipv4 - tot_len) > UINT16_MAX)) {
+            return -1;
+        }
+        dp_packet_set_l2_pad_size(pkt, len_from_ipv4 - tot_len);
+        return 0;
+}
+
+/* Generic loop to process any mfex profile. This code is specialized into
+ * multiple actual MFEX implementation functions. Its marked ALWAYS_INLINE
+ * to ensure the compiler specializes each instance. The code is marked "hot"
+ * to inform the compiler this is a hotspot in the program, encouraging
+ * inlining of callee functions such as the permute calls.
+ */
+static inline uint32_t ALWAYS_INLINE
+__attribute__ ((hot))
+mfex_avx512_process(struct dp_packet_batch *packets,
+                    struct netdev_flow_key *keys,
+                    uint32_t keys_size OVS_UNUSED,
+                    odp_port_t in_port,
+                    void *pmd_handle OVS_UNUSED,
+                    const enum MFEX_PROFILES profile_id,
+                    const uint32_t use_vbmi)
+{
+    uint32_t hitmask = 0;
+    struct dp_packet *packet;
+
+    /* Here the profile to use is chosen by the variable used to specialize
+     * the function. This causes different MFEX traffic to be handled.
+     */
+    const struct mfex_profile *profile = &mfex_profiles[profile_id];
+
+    /* Load profile constant data. */
+    __m512i v_vals = _mm512_loadu_si512(&profile->probe_data);
+    __m512i v_mask = _mm512_loadu_si512(&profile->probe_mask);
+    __m512i v_shuf = _mm512_loadu_si512(&profile->store_shuf);
+
+    __mmask64 k_shuf = profile->store_kmsk;
+    __m128i v_bits = _mm_loadu_si128((void *) &profile->mf_bits);
+    uint16_t dp_pkt_min_size = profile->dp_pkt_min_size;
+
+    __m128i v_zeros = _mm_setzero_si128();
+    __m128i v_blocks01 = _mm_insert_epi32(v_zeros, odp_to_u32(in_port), 1);
+
+    DP_PACKET_BATCH_FOR_EACH (i, packet, packets) {
+        /* If the packet is smaller than the probe size, skip it. */
+        const uint32_t size = dp_packet_size(packet);
+        if (size < dp_pkt_min_size) {
+            continue;
+        }
+
+        /* Load packet data and probe with AVX512 mask & compare. */
+        const uint8_t *pkt = dp_packet_data(packet);
+        __m512i v_pkt0 = _mm512_loadu_si512(pkt);
+        __m512i v_pkt0_masked = _mm512_and_si512(v_pkt0, v_mask);
+        __mmask64 k_cmp = _mm512_cmpeq_epi8_mask(v_pkt0_masked, v_vals);
+        if (k_cmp != UINT64_MAX) {
+            continue;
+        }
+
+        /* Copy known dp packet offsets to the dp_packet instance. */
+        memcpy(&packet->l2_pad_size, &profile->dp_pkt_offs,
+               sizeof(uint16_t) * 4);
+
+        /* Store known miniflow bits and first two blocks. */
+        struct miniflow *mf = &keys[i].mf;
+        uint64_t *bits = (void *) &mf->map.bits[0];
+        uint64_t *blocks = miniflow_values(mf);
+        _mm_storeu_si128((void *) bits, v_bits);
+        _mm_storeu_si128((void *) blocks, v_blocks01);
+
+        /* Permute the packet layout into miniflow blocks shape.
+         * As different avx512 ISA levels have different implementations,
+         * this specializes on the "use_vbmi" attribute passed in.
+         */
+        __m512i v512_zeros = _mm512_setzero_si512();
+        __m512i v_blk0 = v512_zeros;
+        if (__builtin_constant_p(use_vbmi) && use_vbmi) {
+            v_blk0 = _mm512_maskz_permutexvar_epi8_wrap(k_shuf, v_shuf,
+                                                        v_pkt0);
+        } else {
+            v_blk0 = _mm512_maskz_permutex2var_epi8_skx(k_shuf, v_pkt0,
+                                                        v_shuf, v512_zeros);
+        }
+        _mm512_storeu_si512(&blocks[2], v_blk0);
+
+
+        /* Perform "post-processing" per profile, handling details not easily
+         * handled in the above generic AVX512 code. Examples include TCP flag
+         * parsing, adding the VLAN CFI bit, and handling IPv4 fragments.
+         */
+        switch (profile_id) {
+        case PROFILE_COUNT:
+            ovs_assert(0); /* avoid compiler warning on missing ENUM */
+            break;
+
+        case PROFILE_ETH_IPV4_UDP: {
+                /* Handle dynamic l2_pad_size. */
+                uint32_t size_from_ipv4 = size - sizeof(struct eth_header);
+                struct ip_header *nh = (void *)&pkt[sizeof(struct eth_header)];
+                if (mfex_ipv4_set_l2_pad_size(packet, nh, size_from_ipv4)) {
+                    continue;
+                }
+
+            } break;
+        default:
+            break;
+        };
+
+        /* This packet has its miniflow created, add to hitmask. */
+        hitmask |= 1 << i;
+    }
+
+    return hitmask;
+}
+
+
+#define DECLARE_MFEX_FUNC(name, profile)                                \
+uint32_t                                                                \
+__attribute__((__target__("avx512f")))                                  \
+__attribute__((__target__("avx512vl")))                                 \
+__attribute__((__target__("avx512vbmi")))                               \
+mfex_avx512_vbmi_##name(struct dp_packet_batch *packets,                \
+                    struct netdev_flow_key *keys, uint32_t keys_size,   \
+                    odp_port_t in_port, void *pmd_handle)               \
+{                                                                       \
+    return mfex_avx512_process(packets, keys, keys_size, in_port,       \
+                               pmd_handle, profile, 1);                 \
+}                                                                       \
+                                                                        \
+uint32_t                                                                \
+__attribute__((__target__("avx512f")))                                  \
+__attribute__((__target__("avx512vl")))                                 \
+mfex_avx512_##name(struct dp_packet_batch *packets,                     \
+                    struct netdev_flow_key *keys, uint32_t keys_size,   \
+                    odp_port_t in_port, void *pmd_handle)               \
+{                                                                       \
+    return mfex_avx512_process(packets, keys, keys_size, in_port,       \
+                               pmd_handle, profile, 0);                 \
+}
+
+/* Each profile gets a single declare here, which specializes the function
+ * as required.
+ */
+DECLARE_MFEX_FUNC(ip_udp,PROFILE_ETH_IPV4_UDP)
+
+
+static int32_t
+avx512_isa_probe(uint32_t needs_vbmi)
+{
+    static const char *isa_required[] = {
+        "avx512f",
+        "avx512bw",
+        "bmi2",
+    };
+
+    int32_t ret = 0;
+    for (uint32_t i = 0; i < ARRAY_SIZE(isa_required); i++) {
+        if (!dpdk_get_cpu_has_isa("x86_64", isa_required[i])) {
+            ret = -ENOTSUP;
+        }
+    }
+
+    if (needs_vbmi) {
+        if (!dpdk_get_cpu_has_isa("x86_64", "avx512vbmi")) {
+            ret = -ENOTSUP;
+        }
+    }
+
+    return ret;
+}
+
+/* Probe functions to check ISA requirements. */
+int32_t
+mfex_avx512_probe(void)
+{
+    const uint32_t needs_vbmi = 0;
+    return avx512_isa_probe(needs_vbmi);
+}
+
+int32_t
+mfex_avx512_vbmi_probe(void)
+{
+    const uint32_t needs_vbmi = 1;
+    return avx512_isa_probe(needs_vbmi);
+}
+
+#endif /* __CHECKER__ */
+#endif /* __x86_64__ */
diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
index 3a480866d..5646d72cc 100644
--- a/lib/dpif-netdev-private-extract.c
+++ b/lib/dpif-netdev-private-extract.c
@@ -47,8 +47,23 @@  static struct dpif_miniflow_extract_impl mfex_impls[] = {
         .extract_func = mfex_study_traffic,
         .name = "study",
     },
+
+/* Compile in implementations only if the compiler ISA checks pass. */
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD && __SSE4_2__)
+    {
+        .probe = mfex_avx512_vbmi_probe,
+        .extract_func = mfex_avx512_vbmi_ip_udp,
+        .name = "avx512_vbmi_ipv4_udp",
+    },
+    {
+        .probe = mfex_avx512_probe,
+        .extract_func = mfex_avx512_ip_udp,
+        .name = "avx512_ipv4_udp",
+    },
+#endif
 };
 
+
 BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
 
 int32_t
diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
index 0ec74bef9..f32be202a 100644
--- a/lib/dpif-netdev-private-extract.h
+++ b/lib/dpif-netdev-private-extract.h
@@ -136,4 +136,23 @@  dpif_miniflow_extract_set_default(miniflow_extract_func func);
 uint32_t mfex_set_study_pkt_cnt(uint32_t pkt_cmp_count,
                             struct dpif_miniflow_extract_impl *opt);
 
+/* AVX512 MFEX Probe and Implementations functions. */
+#ifdef __x86_64__
+int32_t mfex_avx512_probe(void);
+int32_t mfex_avx512_vbmi_probe(void);
+
+#define DECLARE_AVX512_MFEX_PROTOTYPE(name)                                 \
+    uint32_t                                                                \
+    mfex_avx512_vbmi_##name(struct dp_packet_batch *packets,                \
+                        struct netdev_flow_key *keys, uint32_t keys_size,   \
+                        odp_port_t in_port, void *pmd_handle);              \
+    uint32_t                                                                \
+    mfex_avx512_##name(struct dp_packet_batch *packets,                     \
+                        struct netdev_flow_key *keys, uint32_t keys_size,   \
+                        odp_port_t in_port, void *pmd_handle);
+
+DECLARE_AVX512_MFEX_PROTOTYPE(ip_udp);
+#endif /* __x86_64__ */
+
+
 #endif /* DPIF_NETDEV_AVX512_EXTRACT */