diff mbox series

[ovs-dev,v4,10/11] dpif-netdev: enable ISA optimized miniflow extract

Message ID 20201125184342.2715681-11-harry.van.haaren@intel.com
State Superseded
Headers show
Series DPIF & MFEX Refactor and SIMD optimization | expand

Commit Message

Van Haaren, Harry Nov. 25, 2020, 6:43 p.m. UTC
This commit refactors the way in which the DPIF component can
call the miniflow-extract function. It creates flexibility in
the DPIF component by adding a function pointer at the pmd level.

A new miniflow extract implementation is created which allows the
AVX-512 SIMD instructions to perform the packet matching and building
of the miniflow data-structure.

All AVX-512 capable CPUs will be able to run the miniflow
extract, however CPUs that support the AVX-512 Vector Bit
Manipulation Instructions (VBMI) will benefit more as the
native byte permute instruction gives extra performance.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 lib/automake.mk                   |   5 +
 lib/dpif-netdev-avx512-extract.c  | 528 ++++++++++++++++++++++++++++++
 lib/dpif-netdev-avx512-extract.h  |  40 +++
 lib/dpif-netdev-avx512.c          |  12 +-
 lib/dpif-netdev-private-extract.c |  72 ++++
 lib/dpif-netdev-private-extract.h |  60 ++++
 lib/dpif-netdev-private-flow.h    |   1 +
 lib/dpif-netdev-private-thread.h  |   9 +
 lib/dpif-netdev.c                 | 111 +++++++
 9 files changed, 837 insertions(+), 1 deletion(-)
 create mode 100644 lib/dpif-netdev-avx512-extract.c
 create mode 100644 lib/dpif-netdev-avx512-extract.h
 create mode 100644 lib/dpif-netdev-private-extract.c
 create mode 100644 lib/dpif-netdev-private-extract.h

Comments

0-day Robot Nov. 25, 2020, 7:25 p.m. UTC | #1
Bleep bloop.  Greetings Harry van Haaren, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


checkpatch:
WARNING: Line is 80 characters long (recommended limit is 79)
#430 FILE: lib/dpif-netdev-avx512-extract.c:352:
    /* Check that the runtime CPU has the required ISA avialable. Also check for

WARNING: Line is 80 characters long (recommended limit is 79)
#431 FILE: lib/dpif-netdev-avx512-extract.c:353:
     * AVX-512 Vector Bit Manipulation Instructions (VBMI), which allow a faster

ERROR: Inappropriate bracing around statement
#598 FILE: lib/dpif-netdev-avx512-extract.c:520:
        if (avx512vbmi_available)

Lines checked: 1011, Warnings: 2, Errors: 1


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 719477aa5..1d2d0804b 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -32,6 +32,7 @@  lib_libopenvswitch_la_LIBADD += lib/libopenvswitchavx512.la
 lib_libopenvswitchavx512_la_CFLAGS = \
 	-mavx512f \
 	-mavx512bw \
+	-mavx512vl \
 	-mavx512dq \
 	-mbmi \
 	-mbmi2 \
@@ -39,6 +40,7 @@  lib_libopenvswitchavx512_la_CFLAGS = \
 	$(AM_CFLAGS)
 lib_libopenvswitchavx512_la_SOURCES = \
 	lib/dpif-netdev-lookup-avx512-gather.c \
+	lib/dpif-netdev-avx512-extract.c \
 	lib/dpif-netdev-avx512.c
 lib_libopenvswitchavx512_la_LDFLAGS = \
 	-static
@@ -107,6 +109,7 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dp-packet.h \
 	lib/dp-packet.c \
 	lib/dpdk.h \
+	lib/dpif-netdev-avx512-extract.h \
 	lib/dpif-netdev-lookup.h \
 	lib/dpif-netdev-lookup.c \
 	lib/dpif-netdev-lookup-autovalidator.c \
@@ -117,6 +120,8 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dpif-netdev-private-dpcls.h \
 	lib/dpif-netdev-private-dpif.c \
 	lib/dpif-netdev-private-dpif.h \
+	lib/dpif-netdev-private-extract.c \
+	lib/dpif-netdev-private-extract.h \
 	lib/dpif-netdev-private-flow.h \
 	lib/dpif-netdev-private-hwol.h \
 	lib/dpif-netdev-private-thread.h \
diff --git a/lib/dpif-netdev-avx512-extract.c b/lib/dpif-netdev-avx512-extract.c
new file mode 100644
index 000000000..592a82bd4
--- /dev/null
+++ b/lib/dpif-netdev-avx512-extract.c
@@ -0,0 +1,528 @@ 
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifdef __x86_64__
+/* Sparse cannot handle the AVX512 instructions */
+#if !defined(__CHECKER__)
+
+#include <config.h>
+#include <errno.h>
+
+#include "dpif-netdev-avx512-extract.h"
+#include "dpif-netdev-private-thread.h"
+
+#include "immintrin.h"
+
+/* This file contains optimized implementations of miniflow_extract()
+ * for specific common traffic patterns. The optimizations allow for
+ * quick probing of a specific packet type, and if a match with a specific
+ * type is found, a shuffle like proceedure builds up the required miniflow
+ */
+
+#define MAX_PATTERN_COUNT (8)
+#define MAX_PATTERN_SIZE (128)
+#define MAX_SHUFFLE_COUNT (MAX_PATTERN_SIZE / 64)
+
+/* A structure to represent each matched on packet pattern */
+struct __attribute__((aligned(MAX_PATTERN_SIZE))) packet_pattern {
+    /* A bitmask to apply to the packet before comparing it to the pattern.
+     * This results in only bits that matter to packet layout remaining.
+     */
+    uint8_t mask[MAX_PATTERN_SIZE];
+
+    /* The data values to compare the masked packet against. This is the known
+     * fields of the packet which are required for a specific layout. E.g. an
+     * Ether/IPv4 packet has a 0x0800 ethertype, and the 0x0800 is stored here.
+     */
+    uint8_t data[MAX_PATTERN_SIZE];
+
+};
+
+/* Improvement: create this sttruct in dp-packet.h, and reuse-here. That would
+ * avoid the requirement of the packed attribute.
+ */
+struct __attribute__((packed)) packet_offsets {
+    uint8_t l2_pad_size;
+    uint16_t l2_5_ofs;
+    uint16_t l3_ofs;
+    uint16_t l4_ofs;
+};
+
+/* Structure to represent the data-movement from pattern to miniflow. */
+struct packet_pattern_shuffle {
+    uint64_t kmasks[MAX_SHUFFLE_COUNT];
+    struct packet_offsets offsets;
+
+    /* The input data to the data-movement shuffle. This shuffle changes the
+     * layout of the packet data into the miniflow blocks shape.
+     */
+    uint8_t shuffle[MAX_PATTERN_SIZE];
+
+    /* Data to be merged into the resulting miniflow blocks. This is required
+     * for e.g. VLAN TCI, which generates a bit in the block even if the packet
+     * didn't originally have it.
+     */
+    uint8_t insert[MAX_PATTERN_SIZE];
+};
+
+/* structure that represents all per-thread pattern data. */
+struct packet_pattern_cache {
+    /* Minimum packet len for this pattern index to be a valid candidate. */
+    uint8_t min_len[MAX_PATTERN_COUNT];
+
+    /* Number of active patterns to match against. */
+    uint8_t active_pattern_count;
+
+    /* The mask and compare data itself. */
+    struct packet_pattern patterns[MAX_PATTERN_COUNT];
+
+    /* Miniflow bits that need to be set for each pattern. */
+    struct miniflow miniflow_bits[MAX_PATTERN_COUNT];
+
+    /* Structure to represent the data-movement from pattern to miniflow. */
+    struct packet_pattern_shuffle shuffles[MAX_PATTERN_COUNT];
+
+};
+
+/* Single copy of control-path owned patterns. The contents of this struct will
+ * be updated when the user runs a miniflow-pattern-add command. The contents
+ * of this struct are only read in the datapath during the "study" phase, and
+ * copied into a thread-local memory for the PMD threads for datapath usage.
+ */
+static struct packet_pattern_cache patterns_control_path;
+
+/* Generator for EtherType masks and values. */
+#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \
+  0, 0, 0, 0, 0, 0, /* Ether MAC DST */                                 \
+  0, 0, 0, 0, 0, 0, /* Ether MAC SRC */                                 \
+  type_b0, type_b1, /* EtherType */
+
+#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
+#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
+
+#define PATTERN_VLAN_GEN(tpid0, tpid1, tci0, tci1)                      \
+  tpid0, tpid1, pcp_dei_vid0, vid1, /* Whole VLAN header */
+
+#define PATTERN_VLAN_MASK PATTERN_VLAN_GEN(0xFF, 0xFF, (~0x4), 0xFF)
+#define PATTERN_VLAN_DATA PATTERN_VLAN_GEN(0xFF, 0xFF, 0, 0)
+
+/* Generator for checking IPv4 ver, ihl, and proto */
+#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
+  VER_IHL, /* Version and IHL */                                        \
+  0, 0, 0, /* DSCP, ECN, Total Lenght */                                \
+  0, 0, /* Identification */                                            \
+  /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */  \
+  FLAG_OFF_B0, FLAG_OFF_B1,                                             \
+  0, /* TTL */                                                          \
+  PROTO, /* Protocol */                                                 \
+  0, 0, /* Header checksum */                                           \
+  0, 0, 0, 0, /* Src IP */                                              \
+  0, 0, 0, 0, /* Dst IP */
+
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
+
+
+#define ETHER_IPV4_UDP_LEN (42)
+
+#define NU 0
+#define PATTERN_IPV4_UDP_SHUFFLE \
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, NU, NU, /* Ether */ \
+  26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */  \
+  34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */
+
+static int avx512vbmi_available;
+
+/* Enable Icelake AVX-512 VBMI ISA for only this function. That allows the
+ * compile to emit the instruction here, but not use AVX-512 VBMI outside
+ * of this function.
+ */
+static inline __m512i __attribute__((__target__("avx512vbmi")))
+packet_shuffle_avx512_icx(__mmask64 k_mask, __m512i v_pkt_data_0,
+                          __m512i v_shuf_mask, __m512i v_pkt_data_1)
+{
+    return _mm512_maskz_permutex2var_epi8(k_mask, v_pkt_data_0,
+                                          v_shuf_mask, v_pkt_data_1);
+}
+
+/* This function provides a Skylake and higher fallback for the byte-shuffle
+ * that is required to implement miniflow extract correctly.
+ */
+static inline __m512i
+packet_shuffle_avx512(__mmask64 k_mask, __m512i v_data_0, __m512i v_shuf_idxs,
+                      __m512i v_data_1, uint32_t use_vbmi)
+{
+    if (use_vbmi) {
+        return packet_shuffle_avx512_icx(k_mask, v_data_0,
+                                         v_shuf_idxs, v_data_1);
+    }
+
+    /* Clear away ODD lane bytes, shift down by 1 to get u8 to u16 idxs. */
+    const __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA;
+    __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                            v_shuf_idxs, _mm512_setzero_si512());
+    v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1);
+
+    /* Clear away EVEN lane bytes by shifting out. Shift EVEN lane indexes down
+     * by one bit too to achieve u8 to u16 conversion.
+     */
+    __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9);
+
+    /* Shuffle each of odd/even at 16-bit width. */
+    __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn,
+                                                v_data_1);
+    __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd,
+                                                v_data_1);
+
+    /* Find if the shuffle index was odd, via mask and compare. */
+    uint16_t index_odd_mask = 0x1;
+    const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask);
+
+    /* EVEN lanes, find if u8 index was odd,  result as u16 bitmask. */
+    __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs,
+                                                 v_index_mask_u16);
+    __mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked,
+                                                    v_index_mask_u16);
+
+    /* ODD lanes, find if u8 index was odd, result as u16 bitmask. */
+    __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8);
+    __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8,
+                                                v_index_mask_u16);
+    __mmask32 odd_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_odd_masked,
+                                                    v_index_mask_u16);
+    odd_rotate_mask = ~odd_rotate_mask;
+
+    /* Rotate based on low-bit-set bitmask, and blend results. */
+    __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1,
+                                    evn_rotate_mask, v_shuf1, 8);
+    __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2,
+                                    odd_rotate_mask, v_shuf2, 8);
+
+    /* Blend results of two halves back together. */
+    __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                                    v_shuf_res_evn, v_shuf_res_odd);
+
+    /* k-mask the final result as requested. This is not easy to do before
+     * here, as the instructions operate at u16 size, meaning the k-mask would
+     * be interpreted as the wrong size.
+     */
+    __m512i v_zeros = _mm512_setzero_si512();
+    __m512i v_shuf_res_masked = _mm512_mask_blend_epi8(k_mask, v_zeros,
+                                                       v_shuf_result);
+    return v_shuf_res_masked;
+}
+
+
+/* Matches all patterns provided, building the appropriate miniflow for a hit.
+ *
+ * Note that this function is compile-time specialized into to variants, one
+ * for CPUs that support AVX-512 Vector Bit Manipulation Instructions (VBMI),
+ * and another for those that support AVX-512 but not AVX-512 VBMI.
+ */
+static inline __attribute__((always_inline)) uint32_t
+packet_pattern_avx512(struct dp_packet *dp_pkt, struct miniflow *mf,
+                      struct packet_pattern_cache *cache,
+                      const uint32_t num_patterns,
+                      const uint32_t use_vbmi)
+{
+    uint8_t *pkt = dp_packet_data(dp_pkt);
+    const uint32_t pkt_len = dp_packet_size(dp_pkt);
+    uint32_t in_port = odp_to_u32(dp_pkt->md.in_port.odp_port);
+
+    /* Masked load to only load the valid packet data. */
+    uint64_t mask1 = (1ULL << pkt_len) - 1;
+    mask1 |= (pkt_len < 64) - 1;
+    __mmask64 pkt_len_mask_0 = mask1;
+
+    uint64_t mask2 = (1ULL << (pkt_len - 64)) - 1;
+    mask2 |= (pkt_len < 128) - 1;
+    mask2 &= (pkt_len <  64) - 1;
+    __mmask64 pkt_len_mask_1 = mask2;
+
+    __m512i v_pkt_data_0 = _mm512_maskz_loadu_epi8(pkt_len_mask_0, &pkt[0]);
+    __m512i v_pkt_data_1 = _mm512_maskz_loadu_epi8(pkt_len_mask_1, &pkt[64]);
+
+    /* Loop over the patterns provided. Note that this loop can be compile-time
+     * unrolled for specialized versions with set numbers of patterns.
+     */
+    uint32_t hitmask = 0;
+
+    for (uint32_t i = 0; i < num_patterns; i++) {
+        struct packet_pattern *patterns = cache->patterns;
+
+        /* Mask and match the packet data and pattern, results in hit bit. */
+        __m512i v_mask_0 = _mm512_loadu_si512(&patterns[i].mask[0]);
+        __m512i v_data_0 = _mm512_loadu_si512(&patterns[i].data[0]);
+        __m512i v_pkt_masked = _mm512_and_si512(v_pkt_data_0, v_mask_0);
+        __mmask64 cmp_mask = _mm512_cmpeq_epi8_mask(v_pkt_masked, v_data_0);
+
+        uint32_t hit = (cmp_mask == UINT64_MAX);
+        hitmask |= (hit << i);
+    }
+
+    /* Check packet len to ensure the packet data filled the whole pattern. */
+    __mmask16 min_len_mask = (1 << num_patterns) - 1;
+    __m128i v_pattern_min_lens = _mm_maskz_loadu_epi8(min_len_mask,
+                                                      (void *)cache->min_len);
+    __m128i v_pkt_len = _mm_maskz_set1_epi8(min_len_mask, pkt_len);
+    uint32_t pkt_len_valid_mask = _mm_mask_cmpge_epu8_mask(min_len_mask,
+                                                           v_pkt_len,
+                                                           v_pattern_min_lens);
+
+    /* Strip away hit if packet was too short for the pattern */
+    hitmask &= pkt_len_valid_mask;
+
+    /* If a pattern was hit, build the miniflow using the pattern shuffle. */
+    if (OVS_LIKELY(hitmask)) {
+        uint32_t idx = __builtin_ctzll(hitmask);
+
+        /* Copy the pattern miniflow bits to the destination miniflow. */
+        struct miniflow *pattern_mf_bits = &cache->miniflow_bits[idx];
+        __m128i v_pattern_mf_bits = _mm_load_si128((void *)pattern_mf_bits);
+        _mm_storeu_si128((void *)mf, v_pattern_mf_bits);
+
+        /* Load miniflow building metadata */
+        struct packet_pattern_shuffle *shuffle = &cache->shuffles[idx];
+        __mmask64 k_shuf_0 = shuffle->kmasks[0];
+        __m512i v_shuf_mask_0 = _mm512_loadu_si512(&shuffle->shuffle[0]);
+        __m512i v_ins_0 = _mm512_loadu_si512(&shuffle->insert[0]);
+
+        /* Compute bytes 0-63 and merge in pattern-required bits. */
+        __m512i v_mf_blocks_0 = packet_shuffle_avx512(k_shuf_0, v_pkt_data_0,
+                                    v_shuf_mask_0, v_pkt_data_1, use_vbmi);
+        __m512i v_mf_blocks_ins_0 = _mm512_or_si512(v_mf_blocks_0, v_ins_0);
+
+        /* If required, compute bytes 0-63 and merge in pattern bits. */
+        __m512i v_mf_blocks_ins_1 = _mm512_setzero_si512();
+        __mmask64 k_shuf_1 = shuffle->kmasks[1];
+        if (k_shuf_1) {
+            __m512i v_shuf_mask_1 = _mm512_loadu_si512(&shuffle->shuffle[64]);
+            __m512i v_mf_blocks_1 = packet_shuffle_avx512(k_shuf_1,
+                                           v_pkt_data_0, v_shuf_mask_1,
+                                           v_pkt_data_1, use_vbmi);
+            __m512i v_ins_1 = _mm512_loadu_si512(&shuffle->insert[64]);
+            v_mf_blocks_ins_1 = _mm512_or_si512(v_mf_blocks_1, v_ins_1);
+        }
+
+        /* Miniflow Blocks contains first 2 blocks of non-packet-parsed data,
+         * such as the dp hash, in port, ct_mark, and packet_type. On outer
+         * packets, they are always zero except for in_port.
+         */
+        uint64_t *mf_blocks = miniflow_values(mf);
+        __m128i v_blocks_01 = _mm_setzero_si128();
+        v_blocks_01 = _mm_insert_epi32(v_blocks_01, in_port, 1);
+        _mm_storeu_si128((void *)&mf_blocks[0], v_blocks_01);
+
+        /* Store the computed miniflow blocks. */
+        _mm512_storeu_si512(&mf_blocks[2], v_mf_blocks_ins_0);
+        _mm512_storeu_si512(&mf_blocks[2 + 8], v_mf_blocks_ins_1);
+
+        /* Set dp packet offsets from the pattern metadata.  */
+        memcpy(&dp_pkt->l2_pad_size, &shuffle->offsets,
+               sizeof(struct packet_offsets));
+    }
+
+    return hitmask;
+}
+
+/* TODO: This function accepts a string, which represents the pattern and
+ * shuffles required for the users traffic type. Today this function has a
+ * hard-coded pattern for Ether()/IP()/UDP() packets.
+ *
+ * A future revision of this patchset will include the parsing of the input
+ * string to create the patterns, providing runtime flexibility in parsing
+ * packets into miniflows.
+ */
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string)
+{
+    /* Check that the runtime CPU has the required ISA avialable. Also check for
+     * AVX-512 Vector Bit Manipulation Instructions (VBMI), which allow a faster
+     * code-path to be used due to a native byte permute instruction.
+     */
+    int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+    int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+    avx512vbmi_available = dpdk_get_cpu_has_isa("x86_64", "avx512vbmi");
+
+    uint32_t min_isa_ok = avx512f_available && bmi2_available;
+    printf("%s : minimum ISA avialable: %s, AVX-512 VBMI available: %s\n",
+           __func__, min_isa_ok ? "yes" : "no",
+           avx512vbmi_available ? "yes" : "no");
+    if (!min_isa_ok) {
+        return -ENOTSUP;
+    }
+
+    (void)patterns_control_path;
+    (void)pattern_string;
+
+    /* Add hard-coded Ether/IPv4/UDP implementation for demonstration. */
+    patterns_control_path.active_pattern_count = 1;
+
+    /* Ether/IPv4/UDP pattern metadata */
+    patterns_control_path.patterns[0] = (struct packet_pattern) {
+        .mask = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK },
+        .data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP },
+    };
+
+    printf("%s: pattern 0 mask:\n", __func__);
+    ovs_hex_dump(stdout, &patterns_control_path.patterns[0].mask,
+                 MAX_PATTERN_SIZE, 0, false);
+    printf("%s: pattern 0 data:\n", __func__);
+    ovs_hex_dump(stdout, &patterns_control_path.patterns[0].data,
+                 MAX_PATTERN_SIZE, 0, false);
+
+    patterns_control_path.miniflow_bits[0] = (struct miniflow) {
+        .map = { .bits = {0x18a0000000000000, 0x0000000000040401}, }
+    };
+    printf("pattern[0] mf bits %08llx %08llx\n",
+        patterns_control_path.miniflow_bits[0].map.bits[0],
+        patterns_control_path.miniflow_bits[0].map.bits[1]);
+
+    patterns_control_path.min_len[0] = ETHER_IPV4_UDP_LEN;
+
+    /* Kmask and Shuffle for Ether/IPv4/UDP. Created by inspecting miniflow
+     * built from packet data, and reproduced using AVX-512 instructions with
+     * k-masks to zero parts of the miniflow as required.
+     */
+    patterns_control_path.shuffles[0] = (struct packet_pattern_shuffle) {
+        .kmasks = { 0b0000111111110000111111110011111111111111, 0 },
+        .offsets = {
+            .l2_pad_size = 0,
+            .l2_5_ofs = UINT16_MAX,
+            .l3_ofs = 14,
+            .l4_ofs = 34,
+        },
+        .shuffle = {PATTERN_IPV4_UDP_SHUFFLE},
+    };
+    printf("pattern[0] kmask[0] %08lx, kmask[1] %08lx, shuffle hexdump:\n",
+           patterns_control_path.shuffles[0].kmasks[0],
+           patterns_control_path.shuffles[0].kmasks[1]);
+    ovs_hex_dump(stdout, &patterns_control_path.shuffles[0], MAX_PATTERN_SIZE,
+                 0, false);
+
+    return 0;
+};
+
+static uint32_t
+miniflow_extract_avx512(struct dp_netdev_pmd_thread *pmd,
+                        struct dp_packet *packet,
+                        struct miniflow *mf)
+{
+    /* TODO: alloc pattern cache per PMD thread. */
+    (void)pmd;
+
+    /* Execute the pattern matching using the PMD pattern cache. */
+    uint32_t num_patterns = 1;
+    uint32_t use_vbmi = 0;
+    uint32_t match_hit = packet_pattern_avx512(packet, mf,
+                                               &patterns_control_path,
+                                               num_patterns,
+                                               use_vbmi);
+    return match_hit;
+}
+
+/* This function will only be used if AVX-512 VBMI instructions are available
+ * on the CPU. As such, we use the __target__ attribute to enable VBMI ISA.
+ */
+static uint32_t __attribute__((__target__("avx512vbmi")))
+miniflow_extract_avx512_vbmi(struct dp_netdev_pmd_thread *pmd,
+                             struct dp_packet *packet,
+                             struct miniflow *mf)
+{
+    /* TODO: alloc pattern cache per PMD thread. */
+    (void)pmd;
+
+    /* Execute the pattern matching using the PMD pattern cache. */
+    uint32_t num_patterns = 1;
+    uint32_t use_vbmi = 1;
+    uint32_t match_hit = packet_pattern_avx512(packet, mf,
+                                               &patterns_control_path,
+                                               num_patterns,
+                                               use_vbmi);
+    return match_hit;
+}
+
+/* The study function runs the patterns from the control-path, and based on
+ * some hit statistics can copy the pattern to the per-PMD pattern cache. Part
+ * of the study() functionality is also to validate that hits on a pattern
+ * result in an identical miniflow as the scalar miniflow_extract() function.
+ * This is validated by calling the scalar version, and comparing output.
+ */
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+                              struct dp_packet *packet,
+                              struct miniflow *dst)
+{
+    static volatile int debug = 0;
+
+    /* Run using the user supplied patterns. */
+    uint32_t match = miniflow_extract_avx512(pmd, packet, dst);
+
+    if (debug || match) {
+        /* Save off AVX512 created dp_packet offsets for verification. */
+        struct packet_offsets vec_offsets;
+        memcpy(&vec_offsets, &packet->l2_pad_size,
+               sizeof(struct packet_offsets));
+
+        /* Check the result vs the scalar miniflow-extract for correctness. */
+        struct netdev_flow_key scalar_mf_key = {0};
+        struct miniflow *scalar_mf = &scalar_mf_key.mf;
+        miniflow_extract(packet, scalar_mf);
+
+        /* Validate miniflow data is identical. */
+        uint32_t mf_bit_count = count_1bits(scalar_mf->map.bits[0]) +
+                                    count_1bits(scalar_mf->map.bits[1]);
+        size_t compare_size = sizeof(uint64_t) * (2 + mf_bit_count);
+        if (memcmp(scalar_mf, dst, compare_size)) {
+            printf("%s: Scalar miniflow output:\n", __func__);
+            ovs_hex_dump(stdout, scalar_mf, compare_size, 0, false);
+            printf("%s: AVX512 miniflow output:\n", __func__);
+            ovs_hex_dump(stdout, dst, compare_size, 0, false);
+            printf("error in miniflow compare, see hexdumps() above\n");
+        }
+
+        /* Validate that dp_packet offsets are identical. */
+        if (memcmp(&vec_offsets, &packet->l2_pad_size,
+                   sizeof(struct packet_offsets))) {
+            printf("VECTOR code DP packet properties: %d, %d, %d, %d\n",
+                   vec_offsets.l2_pad_size, vec_offsets.l2_5_ofs,
+                   vec_offsets.l3_ofs, vec_offsets.l4_ofs);
+            printf("Scalar code DP packet properties: %d, %d, %d, %d\n",
+                   packet->l2_pad_size, packet->l2_5_ofs, packet->l3_ofs,
+                   packet->l4_ofs);
+            ovs_assert("error in packet offsets, see printf()s above\n");
+        }
+
+    }
+
+    /* Check if the study function should study more packets, or if it is
+     * done. When done, we change the per-PMD function pointer to the datapath
+     * implementation without study for better performance.
+     */
+    int64_t study_more = --pmd->miniflow_study_pkts;
+    if (!study_more) {
+        printf("%s : setting func ptr to remove study(), study_pkts = %ld\n",
+               __func__, study_more);
+        pmd->miniflow_extract_opt = miniflow_extract_avx512;
+        if (avx512vbmi_available)
+            pmd->miniflow_extract_opt = miniflow_extract_avx512_vbmi;
+    }
+
+    return match;
+}
+
+#endif /* SPARSE */
+#endif /* __x86_64__ */
diff --git a/lib/dpif-netdev-avx512-extract.h b/lib/dpif-netdev-avx512-extract.h
new file mode 100644
index 000000000..39964c31d
--- /dev/null
+++ b/lib/dpif-netdev-avx512-extract.h
@@ -0,0 +1,40 @@ 
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow.h"
+#include "dpif-netdev-private-thread.h"
+
+/* TODO: This function accepts a string, which represents the pattern and
+ * shuffles required for the users traffic type. Today this function has a
+ * hard-coded pattern for Ether()/IP()/UDP() packets.
+ *
+ * A future revision of this patchset will include the parsing of the input
+ * string to create the patterns, providing runtime flexibility in parsing
+ * packets into miniflows.
+ */
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string);
+
+/* The study function runs the patterns from the control-path, and based on
+ * some hit statistics can copy the pattern to the per-PMD pattern cache. Part
+ * of the study() functionality is also to validate that hits on a pattern
+ * result in an identical miniflow as the scalar miniflow_extract() function.
+ * This is validated by calling the scalar version, and comparing output.
+ */
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+                              struct dp_packet *packet,
+                              struct miniflow *dst);
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index 07f064a18..08ad48e7d 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -34,6 +34,7 @@ 
 
 #include "immintrin.h"
 
+#include "dpif-netdev-avx512-extract.h"
 
 /* Structure to contain per-packet metadata that must be attributed to the
  * dp netdev flow. This is unfortunate to have to track per packet, however
@@ -116,7 +117,16 @@  dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
         struct dp_packet *packet = packets->packets[i];
         pkt_metadata_init(&packet->md, in_port);
         struct netdev_flow_key *key = &keys[i];
-        miniflow_extract(packet, &key->mf);
+
+        if (pmd->miniflow_extract_opt) {
+            uint32_t matched = pmd->miniflow_extract_opt(pmd, packet,
+                                                         &key->mf);
+            if (!matched) {
+                miniflow_extract(packet, &key->mf);
+            }
+        } else {
+            miniflow_extract(packet, &key->mf);
+        }
 
         /* Cache TCP and byte values for all packets */
         pkt_meta[i].bytes = dp_packet_size(packet);
diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
new file mode 100644
index 000000000..c97658e41
--- /dev/null
+++ b/lib/dpif-netdev-private-extract.c
@@ -0,0 +1,72 @@ 
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <errno.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "dpif-netdev-private-extract.h"
+#include "openvswitch/vlog.h"
+#include "util.h"
+
+VLOG_DEFINE_THIS_MODULE(dpif_netdev_extract);
+
+int32_t
+miniflow_extract_avx512_probe(void);
+
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string);
+
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+                              struct dp_packet *packet,
+                              struct miniflow *dst);
+
+/* Implementations of available extract opts. */
+static struct dpif_miniflow_extract_opt mfex_impl[] = {
+    {
+        .extract_func = NULL,
+        .insert_func = NULL,
+        .name = "disable",
+    },
+
+/* Only enable AVX512 if compile time criteria are met. */
+#if (__x86_64__ && HAVE_AVX512F && HAVE_LD_AVX512_GOOD)
+    {
+        .extract_func = miniflow_extract_avx512_study,
+        .insert_func = miniflow_extract_avx512_insert,
+        .name = "avx512",
+    },
+#endif
+};
+
+
+int32_t
+dpif_miniflow_extract_opt_get(const char *name,
+                              struct dpif_miniflow_extract_opt **opt)
+{
+    ovs_assert(opt);
+
+    uint32_t i;
+    for (i = 0; i < ARRAY_SIZE(mfex_impl); i++) {
+        if (strcmp(name, mfex_impl[i].name) == 0) {
+                *opt = &mfex_impl[i];
+                return 0;
+        }
+    }
+    return -EINVAL;
+}
diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
new file mode 100644
index 000000000..3c5868ebe
--- /dev/null
+++ b/lib/dpif-netdev-private-extract.h
@@ -0,0 +1,60 @@ 
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_AVX512_EXTRACT
+#define DPIF_NETDEV_AVX512_EXTRACT 1
+
+/* Forward declarations */
+struct dp_packet;
+struct miniflow;
+struct dp_netdev_pmd_thread;
+
+/* Function pointer prototype to be implemented in the optimized miniflow
+ * extract code.
+ */
+typedef uint32_t (*miniflow_extract_func)(struct dp_netdev_pmd_thread *pmd,
+                                          struct dp_packet *packet,
+                                          struct miniflow *mf);
+
+/* Function pointer prototype to be implemented by optimized miniflow extract
+ * code, to implement handling a new traffic pattern.
+ * Returns 0 on success
+ * Returns -ENOTSUP if the CPU does not support the required ISA
+ */
+typedef int32_t (*template_insert_func)(const char *pattern_string);
+
+/* Structure representing the attributes of an optimized implementation. */
+struct dpif_miniflow_extract_opt {
+    /* Function to call to extract miniflows from a packet */
+    miniflow_extract_func extract_func;
+
+    /* Function called to insert a new traffic pattern. */
+    template_insert_func insert_func;
+
+    /* Name of the optimized implementation. */
+    char *name;
+};
+
+/* Returns the opt structure for the requested implementation by name.
+ * Returns zero on success, and opt points to a valid struct, or
+ * returns a negative failure status.
+ * -EINVAL : invalid name requested
+ */
+int32_t
+dpif_miniflow_extract_opt_get(const char *name,
+                              struct dpif_miniflow_extract_opt **opt);
+
+#endif /* DPIF_NETDEV_AVX512_EXTRACT */
diff --git a/lib/dpif-netdev-private-flow.h b/lib/dpif-netdev-private-flow.h
index 6b91a5d4e..20d22bad3 100644
--- a/lib/dpif-netdev-private-flow.h
+++ b/lib/dpif-netdev-private-flow.h
@@ -147,6 +147,7 @@  struct dp_netdev_actions {
     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 };
 
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index b465e6ea3..c5013cf4e 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -29,6 +29,7 @@ 
 #include "openvswitch/thread.h"
 
 #include "dpif-netdev-private-dpif.h"
+#include "dpif-netdev-private-extract.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -108,6 +109,14 @@  struct dp_netdev_pmd_thread {
     /* Function pointer to call for dp_netdev_input() functionality */
     dp_netdev_input_func netdev_input_func;
 
+    /* Function pointer to call for miniflow_extract() functionality */
+    miniflow_extract_func miniflow_extract_opt;
+    /* Number of miniflow packets to study before selecting miniflow
+     * implementation. Depending on variability in traffic, a higher number
+     * allows longer inspection of traffic to ensure all are covered.
+     */
+    uint32_t miniflow_study_pkts;
+
     struct seq *reload_seq;
     uint64_t last_reload_seq;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 588981ca8..5627277c4 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -46,6 +46,7 @@ 
 #include "dpif-netdev-lookup.h"
 #include "dpif-netdev-perf.h"
 #include "dpif-netdev-private-dfc.h"
+#include "dpif-netdev-private-extract.h"
 #include "dpif-provider.h"
 #include "dummy.h"
 #include "fat-rwlock.h"
@@ -990,6 +991,109 @@  dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
     ds_destroy(&reply);
 }
 
+static void
+dpif_miniflow_extract_template_add(struct unixctl_conn *conn, int argc,
+                                   const char *argv[], void *aux OVS_UNUSED)
+{
+    /* This function requires just one parameter, the template name.
+     * A second optional parameter can identify the datapath instance.
+     */
+    const char *mfex_impl_name = argv[1];
+
+    struct dpif_miniflow_extract_opt *mf_opt = NULL;
+    int err = dpif_miniflow_extract_opt_get(mfex_impl_name, &mf_opt);
+    if (err) {
+        struct ds reply = DS_EMPTY_INITIALIZER;
+        ds_put_format(&reply, "Miniflow Extract %s not found.",
+                      mfex_impl_name);
+        const char *reply_str = ds_cstr(&reply);
+        unixctl_command_reply(conn, reply_str);
+        VLOG_INFO("%s", reply_str);
+        ds_destroy(&reply);
+        return;
+    }
+
+    /* Providing "disable" as implementation name has no insert func. */
+    if (mf_opt->insert_func) {
+        /* Insert the new pattern. There is ongoing work on designing the
+         * interaction between the string here, and the patterns in the
+         * miniflow extract optimized code.
+         */
+        const char *pattern_string = argv[2];
+        int32_t insert_err = mf_opt->insert_func(pattern_string);
+        if (OVS_UNLIKELY(insert_err)) {
+            struct ds reply = DS_EMPTY_INITIALIZER;
+
+            if (insert_err == -ENOTSUP) {
+                    ds_put_format(&reply, "Miniflow Extract %s not available."
+                               "This CPU does not support the required ISA.\n",
+                               mfex_impl_name);
+            } else {
+                    ds_put_format(&reply, "Miniflow Extract %s insert failed."
+                             "Check the pattern data and command arguments.\n",
+                              mfex_impl_name);
+            }
+
+            const char *reply_str = ds_cstr(&reply);
+            unixctl_command_reply(conn, reply_str);
+            VLOG_INFO("%s", reply_str);
+            ds_destroy(&reply);
+            return;
+        }
+    }
+
+    ovs_mutex_lock(&dp_netdev_mutex);
+    struct dp_netdev *dp = NULL;
+
+    /* Optional argument, if passed, study this number of packets. Defaults
+     * to 10k.
+     */
+    uint32_t study_pkts = 10000;
+    if (argc >= 4) {
+        study_pkts = atoi(argv[3]);
+    }
+
+    if (argc == 5) {
+        dp = shash_find_data(&dp_netdevs, argv[4]);
+    } else if (shash_count(&dp_netdevs) == 1) {
+        dp = shash_first(&dp_netdevs)->data;
+    }
+
+    if (!dp) {
+        ovs_mutex_unlock(&dp_netdev_mutex);
+        unixctl_command_reply_error(conn,
+                                    "please specify an existing datapath");
+        return;
+    }
+
+    /* Get PMD threads list */
+    size_t n;
+    struct dp_netdev_pmd_thread **pmd_list;
+    sorted_poll_thread_list(dp, &pmd_list, &n);
+
+    for (size_t i = 0; i < n; i++) {
+        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
+        if (pmd->core_id == NON_PMD_CORE_ID) {
+            continue;
+        }
+
+        /* set PMD context to study N packets */
+        pmd->miniflow_study_pkts = study_pkts;
+
+        /* set PMD threads DPIF implementation to requested one */
+        pmd->miniflow_extract_opt = mf_opt->extract_func;
+    };
+    ovs_mutex_unlock(&dp_netdev_mutex);
+
+    /* Reply with success to command */
+    struct ds reply = DS_EMPTY_INITIALIZER;
+    ds_put_format(&reply, "miniflow extract opt impl %s.\n", mfex_impl_name);
+    const char *reply_str = ds_cstr(&reply);
+    unixctl_command_reply(conn, reply_str);
+    VLOG_INFO("%s", reply_str);
+    ds_destroy(&reply);
+}
+
 static void
 dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
                      const char *argv[], void *aux OVS_UNUSED)
@@ -1288,6 +1392,10 @@  dpif_netdev_init(void)
                              "[dpif implementation name] [dp]",
                              1, 2, dpif_netdev_impl_set,
                              NULL);
+    unixctl_command_register("dpif-netdev/miniflow-template-add",
+                             "[impl name] [template] [study pkt count] [dp]",
+                             1, 4, dpif_miniflow_extract_template_add,
+                             NULL);
     return 0;
 }
 
@@ -6127,6 +6235,9 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     /* Initialize the DPIF function pointer to the default scalar version */
     pmd->netdev_input_func = dp_netdev_impl_get_default();
 
+    /* Initialize the miniflow extract function pointer not set */
+    pmd->miniflow_extract_opt = NULL;
+
     /* init the 'flow_cache' since there is no
      * actual thread created for NON_PMD_CORE_ID. */
     if (core_id == NON_PMD_CORE_ID) {