diff mbox series

[ovs-dev,v3,09/10] dpif-netdev: enable ISA optimized DPIF and miniflow extract

Message ID 20201118161501.1710801-10-harry.van.haaren@intel.com
State Superseded
Headers show
Series DPIF & MFEX Refactor and AVX512 SIMD optimization | expand

Commit Message

Van Haaren, Harry Nov. 18, 2020, 4:15 p.m. UTC
This commit refactors the way in which the DPIF component can
call the miniflow-extract function. It creates flexibility in
the DPIF component by adding a function pointer at the pmd level.

A new miniflow extract implementation is created which allows the
AVX-512 SIMD instructions to perform the packet matching and building
of the miniflow data-structure.

All AVX-512 capable CPUs will be able to run the miniflow
extract, however CPUs that support the AVX-512 Vector Bit
Manipulation Instructions (VBMI) will benefit more as the
native byte permute instruction gives extra performance.

Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 lib/automake.mk                   |   2 +
 lib/dpif-netdev-avx512-extract.h  | 435 ++++++++++++++++++++++++++++++
 lib/dpif-netdev-avx512.c          |  16 +-
 lib/dpif-netdev-private-extract.h |  52 ++++
 lib/dpif-netdev-private-flow.h    |   1 +
 lib/dpif-netdev-private-thread.h  |   9 +
 lib/dpif-netdev.c                 |  93 +++++++
 7 files changed, 605 insertions(+), 3 deletions(-)
 create mode 100644 lib/dpif-netdev-avx512-extract.h
 create mode 100644 lib/dpif-netdev-private-extract.h

Comments

0-day Robot Nov. 18, 2020, 5:24 p.m. UTC | #1
Bleep bloop.  Greetings Harry van Haaren, I am a robot and I have tried out your patch.
Thanks for your contribution.

I encountered some error that I wasn't expecting.  See the details below.


build:
/bin/sh ./libtool  --tag=CXX   --mode=link g++ -std=gnu++11  -g -O2     -o include/openvswitch/libcxxtest.la  include/openvswitch/cxxtest.lo  -lpthread -lrt -lm  -lunbound
libtool: link: rm -fr  include/openvswitch/.libs/libcxxtest.a include/openvswitch/.libs/libcxxtest.la
libtool: link: ar cru include/openvswitch/.libs/libcxxtest.a  include/openvswitch/cxxtest.o
libtool: link: ranlib include/openvswitch/.libs/libcxxtest.a
libtool: link: ( cd "include/openvswitch/.libs" && rm -f "libcxxtest.la" && ln -s "../libcxxtest.la" "libcxxtest.la" )
depbase=`echo utilities/ovs-appctl.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
gcc -std=gnu99 -DHAVE_CONFIG_H -I.    -I ./include -I ./include -I ./lib -I ./lib    -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror   -g -O2 -DHAVE_LD_AVX512_GOOD -MT utilities/ovs-appctl.o -MD -MP -MF $depbase.Tpo -c -o utilities/ovs-appctl.o utilities/ovs-appctl.c &&\
mv -f $depbase.Tpo $depbase.Po
/bin/sh ./libtool  --tag=CC   --mode=link gcc -std=gnu99 -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror   -g -O2 -DHAVE_LD_AVX512_GOOD     -o utilities/ovs-appctl utilities/ovs-appctl.o lib/libopenvswitch.la -lpthread -lrt -lm  -lunbound
libtool: link: gcc -std=gnu99 -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror -g -O2 -DHAVE_LD_AVX512_GOOD -o utilities/ovs-appctl utilities/ovs-appctl.o  lib/.libs/libopenvswitch.a -lssl -lcrypto -lcap-ng -lpthread -lrt -lm -lunbound
depbase=`echo utilities/ovs-testcontroller.o | sed 's|[^/]*$|.deps/&|;s|\.o$||'`;\
gcc -std=gnu99 -DHAVE_CONFIG_H -I.    -I ./include -I ./include -I ./lib -I ./lib    -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror   -g -O2 -DHAVE_LD_AVX512_GOOD -MT utilities/ovs-testcontroller.o -MD -MP -MF $depbase.Tpo -c -o utilities/ovs-testcontroller.o utilities/ovs-testcontroller.c &&\
mv -f $depbase.Tpo $depbase.Po
/bin/sh ./libtool  --tag=CC   --mode=link gcc -std=gnu99 -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror   -g -O2 -DHAVE_LD_AVX512_GOOD     -o utilities/ovs-testcontroller utilities/ovs-testcontroller.o lib/libopenvswitch.la -lssl -lcrypto   -lpthread -lrt -lm  -lunbound
libtool: link: gcc -std=gnu99 -Wstrict-prototypes -Wall -Wextra -Wno-sign-compare -Wpointer-arith -Wformat -Wformat-security -Wswitch-enum -Wunused-parameter -Wbad-function-cast -Wcast-align -Wstrict-prototypes -Wold-style-definition -Wmissing-prototypes -Wmissing-field-initializers -fno-strict-aliasing -Wshadow -Werror -Werror -g -O2 -DHAVE_LD_AVX512_GOOD -o utilities/ovs-testcontroller utilities/ovs-testcontroller.o  lib/.libs/libopenvswitch.a -lcap-ng -lssl -lcrypto -lpthread -lrt -lm -lunbound
lib/.libs/libopenvswitch.a(dpif-netdev.o): In function `dpif_miniflow_extract_template_add':
/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace/lib/dpif-netdev.c:1006: undefined reference to `miniflow_extract_avx512_probe'
/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace/lib/dpif-netdev.c:1022: undefined reference to `miniflow_extract_avx512_insert'
/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace/lib/dpif-netdev.c:1066: undefined reference to `miniflow_extract_avx512_study'
collect2: error: ld returned 1 exit status
make[2]: *** [utilities/ovs-testcontroller] Error 1
make[2]: Leaving directory `/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace'
make[1]: *** [all-recursive] Error 1
make[1]: Leaving directory `/var/lib/jenkins/jobs/0day_robot_upstream_build_from_pw/workspace'
make: *** [all] Error 2


Please check this out.  If you feel there has been an error, please email aconole@redhat.com

Thanks,
0-day Robot
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 2a41f7ab5..e5f75ce35 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -107,6 +107,7 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dp-packet.h \
 	lib/dp-packet.c \
 	lib/dpdk.h \
+	lib/dpif-netdev-avx512-extract.h \
 	lib/dpif-netdev-lookup.h \
 	lib/dpif-netdev-lookup.c \
 	lib/dpif-netdev-lookup-autovalidator.c \
@@ -117,6 +118,7 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dpif-netdev-private-dpcls.h \
 	lib/dpif-netdev-private-dpif.c \
 	lib/dpif-netdev-private-dpif.h \
+	lib/dpif-netdev-private-extract.h \
 	lib/dpif-netdev-private-flow.h \
 	lib/dpif-netdev-private-hwol.h \
 	lib/dpif-netdev-private-thread.h \
diff --git a/lib/dpif-netdev-avx512-extract.h b/lib/dpif-netdev-avx512-extract.h
new file mode 100644
index 000000000..c264ac067
--- /dev/null
+++ b/lib/dpif-netdev-avx512-extract.h
@@ -0,0 +1,435 @@ 
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "flow.h"
+#include "dpif-netdev-private-thread.h"
+
+
+/* This file contains optimized implementations of miniflow_extract()
+ * for specific common traffic patterns. The optimizations allow for
+ * quick probing of a specific packet type, and if a match with a specific
+ * type is found, a shuffle like proceedure builds up the required miniflow
+ */
+
+#define MAX_PATTERN_COUNT (8)
+#define MAX_PATTERN_SIZE (128)
+#define MAX_SHUFFLE_COUNT (MAX_PATTERN_SIZE / 64)
+
+/* A structure to represent each matched on packet pattern */
+struct __attribute__((aligned(MAX_PATTERN_SIZE))) packet_pattern {
+    uint8_t mask[MAX_PATTERN_SIZE];
+    uint8_t data[MAX_PATTERN_SIZE];
+};
+
+/* Improvement: create this sttruct in dp-packet.h, and reuse-here. That would
+ * avoid the requirement of the packed attribute.
+ */
+struct __attribute__((packed)) packet_offsets {
+    uint8_t l2_pad_size;
+    uint16_t l2_5_ofs;
+    uint16_t l3_ofs;
+    uint16_t l4_ofs;
+};
+
+/* Structure to represent the data-movement from pattern to miniflow. */
+struct packet_pattern_shuffle {
+    uint64_t kmasks[MAX_SHUFFLE_COUNT];
+    struct packet_offsets offsets;
+    uint8_t shuffle[MAX_PATTERN_SIZE];
+};
+
+/* structure that represents all per-thread pattern data. */
+struct packet_pattern_cache {
+    /* Minimum packet len for this pattern index to be a valid candidate. */
+    uint8_t min_len[MAX_PATTERN_COUNT];
+
+    /* Number of active patterns to match against. */
+    uint8_t active_pattern_count;
+
+    /* The mask and compare data itself. */
+    struct packet_pattern patterns[MAX_PATTERN_COUNT];
+
+    /* Miniflow bits that need to be set for each pattern. */
+    struct miniflow miniflow_bits[MAX_PATTERN_COUNT];
+
+    /* Structure to represent the data-movement from pattern to miniflow. */
+    struct packet_pattern_shuffle shuffles[MAX_PATTERN_COUNT];
+
+};
+
+/* Single copy of control-path owned patterns. The contents of this struct will
+ * be updated when the user runs a miniflow-pattern-add command. The contents
+ * of this struct are only read in the datapath during the "study" phase, and
+ * copied into a thread-local memory for the PMD threads for datapath usage.
+ */
+static struct packet_pattern_cache patterns_control_path;
+
+/* Generator for EtherType masks and values. */
+#define PATTERN_ETHERTYPE_GEN(type_b0, type_b1) \
+  0, 0, 0, 0, 0, 0, /* Ether MAC DST */                                 \
+  0, 0, 0, 0, 0, 0, /* Ether MAC SRC */                                 \
+  type_b0, type_b1, /* EtherType */
+
+#define PATTERN_ETHERTYPE_MASK PATTERN_ETHERTYPE_GEN(0xFF, 0xFF)
+#define PATTERN_ETHERTYPE_IPV4 PATTERN_ETHERTYPE_GEN(0x08, 0x00)
+
+/* Generator for checking IPv4 ver, ihl, and proto */
+#define PATTERN_IPV4_GEN(VER_IHL, FLAG_OFF_B0, FLAG_OFF_B1, PROTO) \
+  VER_IHL, /* Version and IHL */                                        \
+  0, 0, 0, /* DSCP, ECN, Total Lenght */                                \
+  0, 0, /* Identification */                                            \
+  /* Flags/Fragment offset: don't match MoreFrag (MF) or FragOffset */  \
+  FLAG_OFF_B0, FLAG_OFF_B1,                                             \
+  0, /* TTL */                                                          \
+  PROTO, /* Protocol */                                                 \
+  0, 0, /* Header checksum */                                           \
+  0, 0, 0, 0, /* Src IP */                                              \
+  0, 0, 0, 0, /* Dst IP */
+
+#define PATTERN_IPV4_MASK PATTERN_IPV4_GEN(0xFF, 0xFE, 0xFF, 0xFF)
+#define PATTERN_IPV4_UDP PATTERN_IPV4_GEN(0x45, 0, 0, 0x11)
+
+#define ETHER_IPV4_UDP_LEN (42)
+
+#define NU 0
+#define PATTERN_IPV4_UDP_SHUFFLE \
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, NU, NU, /* Ether */ \
+  26, 27, 28, 29, 30, 31, 32, 33, NU, NU, NU, NU, 20, 15, 22, 23, /* IPv4 */  \
+  34, 35, 36, 37, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, NU, /* UDP */
+
+static int avx512vbmi_available;
+
+/* Enable Icelake AVX-512 VBMI ISA for only this function. That allows the
+ * compile to emit the instruction here, but not use AVX-512 VBMI outside
+ * of this function.
+ */
+static inline __m512i __attribute__((__target__("avx512vbmi")))
+packet_shuffle_avx512_icx(__mmask64 k_mask, __m512i v_pkt_data_0,
+                          __m512i v_shuf_mask, __m512i v_pkt_data_1)
+{
+    return _mm512_maskz_permutex2var_epi8(k_mask, v_pkt_data_0,
+                                          v_shuf_mask, v_pkt_data_1);
+}
+
+/* This function provides a Skylake and higher fallback for the byte-shuffle
+ * that is required to implement miniflow extract correctly.
+ */
+static inline __m512i
+packet_shuffle_avx512(__mmask64 k_mask, __m512i v_data_0, __m512i v_shuf_idxs,
+                      __m512i v_data_1)
+{
+    if (avx512vbmi_available) {
+        return packet_shuffle_avx512_icx(k_mask, v_data_0,
+                                         v_shuf_idxs, v_data_1);
+    }
+
+    /* Clear away ODD lane bytes, shift down by 1 to get u8 to u16 idxs. */
+    const __mmask64 k_mask_odd_lanes = 0xAAAAAAAAAAAAAAAA;
+    __m512i v_shuf_idx_evn = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                            v_shuf_idxs, _mm512_setzero_si512());
+    v_shuf_idx_evn = _mm512_srli_epi16(v_shuf_idx_evn, 1);
+
+    /* Clear away EVEN lane bytes by shifting out. Shift EVEN lane indexes down
+     * by one bit too to achieve u8 to u16 conversion.
+     */
+    __m512i v_shuf_idx_odd = _mm512_srli_epi16(v_shuf_idxs, 9);
+
+    /* Shuffle each of odd/even at 16-bit width. */
+    __m512i v_shuf1 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_evn,
+                                                v_data_1);
+    __m512i v_shuf2 = _mm512_permutex2var_epi16(v_data_0, v_shuf_idx_odd,
+                                                v_data_1);
+
+    /* Find if the shuffle index was odd, via mask and compare. */
+    uint16_t index_odd_mask = 0x1;
+    const __m512i v_index_mask_u16 = _mm512_set1_epi16(index_odd_mask);
+
+    /* EVEN lanes, find if u8 index was odd,  result as u16 bitmask. */
+    __m512i v_idx_even_masked = _mm512_and_si512(v_shuf_idxs,
+                                                 v_index_mask_u16);
+    __mmask32 evn_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_even_masked,
+                                                    v_index_mask_u16);
+
+    /* ODD lanes, find if u8 index was odd, result as u16 bitmask. */
+    __m512i v_shuf_idx_srli8 = _mm512_srli_epi16(v_shuf_idxs, 8);
+    __m512i v_idx_odd_masked = _mm512_and_si512(v_shuf_idx_srli8,
+                                                v_index_mask_u16);
+    __mmask32 odd_rotate_mask = _mm512_cmpeq_epi16_mask(v_idx_odd_masked,
+                                                    v_index_mask_u16);
+    odd_rotate_mask = ~odd_rotate_mask;
+
+    /* Rotate based on low-bit-set bitmask, and blend results. */
+    __m512i v_shuf_res_evn = _mm512_mask_srli_epi16(v_shuf1,
+                                    evn_rotate_mask, v_shuf1, 8);
+    __m512i v_shuf_res_odd = _mm512_mask_slli_epi16(v_shuf2,
+                                    odd_rotate_mask, v_shuf2, 8);
+
+    /* Blend results of two halves back together. */
+    __m512i v_shuf_result = _mm512_mask_blend_epi8(k_mask_odd_lanes,
+                                    v_shuf_res_evn, v_shuf_res_odd);
+
+    /* k-mask the final result as requested. This is not easy to do before
+     * here, as the instructions operate at u16 size, meaning the k-mask would
+     * be interpreted as the wrong size.
+     */
+    __m512i v_zeros = _mm512_setzero_si512();
+    __m512i v_shuf_res_masked = _mm512_mask_blend_epi8(k_mask, v_zeros,
+                                                       v_shuf_result);
+    return v_shuf_res_masked;
+}
+
+
+/* Matches all patterns provided, returns a bitmask of which pattern matched
+ * the packet.
+ */
+static inline __attribute__((always_inline)) uint32_t
+packet_pattern_avx512(struct dp_packet *dp_pkt, struct miniflow *mf,
+                      struct packet_pattern_cache *cache,
+                      const uint32_t num_patterns)
+{
+    uint8_t *pkt = dp_packet_data(dp_pkt);
+    uint32_t pkt_len = dp_packet_size(dp_pkt);
+    uint32_t in_port = odp_to_u32(dp_pkt->md.in_port.odp_port);
+
+    /* Masked load to only load the valid packet data. */
+    uint64_t mask1 = (1ULL << pkt_len) - 1;
+    mask1 |= (pkt_len < 64) - 1;
+    __mmask64 pkt_len_mask_0 = mask1;
+
+    uint64_t mask2 = (1ULL << (pkt_len - 64)) - 1;
+    mask2 |= (pkt_len < 128) - 1;
+    mask2 &= (pkt_len <  64) - 1;
+    __mmask64 pkt_len_mask_1 = mask2;
+
+    __m512i v_pkt_data_0 = _mm512_maskz_loadu_epi8(pkt_len_mask_0, &pkt[0]);
+    __m512i v_pkt_data_1 = _mm512_maskz_loadu_epi8(pkt_len_mask_1, &pkt[64]);
+
+    /* Loop over the patterns provided. Note that this loop can be compile-time
+     * unrolled for specialized versions with set numbers of patterns.
+     */
+    uint32_t hitmask = 0;
+    for (uint32_t i = 0; i < num_patterns; i++) {
+        struct packet_pattern *patterns = cache->patterns;
+
+        /* Mask and match the packet data and pattern, results in hit bit. */
+        __m512i v_mask_0 = _mm512_loadu_si512(&patterns[i].mask[0]);
+        __m512i v_data_0 = _mm512_loadu_si512(&patterns[i].data[0]);
+        __m512i v_pkt_masked = _mm512_and_si512(v_pkt_data_0, v_mask_0);
+        __mmask64 cmp_mask = _mm512_cmpeq_epi8_mask(v_pkt_masked, v_data_0);
+
+        uint32_t hit = (cmp_mask == UINT64_MAX);
+        hitmask |= (hit << i);
+    }
+
+    /* If a pattern was hit, build the miniflow using the pattern shuffle. */
+    if (OVS_LIKELY(hitmask)) {
+        uint32_t idx = __builtin_ctzll(hitmask);
+
+        /* Copy the pattern miniflow bits to the destination miniflow. */
+        struct miniflow *pattern_mf_bits = &cache->miniflow_bits[idx];
+        __m128i v_pattern_mf_bits = _mm_load_si128((void *)pattern_mf_bits);
+        _mm_storeu_si128((void *)mf, v_pattern_mf_bits);
+
+        /* Compute bytes 0-63 of miniflow. */
+        struct packet_pattern_shuffle *shuffle = &cache->shuffles[idx];
+        __mmask64 k_shuf_0 = shuffle->kmasks[0];
+        __m512i v_shuf_mask_0 = _mm512_loadu_si512(&shuffle->shuffle[0]);
+        __m512i v_mf_blocks_0 = packet_shuffle_avx512(k_shuf_0, v_pkt_data_0,
+                                    v_shuf_mask_0, v_pkt_data_1);
+
+        /* Compute bytes 64-127 of miniflow. */
+        __mmask64 k_shuf_1 = shuffle->kmasks[1];
+        __m512i v_shuf_mask_1 = _mm512_loadu_si512(&shuffle->shuffle[1]);
+        __m512i v_mf_blocks_1 = packet_shuffle_avx512(k_shuf_1, v_pkt_data_0,
+                                    v_shuf_mask_1, v_pkt_data_1);
+
+        /* Miniflow Blocks contains first 2 blocks of non-packet-parsed data,
+         * such as the dp hash, in port, ct_mark, and packet_type. On outer
+         * packets, they are always zero except for in_port.
+         */
+        uint64_t *mf_blocks = miniflow_values(mf);
+        __m128i v_blocks_01 = _mm_setzero_si128();
+        v_blocks_01 = _mm_insert_epi32(v_blocks_01, in_port, 1);
+        _mm_storeu_si128((void *)&mf_blocks[0], v_blocks_01);
+
+        /* Store the computed miniflow blocks. */
+        _mm512_storeu_si512(&mf_blocks[2], v_mf_blocks_0);
+        _mm512_storeu_si512(&mf_blocks[2 + 8], v_mf_blocks_1);
+
+        /* Set dp packet offsets from the pattern metadata.  */
+        memcpy(&dp_pkt->l2_pad_size, &shuffle->offsets,
+               sizeof(struct packet_offsets));
+    }
+
+    return hitmask;
+}
+
+/* Check that the runtime CPU has the required ISA avialable. Also check for
+ * AVX-512 Vector Bit Manipulation Instructions (VBMI), which allow a faster
+ * code-path to be used due to a native byte permute instruction.
+ */
+int32_t
+miniflow_extract_avx512_probe(void)
+{
+    int avx512f_available = dpdk_get_cpu_has_isa("x86_64", "avx512f");
+    int bmi2_available = dpdk_get_cpu_has_isa("x86_64", "bmi2");
+    avx512vbmi_available = dpdk_get_cpu_has_isa("x86_64", "avx512vbmi");
+
+    uint32_t min_isa_ok = avx512f_available && bmi2_available;
+    printf("%s : minimum ISA avialable: %s, AVX-512 VBMI available: %s\n",
+           __func__, min_isa_ok ? "yes" : "no",
+           avx512vbmi_available ? "yes" : "no");
+    return min_isa_ok;
+}
+
+/* TODO: This function accepts a string, which represents the pattern and
+ * shuffles required for the users traffic type. Today this function has a
+ * hard-coded pattern for Ether()/IP()/UDP() packets.
+ *
+ * A future revision of this patchset will include the parsing of the input
+ * string to create the patterns, providing runtime flexibility in parsing
+ * packets into miniflows.
+ */
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string)
+{
+    (void)patterns_control_path;
+    (void)pattern_string;
+
+    /* Add hard-coded Ether/IPv4/UDP implementation for demonstration. */
+    patterns_control_path.active_pattern_count = 1;
+
+    /* Ether/IPv4/UDP pattern metadata */
+    patterns_control_path.patterns[0] = (struct packet_pattern) {
+        .mask = { PATTERN_ETHERTYPE_MASK PATTERN_IPV4_MASK },
+        .data = { PATTERN_ETHERTYPE_IPV4 PATTERN_IPV4_UDP },
+    };
+
+    printf("%s: pattern 0 mask:\n", __func__);
+    ovs_hex_dump(stdout, &patterns_control_path.patterns[0].mask,
+                 MAX_PATTERN_SIZE, 0, false);
+    printf("%s: pattern 0 data:\n", __func__);
+    ovs_hex_dump(stdout, &patterns_control_path.patterns[0].data,
+                 MAX_PATTERN_SIZE, 0, false);
+
+    patterns_control_path.miniflow_bits[0] = (struct miniflow) {
+        .map = { .bits = {0x18a0000000000000, 0x0000000000040401}, }
+    };
+    printf("pattern[0] mf bits %08llx %08llx\n",
+        patterns_control_path.miniflow_bits[0].map.bits[0],
+        patterns_control_path.miniflow_bits[0].map.bits[1]);
+
+    /* Kmask and Shuffle for Ether/IPv4/UDP. Created by inspecting miniflow
+     * built from packet data, and reproduced using AVX-512 instructions with
+     * k-masks to zero parts of the miniflow as required.
+     */
+    patterns_control_path.shuffles[0] = (struct packet_pattern_shuffle) {
+        .kmasks = { 0b0000111111110000111111110011111111111111, 0 },
+        .offsets = {
+            .l2_pad_size = 0,
+            .l2_5_ofs = UINT16_MAX,
+            .l3_ofs = 14,
+            .l4_ofs = 34,
+        },
+        .shuffle = {PATTERN_IPV4_UDP_SHUFFLE},
+    };
+    printf("pattern[0] kmask[0] %08lx, kmask[1] %08lx, shuffle hexdump:\n",
+           patterns_control_path.shuffles[0].kmasks[0],
+           patterns_control_path.shuffles[0].kmasks[1]);
+    ovs_hex_dump(stdout, &patterns_control_path.shuffles[0], MAX_PATTERN_SIZE,
+                 0, false);
+
+    return 0;
+};
+
+/* The study function runs the patterns from the control-path, and based on
+ * some hit statistics can copy the pattern to the per-PMD pattern cache. Part
+ * of the study() functionality is also to validate that hits on a pattern
+ * result in an identical miniflow as the scalar miniflow_extract() function.
+ * This is validated by calling the scalar version, and comparing output.
+ */
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+                              struct dp_packet *packet,
+                              struct miniflow *dst)
+{
+    /* Run using the user supplied patterns. */
+    uint32_t match = miniflow_extract_avx512(pmd, packet, dst);
+
+    if (match) {
+        /* Save off AVX512 created dp_packet offsets for verification. */
+        struct packet_offsets vec_offsets;
+        memcpy(&vec_offsets, &packet->l2_pad_size,
+               sizeof(struct packet_offsets));
+
+        /* Check the result vs the scalar miniflow-extract for correctness. */
+        struct netdev_flow_key scalar_mf_key = {0};
+        struct miniflow *scalar_mf = &scalar_mf_key.mf;
+        miniflow_extract(packet, scalar_mf);
+
+        /* Validate miniflow data is identical. */
+        uint32_t mf_bit_count = count_1bits(scalar_mf->map.bits[0]) +
+                                    count_1bits(scalar_mf->map.bits[1]);
+        size_t compare_size = sizeof(uint64_t) * (2 + mf_bit_count);
+        if (memcmp(scalar_mf, dst, compare_size)) {
+            printf("%s: Scalar miniflow output:\n", __func__);
+            ovs_hex_dump(stdout, scalar_mf, compare_size, 0, false);
+            printf("%s: AVX512 miniflow output:\n", __func__);
+            ovs_hex_dump(stdout, dst, compare_size, 0, false);
+            printf("error in miniflow compare, see hexdumps() above\n");
+        }
+
+        /* Validate that dp_packet offsets are identical. */
+        if (memcmp(&vec_offsets, &packet->l2_pad_size,
+                   sizeof(struct packet_offsets))) {
+            printf("VECTOR code DP packet properties: %d, %d, %d, %d\n",
+                   vec_offsets.l2_pad_size, vec_offsets.l2_5_ofs,
+                   vec_offsets.l3_ofs, vec_offsets.l4_ofs);
+            printf("Scalar code DP packet properties: %d, %d, %d, %d\n",
+                   packet->l2_pad_size, packet->l2_5_ofs, packet->l3_ofs,
+                   packet->l4_ofs);
+            ovs_assert("error in packet offsets, see printf()s above\n");
+        }
+
+    }
+
+    /* Check if the study function should study more packets, or if it is
+     * done. When done, we change the per-PMD function pointer to the datapath
+     * implementation without study for better performance.
+     */
+    int64_t study_more = --pmd->miniflow_study_pkts;
+    if (!study_more) {
+        printf("%s : setting func ptr to remove study(), study_pkts = %ld\n",
+               __func__, study_more);
+        pmd->miniflow_extract_opt = miniflow_extract_avx512;
+    }
+
+    return match;
+}
+
+uint32_t
+miniflow_extract_avx512(struct dp_netdev_pmd_thread *pmd,
+                        struct dp_packet *packet,
+                        struct miniflow *mf)
+{
+    /* TODO: alloc pattern cache per PMD thread. */
+    (void)pmd;
+
+    /* Execute the pattern matching using the PMD pattern cache. */
+    uint32_t match_hit = packet_pattern_avx512(packet, mf,
+                                               &patterns_control_path, 1);
+    return match_hit;
+}
diff --git a/lib/dpif-netdev-avx512.c b/lib/dpif-netdev-avx512.c
index 2dee909a3..502b9a3f4 100644
--- a/lib/dpif-netdev-avx512.c
+++ b/lib/dpif-netdev-avx512.c
@@ -34,6 +34,7 @@ 
 
 #include "immintrin.h"
 
+#include "dpif-netdev-avx512-extract.h"
 
 /* Structure to contain per-packet metadata that must be attributed to the
  * dp netdev flow. This is unfortunate to have to track per packet, however
@@ -104,9 +105,18 @@  dp_netdev_input_outer_avx512(struct dp_netdev_pmd_thread *pmd,
         struct dp_packet *packet = packets->packets[i];
         pkt_metadata_init(&packet->md, in_port);
         struct netdev_flow_key *key = &keys[i];
-        miniflow_extract(packet, &key->mf);
-        key->len = count_1bits(key->mf.map.bits[0])
-                   + count_1bits(key->mf.map.bits[1]);
+        if (pmd->miniflow_extract_opt) {
+            uint32_t matched = pmd->miniflow_extract_opt(pmd, packet,
+                                                         &key->mf);
+            if (!matched) {
+                miniflow_extract(packet, &key->mf);
+            }
+        } else {
+            miniflow_extract(packet, &key->mf);
+        }
+
+        key->len = count_1bits(key->mf.map.bits[0]) +
+                       count_1bits(key->mf.map.bits[1]);
         key->hash = dpif_netdev_packet_get_rss_hash_orig_pkt(packet, &key->mf);
 
         if (emc_enabled) {
diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
new file mode 100644
index 000000000..5a4bef7ed
--- /dev/null
+++ b/lib/dpif-netdev-private-extract.h
@@ -0,0 +1,52 @@ 
+/*
+ * Copyright (c) 2020 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef DPIF_NETDEV_AVX512_EXTRACT
+#define DPIF_NETDEV_AVX512_EXTRACT 1
+
+/* Forward declarations */
+struct dp_packet;
+struct miniflow;
+struct dp_netdev_pmd_thread;
+
+/* Function pointer prototype to be implemented in the optimized miniflow
+ * extract code.
+ */
+typedef uint32_t (*miniflow_extract_func)(struct dp_netdev_pmd_thread *pmd,
+                                          struct dp_packet *packet,
+                                          struct miniflow *mf);
+
+/* Today the avx512 implementation of miniflow extract is exposed to DPIF.
+ * This will be abstracted like is done in DPCLS, with multiple implementations
+ * being available to be selected.
+ */
+
+int32_t
+miniflow_extract_avx512_probe(void);
+
+int32_t
+miniflow_extract_avx512_insert(const char *pattern_string);
+
+uint32_t
+miniflow_extract_avx512_study(struct dp_netdev_pmd_thread *pmd,
+                              struct dp_packet *packet,
+                              struct miniflow *dst);
+
+uint32_t
+miniflow_extract_avx512(struct dp_netdev_pmd_thread *pmd,
+                        struct dp_packet *packet,
+                        struct miniflow *mf);
+
+#endif /* DPIF_NETDEV_AVX512_EXTRACT */
diff --git a/lib/dpif-netdev-private-flow.h b/lib/dpif-netdev-private-flow.h
index 6b91a5d4e..20d22bad3 100644
--- a/lib/dpif-netdev-private-flow.h
+++ b/lib/dpif-netdev-private-flow.h
@@ -147,6 +147,7 @@  struct dp_netdev_actions {
     struct nlattr actions[];    /* Sequence of OVS_ACTION_ATTR_* attributes. */
 };
 
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/lib/dpif-netdev-private-thread.h b/lib/dpif-netdev-private-thread.h
index d0b3ccb06..34139700e 100644
--- a/lib/dpif-netdev-private-thread.h
+++ b/lib/dpif-netdev-private-thread.h
@@ -29,6 +29,7 @@ 
 #include "openvswitch/thread.h"
 
 #include "dpif-netdev-private-dpif.h"
+#include "dpif-netdev-private-extract.h"
 
 #ifdef  __cplusplus
 extern "C" {
@@ -106,6 +107,14 @@  struct dp_netdev_pmd_thread {
     /* Function pointer to call for dp_netdev_input() functionality */
     dp_netdev_input_func netdev_input_func;
 
+    /* Function pointer to call for miniflow_extract() functionality */
+    miniflow_extract_func miniflow_extract_opt;
+    /* Number of miniflow packets to study before selecting miniflow
+     * implementation. Depending on variability in traffic, a higher number
+     * allows longer inspection of traffic to ensure all are covered.
+     */
+    uint32_t miniflow_study_pkts;
+
     struct seq *reload_seq;
     uint64_t last_reload_seq;
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 211fc533b..88db883a9 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -990,6 +990,92 @@  dpif_netdev_subtable_lookup_set(struct unixctl_conn *conn, int argc,
     ds_destroy(&reply);
 }
 
+static void
+dpif_miniflow_extract_template_add(struct unixctl_conn *conn, int argc,
+                                   const char *argv[], void *aux OVS_UNUSED)
+{
+    /* This function requires just one parameter, the template name.
+     * A second optional parameter can identify the datapath instance.
+     */
+    const char *template_name = argv[1];
+
+    /* Today the code here is hard-coded to the specific miniflow extract
+     * implementation. This will be reworked to be generic like the DPCLS,
+     * where different implementations can be selected at runtime.
+     */
+    int32_t isa_available = miniflow_extract_avx512_probe();
+    if (!isa_available) {
+        struct ds reply = DS_EMPTY_INITIALIZER;
+        ds_put_format(&reply, "Miniflow Extract %s not available.\n",
+                      template_name);
+        const char *reply_str = ds_cstr(&reply);
+        unixctl_command_reply(conn, reply_str);
+        VLOG_INFO("%s", reply_str);
+        ds_destroy(&reply);
+        return;
+    }
+
+    /* Insert the new pattern. There is ongoing work on designing the
+     * interaction between the string here, and the patterns in the miniflow
+     * extract optimized code.
+     */
+    int32_t insert_err = miniflow_extract_avx512_insert(template_name);
+    if (insert_err) {
+        VLOG_DBG("error inserting pattern, returned %d", insert_err);
+    }
+
+    ovs_mutex_lock(&dp_netdev_mutex);
+    struct dp_netdev *dp = NULL;
+
+    /* Optional argument, if passed, study this number of packets. Defaults
+     * to 10k.
+     */
+    uint32_t study_pkts = 10000;
+    if (argc >= 3) {
+        study_pkts = atoi(argv[2]);
+    }
+
+    if (argc == 4) {
+        dp = shash_find_data(&dp_netdevs, argv[3]);
+    } else if (shash_count(&dp_netdevs) == 1) {
+        dp = shash_first(&dp_netdevs)->data;
+    }
+
+    if (!dp) {
+        ovs_mutex_unlock(&dp_netdev_mutex);
+        unixctl_command_reply_error(conn,
+                                    "please specify an existing datapath");
+        return;
+    }
+
+    /* Get PMD threads list */
+    size_t n;
+    struct dp_netdev_pmd_thread **pmd_list;
+    sorted_poll_thread_list(dp, &pmd_list, &n);
+
+    for (size_t i = 0; i < n; i++) {
+        struct dp_netdev_pmd_thread *pmd = pmd_list[i];
+        if (pmd->core_id == NON_PMD_CORE_ID) {
+            continue;
+        }
+
+        /* set PMD context to study N packets */
+        pmd->miniflow_study_pkts = study_pkts;
+
+        /* set PMD threads DPIF implementation to requested one */
+        pmd->miniflow_extract_opt = miniflow_extract_avx512_study;
+    };
+    ovs_mutex_unlock(&dp_netdev_mutex);
+
+    /* Reply with success to command */
+    struct ds reply = DS_EMPTY_INITIALIZER;
+    ds_put_format(&reply, "miniflow template %s added.\n", template_name);
+    const char *reply_str = ds_cstr(&reply);
+    unixctl_command_reply(conn, reply_str);
+    VLOG_INFO("%s", reply_str);
+    ds_destroy(&reply);
+}
+
 static void
 dpif_netdev_impl_set(struct unixctl_conn *conn, int argc,
                      const char *argv[], void *aux OVS_UNUSED)
@@ -1288,6 +1374,10 @@  dpif_netdev_init(void)
                              "[dpif implementation name] [dp]",
                              1, 2, dpif_netdev_impl_set,
                              NULL);
+    unixctl_command_register("dpif-netdev/miniflow-template-add",
+                             "[template name] [study pkt count] [dp]",
+                             1, 3, dpif_miniflow_extract_template_add,
+                             NULL);
     return 0;
 }
 
@@ -6123,6 +6213,9 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     /* Initialize the DPIF function pointer to the default scalar version */
     pmd->netdev_input_func = dp_netdev_impl_get_default();
 
+    /* Initialize the miniflow extract function pointer not set */
+    pmd->miniflow_extract_opt = NULL;
+
     /* init the 'flow_cache' since there is no
      * actual thread created for NON_PMD_CORE_ID. */
     if (core_id == NON_PMD_CORE_ID) {