diff mbox series

[ovs-dev,v4,03/12] dpif-netdev: Add study function to select the best mfex function

Message ID 20210617162754.2028048-4-kumar.amber@intel.com
State Superseded
Headers show
Series MFEX Infrastructure + Optimizations | expand

Commit Message

Kumar Amber June 17, 2021, 4:27 p.m. UTC
The study function runs all the available implementations
of miniflow_extract and makes a choice whose hitmask has
maximum hits and sets the mfex to that function.

Study can be run at runtime using the following command:

$ ovs-appctl dpif-netdev/miniflow-parser-set study

Signed-off-by: Kumar Amber <kumar.amber@intel.com>
Co-authored-by: Harry van Haaren <harry.van.haaren@intel.com>
Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 lib/automake.mk                   |   1 +
 lib/dpif-netdev-extract-study.c   | 119 ++++++++++++++++++++++++++++++
 lib/dpif-netdev-private-extract.c |   5 ++
 lib/dpif-netdev-private-extract.h |  14 +++-
 4 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 lib/dpif-netdev-extract-study.c

Comments

Stokes, Ian June 24, 2021, 1:20 p.m. UTC | #1
> The study function runs all the available implementations
> of miniflow_extract and makes a choice whose hitmask has
> maximum hits and sets the mfex to that function.

Hi Amber/Harry,

Thanks for the patch, a few comments inline below.

> 
> Study can be run at runtime using the following command:
> 
> $ ovs-appctl dpif-netdev/miniflow-parser-set study
> 
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> Co-authored-by: Harry van Haaren <harry.van.haaren@intel.com>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> ---
>  lib/automake.mk                   |   1 +
>  lib/dpif-netdev-extract-study.c   | 119 ++++++++++++++++++++++++++++++
>  lib/dpif-netdev-private-extract.c |   5 ++
>  lib/dpif-netdev-private-extract.h |  14 +++-
>  4 files changed, 138 insertions(+), 1 deletion(-)
>  create mode 100644 lib/dpif-netdev-extract-study.c
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 6657b9ae5..3080bb04a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dpif-netdev.c \
>  	lib/dpif-netdev.h \
>  	lib/dpif-netdev-private-dfc.c \
> +	lib/dpif-netdev-extract-study.c \
Headers should be added alphabetically.

>  	lib/dpif-netdev-private-dfc.h \
>  	lib/dpif-netdev-private-dpcls.h \
>  	lib/dpif-netdev-private-dpif.c \
> diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c
> new file mode 100644
> index 000000000..d063d040c
> --- /dev/null
> +++ b/lib/dpif-netdev-extract-study.c
> @@ -0,0 +1,119 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <errno.h>
> +#include <stdint.h>
> +#include <string.h>
> +
> +#include "dpif-netdev-private-extract.h"
> +#include "dpif-netdev-private-thread.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-thread.h"
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_mfex_extract_study);
> +
> +/* Max size of packets to be compared. */
> +#define MFEX_MAX_COUNT (128)
> +
> +/* This value is the threshold for the amount of packets that
> + * must hit on the optimized miniflow extract before it will be
> + * accepted and used in the datapath after the study phase. */
> +#define MFEX_MIN_HIT_COUNT_FOR_USE (MFEX_MAX_COUNT / 2)
> +
> +/* Struct to hold miniflow study stats. */
> +struct study_stats {
> +    uint32_t pkt_count;
> +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> +};
> +
> +/* Define per thread data to hold the study stats. */
> +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *, study_stats);
> +
> +/* Allocate per thread PMD pointer space for study_stats. */
> +static inline struct study_stats *
> +get_study_stats(void)

Would maybe suggest a name change here, get_study_stats sounds as if info is being returned whereas whats actually happening is that the memory for the stats are being provisioned.
> +{
> +    struct study_stats *stats = study_stats_get();
> +    if (OVS_UNLIKELY(!stats)) {
> +       stats = xzalloc(sizeof *stats);
> +       study_stats_set_unsafe(stats);
Can you explain why above is set unsafe? Where does that function originate from?

> +    }
> +    return stats;
> +}
> +
> +uint32_t
> +mfex_study_traffic(struct dp_packet_batch *packets,
> +                   struct netdev_flow_key *keys,
> +                   uint32_t keys_size, odp_port_t in_port,
> +                   void *pmd_handle)
> +{
> +    uint32_t hitmask = 0;
> +    uint32_t mask = 0;
> +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
> +    struct dpif_miniflow_extract_impl *miniflow_funcs;
> +    uint32_t impl_count = dpif_miniflow_extract_info_get(&miniflow_funcs);
> +    struct study_stats *stats = get_study_stats();
> +
> +    /* Run traffic optimized miniflow_extract to collect the hitmask
> +     * to be compared after certain packets have been hit to choose
> +     * the best miniflow_extract version for that traffic. */

For the comment above would prefer to keep with the OVS coding style and close comment vertically aligned.

https://docs.openvswitch.org/en/latest/internals/contributing/coding-style/#comments

/*
 *
 */

> +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> +        if (miniflow_funcs[i].available) {
> +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> +                                                     in_port, pmd_handle);
> +            stats->impl_hitcount[i] += count_1bits(hitmask);
> +
> +            /* If traffic is not classified than we dont overwrite the keys
Typo above than -> then
> +             * array in minfiflow implementations so its safe to create a
> +             * mask for all those packets whose miniflow have been created. */
> +            mask |= hitmask;
> +        }
> +    }
> +    stats->pkt_count += dp_packet_batch_size(packets);
> +
> +    /* Choose the best implementation after a minimum packets have been
> +     * processed. */
> +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
> +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> +        uint32_t max_hits = 0;
> +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> +            if (stats->impl_hitcount[i] > max_hits) {
> +                max_hits = stats->impl_hitcount[i];
> +                best_func_index = i;
> +            }
> +        }
> +
> +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> +            /* Set the implementation to index with max_hits. */
> +            pmd->miniflow_extract_opt =
> +                        miniflow_funcs[best_func_index].extract_func;
> +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> +                      miniflow_funcs[best_func_index].name, max_hits,
> +                      stats->pkt_count);
> +        } else {
> +            /* Set the implementation to null for default miniflow. */
> +            pmd->miniflow_extract_opt = NULL;
> +            VLOG_INFO("Not enough packets matched (%d/%d), disabling"
> +                      " optimized MFEX.\n", max_hits, stats->pkt_count);
> +        }
> +        /* Reset stats so that study function can be called again
> +         * for next traffic type and optimal function ptr can be
> +         * choosen. */

Typo, chosen -> chosen.

BR
Ian

> +        memset(stats, 0, sizeof(struct study_stats));
> +    }
> +    return mask;
> +}
> diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
> index 0741c19f9..d86268a1d 100644
> --- a/lib/dpif-netdev-private-extract.c
> +++ b/lib/dpif-netdev-private-extract.c
> @@ -42,6 +42,11 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = {
>          .extract_func = NULL,
>          .name = "disable",
>      },
> +    {
> +        .probe = NULL,
> +        .extract_func = mfex_study_traffic,
> +        .name = "study",
> +    },
>  };
> 
>  BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
> diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
> index 455a7b590..3ada413bb 100644
> --- a/lib/dpif-netdev-private-extract.h
> +++ b/lib/dpif-netdev-private-extract.h
> @@ -27,7 +27,7 @@
>  /* Skip the autovalidator study and null when iterating all available
>   * miniflow implementations.
>   */
> -#define MFEX_IMPL_START_IDX (1)
> +#define MFEX_IMPL_START_IDX (3)
> 
>  /* Forward declarations. */
>  struct dp_packet;
> @@ -106,4 +106,16 @@ dpif_miniflow_extract_autovalidator(struct
> dp_packet_batch *batch,
>                                      uint32_t keys_size, odp_port_t in_port,
>                                      void *pmd_handle);
> 
> +/* Retrieve the number of packets by studying packets using different miniflow
> + * implementations to choose the best implementation using the maximum
> hitmask
> + * count.
> + * On error, returns a zero for no packets.
> + * On success, returns mask of the packets hit.
> + */
> +uint32_t
> +mfex_study_traffic(struct dp_packet_batch *packets,
> +                   struct netdev_flow_key *keys,
> +                   uint32_t keys_size, odp_port_t in_port,
> +                   void *pmd_handle);
> +
>  #endif /* DPIF_NETDEV_AVX512_EXTRACT */
> --
> 2.25.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Van Haaren, Harry June 24, 2021, 2:38 p.m. UTC | #2
> -----Original Message-----
> From: Stokes, Ian <ian.stokes@intel.com>
> Sent: Thursday, June 24, 2021 2:20 PM
> To: Amber, Kumar <kumar.amber@intel.com>; dev@openvswitch.org; Van
> Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Amber, Kumar <kumar.amber@intel.com>; i.maximets@ovn.org
> Subject: RE: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
> best mfex function
> 
> > The study function runs all the available implementations
> > of miniflow_extract and makes a choice whose hitmask has
> > maximum hits and sets the mfex to that function.
> 
> Hi Amber/Harry,
> 
> Thanks for the patch, a few comments inline below.

Thanks for review. Just addressing the stats get/TLS topic here.
<snip other patch changes>

> > +/* Struct to hold miniflow study stats. */
> > +struct study_stats {
> > +    uint32_t pkt_count;
> > +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> > +};
> > +
> > +/* Define per thread data to hold the study stats. */
> > +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *, study_stats);
> > +
> > +/* Allocate per thread PMD pointer space for study_stats. */
> > +static inline struct study_stats *
> > +get_study_stats(void)
> 
> Would maybe suggest a name change here, get_study_stats sounds as if info is
> being returned whereas whats actually happening is that the memory for the
> stats are being provisioned.

More context for explaining below...

> > +{
> > +    struct study_stats *stats = study_stats_get();
> > +    if (OVS_UNLIKELY(!stats)) {
> > +       stats = xzalloc(sizeof *stats);
> > +       study_stats_set_unsafe(stats);
> Can you explain why above is set unsafe? Where does that function originate
> from?

Yes, this is how the OVS "per thread data" (also called "Thread Local Storage" or TLS)
is implemented. The "get()" function indeed allocates the memory first time that this
thread actually accesses it, and any time after that it just returns the per-thread allocated
data pointer.

The "unsafe" is essentially the API used to change a TLS variable. It must only be called
by the thread that's using it itself, hence the unsafe() AFAIK.

The same function naming etc is used in DPCLS already, where this was the recommended
method of getting/using TLS data.

dpif-netdev-lookup-generic.c +47   function has "get_blocks_scratch()" which performs
approximately the same functionality as here. 

Hope that clears up that topic, regards, -Harry
Kumar Amber June 24, 2021, 2:39 p.m. UTC | #3
Hi Ian ,

Thanks Again, replies are inline.

<Snipped>

> > @@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
> > lib/dpif-netdev.c \  lib/dpif-netdev.h \
> > lib/dpif-netdev-private-dfc.c \
> > +lib/dpif-netdev-extract-study.c \
> Headers should be added alphabetically.
> 

Fixed in v5.
> >  lib/dpif-netdev-private-dfc.h \
> >  lib/dpif-netdev-private-dpcls.h \
> >  lib/dpif-netdev-private-dpif.c \
> > diff --git a/lib/dpif-netdev-extract-study.c
> > b/lib/dpif-netdev-extract-study.c new file mode 100644 index
> > 000000000..d063d040c
> > --- /dev/null
> > +++ b/lib/dpif-netdev-extract-study.c
> > @@ -0,0 +1,119 @@
> > +/*
> > + * Copyright (c) 2021 Intel.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing,
> > +software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> > implied.
> > + * See the License for the specific language governing permissions
> > + and
> > + * limitations under the License.
> > + */
> > +
> > +#include <config.h>
> > +#include <errno.h>
> > +#include <stdint.h>
> > +#include <string.h>
> > +
> > +#include "dpif-netdev-private-extract.h"
> > +#include "dpif-netdev-private-thread.h"
> > +#include "openvswitch/vlog.h"
> > +#include "ovs-thread.h"
> > +
> > +VLOG_DEFINE_THIS_MODULE(dpif_mfex_extract_study);
> > +
> > +/* Max size of packets to be compared. */ #define MFEX_MAX_COUNT
> > +(128)
> > +
> > +/* This value is the threshold for the amount of packets that
> > + * must hit on the optimized miniflow extract before it will be
> > + * accepted and used in the datapath after the study phase. */
> > +#define MFEX_MIN_HIT_COUNT_FOR_USE (MFEX_MAX_COUNT / 2)
> > +
> > +/* Struct to hold miniflow study stats. */ struct study_stats {
> > +    uint32_t pkt_count;
> > +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> > +};
> > +
> > +/* Define per thread data to hold the study stats. */
> > +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *,
> study_stats);
> > +
> > +/* Allocate per thread PMD pointer space for study_stats. */ static
> > +inline struct study_stats *
> > +get_study_stats(void)
> 
> Would maybe suggest a name change here, get_study_stats sounds as if info
> is being returned whereas whats actually happening is that the memory for
> the stats are being provisioned.

Fixed in v5.  Renamed to get_study_stats_ptr().
> > +{
> > +    struct study_stats *stats = study_stats_get();
> > +    if (OVS_UNLIKELY(!stats)) {
> > +       stats = xzalloc(sizeof *stats);
> > +       study_stats_set_unsafe(stats);
> Can you explain why above is set unsafe? Where does that function
> originate from?
> 
> > +    }
> > +    return stats;
> > +}
> > +
> > +uint32_t
> > +mfex_study_traffic(struct dp_packet_batch *packets,
> > +                   struct netdev_flow_key *keys,
> > +                   uint32_t keys_size, odp_port_t in_port,
> > +                   void *pmd_handle)
> > +{
> > +    uint32_t hitmask = 0;
> > +    uint32_t mask = 0;
> > +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
> > +    struct dpif_miniflow_extract_impl *miniflow_funcs;
> > +    uint32_t impl_count =
> dpif_miniflow_extract_info_get(&miniflow_funcs);
> > +    struct study_stats *stats = get_study_stats();
> > +
> > +    /* Run traffic optimized miniflow_extract to collect the hitmask
> > +     * to be compared after certain packets have been hit to choose
> > +     * the best miniflow_extract version for that traffic. */
> 
> For the comment above would prefer to keep with the OVS coding style and
> close comment vertically aligned.
> 
> https://docs.openvswitch.org/en/latest/internals/contributing/coding-
> style/#comments

Fixed at all the places in v5.
> 
> /*
>  *
>  */
> 
> > +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> > +        if (miniflow_funcs[i].available) {
> > +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> > +                                                     in_port, pmd_handle);
> > +            stats->impl_hitcount[i] += count_1bits(hitmask);
> > +
> > +            /* If traffic is not classified than we dont overwrite
> > + the keys
> Typo above than -> then

Fixed in v5.
> > +             * array in minfiflow implementations so its safe to create a
> > +             * mask for all those packets whose miniflow have been created.
> */
> > +            mask |= hitmask;
> > +        }
> > +    }
> > +    stats->pkt_count += dp_packet_batch_size(packets);
> > +
> > +    /* Choose the best implementation after a minimum packets have
> been
> > +     * processed. */
> > +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
> > +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> > +        uint32_t max_hits = 0;
> > +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> > +            if (stats->impl_hitcount[i] > max_hits) {
> > +                max_hits = stats->impl_hitcount[i];
> > +                best_func_index = i;
> > +            }
> > +        }
> > +
> > +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> > +            /* Set the implementation to index with max_hits. */
> > +            pmd->miniflow_extract_opt =
> > +                        miniflow_funcs[best_func_index].extract_func;
> > +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> > +                      miniflow_funcs[best_func_index].name, max_hits,
> > +                      stats->pkt_count);
> > +        } else {
> > +            /* Set the implementation to null for default miniflow. */
> > +            pmd->miniflow_extract_opt = NULL;
> > +            VLOG_INFO("Not enough packets matched (%d/%d), disabling"
> > +                      " optimized MFEX.\n", max_hits, stats->pkt_count);
> > +        }
> > +        /* Reset stats so that study function can be called again
> > +         * for next traffic type and optimal function ptr can be
> > +         * choosen. */
> 
> Typo, chosen -> chosen.

Fixed in v5.
> 
> BR
> Ian
> 
> > +        memset(stats, 0, sizeof(struct study_stats));
> > +    }
> > +    return mask;
> > +}
> > diff --git a/lib/dpif-netdev-private-extract.c
> > b/lib/dpif-netdev-private-extract.c
> > index 0741c19f9..d86268a1d 100644
> > --- a/lib/dpif-netdev-private-extract.c
> > +++ b/lib/dpif-netdev-private-extract.c
> > @@ -42,6 +42,11 @@ static struct dpif_miniflow_extract_impl
> mfex_impls[] = {
> >          .extract_func = NULL,
> >          .name = "disable",
> >      },
> > +    {
> > +        .probe = NULL,
> > +        .extract_func = mfex_study_traffic,
> > +        .name = "study",
> > +    },
> >  };
> >
> >  BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
> diff
> > --git a/lib/dpif-netdev-private-extract.h
> > b/lib/dpif-netdev-private-extract.h
> > index 455a7b590..3ada413bb 100644
> > --- a/lib/dpif-netdev-private-extract.h
> > +++ b/lib/dpif-netdev-private-extract.h
> > @@ -27,7 +27,7 @@
> >  /* Skip the autovalidator study and null when iterating all available
> >   * miniflow implementations.
> >   */
> > -#define MFEX_IMPL_START_IDX (1)
> > +#define MFEX_IMPL_START_IDX (3)
> >
> >  /* Forward declarations. */
> >  struct dp_packet;
> > @@ -106,4 +106,16 @@ dpif_miniflow_extract_autovalidator(struct
> > dp_packet_batch *batch,
> >                                      uint32_t keys_size, odp_port_t in_port,
> >                                      void *pmd_handle);
> >
> > +/* Retrieve the number of packets by studying packets using different
> > +miniflow
> > + * implementations to choose the best implementation using the
> > +maximum
> > hitmask
> > + * count.
> > + * On error, returns a zero for no packets.
> > + * On success, returns mask of the packets hit.
> > + */
> > +uint32_t
> > +mfex_study_traffic(struct dp_packet_batch *packets,
> > +                   struct netdev_flow_key *keys,
> > +                   uint32_t keys_size, odp_port_t in_port,
> > +                   void *pmd_handle);
> > +
> >  #endif /* DPIF_NETDEV_AVX512_EXTRACT */
> > --
> > 2.25.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Flavio Leitner June 28, 2021, 2:51 a.m. UTC | #4
Hi,

On Thu, Jun 17, 2021 at 09:57:45PM +0530, Kumar Amber wrote:
> The study function runs all the available implementations
> of miniflow_extract and makes a choice whose hitmask has
> maximum hits and sets the mfex to that function.
> 
> Study can be run at runtime using the following command:
> 
> $ ovs-appctl dpif-netdev/miniflow-parser-set study

Nice!


> 
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> Co-authored-by: Harry van Haaren <harry.van.haaren@intel.com>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> ---
>  lib/automake.mk                   |   1 +
>  lib/dpif-netdev-extract-study.c   | 119 ++++++++++++++++++++++++++++++
>  lib/dpif-netdev-private-extract.c |   5 ++
>  lib/dpif-netdev-private-extract.h |  14 +++-
>  4 files changed, 138 insertions(+), 1 deletion(-)
>  create mode 100644 lib/dpif-netdev-extract-study.c
> 
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 6657b9ae5..3080bb04a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dpif-netdev.c \
>  	lib/dpif-netdev.h \
>  	lib/dpif-netdev-private-dfc.c \
> +	lib/dpif-netdev-extract-study.c \

Wrong order?

>  	lib/dpif-netdev-private-dfc.h \
>  	lib/dpif-netdev-private-dpcls.h \
>  	lib/dpif-netdev-private-dpif.c \
> diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c
> new file mode 100644
> index 000000000..d063d040c
> --- /dev/null
> +++ b/lib/dpif-netdev-extract-study.c
> @@ -0,0 +1,119 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <errno.h>
> +#include <stdint.h>
> +#include <string.h>
> +
> +#include "dpif-netdev-private-extract.h"
> +#include "dpif-netdev-private-thread.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-thread.h"
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_mfex_extract_study);
> +
> +/* Max size of packets to be compared. */

Size or number?

> +#define MFEX_MAX_COUNT (128)
> +
> +/* This value is the threshold for the amount of packets that
> + * must hit on the optimized miniflow extract before it will be
> + * accepted and used in the datapath after the study phase. */
> +#define MFEX_MIN_HIT_COUNT_FOR_USE (MFEX_MAX_COUNT / 2)
> +
> +/* Struct to hold miniflow study stats. */
> +struct study_stats {
> +    uint32_t pkt_count;
> +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> +};
> +
> +/* Define per thread data to hold the study stats. */
> +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *, study_stats);
> +
> +/* Allocate per thread PMD pointer space for study_stats. */
> +static inline struct study_stats *
> +get_study_stats(void)

Please define some prefix name for this module, like
for example mfex_study_<something>, to have a convention.


> +{
> +    struct study_stats *stats = study_stats_get();
> +    if (OVS_UNLIKELY(!stats)) {
> +       stats = xzalloc(sizeof *stats);
> +       study_stats_set_unsafe(stats);
> +    }
> +    return stats;
> +}
> +
> +uint32_t
> +mfex_study_traffic(struct dp_packet_batch *packets,
> +                   struct netdev_flow_key *keys,
> +                   uint32_t keys_size, odp_port_t in_port,
> +                   void *pmd_handle)
> +{
> +    uint32_t hitmask = 0;
> +    uint32_t mask = 0;
> +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
> +    struct dpif_miniflow_extract_impl *miniflow_funcs;
> +    uint32_t impl_count = dpif_miniflow_extract_info_get(&miniflow_funcs);
> +    struct study_stats *stats = get_study_stats();
> +
> +    /* Run traffic optimized miniflow_extract to collect the hitmask
> +     * to be compared after certain packets have been hit to choose
> +     * the best miniflow_extract version for that traffic. */
> +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> +        if (miniflow_funcs[i].available) {
> +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> +                                                     in_port, pmd_handle);
> +            stats->impl_hitcount[i] += count_1bits(hitmask);
> +
> +            /* If traffic is not classified than we dont overwrite the keys
> +             * array in minfiflow implementations so its safe to create a
> +             * mask for all those packets whose miniflow have been created. */
> +            mask |= hitmask;
> +        }
> +    }
> +    stats->pkt_count += dp_packet_batch_size(packets);
> +
> +    /* Choose the best implementation after a minimum packets have been
> +     * processed. */
> +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
> +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> +        uint32_t max_hits = 0;
> +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> +            if (stats->impl_hitcount[i] > max_hits) {
> +                max_hits = stats->impl_hitcount[i];
> +                best_func_index = i;
> +            }
> +        }
> +
> +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> +            /* Set the implementation to index with max_hits. */
> +            pmd->miniflow_extract_opt =
> +                        miniflow_funcs[best_func_index].extract_func;
> +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> +                      miniflow_funcs[best_func_index].name, max_hits,
> +                      stats->pkt_count);

No need to terminate with \n when using VLOG_*

> +        } else {
> +            /* Set the implementation to null for default miniflow. */
> +            pmd->miniflow_extract_opt = NULL;
> +            VLOG_INFO("Not enough packets matched (%d/%d), disabling"
> +                      " optimized MFEX.\n", max_hits, stats->pkt_count);
> +        }
> +        /* Reset stats so that study function can be called again
> +         * for next traffic type and optimal function ptr can be
> +         * choosen. */
> +        memset(stats, 0, sizeof(struct study_stats));
> +    }
> +    return mask;
> +}
> diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
> index 0741c19f9..d86268a1d 100644
> --- a/lib/dpif-netdev-private-extract.c
> +++ b/lib/dpif-netdev-private-extract.c
> @@ -42,6 +42,11 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = {
>          .extract_func = NULL,
>          .name = "disable",
>      },
> +    {
> +        .probe = NULL,
> +        .extract_func = mfex_study_traffic,
> +        .name = "study",
> +    },
>  };
>  
>  BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));

Since there is no dynamic registration of mfex implementation, then
using the enum suggested on an earlier patch we would have the last
entry as replacement for MFEX_IMPLS_MAX_SIZE.


> diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
> index 455a7b590..3ada413bb 100644
> --- a/lib/dpif-netdev-private-extract.h
> +++ b/lib/dpif-netdev-private-extract.h
> @@ -27,7 +27,7 @@
>  /* Skip the autovalidator study and null when iterating all available
>   * miniflow implementations.
>   */
> -#define MFEX_IMPL_START_IDX (1)
> +#define MFEX_IMPL_START_IDX (3)
>  
>  /* Forward declarations. */
>  struct dp_packet;
> @@ -106,4 +106,16 @@ dpif_miniflow_extract_autovalidator(struct dp_packet_batch *batch,
>                                      uint32_t keys_size, odp_port_t in_port,
>                                      void *pmd_handle);
>  
> +/* Retrieve the number of packets by studying packets using different miniflow
> + * implementations to choose the best implementation using the maximum hitmask
> + * count.
> + * On error, returns a zero for no packets.
> + * On success, returns mask of the packets hit.
> + */
> +uint32_t
> +mfex_study_traffic(struct dp_packet_batch *packets,
> +                   struct netdev_flow_key *keys,
> +                   uint32_t keys_size, odp_port_t in_port,
> +                   void *pmd_handle);
> +
>  #endif /* DPIF_NETDEV_AVX512_EXTRACT */
> -- 
> 2.25.1
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Kumar Amber June 29, 2021, 3:46 a.m. UTC | #5
Hi Flavio,

Thanks again and my replies are inline.

> -----Original Message-----
> From: Flavio Leitner <fbl@sysclose.org>
> Sent: Monday, June 28, 2021 8:22 AM
> To: Amber, Kumar <kumar.amber@intel.com>
> Cc: dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select
> the best mfex function
> 
> 
> Hi,
> 
> On Thu, Jun 17, 2021 at 09:57:45PM +0530, Kumar Amber wrote:
> > The study function runs all the available implementations of
> > miniflow_extract and makes a choice whose hitmask has maximum hits
> and
> > sets the mfex to that function.
> >
> > Study can be run at runtime using the following command:
> >
> > $ ovs-appctl dpif-netdev/miniflow-parser-set study
> 
> Nice!
>

😊
 
> 
> >
> > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> > Co-authored-by: Harry van Haaren <harry.van.haaren@intel.com>
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > ---
> >  lib/automake.mk                   |   1 +
> >  lib/dpif-netdev-extract-study.c   | 119 ++++++++++++++++++++++++++++++
> >  lib/dpif-netdev-private-extract.c |   5 ++
> >  lib/dpif-netdev-private-extract.h |  14 +++-
> >  4 files changed, 138 insertions(+), 1 deletion(-)  create mode 100644
> > lib/dpif-netdev-extract-study.c
> >
> > diff --git a/lib/automake.mk b/lib/automake.mk index
> > 6657b9ae5..3080bb04a 100644
> > --- a/lib/automake.mk
> > +++ b/lib/automake.mk
> > @@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
> >  	lib/dpif-netdev.c \
> >  	lib/dpif-netdev.h \
> >  	lib/dpif-netdev-private-dfc.c \
> > +	lib/dpif-netdev-extract-study.c \
> 
> Wrong order?
> 

Fixed in v5.

> >  	lib/dpif-netdev-private-dfc.h \
> >  	lib/dpif-netdev-private-dpcls.h \
> >  	lib/dpif-netdev-private-dpif.c \
> > diff --git a/lib/dpif-netdev-extract-study.c
> > b/lib/dpif-netdev-extract-study.c new file mode 100644 index
> > 000000000..d063d040c
> > --- /dev/null
> > +++ b/lib/dpif-netdev-extract-study.c
> > @@ -0,0 +1,119 @@
> > +/*
> > + * Copyright (c) 2021 Intel.
> > + *
> > + * Licensed under the Apache License, Version 2.0 (the "License");
> > + * you may not use this file except in compliance with the License.
> > + * You may obtain a copy of the License at:
> > + *
> > + *     http://www.apache.org/licenses/LICENSE-2.0
> > + *
> > + * Unless required by applicable law or agreed to in writing,
> > +software
> > + * distributed under the License is distributed on an "AS IS" BASIS,
> > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
> implied.
> > + * See the License for the specific language governing permissions
> > +and
> > + * limitations under the License.
> > + */
> > +
> > +#include <config.h>
> > +#include <errno.h>
> > +#include <stdint.h>
> > +#include <string.h>
> > +
> > +#include "dpif-netdev-private-extract.h"
> > +#include "dpif-netdev-private-thread.h"
> > +#include "openvswitch/vlog.h"
> > +#include "ovs-thread.h"
> > +
> > +VLOG_DEFINE_THIS_MODULE(dpif_mfex_extract_study);
> > +
> > +/* Max size of packets to be compared. */
> 
> Size or number?
> 

Typo fixed.

> > +#define MFEX_MAX_COUNT (128)
> > +
> > +/* This value is the threshold for the amount of packets that
> > + * must hit on the optimized miniflow extract before it will be
> > + * accepted and used in the datapath after the study phase. */
> > +#define MFEX_MIN_HIT_COUNT_FOR_USE (MFEX_MAX_COUNT / 2)
> > +
> > +/* Struct to hold miniflow study stats. */ struct study_stats {
> > +    uint32_t pkt_count;
> > +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> > +};
> > +
> > +/* Define per thread data to hold the study stats. */
> > +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *,
> study_stats);
> > +
> > +/* Allocate per thread PMD pointer space for study_stats. */ static
> > +inline struct study_stats *
> > +get_study_stats(void)
> 
> Please define some prefix name for this module, like for example
> mfex_study_<something>, to have a convention.
>

Using mfex_study_get_study_stats as name in v5.
 
> 
> > +{
> > +    struct study_stats *stats = study_stats_get();
> > +    if (OVS_UNLIKELY(!stats)) {
> > +       stats = xzalloc(sizeof *stats);
> > +       study_stats_set_unsafe(stats);
> > +    }
> > +    return stats;
> > +}
> > +
> > +uint32_t
> > +mfex_study_traffic(struct dp_packet_batch *packets,
> > +                   struct netdev_flow_key *keys,
> > +                   uint32_t keys_size, odp_port_t in_port,
> > +                   void *pmd_handle)
> > +{
> > +    uint32_t hitmask = 0;
> > +    uint32_t mask = 0;
> > +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
> > +    struct dpif_miniflow_extract_impl *miniflow_funcs;
> > +    uint32_t impl_count =
> dpif_miniflow_extract_info_get(&miniflow_funcs);
> > +    struct study_stats *stats = get_study_stats();
> > +
> > +    /* Run traffic optimized miniflow_extract to collect the hitmask
> > +     * to be compared after certain packets have been hit to choose
> > +     * the best miniflow_extract version for that traffic. */
> > +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> > +        if (miniflow_funcs[i].available) {
> > +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> > +                                                     in_port, pmd_handle);
> > +            stats->impl_hitcount[i] += count_1bits(hitmask);
> > +
> > +            /* If traffic is not classified than we dont overwrite the keys
> > +             * array in minfiflow implementations so its safe to create a
> > +             * mask for all those packets whose miniflow have been created.
> */
> > +            mask |= hitmask;
> > +        }
> > +    }
> > +    stats->pkt_count += dp_packet_batch_size(packets);
> > +
> > +    /* Choose the best implementation after a minimum packets have
> been
> > +     * processed. */
> > +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
> > +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> > +        uint32_t max_hits = 0;
> > +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> > +            if (stats->impl_hitcount[i] > max_hits) {
> > +                max_hits = stats->impl_hitcount[i];
> > +                best_func_index = i;
> > +            }
> > +        }
> > +
> > +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> > +            /* Set the implementation to index with max_hits. */
> > +            pmd->miniflow_extract_opt =
> > +                        miniflow_funcs[best_func_index].extract_func;
> > +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> > +                      miniflow_funcs[best_func_index].name, max_hits,
> > +                      stats->pkt_count);
> 
> No need to terminate with \n when using VLOG_*
> 

Removed in all Vlogs.
> > +        } else {
> > +            /* Set the implementation to null for default miniflow. */
> > +            pmd->miniflow_extract_opt = NULL;
> > +            VLOG_INFO("Not enough packets matched (%d/%d), disabling"
> > +                      " optimized MFEX.\n", max_hits, stats->pkt_count);
> > +        }
> > +        /* Reset stats so that study function can be called again
> > +         * for next traffic type and optimal function ptr can be
> > +         * choosen. */
> > +        memset(stats, 0, sizeof(struct study_stats));
> > +    }
> > +    return mask;
> > +}
> > diff --git a/lib/dpif-netdev-private-extract.c
> > b/lib/dpif-netdev-private-extract.c
> > index 0741c19f9..d86268a1d 100644
> > --- a/lib/dpif-netdev-private-extract.c
> > +++ b/lib/dpif-netdev-private-extract.c
> > @@ -42,6 +42,11 @@ static struct dpif_miniflow_extract_impl
> mfex_impls[] = {
> >          .extract_func = NULL,
> >          .name = "disable",
> >      },
> > +    {
> > +        .probe = NULL,
> > +        .extract_func = mfex_study_traffic,
> > +        .name = "study",
> > +    },
> >  };
> >
> >  BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
> 
> Since there is no dynamic registration of mfex implementation, then using
> the enum suggested on an earlier patch we would have the last entry as
> replacement for MFEX_IMPLS_MAX_SIZE.
> 
> 
Using MFEX_IMPLS_MAX as end Enum marker.

> > diff --git a/lib/dpif-netdev-private-extract.h
> > b/lib/dpif-netdev-private-extract.h
> > index 455a7b590..3ada413bb 100644
> > --- a/lib/dpif-netdev-private-extract.h
> > +++ b/lib/dpif-netdev-private-extract.h
> > @@ -27,7 +27,7 @@
> >  /* Skip the autovalidator study and null when iterating all available
> >   * miniflow implementations.
> >   */
> > -#define MFEX_IMPL_START_IDX (1)
> > +#define MFEX_IMPL_START_IDX (3)
> >
> >  /* Forward declarations. */
> >  struct dp_packet;
> > @@ -106,4 +106,16 @@ dpif_miniflow_extract_autovalidator(struct
> dp_packet_batch *batch,
> >                                      uint32_t keys_size, odp_port_t in_port,
> >                                      void *pmd_handle);
> >
> > +/* Retrieve the number of packets by studying packets using different
> > +miniflow
> > + * implementations to choose the best implementation using the
> > +maximum hitmask
> > + * count.
> > + * On error, returns a zero for no packets.
> > + * On success, returns mask of the packets hit.
> > + */
> > +uint32_t
> > +mfex_study_traffic(struct dp_packet_batch *packets,
> > +                   struct netdev_flow_key *keys,
> > +                   uint32_t keys_size, odp_port_t in_port,
> > +                   void *pmd_handle);
> > +
> >  #endif /* DPIF_NETDEV_AVX512_EXTRACT */
> > --
> > 2.25.1
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> --
> fbl
Eelco Chaudron June 29, 2021, 12:38 p.m. UTC | #6
More comments below. FYI I’m only reviewing right now, no testing.

//Eelco


On 17 Jun 2021, at 18:27, Kumar Amber wrote:

> The study function runs all the available implementations
> of miniflow_extract and makes a choice whose hitmask has
> maximum hits and sets the mfex to that function.
>
> Study can be run at runtime using the following command:
>
> $ ovs-appctl dpif-netdev/miniflow-parser-set study
>
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> Co-authored-by: Harry van Haaren <harry.van.haaren@intel.com>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> ---
>  lib/automake.mk                   |   1 +
>  lib/dpif-netdev-extract-study.c   | 119 ++++++++++++++++++++++++++++++
>  lib/dpif-netdev-private-extract.c |   5 ++
>  lib/dpif-netdev-private-extract.h |  14 +++-
>  4 files changed, 138 insertions(+), 1 deletion(-)
>  create mode 100644 lib/dpif-netdev-extract-study.c
>
> diff --git a/lib/automake.mk b/lib/automake.mk
> index 6657b9ae5..3080bb04a 100644
> --- a/lib/automake.mk
> +++ b/lib/automake.mk
> @@ -114,6 +114,7 @@ lib_libopenvswitch_la_SOURCES = \
>  	lib/dpif-netdev.c \
>  	lib/dpif-netdev.h \
>  	lib/dpif-netdev-private-dfc.c \
> +	lib/dpif-netdev-extract-study.c \
>  	lib/dpif-netdev-private-dfc.h \
>  	lib/dpif-netdev-private-dpcls.h \
>  	lib/dpif-netdev-private-dpif.c \
> diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c
> new file mode 100644
> index 000000000..d063d040c
> --- /dev/null
> +++ b/lib/dpif-netdev-extract-study.c
> @@ -0,0 +1,119 @@
> +/*
> + * Copyright (c) 2021 Intel.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#include <config.h>
> +#include <errno.h>
> +#include <stdint.h>
> +#include <string.h>
> +
> +#include "dpif-netdev-private-extract.h"
> +#include "dpif-netdev-private-thread.h"
> +#include "openvswitch/vlog.h"
> +#include "ovs-thread.h"
> +
> +VLOG_DEFINE_THIS_MODULE(dpif_mfex_extract_study);
> +
> +/* Max size of packets to be compared. */
> +#define MFEX_MAX_COUNT (128)
> +
> +/* This value is the threshold for the amount of packets that
> + * must hit on the optimized miniflow extract before it will be
> + * accepted and used in the datapath after the study phase. */
> +#define MFEX_MIN_HIT_COUNT_FOR_USE (MFEX_MAX_COUNT / 2)
> +
> +/* Struct to hold miniflow study stats. */
> +struct study_stats {
> +    uint32_t pkt_count;
> +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> +};
> +
> +/* Define per thread data to hold the study stats. */
> +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *, study_stats);
> +
> +/* Allocate per thread PMD pointer space for study_stats. */
> +static inline struct study_stats *
> +get_study_stats(void)
> +{
> +    struct study_stats *stats = study_stats_get();
> +    if (OVS_UNLIKELY(!stats)) {
> +       stats = xzalloc(sizeof *stats);
> +       study_stats_set_unsafe(stats);
> +    }
> +    return stats;
> +}
> +

Just got a mind-meld with the code, and realized that the function might be different per PMD thread due to this auto mode (and autovalidator mode in the previous patch).

This makes it only stronger that we need a way to see the currently selected mode, and not per datapath, but per PMD per datapath!

Do we also need a way to set this per PMD?

> +uint32_t
> +mfex_study_traffic(struct dp_packet_batch *packets,
> +                   struct netdev_flow_key *keys,
> +                   uint32_t keys_size, odp_port_t in_port,
> +                   void *pmd_handle)
> +{
> +    uint32_t hitmask = 0;
> +    uint32_t mask = 0;
> +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
> +    struct dpif_miniflow_extract_impl *miniflow_funcs;
> +    uint32_t impl_count = dpif_miniflow_extract_info_get(&miniflow_funcs);
> +    struct study_stats *stats = get_study_stats();
> +
> +    /* Run traffic optimized miniflow_extract to collect the hitmask
> +     * to be compared after certain packets have been hit to choose
> +     * the best miniflow_extract version for that traffic. */
> +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> +        if (miniflow_funcs[i].available) {
> +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> +                                                     in_port, pmd_handle);
> +            stats->impl_hitcount[i] += count_1bits(hitmask);
> +
> +            /* If traffic is not classified than we dont overwrite the keys
> +             * array in minfiflow implementations so its safe to create a
> +             * mask for all those packets whose miniflow have been created. */
> +            mask |= hitmask;
> +        }
> +    }
> +    stats->pkt_count += dp_packet_batch_size(packets);
> +
> +    /* Choose the best implementation after a minimum packets have been
> +     * processed. */
> +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
> +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> +        uint32_t max_hits = 0;
> +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> +            if (stats->impl_hitcount[i] > max_hits) {
> +                max_hits = stats->impl_hitcount[i];
> +                best_func_index = i;
> +            }
> +        }
> +
> +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> +            /* Set the implementation to index with max_hits. */
> +            pmd->miniflow_extract_opt =
> +                        miniflow_funcs[best_func_index].extract_func;
> +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> +                      miniflow_funcs[best_func_index].name, max_hits,
> +                      stats->pkt_count);

We have no idea which PMD the mode is selected for guess we might need to add this?

Maybe we should report the numbers/hits for the other methods, as they might be equal, and some might be faster in execution time?

> +        } else {
> +            /* Set the implementation to null for default miniflow. */
> +            pmd->miniflow_extract_opt = NULL;
> +            VLOG_INFO("Not enough packets matched (%d/%d), disabling"
> +                      " optimized MFEX.\n", max_hits, stats->pkt_count);
> +        }
> +        /* Reset stats so that study function can be called again
> +         * for next traffic type and optimal function ptr can be
> +         * choosen. */
> +        memset(stats, 0, sizeof(struct study_stats));
> +    }
> +    return mask;
> +}
> diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
> index 0741c19f9..d86268a1d 100644
> --- a/lib/dpif-netdev-private-extract.c
> +++ b/lib/dpif-netdev-private-extract.c
> @@ -42,6 +42,11 @@ static struct dpif_miniflow_extract_impl mfex_impls[] = {
>          .extract_func = NULL,
>          .name = "disable",
>      },
> +    {
> +        .probe = NULL,
> +        .extract_func = mfex_study_traffic,
> +        .name = "study",
> +    },
>  };
>
>  BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
> diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
> index 455a7b590..3ada413bb 100644
> --- a/lib/dpif-netdev-private-extract.h
> +++ b/lib/dpif-netdev-private-extract.h
> @@ -27,7 +27,7 @@
>  /* Skip the autovalidator study and null when iterating all available
>   * miniflow implementations.
>   */
> -#define MFEX_IMPL_START_IDX (1)
> +#define MFEX_IMPL_START_IDX (3)
>
>  /* Forward declarations. */
>  struct dp_packet;
> @@ -106,4 +106,16 @@ dpif_miniflow_extract_autovalidator(struct dp_packet_batch *batch,
>                                      uint32_t keys_size, odp_port_t in_port,
>                                      void *pmd_handle);
>
> +/* Retrieve the number of packets by studying packets using different miniflow
> + * implementations to choose the best implementation using the maximum hitmask
> + * count.
> + * On error, returns a zero for no packets.
> + * On success, returns mask of the packets hit.
> + */
> +uint32_t
> +mfex_study_traffic(struct dp_packet_batch *packets,
> +                   struct netdev_flow_key *keys,
> +                   uint32_t keys_size, odp_port_t in_port,
> +                   void *pmd_handle);
> +
>  #endif /* DPIF_NETDEV_AVX512_EXTRACT */
> -- 
> 2.25.1
>
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Van Haaren, Harry June 29, 2021, 4:32 p.m. UTC | #7
> -----Original Message-----
> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
> Sent: Tuesday, June 29, 2021 1:38 PM
> To: Amber, Kumar <kumar.amber@intel.com>
> Cc: dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
> mfex function
> 
> More comments below. FYI I’m only reviewing right now, no testing.

Sure, thanks for reviews.

> On 17 Jun 2021, at 18:27, Kumar Amber wrote:

<snip patch commit and some code>

> > +/* Allocate per thread PMD pointer space for study_stats. */
> > +static inline struct study_stats *
> > +get_study_stats(void)
> > +{
> > +    struct study_stats *stats = study_stats_get();
> > +    if (OVS_UNLIKELY(!stats)) {
> > +       stats = xzalloc(sizeof *stats);
> > +       study_stats_set_unsafe(stats);
> > +    }
> > +    return stats;
> > +}
> > +
> 
> Just got a mind-meld with the code, and realized that the function might be different
> per PMD thread due to this auto mode (and autovalidator mode in the previous
> patch).
> 
> This makes it only stronger that we need a way to see the currently selected mode,
> and not per datapath, but per PMD per datapath!

Study depends on the traffic pattern, so yes you're correct that it depends.
The study command was added after community suggested user-experience
would improve if the user doesn't have to provide an exact miniflow profile name.

Study studies the traffic running on that PMD, compares all MFEX impls, and prints out
hits. It selects the _first_ implementation that surpasses the threshold of packets.

Users are free to use the more specific names of MFEX impls instead of "study"
for fine-grained control over the MFEX impl in use, e.g.

ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp

> Do we also need a way to set this per PMD?

I don't feel there is real value here, but we could investigate adding an
optional parameter to the command indicating a PMD thread IDX to set?
We have access to "pmd->core_id" in our set() function, so limiting changes
to a specific PMD thread can be done ~ easily... but is it really required?

Perfect is the enemy of good... I'd prefer focus on getting existing code changes merged,
and add additional (optional) parameters in future if deemed useful in real world testing?


> > +uint32_t
> > +mfex_study_traffic(struct dp_packet_batch *packets,
> > +                   struct netdev_flow_key *keys,
> > +                   uint32_t keys_size, odp_port_t in_port,
> > +                   void *pmd_handle)
> > +{
> > +    uint32_t hitmask = 0;
> > +    uint32_t mask = 0;
> > +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
> > +    struct dpif_miniflow_extract_impl *miniflow_funcs;
> > +    uint32_t impl_count = dpif_miniflow_extract_info_get(&miniflow_funcs);
> > +    struct study_stats *stats = get_study_stats();
> > +
> > +    /* Run traffic optimized miniflow_extract to collect the hitmask
> > +     * to be compared after certain packets have been hit to choose
> > +     * the best miniflow_extract version for that traffic. */
> > +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> > +        if (miniflow_funcs[i].available) {
> > +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> > +                                                     in_port, pmd_handle);
> > +            stats->impl_hitcount[i] += count_1bits(hitmask);
> > +
> > +            /* If traffic is not classified than we dont overwrite the keys
> > +             * array in minfiflow implementations so its safe to create a
> > +             * mask for all those packets whose miniflow have been created. */
> > +            mask |= hitmask;
> > +        }
> > +    }
> > +    stats->pkt_count += dp_packet_batch_size(packets);
> > +
> > +    /* Choose the best implementation after a minimum packets have been
> > +     * processed. */
> > +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
> > +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> > +        uint32_t max_hits = 0;
> > +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
> > +            if (stats->impl_hitcount[i] > max_hits) {
> > +                max_hits = stats->impl_hitcount[i];
> > +                best_func_index = i;
> > +            }
> > +        }
> > +
> > +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> > +            /* Set the implementation to index with max_hits. */
> > +            pmd->miniflow_extract_opt =
> > +                        miniflow_funcs[best_func_index].extract_func;
> > +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> > +                      miniflow_funcs[best_func_index].name, max_hits,
> > +                      stats->pkt_count);
> 
> We have no idea which PMD the mode is selected for guess we might need to add
> this?
> 
> Maybe we should report the numbers/hits for the other methods, as they might be
> equal, and some might be faster in execution time?

As above, the implementations are sorted in performance order. Performance
here can be known by micro-benchmarks, and developers of such SIMD optimized
code can be expected to know which impl is fastest.

In our current code, the avx512_vbmi_* impls are always before the avx512_*
impls, as the VBMI instruction set allows a faster runtime.

If really desired, we could dump the whole results of the MFEX table for that
PMD thread, however I would expect the results to be noise, and not signal.

I'm happy to discuss, but a bit fearful of adding all sorts of fancy features
that in reality are not going to be useful.

<snip code changes till end of patch>

Regards, -Harry
Stokes, Ian June 29, 2021, 4:56 p.m. UTC | #8
> > -----Original Message-----
> > From: Stokes, Ian <ian.stokes@intel.com>
> > Sent: Thursday, June 24, 2021 2:20 PM
> > To: Amber, Kumar <kumar.amber@intel.com>; dev@openvswitch.org; Van
> > Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: Amber, Kumar <kumar.amber@intel.com>; i.maximets@ovn.org
> > Subject: RE: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
> > best mfex function
> >
> > > The study function runs all the available implementations
> > > of miniflow_extract and makes a choice whose hitmask has
> > > maximum hits and sets the mfex to that function.
> >
> > Hi Amber/Harry,
> >
> > Thanks for the patch, a few comments inline below.
> 
> Thanks for review. Just addressing the stats get/TLS topic here.
> <snip other patch changes>
> 
> > > +/* Struct to hold miniflow study stats. */
> > > +struct study_stats {
> > > +    uint32_t pkt_count;
> > > +    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
> > > +};
> > > +
> > > +/* Define per thread data to hold the study stats. */
> > > +DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *, study_stats);
> > > +
> > > +/* Allocate per thread PMD pointer space for study_stats. */
> > > +static inline struct study_stats *
> > > +get_study_stats(void)
> >
> > Would maybe suggest a name change here, get_study_stats sounds as if info is
> > being returned whereas whats actually happening is that the memory for the
> > stats are being provisioned.
> 
> More context for explaining below...
> 
> > > +{
> > > +    struct study_stats *stats = study_stats_get();
> > > +    if (OVS_UNLIKELY(!stats)) {
> > > +       stats = xzalloc(sizeof *stats);
> > > +       study_stats_set_unsafe(stats);
> > Can you explain why above is set unsafe? Where does that function originate
> > from?
> 
> Yes, this is how the OVS "per thread data" (also called "Thread Local Storage" or
> TLS)
> is implemented. The "get()" function indeed allocates the memory first time that
> this
> thread actually accesses it, and any time after that it just returns the per-thread
> allocated
> data pointer.
> 

Ah that makes more sense, have followed up on the existing code since and indeed it follows the same logic.

> The "unsafe" is essentially the API used to change a TLS variable. It must only be
> called
> by the thread that's using it itself, hence the unsafe() AFAIK.
> 
> The same function naming etc is used in DPCLS already, where this was the
> recommended
> method of getting/using TLS data.
> 
> dpif-netdev-lookup-generic.c +47   function has "get_blocks_scratch()" which
> performs
> approximately the same functionality as here.
> 
> Hope that clears up that topic, regards, -Harry

Thanks for clarifying.

BR
Ian
Flavio Leitner June 29, 2021, 6:11 p.m. UTC | #9
On Tue, Jun 29, 2021 at 04:32:05PM +0000, Van Haaren, Harry wrote:
> > -----Original Message-----
> > From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
> > Sent: Tuesday, June 29, 2021 1:38 PM
> > To: Amber, Kumar <kumar.amber@intel.com>
> > Cc: dev@openvswitch.org; i.maximets@ovn.org
> > Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
> > mfex function
> > 
> > More comments below. FYI I’m only reviewing right now, no testing.
> 
> Sure, thanks for reviews.
> 
> > On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> 
> <snip patch commit and some code>
> 
> > > +/* Allocate per thread PMD pointer space for study_stats. */
> > > +static inline struct study_stats *
> > > +get_study_stats(void)
> > > +{
> > > +    struct study_stats *stats = study_stats_get();
> > > +    if (OVS_UNLIKELY(!stats)) {
> > > +       stats = xzalloc(sizeof *stats);
> > > +       study_stats_set_unsafe(stats);
> > > +    }
> > > +    return stats;
> > > +}
> > > +
> > 
> > Just got a mind-meld with the code, and realized that the function might be different
> > per PMD thread due to this auto mode (and autovalidator mode in the previous
> > patch).
> > 
> > This makes it only stronger that we need a way to see the currently selected mode,
> > and not per datapath, but per PMD per datapath!
> 
> Study depends on the traffic pattern, so yes you're correct that it depends.
> The study command was added after community suggested user-experience
> would improve if the user doesn't have to provide an exact miniflow profile name.
> 
> Study studies the traffic running on that PMD, compares all MFEX impls, and prints out
> hits. It selects the _first_ implementation that surpasses the threshold of packets.
> 
> Users are free to use the more specific names of MFEX impls instead of "study"
> for fine-grained control over the MFEX impl in use, e.g.
> 
> ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
> 
> > Do we also need a way to set this per PMD?
> 
> I don't feel there is real value here, but we could investigate adding an
> optional parameter to the command indicating a PMD thread IDX to set?
> We have access to "pmd->core_id" in our set() function, so limiting changes
> to a specific PMD thread can be done ~ easily... but is it really required?

I think the concern here (at least from my side) is that users can
set the algorithm globally or per DP, not per PMD. However, the
study can set different algorithms per PMD. For example, say that
'study' indicates that alg#1 for PMD#1 and alg#2 for PMD#2 in the
lab. Now we want to move to production and make that selection
static, how can we do that?

If we set study, how do we tell from the cmdline the algorithm
chose for each PMD? Another example of the same situation: if
we always start with 'study' and suddenly there is a traffic
processing difference. How one can check what is different in
the settings? The logs don't tell which PMD was affected.
 
> Perfect is the enemy of good... I'd prefer focus on getting existing code changes merged,
> and add additional (optional) parameters in future if deemed useful in real world testing?

True. Perhaps we have different use cases in mind. How do you expect
users to use this feature? Do you think production users will always
start with 'study'?

Thanks,
fbl
Eelco Chaudron June 30, 2021, 9:18 a.m. UTC | #10
On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:

>> -----Original Message-----
>> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
>> Sent: Tuesday, June 29, 2021 1:38 PM
>> To: Amber, Kumar <kumar.amber@intel.com>
>> Cc: dev@openvswitch.org; i.maximets@ovn.org
>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
>> mfex function
>>
>> More comments below. FYI I’m only reviewing right now, no testing.
>
> Sure, thanks for reviews.
>
>> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
>
> <snip patch commit and some code>
>
>>> +/* Allocate per thread PMD pointer space for study_stats. */
>>> +static inline struct study_stats *
>>> +get_study_stats(void)
>>> +{
>>> +    struct study_stats *stats = study_stats_get();
>>> +    if (OVS_UNLIKELY(!stats)) {
>>> +       stats = xzalloc(sizeof *stats);
>>> +       study_stats_set_unsafe(stats);
>>> +    }
>>> +    return stats;
>>> +}
>>> +
>>
>> Just got a mind-meld with the code, and realized that the function might be different
>> per PMD thread due to this auto mode (and autovalidator mode in the previous
>> patch).
>>
>> This makes it only stronger that we need a way to see the currently selected mode,
>> and not per datapath, but per PMD per datapath!
>
> Study depends on the traffic pattern, so yes you're correct that it depends.
> The study command was added after community suggested user-experience
> would improve if the user doesn't have to provide an exact miniflow profile name.
>
> Study studies the traffic running on that PMD, compares all MFEX impls, and prints out
> hits. It selects the _first_ implementation that surpasses the threshold of packets.
>
> Users are free to use the more specific names of MFEX impls instead of "study"
> for fine-grained control over the MFEX impl in use, e.g.
>
> ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
>
>> Do we also need a way to set this per PMD?
>
> I don't feel there is real value here, but we could investigate adding an
> optional parameter to the command indicating a PMD thread IDX to set?
> We have access to "pmd->core_id" in our set() function, so limiting changes
> to a specific PMD thread can be done ~ easily... but is it really required?
>
> Perfect is the enemy of good... I'd prefer focus on getting existing code changes merged,
> and add additional (optional) parameters in future if deemed useful in real world testing?

See Flavio’s reply, as those were the concerns same concerns I thought of.

>>> +uint32_t
>>> +mfex_study_traffic(struct dp_packet_batch *packets,
>>> +                   struct netdev_flow_key *keys,
>>> +                   uint32_t keys_size, odp_port_t in_port,
>>> +                   void *pmd_handle)
>>> +{
>>> +    uint32_t hitmask = 0;
>>> +    uint32_t mask = 0;
>>> +    struct dp_netdev_pmd_thread *pmd = pmd_handle;
>>> +    struct dpif_miniflow_extract_impl *miniflow_funcs;
>>> +    uint32_t impl_count = dpif_miniflow_extract_info_get(&miniflow_funcs);
>>> +    struct study_stats *stats = get_study_stats();
>>> +
>>> +    /* Run traffic optimized miniflow_extract to collect the hitmask
>>> +     * to be compared after certain packets have been hit to choose
>>> +     * the best miniflow_extract version for that traffic. */
>>> +    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
>>> +        if (miniflow_funcs[i].available) {
>>> +            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
>>> +                                                     in_port, pmd_handle);
>>> +            stats->impl_hitcount[i] += count_1bits(hitmask);
>>> +
>>> +            /* If traffic is not classified than we dont overwrite the keys
>>> +             * array in minfiflow implementations so its safe to create a
>>> +             * mask for all those packets whose miniflow have been created. */
>>> +            mask |= hitmask;
>>> +        }
>>> +    }
>>> +    stats->pkt_count += dp_packet_batch_size(packets);
>>> +
>>> +    /* Choose the best implementation after a minimum packets have been
>>> +     * processed. */
>>> +    if (stats->pkt_count >= MFEX_MAX_COUNT) {
>>> +        uint32_t best_func_index = MFEX_IMPL_START_IDX;
>>> +        uint32_t max_hits = 0;
>>> +        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
>>> +            if (stats->impl_hitcount[i] > max_hits) {
>>> +                max_hits = stats->impl_hitcount[i];
>>> +                best_func_index = i;
>>> +            }
>>> +        }
>>> +
>>> +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
>>> +            /* Set the implementation to index with max_hits. */
>>> +            pmd->miniflow_extract_opt =
>>> +                        miniflow_funcs[best_func_index].extract_func;
>>> +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
>>> +                      miniflow_funcs[best_func_index].name, max_hits,
>>> +                      stats->pkt_count);
>>
>> We have no idea which PMD the mode is selected for guess we might need to add
>> this?
>>
>> Maybe we should report the numbers/hits for the other methods, as they might be
>> equal, and some might be faster in execution time?
>
> As above, the implementations are sorted in performance order. Performance
> here can be known by micro-benchmarks, and developers of such SIMD optimized
> code can be expected to know which impl is fastest.

Don’t think we can, as it’s not documented in the code, and some one can just add his own, and has no clue about the existing ones.

> In our current code, the avx512_vbmi_* impls are always before the avx512_*
> impls, as the VBMI instruction set allows a faster runtime.

Guess we need some documentation in the developer's section on how to add processor optimized functions, and how to benchmark them (and maybe some benchmark data for the current implementations).
Also, someone can write a sloppy avx512_vbmi* function that might be slower than an avx512_*, right?

> If really desired, we could dump the whole results of the MFEX table for that
> PMD thread, however I would expect the results to be noise, and not signal.

What about dumping it as debug messages, so in case we would like to see it (either for development, or production case), we can still enable it?


> I'm happy to discuss, but a bit fearful of adding all sorts of fancy features
> that in reality are not going to be useful.
>
> <snip code changes till end of patch>
>
> Regards, -Harry
Van Haaren, Harry June 30, 2021, 9:32 a.m. UTC | #11
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Wednesday, June 30, 2021 10:18 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Amber, Kumar <kumar.amber@intel.com>; dev@openvswitch.org;
> i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
> mfex function
> 
> 
> 
> On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:
> 
> >> -----Original Message-----
> >> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
> >> Sent: Tuesday, June 29, 2021 1:38 PM
> >> To: Amber, Kumar <kumar.amber@intel.com>
> >> Cc: dev@openvswitch.org; i.maximets@ovn.org
> >> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
> best
> >> mfex function

<snip previous discussion>

> > Perfect is the enemy of good... I'd prefer focus on getting existing code changes
> merged,
> > and add additional (optional) parameters in future if deemed useful in real world
> testing?
> 
> See Flavio’s reply, as those were the concerns same concerns I thought of.

Yes - thanks for combining threads - I'm writing a detailed reply there as we speak here :)
I'll send that reply shortly.

<snip code/patch changes>

> >>> +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
> >>> +            /* Set the implementation to index with max_hits. */
> >>> +            pmd->miniflow_extract_opt =
> >>> +                        miniflow_funcs[best_func_index].extract_func;
> >>> +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
> >>> +                      miniflow_funcs[best_func_index].name, max_hits,
> >>> +                      stats->pkt_count);
> >>
> >> We have no idea which PMD the mode is selected for guess we might need to add
> >> this?
> >>
> >> Maybe we should report the numbers/hits for the other methods, as they might
> be
> >> equal, and some might be faster in execution time?
> >
> > As above, the implementations are sorted in performance order. Performance
> > here can be known by micro-benchmarks, and developers of such SIMD optimized
> > code can be expected to know which impl is fastest.
> 
> Don’t think we can, as it’s not documented in the code, and some one can just add
> his own, and has no clue about the existing ones.

Yes, in theory somebody could add his own, and get this wrong. There are many many
things that could go wrong when making code changes. We cannot document everything.


> > In our current code, the avx512_vbmi_* impls are always before the avx512_*
> > impls, as the VBMI instruction set allows a faster runtime.
> 
> Guess we need some documentation in the developer's section on how to add
> processor optimized functions, and how to benchmark them (and maybe some
> benchmark data for the current implementations).
> Also, someone can write a sloppy avx512_vbmi* function that might be slower than
> an avx512_*, right?

What are we trying to achieve here? What is the root problem that is being addressed?

Yes, somebody "could" write sloppy (complex, advanced, ISA specific, SIMD) avx512 code,
and have it be slower. Who is realistically going to do that?

I'm fine with documenting a few things if they make sense to document, but
trying to "hand hold" at every level just doesn't work. Adding sections on how
to benchmark code, and how function pointers work and how to add them?
These things are documented in various places across the internet.

If there's really an interest to learn AVX512 SIMD optimization, reach out to the
OVS community, put me on CC, and I'll be willing to help. Adding documentation
ad nauseam is not the solution, as each optimization is likely to have subtle differences.


> > <snip code changes till end of patch>
<snip snip away irrelevant old discussions>
Van Haaren, Harry June 30, 2021, 9:43 a.m. UTC | #12
> -----Original Message-----
> From: Flavio Leitner <fbl@sysclose.org>
> Sent: Tuesday, June 29, 2021 7:11 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Eelco Chaudron <echaudro@redhat.com>; Amber, Kumar
> <kumar.amber@intel.com>; dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
> mfex function
> 
> On Tue, Jun 29, 2021 at 04:32:05PM +0000, Van Haaren, Harry wrote:
> > > -----Original Message-----
> > > From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
> > > Sent: Tuesday, June 29, 2021 1:38 PM
> > > To: Amber, Kumar <kumar.amber@intel.com>
> > > Cc: dev@openvswitch.org; i.maximets@ovn.org
> > > Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
> best
> > > mfex function
> > >
> > > More comments below. FYI I’m only reviewing right now, no testing.
> >
> > Sure, thanks for reviews.
> >
> > > On 17 Jun 2021, at 18:27, Kumar Amber wrote:
> >
> > <snip patch commit and some code>
> >
> > > > +/* Allocate per thread PMD pointer space for study_stats. */
> > > > +static inline struct study_stats *
> > > > +get_study_stats(void)
> > > > +{
> > > > +    struct study_stats *stats = study_stats_get();
> > > > +    if (OVS_UNLIKELY(!stats)) {
> > > > +       stats = xzalloc(sizeof *stats);
> > > > +       study_stats_set_unsafe(stats);
> > > > +    }
> > > > +    return stats;
> > > > +}
> > > > +
> > >
> > > Just got a mind-meld with the code, and realized that the function might be
> different
> > > per PMD thread due to this auto mode (and autovalidator mode in the previous
> > > patch).
> > >
> > > This makes it only stronger that we need a way to see the currently selected
> mode,
> > > and not per datapath, but per PMD per datapath!
> >
> > Study depends on the traffic pattern, so yes you're correct that it depends.
> > The study command was added after community suggested user-experience
> > would improve if the user doesn't have to provide an exact miniflow profile name.
> >
> > Study studies the traffic running on that PMD, compares all MFEX impls, and prints
> out
> > hits. It selects the _first_ implementation that surpasses the threshold of packets.
> >
> > Users are free to use the more specific names of MFEX impls instead of "study"
> > for fine-grained control over the MFEX impl in use, e.g.
> >
> > ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
> >
> > > Do we also need a way to set this per PMD?
> >
> > I don't feel there is real value here, but we could investigate adding an
> > optional parameter to the command indicating a PMD thread IDX to set?
> > We have access to "pmd->core_id" in our set() function, so limiting changes
> > to a specific PMD thread can be done ~ easily... but is it really required?
> 
> I think the concern here (at least from my side) is that users can
> set the algorithm globally or per DP, not per PMD. However, the
> study can set different algorithms per PMD. For example, say that
> 'study' indicates that alg#1 for PMD#1 and alg#2 for PMD#2 in the
> lab. Now we want to move to production and make that selection
> static, how can we do that?

That's a good question. Today the command doesn't give us per-PMD thread
control. Study can indeed result in different PMDs having different MFEX funcs.
 

> If we set study, how do we tell from the cmdline the algorithm
> chose for each PMD? Another example of the same situation: if
> we always start with 'study' and suddenly there is a traffic
> processing difference. How one can check what is different in
> the settings? The logs don't tell which PMD was affected.

Sure they do; the "pmd-cX" and "pmd-cY" below show what datapath thread selects what function.
Note that the first line is from the OVS command thread, which notes that "study" was selected.
The following two prints are from each datapath thread, noting the resulting function chosen by study.

2021-06-30T09:05:41Z|00134|dpif_netdev|INFO|Miniflow implementation set to study.
2021-06-30T09:05:41Z|00001|dpif_mfex_extract_study(pmd-cX/id:X)|INFO|MFEX study chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)
2021-06-30T09:05:41Z|00001|dpif_mfex_extract_study(pmd-cY/id:Y)|INFO|MFEX study chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)


> > Perfect is the enemy of good... I'd prefer focus on getting existing code changes
> merged,
> > and add additional (optional) parameters in future if deemed useful in real world
> testing?
> 
> True. Perhaps we have different use cases in mind. How do you expect
> users to use this feature? Do you think production users will always
> start with 'study'?

I was expecting OVS users to be aware of what L2-4 traffic they're
running, and to per-instance configure that statically for all datapath
threads, for example by running the command below:

$ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp

There is an assumption here that all datapath threads handle
the same outer traffic type. If that's not the case, we cannot manually
set different MFEX impls to different pmd threads today, as your lab
to production requirement requests above.

If we add an optional PMD thread id parameter, we can support this:
$ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp <packet_count_to_study> <pmd thread idx>


> Thanks,
> fbl
Eelco Chaudron June 30, 2021, 9:52 a.m. UTC | #13
On 30 Jun 2021, at 11:32, Van Haaren, Harry wrote:

>> -----Original Message-----
>> From: Eelco Chaudron <echaudro@redhat.com>
>> Sent: Wednesday, June 30, 2021 10:18 AM
>> To: Van Haaren, Harry <harry.van.haaren@intel.com>
>> Cc: Amber, Kumar <kumar.amber@intel.com>; dev@openvswitch.org;
>> i.maximets@ovn.org
>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
>> mfex function
>>
>>
>>
>> On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:
>>
>>>> -----Original Message-----
>>>> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
>>>> Sent: Tuesday, June 29, 2021 1:38 PM
>>>> To: Amber, Kumar <kumar.amber@intel.com>
>>>> Cc: dev@openvswitch.org; i.maximets@ovn.org
>>>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
>> best
>>>> mfex function
>
> <snip previous discussion>
>
>>> Perfect is the enemy of good... I'd prefer focus on getting existing code changes
>> merged,
>>> and add additional (optional) parameters in future if deemed useful in real world
>> testing?
>>
>> See Flavio’s reply, as those were the concerns same concerns I thought of.
>
> Yes - thanks for combining threads - I'm writing a detailed reply there as we speak here :)
> I'll send that reply shortly.
>
> <snip code/patch changes>
>
>>>>> +        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
>>>>> +            /* Set the implementation to index with max_hits. */
>>>>> +            pmd->miniflow_extract_opt =
>>>>> +                        miniflow_funcs[best_func_index].extract_func;
>>>>> +            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
>>>>> +                      miniflow_funcs[best_func_index].name, max_hits,
>>>>> +                      stats->pkt_count);
>>>>
>>>> We have no idea which PMD the mode is selected for guess we might need to add
>>>> this?
>>>>
>>>> Maybe we should report the numbers/hits for the other methods, as they might
>> be
>>>> equal, and some might be faster in execution time?
>>>
>>> As above, the implementations are sorted in performance order. Performance
>>> here can be known by micro-benchmarks, and developers of such SIMD optimized
>>> code can be expected to know which impl is fastest.
>>
>> Don’t think we can, as it’s not documented in the code, and some one can just add
>> his own, and has no clue about the existing ones.
>
> Yes, in theory somebody could add his own, and get this wrong. There are many many
> things that could go wrong when making code changes. We cannot document everything.

I meant that the code currently does not document that the implementation table, mfex_impls[], is in order of preference. So I think this should be added.

>>> In our current code, the avx512_vbmi_* impls are always before the avx512_*
>>> impls, as the VBMI instruction set allows a faster runtime.
>>
>> Guess we need some documentation in the developer's section on how to add
>> processor optimized functions, and how to benchmark them (and maybe some
>> benchmark data for the current implementations).
>> Also, someone can write a sloppy avx512_vbmi* function that might be slower than
>> an avx512_*, right?
>
> What are we trying to achieve here? What is the root problem that is being addressed?
>
> Yes, somebody "could" write sloppy (complex, advanced, ISA specific, SIMD) avx512 code,
> and have it be slower. Who is realistically going to do that?
>
> I'm fine with documenting a few things if they make sense to document, but
> trying to "hand hold" at every level just doesn't work. Adding sections on how
> to benchmark code, and how function pointers work and how to add them?
> These things are documented in various places across the internet.
>
> If there's really an interest to learn AVX512 SIMD optimization, reach out to the
> OVS community, put me on CC, and I'll be willing to help. Adding documentation
> ad nauseam is not the solution, as each optimization is likely to have subtle differences.
I think the problem is that except you, and some other small group at Intel might know AVX512, but for most of the OVS community this is moving back to handwritten assembler. So at least some guidelines on what you should do when adding a custom function would help. Like order them in priority, maybe some simple example on how to benchmark the runtime of the mfex function. Don't think this has to be part of this patch, but a follow-up would be nice.
>
>
>>> <snip code changes till end of patch>
> <snip snip away irrelevant old discussions>
Eelco Chaudron June 30, 2021, 10:07 a.m. UTC | #14
On 30 Jun 2021, at 11:43, Van Haaren, Harry wrote:

>> -----Original Message-----
>> From: Flavio Leitner <fbl@sysclose.org>
>> Sent: Tuesday, June 29, 2021 7:11 PM
>> To: Van Haaren, Harry <harry.van.haaren@intel.com>
>> Cc: Eelco Chaudron <echaudro@redhat.com>; Amber, Kumar
>> <kumar.amber@intel.com>; dev@openvswitch.org; i.maximets@ovn.org
>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
>> mfex function
>>
>> On Tue, Jun 29, 2021 at 04:32:05PM +0000, Van Haaren, Harry wrote:
>>>> -----Original Message-----
>>>> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron
>>>> Sent: Tuesday, June 29, 2021 1:38 PM
>>>> To: Amber, Kumar <kumar.amber@intel.com>
>>>> Cc: dev@openvswitch.org; i.maximets@ovn.org
>>>> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
>> best
>>>> mfex function
>>>>
>>>> More comments below. FYI I’m only reviewing right now, no testing.
>>>
>>> Sure, thanks for reviews.
>>>
>>>> On 17 Jun 2021, at 18:27, Kumar Amber wrote:
>>>
>>> <snip patch commit and some code>
>>>
>>>>> +/* Allocate per thread PMD pointer space for study_stats. */
>>>>> +static inline struct study_stats *
>>>>> +get_study_stats(void)
>>>>> +{
>>>>> +    struct study_stats *stats = study_stats_get();
>>>>> +    if (OVS_UNLIKELY(!stats)) {
>>>>> +       stats = xzalloc(sizeof *stats);
>>>>> +       study_stats_set_unsafe(stats);
>>>>> +    }
>>>>> +    return stats;
>>>>> +}
>>>>> +
>>>>
>>>> Just got a mind-meld with the code, and realized that the function might be
>> different
>>>> per PMD thread due to this auto mode (and autovalidator mode in the previous
>>>> patch).
>>>>
>>>> This makes it only stronger that we need a way to see the currently selected
>> mode,
>>>> and not per datapath, but per PMD per datapath!
>>>
>>> Study depends on the traffic pattern, so yes you're correct that it depends.
>>> The study command was added after community suggested user-experience
>>> would improve if the user doesn't have to provide an exact miniflow profile name.
>>>
>>> Study studies the traffic running on that PMD, compares all MFEX impls, and prints
>> out
>>> hits. It selects the _first_ implementation that surpasses the threshold of packets.
>>>
>>> Users are free to use the more specific names of MFEX impls instead of "study"
>>> for fine-grained control over the MFEX impl in use, e.g.
>>>
>>> ovs-appctl dpif-netdev/miniflow-parser-set avx512_vbmi_ipv4_udp
>>>
>>>> Do we also need a way to set this per PMD?
>>>
>>> I don't feel there is real value here, but we could investigate adding an
>>> optional parameter to the command indicating a PMD thread IDX to set?
>>> We have access to "pmd->core_id" in our set() function, so limiting changes
>>> to a specific PMD thread can be done ~ easily... but is it really required?
>>
>> I think the concern here (at least from my side) is that users can
>> set the algorithm globally or per DP, not per PMD. However, the
>> study can set different algorithms per PMD. For example, say that
>> 'study' indicates that alg#1 for PMD#1 and alg#2 for PMD#2 in the
>> lab. Now we want to move to production and make that selection
>> static, how can we do that?
>
> That's a good question. Today the command doesn't give us per-PMD thread
> control. Study can indeed result in different PMDs having different MFEX funcs.
>
>
>> If we set study, how do we tell from the cmdline the algorithm
>> chose for each PMD? Another example of the same situation: if
>> we always start with 'study' and suddenly there is a traffic
>> processing difference. How one can check what is different in
>> the settings? The logs don't tell which PMD was affected.
>
> Sure they do; the "pmd-cX" and "pmd-cY" below show what datapath thread selects what function.
> Note that the first line is from the OVS command thread, which notes that "study" was selected.
> The following two prints are from each datapath thread, noting the resulting function chosen by study.
>
> 2021-06-30T09:05:41Z|00134|dpif_netdev|INFO|Miniflow implementation set to study.
> 2021-06-30T09:05:41Z|00001|dpif_mfex_extract_study(pmd-cX/id:X)|INFO|MFEX study chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)
> 2021-06-30T09:05:41Z|00001|dpif_mfex_extract_study(pmd-cY/id:Y)|INFO|MFEX study chose impl avx512_vbmi_ipv4_udp: (hits 128/128 pkts)

And with the updated miniflow-parser-get we should be able to see it after the logs have wrapped.

>>> Perfect is the enemy of good... I'd prefer focus on getting existing code changes
>> merged,
>>> and add additional (optional) parameters in future if deemed useful in real world
>> testing?
>>
>> True. Perhaps we have different use cases in mind. How do you expect
>> users to use this feature? Do you think production users will always
>> start with 'study'?
>
> I was expecting OVS users to be aware of what L2-4 traffic they're
> running, and to per-instance configure that statically for all datapath
> threads, for example by running the command below:
>
> $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp
>
> There is an assumption here that all datapath threads handle
> the same outer traffic type. If that's not the case, we cannot manually
> set different MFEX impls to different pmd threads today, as your lab
> to production requirement requests above.
>
> If we add an optional PMD thread id parameter, we can support this:
> $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp <packet_count_to_study> <pmd thread idx>

I think if we allow study to set it per PMD thread, we should support the pmd thread for manual configuration.
We also might need to re-think the command to make sure packet_count_to_study is only needed for the study command.
So the help text might become something like:

dpif-netdev/miniflow-parser-set {miniflow_implementation_name | study [pkt_cnt]} [dp] [pmd_core]
Van Haaren, Harry June 30, 2021, 11:21 a.m. UTC | #15
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Wednesday, June 30, 2021 10:52 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Amber, Kumar <kumar.amber@intel.com>; dev@openvswitch.org;
> i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
> mfex function
> 
> 
> 
> On 30 Jun 2021, at 11:32, Van Haaren, Harry wrote:
> 
> >> -----Original Message-----
> >> From: Eelco Chaudron <echaudro@redhat.com>
> >> Sent: Wednesday, June 30, 2021 10:18 AM
> >> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> >> Cc: Amber, Kumar <kumar.amber@intel.com>; dev@openvswitch.org;
> >> i.maximets@ovn.org
> >> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the
> best
> >> mfex function
> >>
> >>
> >>
> >> On 29 Jun 2021, at 18:32, Van Haaren, Harry wrote:
> >>
> >>>> -----Original Message-----
> >>>> From: dev <ovs-dev-bounces@openvswitch.org> On Behalf Of Eelco Chaudron

<snip away outdated context>

> >>>> Maybe we should report the numbers/hits for the other methods, as they might
> >> be
> >>>> equal, and some might be faster in execution time?
> >>>
> >>> As above, the implementations are sorted in performance order. Performance
> >>> here can be known by micro-benchmarks, and developers of such SIMD
> optimized
> >>> code can be expected to know which impl is fastest.
> >>
> >> Don’t think we can, as it’s not documented in the code, and some one can just
> add
> >> his own, and has no clue about the existing ones.
> >
> > Yes, in theory somebody could add his own, and get this wrong. There are many
> many
> > things that could go wrong when making code changes. We cannot document
> everything.
> 
> I meant that the code currently does not document that the implementation table,
> mfex_impls[], is in order of preference. So I think this should be added.

Sure we can document that the impl list is iterated & searched in order, hence
code-doc would help there. Will add this to the code.


> >>> In our current code, the avx512_vbmi_* impls are always before the avx512_*
> >>> impls, as the VBMI instruction set allows a faster runtime.
> >>
> >> Guess we need some documentation in the developer's section on how to add
> >> processor optimized functions, and how to benchmark them (and maybe some
> >> benchmark data for the current implementations).
> >> Also, someone can write a sloppy avx512_vbmi* function that might be slower
> than
> >> an avx512_*, right?
> >
> > What are we trying to achieve here? What is the root problem that is being
> addressed?
> >
> > Yes, somebody "could" write sloppy (complex, advanced, ISA specific, SIMD)
> avx512 code,
> > and have it be slower. Who is realistically going to do that?
> >
> > I'm fine with documenting a few things if they make sense to document, but
> > trying to "hand hold" at every level just doesn't work. Adding sections on how
> > to benchmark code, and how function pointers work and how to add them?
> > These things are documented in various places across the internet.
> >
> > If there's really an interest to learn AVX512 SIMD optimization, reach out to the
> > OVS community, put me on CC, and I'll be willing to help. Adding documentation
> > ad nauseam is not the solution, as each optimization is likely to have subtle
> differences.
>
> I think the problem is that except you, and some other small group at Intel might
> know AVX512, but for most of the OVS community this is moving back to
> handwritten assembler. 

Nitpick but worth mentioning: optimizing with intrinsics is much easier, and much
less mental overhead than actual assembler (e.g. register allocation handled by compiler).
I agree lots of developers don't see this on a daily basis, but its really not that "crazy".
Once over the 1st level of "reading intrinsics", scalar becomes looped scalar becomes vector:

uint64_t x = y & z;

for (int i = 0; i < 8; i++)
   x[i] = y[i] & z[i];

__m512i x = _mm512_and_si512(y, z);

Anyway, this is getting off topic, so I'll stop adding detail here.

> So at least some guidelines on what you should do when
> adding a custom function would help. Like order them in priority, maybe some
> simple example on how to benchmark the runtime of the mfex function. Don't think
> this has to be part of this patch, but a follow-up would be nice.

Honestly I'm still not convinced. Just running the normal OVS benchmarks is enough.
If the cycle-counts/packet-rate reported by OVS are better, you're going faster. These
things are already documented:
https://docs.openvswitch.org/en/latest/topics/dpdk/pmd/

If you're a developer writing SIMD code, I think its fair to assume some level of knowledge
on profiling. If not, the OVS documentation is IMO still _not_ the place to document how
to profile optimized code. There's nothing special about benchmarking these AVX512 MFEX
implementations compared to any other datapath (or otherwise) function.


> >>> <snip code changes till end of patch>
> > <snip snip away irrelevant old discussions>
Van Haaren, Harry June 30, 2021, 1:34 p.m. UTC | #16
> -----Original Message-----
> From: Eelco Chaudron <echaudro@redhat.com>
> Sent: Wednesday, June 30, 2021 11:07 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: Flavio Leitner <fbl@sysclose.org>; Amber, Kumar <kumar.amber@intel.com>;
> dev@openvswitch.org; i.maximets@ovn.org
> Subject: Re: [ovs-dev] [v4 03/12] dpif-netdev: Add study function to select the best
> mfex function
> 
> On 30 Jun 2021, at 11:43, Van Haaren, Harry wrote:

<snip>

> > $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp
> >
> > There is an assumption here that all datapath threads handle
> > the same outer traffic type. If that's not the case, we cannot manually
> > set different MFEX impls to different pmd threads today, as your lab
> > to production requirement requests above.
> >
> > If we add an optional PMD thread id parameter, we can support this:
> > $ ovs-appctl  dpif-netdev/miniflow-parser-set avx512_ipv4_udp
> <packet_count_to_study> <pmd thread idx>
> 
> I think if we allow study to set it per PMD thread, we should support the pmd thread
> for manual configuration.
> We also might need to re-think the command to make sure packet_count_to_study
> is only needed for the study command.
> So the help text might become something like:
> 
> dpif-netdev/miniflow-parser-set {miniflow_implementation_name | study [pkt_cnt]}
> [dp] [pmd_core]

Amber has designed & implemented a proposal, with documentation on each. Request to
review the next version of the patchset when available, to ensure it meets requirements.
diff mbox series

Patch

diff --git a/lib/automake.mk b/lib/automake.mk
index 6657b9ae5..3080bb04a 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -114,6 +114,7 @@  lib_libopenvswitch_la_SOURCES = \
 	lib/dpif-netdev.c \
 	lib/dpif-netdev.h \
 	lib/dpif-netdev-private-dfc.c \
+	lib/dpif-netdev-extract-study.c \
 	lib/dpif-netdev-private-dfc.h \
 	lib/dpif-netdev-private-dpcls.h \
 	lib/dpif-netdev-private-dpif.c \
diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c
new file mode 100644
index 000000000..d063d040c
--- /dev/null
+++ b/lib/dpif-netdev-extract-study.c
@@ -0,0 +1,119 @@ 
+/*
+ * Copyright (c) 2021 Intel.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "dpif-netdev-private-extract.h"
+#include "dpif-netdev-private-thread.h"
+#include "openvswitch/vlog.h"
+#include "ovs-thread.h"
+
+VLOG_DEFINE_THIS_MODULE(dpif_mfex_extract_study);
+
+/* Max size of packets to be compared. */
+#define MFEX_MAX_COUNT (128)
+
+/* This value is the threshold for the amount of packets that
+ * must hit on the optimized miniflow extract before it will be
+ * accepted and used in the datapath after the study phase. */
+#define MFEX_MIN_HIT_COUNT_FOR_USE (MFEX_MAX_COUNT / 2)
+
+/* Struct to hold miniflow study stats. */
+struct study_stats {
+    uint32_t pkt_count;
+    uint32_t impl_hitcount[MFEX_IMPLS_MAX_SIZE];
+};
+
+/* Define per thread data to hold the study stats. */
+DEFINE_PER_THREAD_MALLOCED_DATA(struct study_stats *, study_stats);
+
+/* Allocate per thread PMD pointer space for study_stats. */
+static inline struct study_stats *
+get_study_stats(void)
+{
+    struct study_stats *stats = study_stats_get();
+    if (OVS_UNLIKELY(!stats)) {
+       stats = xzalloc(sizeof *stats);
+       study_stats_set_unsafe(stats);
+    }
+    return stats;
+}
+
+uint32_t
+mfex_study_traffic(struct dp_packet_batch *packets,
+                   struct netdev_flow_key *keys,
+                   uint32_t keys_size, odp_port_t in_port,
+                   void *pmd_handle)
+{
+    uint32_t hitmask = 0;
+    uint32_t mask = 0;
+    struct dp_netdev_pmd_thread *pmd = pmd_handle;
+    struct dpif_miniflow_extract_impl *miniflow_funcs;
+    uint32_t impl_count = dpif_miniflow_extract_info_get(&miniflow_funcs);
+    struct study_stats *stats = get_study_stats();
+
+    /* Run traffic optimized miniflow_extract to collect the hitmask
+     * to be compared after certain packets have been hit to choose
+     * the best miniflow_extract version for that traffic. */
+    for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
+        if (miniflow_funcs[i].available) {
+            hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
+                                                     in_port, pmd_handle);
+            stats->impl_hitcount[i] += count_1bits(hitmask);
+
+            /* If traffic is not classified than we dont overwrite the keys
+             * array in minfiflow implementations so its safe to create a
+             * mask for all those packets whose miniflow have been created. */
+            mask |= hitmask;
+        }
+    }
+    stats->pkt_count += dp_packet_batch_size(packets);
+
+    /* Choose the best implementation after a minimum packets have been
+     * processed. */
+    if (stats->pkt_count >= MFEX_MAX_COUNT) {
+        uint32_t best_func_index = MFEX_IMPL_START_IDX;
+        uint32_t max_hits = 0;
+        for (int i = MFEX_IMPL_START_IDX; i < impl_count; i++) {
+            if (stats->impl_hitcount[i] > max_hits) {
+                max_hits = stats->impl_hitcount[i];
+                best_func_index = i;
+            }
+        }
+
+        if (max_hits >= MFEX_MIN_HIT_COUNT_FOR_USE) {
+            /* Set the implementation to index with max_hits. */
+            pmd->miniflow_extract_opt =
+                        miniflow_funcs[best_func_index].extract_func;
+            VLOG_INFO("MFEX study chose impl %s: (hits %d/%d pkts)\n",
+                      miniflow_funcs[best_func_index].name, max_hits,
+                      stats->pkt_count);
+        } else {
+            /* Set the implementation to null for default miniflow. */
+            pmd->miniflow_extract_opt = NULL;
+            VLOG_INFO("Not enough packets matched (%d/%d), disabling"
+                      " optimized MFEX.\n", max_hits, stats->pkt_count);
+        }
+        /* Reset stats so that study function can be called again
+         * for next traffic type and optimal function ptr can be
+         * choosen. */
+        memset(stats, 0, sizeof(struct study_stats));
+    }
+    return mask;
+}
diff --git a/lib/dpif-netdev-private-extract.c b/lib/dpif-netdev-private-extract.c
index 0741c19f9..d86268a1d 100644
--- a/lib/dpif-netdev-private-extract.c
+++ b/lib/dpif-netdev-private-extract.c
@@ -42,6 +42,11 @@  static struct dpif_miniflow_extract_impl mfex_impls[] = {
         .extract_func = NULL,
         .name = "disable",
     },
+    {
+        .probe = NULL,
+        .extract_func = mfex_study_traffic,
+        .name = "study",
+    },
 };
 
 BUILD_ASSERT_DECL(MFEX_IMPLS_MAX_SIZE > ARRAY_SIZE(mfex_impls));
diff --git a/lib/dpif-netdev-private-extract.h b/lib/dpif-netdev-private-extract.h
index 455a7b590..3ada413bb 100644
--- a/lib/dpif-netdev-private-extract.h
+++ b/lib/dpif-netdev-private-extract.h
@@ -27,7 +27,7 @@ 
 /* Skip the autovalidator study and null when iterating all available
  * miniflow implementations.
  */
-#define MFEX_IMPL_START_IDX (1)
+#define MFEX_IMPL_START_IDX (3)
 
 /* Forward declarations. */
 struct dp_packet;
@@ -106,4 +106,16 @@  dpif_miniflow_extract_autovalidator(struct dp_packet_batch *batch,
                                     uint32_t keys_size, odp_port_t in_port,
                                     void *pmd_handle);
 
+/* Retrieve the number of packets by studying packets using different miniflow
+ * implementations to choose the best implementation using the maximum hitmask
+ * count.
+ * On error, returns a zero for no packets.
+ * On success, returns mask of the packets hit.
+ */
+uint32_t
+mfex_study_traffic(struct dp_packet_batch *packets,
+                   struct netdev_flow_key *keys,
+                   uint32_t keys_size, odp_port_t in_port,
+                   void *pmd_handle);
+
 #endif /* DPIF_NETDEV_AVX512_EXTRACT */