diff mbox series

[ovs-dev,v5,8/9] mfex-study: Modify study func to select outer and inner mfex funcs.

Message ID 20220825233058.2697002-9-kumar.amber@intel.com
State Superseded
Headers show
Series DPIF + MFEX Inner AVX512 | expand

Checks

Context Check Description
ovsrobot/apply-robot success apply and check: success
ovsrobot/github-robot-_Build_and_Test success github build: passed

Commit Message

Kumar Amber Aug. 25, 2022, 11:30 p.m. UTC
The Mfex study function is split into outer and inner to allow
for independent selection and studying of packets in outer and inner
flows to different ISA optimized Mfexs.

Signed-off-by: Kumar Amber <kumar.amber@intel.com>
---
 lib/dpif-netdev-extract-study.c | 126 +++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 43 deletions(-)

Comments

Ferriter, Cian Sept. 29, 2022, 3:31 p.m. UTC | #1
Hi Amber,

My comments on the patch are below inline.

Thanks,
Cian

> -----Original Message-----
> From: Amber, Kumar <kumar.amber@intel.com>
> Sent: Friday 26 August 2022 00:31
> To: ovs-dev@openvswitch.org
> Cc: echaudro@redhat.com; i.maximets@ovn.org; Ferriter, Cian <cian.ferriter@intel.com>; Stokes, Ian
> <ian.stokes@intel.com>; fbl@sysclose.org; Van Haaren, Harry <harry.van.haaren@intel.com>; Amber, Kumar
> <kumar.amber@intel.com>
> Subject: [PATCH v5 8/9] mfex-study: Modify study func to select outer and inner mfex funcs.
> 
> The Mfex study function is split into outer and inner to allow
> for independent selection and studying of packets in outer and inner
> flows to different ISA optimized Mfexs.

For the last line how about:
> flows to different ISA optimized miniflow extract implementations.

> 
> Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> ---
>  lib/dpif-netdev-extract-study.c | 126 +++++++++++++++++++++-----------
>  1 file changed, 83 insertions(+), 43 deletions(-)
> 
> diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c
> index 71354cc4c..03d97c64e 100644
> --- a/lib/dpif-netdev-extract-study.c
> +++ b/lib/dpif-netdev-extract-study.c
> @@ -30,7 +30,9 @@ static atomic_uint32_t mfex_study_pkts_count = MFEX_MAX_PKT_COUNT;
>  /* Struct to hold miniflow study stats. */
>  struct study_stats {
>      uint32_t pkt_count;
> +    uint32_t pkt_inner_count;
>      uint32_t impl_hitcount[MFEX_IMPL_MAX];
> +    uint32_t impl_inner_hitcount[MFEX_IMPL_MAX];
>  };
> 
>  /* Define per thread data to hold the study stats. */
> @@ -67,6 +69,58 @@ mfex_set_study_pkt_cnt(uint32_t pkt_cmp_count, const char *name)
>      return -EINVAL;
>  }
> 
> +

Remove the extra newline here.

> +static inline void
> +mfex_reset_stats(uint32_t *impls_hitcount, uint32_t *pkt_cnt) {
> +    /* Reset stats so that study function can be called again
> +     * for next traffic type and optimal function ptr can be
> +     * chosen.
> +     */

I prefer having this comment above the function rather than inside since it describes what the whole function does.
I've reworded the comment to add 'the's and 'an'. How about this:
Reset stats so that the study function can be called again for the next traffic type and an optimal function pointer can be chosen.

> +    memset(impls_hitcount, 0, sizeof(uint32_t) * MFEX_IMPL_MAX);
> +    *pkt_cnt = 0;
> +}
> +
> +static inline void
> +mfex_study_select_best_impls(struct dpif_miniflow_extract_impl *mfex_funcs,
> +                             uint32_t pkt_cnt, uint32_t *impls_arr,
> +                             atomic_uintptr_t *pmd_func, char *name)
> +{
> +
> +    uint32_t best_func_index = MFEX_IMPL_START_IDX;
> +    uint32_t max_hits = 0;
> +
> +    for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> +        if (impls_arr[i] > max_hits) {
> +            max_hits = impls_arr[i];
> +            best_func_index = i;
> +            }
> +        }

These 2 braces are indented incorrectly. It should be like this:
> +        }
> +    }


> +
> +    /* If 50% of the packets hit, enable the function. */

How about this reword:
If at least 50% of the packets hit the implementation, enable that implementation.

> +    if (max_hits >= (mfex_study_pkts_count / 2)) {
> +        atomic_store_relaxed(pmd_func,
> +                    (uintptr_t) mfex_funcs[best_func_index].extract_func);
> +        VLOG_INFO("MFEX %s study chose impl %s: (hits %u/%u pkts)",
> +                  name, mfex_funcs[best_func_index].name, max_hits,
> +                  pkt_cnt);

The 'pkt_cnt' doesn't have to be wrapped to the next line.

> +    } else {
> +        /* Set the implementation to null for default miniflow. */
> +        atomic_store_relaxed(pmd_func,
> +                    (uintptr_t) mfex_funcs[MFEX_IMPL_SCALAR].extract_func);
> +        VLOG_INFO("Not enough packets matched (%u/%u), disabling"
> +                  " optimized MFEX.", max_hits, pkt_cnt);
> +    }
> +
> +    /* In debug mode show stats for all the counters. */
> +    if (VLOG_IS_DBG_ENABLED()) {
> +        for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> +                VLOG_DBG("MFEX study results for implementation %s:"
> +                         " (hits %u/%u pkts)", mfex_funcs[i].name,
> +                         impls_arr[i], pkt_cnt);
> +        }
> +    }
> +}
> +
>  uint32_t
>  mfex_study_traffic(struct dp_packet_batch *packets,
>                     struct netdev_flow_key *keys,
> @@ -76,10 +130,12 @@ mfex_study_traffic(struct dp_packet_batch *packets,
>  {
>      uint32_t hitmask = 0;
>      uint32_t mask = 0;
> +    uint32_t study_cnt_pkts;
>      struct dp_netdev_pmd_thread *pmd = pmd_handle;
>      struct dpif_miniflow_extract_impl *miniflow_funcs;
>      struct study_stats *stats = mfex_study_get_study_stats_ptr();
>      miniflow_funcs = dpif_mfex_impl_info_get();
> +    atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts);
> 
>      /* Run traffic optimized miniflow_extract to collect the hitmask
>       * to be compared after certain packets have been hit to choose
> @@ -93,7 +149,11 @@ mfex_study_traffic(struct dp_packet_batch *packets,
>          hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
>                                                   in_port, pmd_handle,
>                                                   md_is_valid);
> -        stats->impl_hitcount[i] += count_1bits(hitmask);
> +        if (!md_is_valid) {
> +            stats->impl_hitcount[i] += count_1bits(hitmask);
> +        } else {
> +            stats->impl_inner_hitcount[i] += count_1bits(hitmask);
> +        }
> 
>          /* If traffic is not classified then we dont overwrite the keys
>           * array in minfiflow implementations so its safe to create a
> @@ -102,54 +162,34 @@ mfex_study_traffic(struct dp_packet_batch *packets,
>          mask |= hitmask;
>      }
> 
> -    stats->pkt_count += dp_packet_batch_size(packets);
> -
>      /* Choose the best implementation after a minimum packets have been
>       * processed.
>       */

How about updating the comment to:
Choose the best miniflow extract implementation to use for inner and outer packets separately.

> -    uint32_t study_cnt_pkts;
> -    atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts);
> -
> -    if (stats->pkt_count >= study_cnt_pkts) {
> -        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> -        uint32_t max_hits = 0;
> -        for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> -            if (stats->impl_hitcount[i] > max_hits) {
> -                max_hits = stats->impl_hitcount[i];
> -                best_func_index = i;
> -            }
> +    if (!md_is_valid) {
> +        stats->pkt_count += dp_packet_batch_size(packets);
> +
> +        if (stats->pkt_count >= study_cnt_pkts) {
> +            char name[] = "outer";
> +            mfex_study_select_best_impls(miniflow_funcs, stats->pkt_count,
> +                             stats->impl_hitcount,
> +                             (void *)&pmd->miniflow_extract_opt,
> +                             name);
> +            mfex_reset_stats(stats->impl_hitcount, &stats->pkt_count);
>          }
> 
> -        /* If 50% of the packets hit, enable the function. */
> -        if (max_hits >= (mfex_study_pkts_count / 2)) {
> -            atomic_store_relaxed(&pmd->miniflow_extract_opt,
> -                                 miniflow_funcs[best_func_index].extract_func);
> -            VLOG_INFO("MFEX study chose impl %s: (hits %u/%u pkts)",
> -                      miniflow_funcs[best_func_index].name, max_hits,
> -                      stats->pkt_count);
> -        } else {
> -            /* Set the implementation to null for default miniflow. */
> -            atomic_store_relaxed(&pmd->miniflow_extract_opt,
> -                    miniflow_funcs[MFEX_IMPL_SCALAR].extract_func);
> -            VLOG_INFO("Not enough packets matched (%u/%u), disabling"
> -                      " optimized MFEX.", max_hits, stats->pkt_count);
> +    } else {
> +        stats->pkt_inner_count += dp_packet_batch_size(packets);
> +
> +        if (stats->pkt_inner_count >= study_cnt_pkts) {
> +            char name[] = "inner";
> +            mfex_study_select_best_impls(miniflow_funcs,
> +                             stats->pkt_inner_count,
> +                             stats->impl_inner_hitcount,
> +                             (void *)&pmd->miniflow_extract_inner_opt,
> +                             name);
> +            mfex_reset_stats(stats->impl_inner_hitcount,
> +                             &stats->pkt_inner_count);
>          }
> -
> -        /* In debug mode show stats for all the counters. */
> -        if (VLOG_IS_DBG_ENABLED()) {
> -
> -            for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> -                VLOG_DBG("MFEX study results for implementation %s:"
> -                         " (hits %u/%u pkts)", miniflow_funcs[i].name,
> -                         stats->impl_hitcount[i], stats->pkt_count);
> -            }
> -        }
> -
> -        /* Reset stats so that study function can be called again
> -         * for next traffic type and optimal function ptr can be
> -         * chosen.
> -         */
> -        memset(stats, 0, sizeof(struct study_stats));
>      }
>      return mask;
>  }
> --
> 2.25.1

Overall, I like the reuse of code. And I think the patch looks good. The above are all smaller comments. The only major thing is what I mentioned in patch 7/9 about seeing whether we can get the same behaviour without using the 'md_is_valid' bool at all.
Kumar Amber Oct. 3, 2022, 4:28 p.m. UTC | #2
Hi Cian,

Please find the comments inline.

> -----Original Message-----
> From: Ferriter, Cian <cian.ferriter@intel.com>
> Sent: Thursday, September 29, 2022 9:01 PM
> To: Amber, Kumar <kumar.amber@intel.com>; ovs-dev@openvswitch.org
> Cc: echaudro@redhat.com; i.maximets@ovn.org; Stokes, Ian
> <ian.stokes@intel.com>; fbl@sysclose.org; Van Haaren, Harry
> <harry.van.haaren@intel.com>
> Subject: RE: [PATCH v5 8/9] mfex-study: Modify study func to select outer
> and inner mfex funcs.
> 
> Hi Amber,
> 
> My comments on the patch are below inline.
> 
> Thanks,
> Cian
> 
> > -----Original Message-----
> > From: Amber, Kumar <kumar.amber@intel.com>
> > Sent: Friday 26 August 2022 00:31
> > To: ovs-dev@openvswitch.org
> > Cc: echaudro@redhat.com; i.maximets@ovn.org; Ferriter, Cian
> > <cian.ferriter@intel.com>; Stokes, Ian <ian.stokes@intel.com>;
> > fbl@sysclose.org; Van Haaren, Harry <harry.van.haaren@intel.com>;
> > Amber, Kumar <kumar.amber@intel.com>
> > Subject: [PATCH v5 8/9] mfex-study: Modify study func to select outer and
> inner mfex funcs.
> >
> > The Mfex study function is split into outer and inner to allow for
> > independent selection and studying of packets in outer and inner flows
> > to different ISA optimized Mfexs.
> 
> For the last line how about:
> > flows to different ISA optimized miniflow extract implementations.
> 
Fixed.

> >
> > Signed-off-by: Kumar Amber <kumar.amber@intel.com>
> > ---
> >  lib/dpif-netdev-extract-study.c | 126
> > +++++++++++++++++++++-----------
> >  1 file changed, 83 insertions(+), 43 deletions(-)
> >
> > diff --git a/lib/dpif-netdev-extract-study.c
> > b/lib/dpif-netdev-extract-study.c index 71354cc4c..03d97c64e 100644
> > --- a/lib/dpif-netdev-extract-study.c
> > +++ b/lib/dpif-netdev-extract-study.c
> > @@ -30,7 +30,9 @@ static atomic_uint32_t mfex_study_pkts_count =
> > MFEX_MAX_PKT_COUNT;
> >  /* Struct to hold miniflow study stats. */  struct study_stats {
> >      uint32_t pkt_count;
> > +    uint32_t pkt_inner_count;
> >      uint32_t impl_hitcount[MFEX_IMPL_MAX];
> > +    uint32_t impl_inner_hitcount[MFEX_IMPL_MAX];
> >  };
> >
> >  /* Define per thread data to hold the study stats. */ @@ -67,6 +69,58
> > @@ mfex_set_study_pkt_cnt(uint32_t pkt_cmp_count, const char *name)
> >      return -EINVAL;
> >  }
> >
> > +
> 
> Remove the extra newline here.
> 
Fixed.

> > +static inline void
> > +mfex_reset_stats(uint32_t *impls_hitcount, uint32_t *pkt_cnt) {
> > +    /* Reset stats so that study function can be called again
> > +     * for next traffic type and optimal function ptr can be
> > +     * chosen.
> > +     */
> 
> I prefer having this comment above the function rather than inside since it
> describes what the whole function does.
> I've reworded the comment to add 'the's and 'an'. How about this:
> Reset stats so that the study function can be called again for the next traffic
> type and an optimal function pointer can be chosen.
> 

Fixed.

> > +    memset(impls_hitcount, 0, sizeof(uint32_t) * MFEX_IMPL_MAX);
> > +    *pkt_cnt = 0;
> > +}
> > +
> > +static inline void
> > +mfex_study_select_best_impls(struct dpif_miniflow_extract_impl
> *mfex_funcs,
> > +                             uint32_t pkt_cnt, uint32_t *impls_arr,
> > +                             atomic_uintptr_t *pmd_func, char *name)
> > +{
> > +
> > +    uint32_t best_func_index = MFEX_IMPL_START_IDX;
> > +    uint32_t max_hits = 0;
> > +
> > +    for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> > +        if (impls_arr[i] > max_hits) {
> > +            max_hits = impls_arr[i];
> > +            best_func_index = i;
> > +            }
> > +        }
> 
> These 2 braces are indented incorrectly. It should be like this:

Fixed.

> > +        }
> > +    }
> 
> 
> > +
> > +    /* If 50% of the packets hit, enable the function. */
> 
> How about this reword:
> If at least 50% of the packets hit the implementation, enable that
> implementation.
> 
Fixed.

> > +    if (max_hits >= (mfex_study_pkts_count / 2)) {
> > +        atomic_store_relaxed(pmd_func,
> > +                    (uintptr_t) mfex_funcs[best_func_index].extract_func);
> > +        VLOG_INFO("MFEX %s study chose impl %s: (hits %u/%u pkts)",
> > +                  name, mfex_funcs[best_func_index].name, max_hits,
> > +                  pkt_cnt);
> 
> The 'pkt_cnt' doesn't have to be wrapped to the next line.
> 
Fixed.

> > +    } else {
> > +        /* Set the implementation to null for default miniflow. */
> > +        atomic_store_relaxed(pmd_func,
> > +                    (uintptr_t) mfex_funcs[MFEX_IMPL_SCALAR].extract_func);
> > +        VLOG_INFO("Not enough packets matched (%u/%u), disabling"
> > +                  " optimized MFEX.", max_hits, pkt_cnt);
> > +    }
> > +
> > +    /* In debug mode show stats for all the counters. */
> > +    if (VLOG_IS_DBG_ENABLED()) {
> > +        for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> > +                VLOG_DBG("MFEX study results for implementation %s:"
> > +                         " (hits %u/%u pkts)", mfex_funcs[i].name,
> > +                         impls_arr[i], pkt_cnt);
> > +        }
> > +    }
> > +}
> > +
> >  uint32_t
> >  mfex_study_traffic(struct dp_packet_batch *packets,
> >                     struct netdev_flow_key *keys, @@ -76,10 +130,12 @@
> > mfex_study_traffic(struct dp_packet_batch *packets,  {
> >      uint32_t hitmask = 0;
> >      uint32_t mask = 0;
> > +    uint32_t study_cnt_pkts;
> >      struct dp_netdev_pmd_thread *pmd = pmd_handle;
> >      struct dpif_miniflow_extract_impl *miniflow_funcs;
> >      struct study_stats *stats = mfex_study_get_study_stats_ptr();
> >      miniflow_funcs = dpif_mfex_impl_info_get();
> > +    atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts);
> >
> >      /* Run traffic optimized miniflow_extract to collect the hitmask
> >       * to be compared after certain packets have been hit to choose
> > @@ -93,7 +149,11 @@ mfex_study_traffic(struct dp_packet_batch
> *packets,
> >          hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
> >                                                   in_port, pmd_handle,
> >                                                   md_is_valid);
> > -        stats->impl_hitcount[i] += count_1bits(hitmask);
> > +        if (!md_is_valid) {
> > +            stats->impl_hitcount[i] += count_1bits(hitmask);
> > +        } else {
> > +            stats->impl_inner_hitcount[i] += count_1bits(hitmask);
> > +        }
> >
> >          /* If traffic is not classified then we dont overwrite the keys
> >           * array in minfiflow implementations so its safe to create a
> > @@ -102,54 +162,34 @@ mfex_study_traffic(struct dp_packet_batch
> *packets,
> >          mask |= hitmask;
> >      }
> >
> > -    stats->pkt_count += dp_packet_batch_size(packets);
> > -
> >      /* Choose the best implementation after a minimum packets have been
> >       * processed.
> >       */
> 
> How about updating the comment to:
> Choose the best miniflow extract implementation to use for inner and outer
> packets separately.
> 
Fixed.

> > -    uint32_t study_cnt_pkts;
> > -    atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts);
> > -
> > -    if (stats->pkt_count >= study_cnt_pkts) {
> > -        uint32_t best_func_index = MFEX_IMPL_START_IDX;
> > -        uint32_t max_hits = 0;
> > -        for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> > -            if (stats->impl_hitcount[i] > max_hits) {
> > -                max_hits = stats->impl_hitcount[i];
> > -                best_func_index = i;
> > -            }
> > +    if (!md_is_valid) {
> > +        stats->pkt_count += dp_packet_batch_size(packets);
> > +
> > +        if (stats->pkt_count >= study_cnt_pkts) {
> > +            char name[] = "outer";
> > +            mfex_study_select_best_impls(miniflow_funcs, stats->pkt_count,
> > +                             stats->impl_hitcount,
> > +                             (void *)&pmd->miniflow_extract_opt,
> > +                             name);
> > +            mfex_reset_stats(stats->impl_hitcount,
> > + &stats->pkt_count);
> >          }
> >
> > -        /* If 50% of the packets hit, enable the function. */
> > -        if (max_hits >= (mfex_study_pkts_count / 2)) {
> > -            atomic_store_relaxed(&pmd->miniflow_extract_opt,
> > -                                 miniflow_funcs[best_func_index].extract_func);
> > -            VLOG_INFO("MFEX study chose impl %s: (hits %u/%u pkts)",
> > -                      miniflow_funcs[best_func_index].name, max_hits,
> > -                      stats->pkt_count);
> > -        } else {
> > -            /* Set the implementation to null for default miniflow. */
> > -            atomic_store_relaxed(&pmd->miniflow_extract_opt,
> > -                    miniflow_funcs[MFEX_IMPL_SCALAR].extract_func);
> > -            VLOG_INFO("Not enough packets matched (%u/%u), disabling"
> > -                      " optimized MFEX.", max_hits, stats->pkt_count);
> > +    } else {
> > +        stats->pkt_inner_count += dp_packet_batch_size(packets);
> > +
> > +        if (stats->pkt_inner_count >= study_cnt_pkts) {
> > +            char name[] = "inner";
> > +            mfex_study_select_best_impls(miniflow_funcs,
> > +                             stats->pkt_inner_count,
> > +                             stats->impl_inner_hitcount,
> > +                             (void *)&pmd->miniflow_extract_inner_opt,
> > +                             name);
> > +            mfex_reset_stats(stats->impl_inner_hitcount,
> > +                             &stats->pkt_inner_count);
> >          }
> > -
> > -        /* In debug mode show stats for all the counters. */
> > -        if (VLOG_IS_DBG_ENABLED()) {
> > -
> > -            for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
> > -                VLOG_DBG("MFEX study results for implementation %s:"
> > -                         " (hits %u/%u pkts)", miniflow_funcs[i].name,
> > -                         stats->impl_hitcount[i], stats->pkt_count);
> > -            }
> > -        }
> > -
> > -        /* Reset stats so that study function can be called again
> > -         * for next traffic type and optimal function ptr can be
> > -         * chosen.
> > -         */
> > -        memset(stats, 0, sizeof(struct study_stats));
> >      }
> >      return mask;
> >  }
> > --
> > 2.25.1
> 
> Overall, I like the reuse of code. And I think the patch looks good. The above
> are all smaller comments. The only major thing is what I mentioned in patch
> 7/9 about seeing whether we can get the same behaviour without using the
> 'md_is_valid' bool at all.

Thanks, Md_is_valid is a better solution.

Regards
Amber
diff mbox series

Patch

diff --git a/lib/dpif-netdev-extract-study.c b/lib/dpif-netdev-extract-study.c
index 71354cc4c..03d97c64e 100644
--- a/lib/dpif-netdev-extract-study.c
+++ b/lib/dpif-netdev-extract-study.c
@@ -30,7 +30,9 @@  static atomic_uint32_t mfex_study_pkts_count = MFEX_MAX_PKT_COUNT;
 /* Struct to hold miniflow study stats. */
 struct study_stats {
     uint32_t pkt_count;
+    uint32_t pkt_inner_count;
     uint32_t impl_hitcount[MFEX_IMPL_MAX];
+    uint32_t impl_inner_hitcount[MFEX_IMPL_MAX];
 };
 
 /* Define per thread data to hold the study stats. */
@@ -67,6 +69,58 @@  mfex_set_study_pkt_cnt(uint32_t pkt_cmp_count, const char *name)
     return -EINVAL;
 }
 
+
+static inline void
+mfex_reset_stats(uint32_t *impls_hitcount, uint32_t *pkt_cnt) {
+    /* Reset stats so that study function can be called again
+     * for next traffic type and optimal function ptr can be
+     * chosen.
+     */
+    memset(impls_hitcount, 0, sizeof(uint32_t) * MFEX_IMPL_MAX);
+    *pkt_cnt = 0;
+}
+
+static inline void
+mfex_study_select_best_impls(struct dpif_miniflow_extract_impl *mfex_funcs,
+                             uint32_t pkt_cnt, uint32_t *impls_arr,
+                             atomic_uintptr_t *pmd_func, char *name)
+{
+
+    uint32_t best_func_index = MFEX_IMPL_START_IDX;
+    uint32_t max_hits = 0;
+
+    for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
+        if (impls_arr[i] > max_hits) {
+            max_hits = impls_arr[i];
+            best_func_index = i;
+            }
+        }
+
+    /* If 50% of the packets hit, enable the function. */
+    if (max_hits >= (mfex_study_pkts_count / 2)) {
+        atomic_store_relaxed(pmd_func,
+                    (uintptr_t) mfex_funcs[best_func_index].extract_func);
+        VLOG_INFO("MFEX %s study chose impl %s: (hits %u/%u pkts)",
+                  name, mfex_funcs[best_func_index].name, max_hits,
+                  pkt_cnt);
+    } else {
+        /* Set the implementation to null for default miniflow. */
+        atomic_store_relaxed(pmd_func,
+                    (uintptr_t) mfex_funcs[MFEX_IMPL_SCALAR].extract_func);
+        VLOG_INFO("Not enough packets matched (%u/%u), disabling"
+                  " optimized MFEX.", max_hits, pkt_cnt);
+    }
+
+    /* In debug mode show stats for all the counters. */
+    if (VLOG_IS_DBG_ENABLED()) {
+        for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
+                VLOG_DBG("MFEX study results for implementation %s:"
+                         " (hits %u/%u pkts)", mfex_funcs[i].name,
+                         impls_arr[i], pkt_cnt);
+        }
+    }
+}
+
 uint32_t
 mfex_study_traffic(struct dp_packet_batch *packets,
                    struct netdev_flow_key *keys,
@@ -76,10 +130,12 @@  mfex_study_traffic(struct dp_packet_batch *packets,
 {
     uint32_t hitmask = 0;
     uint32_t mask = 0;
+    uint32_t study_cnt_pkts;
     struct dp_netdev_pmd_thread *pmd = pmd_handle;
     struct dpif_miniflow_extract_impl *miniflow_funcs;
     struct study_stats *stats = mfex_study_get_study_stats_ptr();
     miniflow_funcs = dpif_mfex_impl_info_get();
+    atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts);
 
     /* Run traffic optimized miniflow_extract to collect the hitmask
      * to be compared after certain packets have been hit to choose
@@ -93,7 +149,11 @@  mfex_study_traffic(struct dp_packet_batch *packets,
         hitmask = miniflow_funcs[i].extract_func(packets, keys, keys_size,
                                                  in_port, pmd_handle,
                                                  md_is_valid);
-        stats->impl_hitcount[i] += count_1bits(hitmask);
+        if (!md_is_valid) {
+            stats->impl_hitcount[i] += count_1bits(hitmask);
+        } else {
+            stats->impl_inner_hitcount[i] += count_1bits(hitmask);
+        }
 
         /* If traffic is not classified then we dont overwrite the keys
          * array in minfiflow implementations so its safe to create a
@@ -102,54 +162,34 @@  mfex_study_traffic(struct dp_packet_batch *packets,
         mask |= hitmask;
     }
 
-    stats->pkt_count += dp_packet_batch_size(packets);
-
     /* Choose the best implementation after a minimum packets have been
      * processed.
      */
-    uint32_t study_cnt_pkts;
-    atomic_read_relaxed(&mfex_study_pkts_count, &study_cnt_pkts);
-
-    if (stats->pkt_count >= study_cnt_pkts) {
-        uint32_t best_func_index = MFEX_IMPL_START_IDX;
-        uint32_t max_hits = 0;
-        for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
-            if (stats->impl_hitcount[i] > max_hits) {
-                max_hits = stats->impl_hitcount[i];
-                best_func_index = i;
-            }
+    if (!md_is_valid) {
+        stats->pkt_count += dp_packet_batch_size(packets);
+
+        if (stats->pkt_count >= study_cnt_pkts) {
+            char name[] = "outer";
+            mfex_study_select_best_impls(miniflow_funcs, stats->pkt_count,
+                             stats->impl_hitcount,
+                             (void *)&pmd->miniflow_extract_opt,
+                             name);
+            mfex_reset_stats(stats->impl_hitcount, &stats->pkt_count);
         }
 
-        /* If 50% of the packets hit, enable the function. */
-        if (max_hits >= (mfex_study_pkts_count / 2)) {
-            atomic_store_relaxed(&pmd->miniflow_extract_opt,
-                                 miniflow_funcs[best_func_index].extract_func);
-            VLOG_INFO("MFEX study chose impl %s: (hits %u/%u pkts)",
-                      miniflow_funcs[best_func_index].name, max_hits,
-                      stats->pkt_count);
-        } else {
-            /* Set the implementation to null for default miniflow. */
-            atomic_store_relaxed(&pmd->miniflow_extract_opt,
-                    miniflow_funcs[MFEX_IMPL_SCALAR].extract_func);
-            VLOG_INFO("Not enough packets matched (%u/%u), disabling"
-                      " optimized MFEX.", max_hits, stats->pkt_count);
+    } else {
+        stats->pkt_inner_count += dp_packet_batch_size(packets);
+
+        if (stats->pkt_inner_count >= study_cnt_pkts) {
+            char name[] = "inner";
+            mfex_study_select_best_impls(miniflow_funcs,
+                             stats->pkt_inner_count,
+                             stats->impl_inner_hitcount,
+                             (void *)&pmd->miniflow_extract_inner_opt,
+                             name);
+            mfex_reset_stats(stats->impl_inner_hitcount,
+                             &stats->pkt_inner_count);
         }
-
-        /* In debug mode show stats for all the counters. */
-        if (VLOG_IS_DBG_ENABLED()) {
-
-            for (int i = MFEX_IMPL_START_IDX; i < MFEX_IMPL_MAX; i++) {
-                VLOG_DBG("MFEX study results for implementation %s:"
-                         " (hits %u/%u pkts)", miniflow_funcs[i].name,
-                         stats->impl_hitcount[i], stats->pkt_count);
-            }
-        }
-
-        /* Reset stats so that study function can be called again
-         * for next traffic type and optimal function ptr can be
-         * chosen.
-         */
-        memset(stats, 0, sizeof(struct study_stats));
     }
     return mask;
 }