diff mbox series

[ovs-dev,2/5] dpif-netdev: Make PMD auto load balance use common rxq scheduling.

Message ID 20210604211856.915563-3-ktraynor@redhat.com
State Superseded, archived
Headers show
Series Rxq scheduling updates. | expand

Commit Message

Kevin Traynor June 4, 2021, 9:18 p.m. UTC
PMD auto load balance had its own separate implementation of the
rxq scheduling that it used for dry runs. This was done because
previously the rxq scheduling was not reusable for a dry run.

Apart from the code duplication (which is a good enough reason
to replace it alone) this meant that if any further rxq scheduling
changes or assignment types were added they would also have to be
duplicated in the auto load balance code too.

This patch replaces the current PMD auto load balance rxq scheduling
code to reuse the common rxq scheduling code.

The behaviour does not change from a user perspective, except the logs
are updated to be more consistent.

As the dry run will compare the pmd load variances for current and
estimated assignments, new functions are added to populate the current
assignments and calculate variance on the rxq scheduling data structs.

Now that the new rxq scheduling data structures are being used in
PMD auto load balance, the older rr_* data structs and associated
functions can be removed.

Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
---
 lib/dpif-netdev.c | 511 +++++++++++++++-------------------------------
 1 file changed, 164 insertions(+), 347 deletions(-)

Comments

Pai G, Sunil June 23, 2021, 11:21 a.m. UTC | #1
Hey Kevin , 

Patch looks good to me.
Builds fine , all test cases here http://patchwork.ozlabs.org/project/openvswitch/patch/20210316154532.127858-1-ktraynor@redhat.com/ pass as well.

Some minor nits inline :

<snipped>

> +static bool
> +pmd_rebalance_dry_run(struct dp_netdev *dp)
> +    OVS_REQUIRES(dp->port_mutex)
> +{
> +    struct sched_numa_list *numa_list_cur;
> +    struct sched_numa_list *numa_list_est;
> +    bool thresh_met = false;
> +    uint64_t current, estimate;

current and estimate aren't specific, may be current_var and estimate_var ?

> +    uint64_t improvement = 0;
> +
> +    VLOG_DBG("PMD auto load balance performing dry run.");
> +
> +    /* Populate current assignments. */
> +    numa_list_cur = xzalloc(sizeof *numa_list_cur);
> +    sched_numa_list_populate(numa_list_cur, dp);
> +    sched_numa_list_assignments(numa_list_cur, dp);
> +
> +    /* Populate estimated assignments. */
> +    numa_list_est = xzalloc(sizeof *numa_list_est);
> +    sched_numa_list_populate(numa_list_est, dp);
> +    sched_numa_list_schedule(numa_list_est, dp,
> +                             dp->pmd_rxq_assign_cyc, VLL_DBG);
> +
> +    /* Check if cross-numa polling, there is only one numa with PMDs. */
> +    if (!sched_numa_list_cross_numa_polling(numa_list_est) ||
> +            sched_numa_list_count(numa_list_est) == 1) {
> +
> +        /* Calculate variances. */
> +        current = sched_numa_list_variance(numa_list_cur);
> +        estimate = sched_numa_list_variance(numa_list_est);
> +
> +        if (estimate < current) {
> +             improvement = ((current - estimate) * 100) / current;
> +        }
> +        VLOG_DBG("Current variance %"PRIu64" Estimated variance
> %"PRIu64"",
> +                 current, estimate);

space alignment issues. extra space is required at the start of second statement to align with the first one ?
Also, comma or full stop after Current variance %"PRIu64" ?

> +        VLOG_DBG("Variance improvement %"PRIu64"%%", improvement);
> +
> +        if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
> +            thresh_met = true;
> +            VLOG_DBG("PMD load variance improvement threshold %u%% "
> +                      "is met", dp->pmd_alb.rebalance_improve_thresh);

space alignment issue. Extra space added here before "is_met".


> +        } else {
> +            VLOG_DBG("PMD load variance improvement threshold %u%% is not
> met",
> +                      dp->pmd_alb.rebalance_improve_thresh);
> +        }
> +    } else {
> +        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
> +                 "multiple numa nodes. Unable to accurately estimate.");
> +    }
> +
> +    sched_numa_list_free_entries(numa_list_cur);
> +    sched_numa_list_free_entries(numa_list_est);
> +
> +    free(numa_list_cur);
> +    free(numa_list_est);
> +
> +    return thresh_met;
> +}
> +
>  static void
>  reload_affected_pmds(struct dp_netdev *dp) @@ -5897,215 +5925,4 @@
> variance(uint64_t a[], int n)  }

<snipped>
David Marchand June 24, 2021, 2:30 p.m. UTC | #2
On Fri, Jun 4, 2021 at 11:19 PM Kevin Traynor <ktraynor@redhat.com> wrote:
>
> PMD auto load balance had its own separate implementation of the
> rxq scheduling that it used for dry runs. This was done because
> previously the rxq scheduling was not reusable for a dry run.
>
> Apart from the code duplication (which is a good enough reason
> to replace it alone) this meant that if any further rxq scheduling
> changes or assignment types were added they would also have to be
> duplicated in the auto load balance code too.
>
> This patch replaces the current PMD auto load balance rxq scheduling
> code to reuse the common rxq scheduling code.
>
> The behaviour does not change from a user perspective, except the logs
> are updated to be more consistent.
>
> As the dry run will compare the pmd load variances for current and
> estimated assignments, new functions are added to populate the current
> assignments and calculate variance on the rxq scheduling data structs.
>
> Now that the new rxq scheduling data structures are being used in
> PMD auto load balance, the older rr_* data structs and associated
> functions can be removed.
>
> Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
> ---
>  lib/dpif-netdev.c | 511 +++++++++++++++-------------------------------
>  1 file changed, 164 insertions(+), 347 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 57d23e112..eaa4e9733 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -4872,138 +4872,4 @@ port_reconfigure(struct dp_netdev_port *port)
>  }
>
> -struct rr_numa_list {
> -    struct hmap numas;  /* Contains 'struct rr_numa' */
> -};
> -
> -struct rr_numa {
> -    struct hmap_node node;
> -
> -    int numa_id;
> -
> -    /* Non isolated pmds on numa node 'numa_id' */
> -    struct dp_netdev_pmd_thread **pmds;
> -    int n_pmds;
> -
> -    int cur_index;
> -    bool idx_inc;
> -};
> -
> -static size_t
> -rr_numa_list_count(struct rr_numa_list *rr)
> -{
> -    return hmap_count(&rr->numas);
> -}
> -
> -static struct rr_numa *
> -rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
> -{
> -    struct rr_numa *numa;
> -
> -    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
> -        if (numa->numa_id == numa_id) {
> -            return numa;
> -        }
> -    }
> -
> -    return NULL;
> -}
> -
> -/* Returns the next node in numa list following 'numa' in round-robin fashion.
> - * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
> - * Returns NULL if 'rr' numa list is empty. */
> -static struct rr_numa *
> -rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
> -{
> -    struct hmap_node *node = NULL;
> -
> -    if (numa) {
> -        node = hmap_next(&rr->numas, &numa->node);
> -    }
> -    if (!node) {
> -        node = hmap_first(&rr->numas);
> -    }
> -
> -    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
> -}
> -
> -static void
> -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
> -{
> -    struct dp_netdev_pmd_thread *pmd;
> -    struct rr_numa *numa;
> -
> -    hmap_init(&rr->numas);
> -
> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> -        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
> -            continue;
> -        }
> -
> -        numa = rr_numa_list_lookup(rr, pmd->numa_id);
> -        if (!numa) {
> -            numa = xzalloc(sizeof *numa);
> -            numa->numa_id = pmd->numa_id;
> -            hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
> -        }
> -        numa->n_pmds++;
> -        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
> -        numa->pmds[numa->n_pmds - 1] = pmd;
> -        /* At least one pmd so initialise curr_idx and idx_inc. */
> -        numa->cur_index = 0;
> -        numa->idx_inc = true;
> -    }
> -}
> -
> -/*
> - * Returns the next pmd from the numa node.
> - *
> - * If 'updown' is 'true' it will alternate between selecting the next pmd in
> - * either an up or down walk, switching between up/down when the first or last
> - * core is reached. e.g. 1,2,3,3,2,1,1,2...
> - *
> - * If 'updown' is 'false' it will select the next pmd wrapping around when last
> - * core reached. e.g. 1,2,3,1,2,3,1,2...
> - */
> -static struct dp_netdev_pmd_thread *
> -rr_numa_get_pmd(struct rr_numa *numa, bool updown)
> -{
> -    int numa_idx = numa->cur_index;
> -
> -    if (numa->idx_inc == true) {
> -        /* Incrementing through list of pmds. */
> -        if (numa->cur_index == numa->n_pmds-1) {
> -            /* Reached the last pmd. */
> -            if (updown) {
> -                numa->idx_inc = false;
> -            } else {
> -                numa->cur_index = 0;
> -            }
> -        } else {
> -            numa->cur_index++;
> -        }
> -    } else {
> -        /* Decrementing through list of pmds. */
> -        if (numa->cur_index == 0) {
> -            /* Reached the first pmd. */
> -            numa->idx_inc = true;
> -        } else {
> -            numa->cur_index--;
> -        }
> -    }
> -    return numa->pmds[numa_idx];
> -}
> -
> -static void
> -rr_numa_list_destroy(struct rr_numa_list *rr)
> -{
> -    struct rr_numa *numa;
> -
> -    HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
> -        free(numa->pmds);
> -        free(numa);
> -    }
> -    hmap_destroy(&rr->numas);
> -}
> -
>  struct sched_numa_list {
>      struct hmap numas;  /* Contains 'struct sched_numa'. */
> @@ -5033,4 +4899,6 @@ struct sched_numa {
>  };
>
> +static uint64_t variance(uint64_t a[], int n);

Nit: this fwd declaration can be moved right before
sched_numa_list_variance() which is the only function that uses it.
Or variance() itself could be moved.


> +
>  static size_t
>  sched_numa_list_count(struct sched_numa_list *numa_list)
> @@ -5181,4 +5049,36 @@ sched_add_rxq_to_sched_pmd(struct sched_pmd *sched_pmd,
>  }
>
> +static void
> +sched_numa_list_assignments(struct sched_numa_list *numa_list,
> +                                     struct dp_netdev *dp)

Indent of second line.
+
Don't we need a OVS_REQUIRES(dp->port_mutex) annotation?



> +{
> +    struct dp_netdev_port *port;
> +
> +    /* For each port. */
> +    HMAP_FOR_EACH (port, node, &dp->ports) {
> +        if (!netdev_is_pmd(port->netdev)) {
> +            continue;
> +        }

[snip]
Kevin Traynor July 1, 2021, 11:16 p.m. UTC | #3
On 24/06/2021 15:30, David Marchand wrote:
> On Fri, Jun 4, 2021 at 11:19 PM Kevin Traynor <ktraynor@redhat.com> wrote:
>>
>> PMD auto load balance had its own separate implementation of the
>> rxq scheduling that it used for dry runs. This was done because
>> previously the rxq scheduling was not reusable for a dry run.
>>
>> Apart from the code duplication (which is a good enough reason
>> to replace it alone) this meant that if any further rxq scheduling
>> changes or assignment types were added they would also have to be
>> duplicated in the auto load balance code too.
>>
>> This patch replaces the current PMD auto load balance rxq scheduling
>> code to reuse the common rxq scheduling code.
>>
>> The behaviour does not change from a user perspective, except the logs
>> are updated to be more consistent.
>>
>> As the dry run will compare the pmd load variances for current and
>> estimated assignments, new functions are added to populate the current
>> assignments and calculate variance on the rxq scheduling data structs.
>>
>> Now that the new rxq scheduling data structures are being used in
>> PMD auto load balance, the older rr_* data structs and associated
>> functions can be removed.
>>
>> Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
>> ---
>>  lib/dpif-netdev.c | 511 +++++++++++++++-------------------------------
>>  1 file changed, 164 insertions(+), 347 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 57d23e112..eaa4e9733 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -4872,138 +4872,4 @@ port_reconfigure(struct dp_netdev_port *port)
>>  }
>>
>> -struct rr_numa_list {
>> -    struct hmap numas;  /* Contains 'struct rr_numa' */
>> -};
>> -
>> -struct rr_numa {
>> -    struct hmap_node node;
>> -
>> -    int numa_id;
>> -
>> -    /* Non isolated pmds on numa node 'numa_id' */
>> -    struct dp_netdev_pmd_thread **pmds;
>> -    int n_pmds;
>> -
>> -    int cur_index;
>> -    bool idx_inc;
>> -};
>> -
>> -static size_t
>> -rr_numa_list_count(struct rr_numa_list *rr)
>> -{
>> -    return hmap_count(&rr->numas);
>> -}
>> -
>> -static struct rr_numa *
>> -rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
>> -{
>> -    struct rr_numa *numa;
>> -
>> -    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
>> -        if (numa->numa_id == numa_id) {
>> -            return numa;
>> -        }
>> -    }
>> -
>> -    return NULL;
>> -}
>> -
>> -/* Returns the next node in numa list following 'numa' in round-robin fashion.
>> - * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
>> - * Returns NULL if 'rr' numa list is empty. */
>> -static struct rr_numa *
>> -rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
>> -{
>> -    struct hmap_node *node = NULL;
>> -
>> -    if (numa) {
>> -        node = hmap_next(&rr->numas, &numa->node);
>> -    }
>> -    if (!node) {
>> -        node = hmap_first(&rr->numas);
>> -    }
>> -
>> -    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
>> -}
>> -
>> -static void
>> -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
>> -{
>> -    struct dp_netdev_pmd_thread *pmd;
>> -    struct rr_numa *numa;
>> -
>> -    hmap_init(&rr->numas);
>> -
>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> -        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
>> -            continue;
>> -        }
>> -
>> -        numa = rr_numa_list_lookup(rr, pmd->numa_id);
>> -        if (!numa) {
>> -            numa = xzalloc(sizeof *numa);
>> -            numa->numa_id = pmd->numa_id;
>> -            hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
>> -        }
>> -        numa->n_pmds++;
>> -        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
>> -        numa->pmds[numa->n_pmds - 1] = pmd;
>> -        /* At least one pmd so initialise curr_idx and idx_inc. */
>> -        numa->cur_index = 0;
>> -        numa->idx_inc = true;
>> -    }
>> -}
>> -
>> -/*
>> - * Returns the next pmd from the numa node.
>> - *
>> - * If 'updown' is 'true' it will alternate between selecting the next pmd in
>> - * either an up or down walk, switching between up/down when the first or last
>> - * core is reached. e.g. 1,2,3,3,2,1,1,2...
>> - *
>> - * If 'updown' is 'false' it will select the next pmd wrapping around when last
>> - * core reached. e.g. 1,2,3,1,2,3,1,2...
>> - */
>> -static struct dp_netdev_pmd_thread *
>> -rr_numa_get_pmd(struct rr_numa *numa, bool updown)
>> -{
>> -    int numa_idx = numa->cur_index;
>> -
>> -    if (numa->idx_inc == true) {
>> -        /* Incrementing through list of pmds. */
>> -        if (numa->cur_index == numa->n_pmds-1) {
>> -            /* Reached the last pmd. */
>> -            if (updown) {
>> -                numa->idx_inc = false;
>> -            } else {
>> -                numa->cur_index = 0;
>> -            }
>> -        } else {
>> -            numa->cur_index++;
>> -        }
>> -    } else {
>> -        /* Decrementing through list of pmds. */
>> -        if (numa->cur_index == 0) {
>> -            /* Reached the first pmd. */
>> -            numa->idx_inc = true;
>> -        } else {
>> -            numa->cur_index--;
>> -        }
>> -    }
>> -    return numa->pmds[numa_idx];
>> -}
>> -
>> -static void
>> -rr_numa_list_destroy(struct rr_numa_list *rr)
>> -{
>> -    struct rr_numa *numa;
>> -
>> -    HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
>> -        free(numa->pmds);
>> -        free(numa);
>> -    }
>> -    hmap_destroy(&rr->numas);
>> -}
>> -
>>  struct sched_numa_list {
>>      struct hmap numas;  /* Contains 'struct sched_numa'. */
>> @@ -5033,4 +4899,6 @@ struct sched_numa {
>>  };
>>
>> +static uint64_t variance(uint64_t a[], int n);
> 
> Nit: this fwd declaration can be moved right before
> sched_numa_list_variance() which is the only function that uses it.
> Or variance() itself could be moved.
> 

I wanted to avoid moving the variance() fn to avoid unnecessary code
churn, so I will move the declaration.

> 
>> +
>>  static size_t
>>  sched_numa_list_count(struct sched_numa_list *numa_list)
>> @@ -5181,4 +5049,36 @@ sched_add_rxq_to_sched_pmd(struct sched_pmd *sched_pmd,
>>  }
>>
>> +static void
>> +sched_numa_list_assignments(struct sched_numa_list *numa_list,
>> +                                     struct dp_netdev *dp)
> 
> Indent of second line.

fixed

> +
> Don't we need a OVS_REQUIRES(dp->port_mutex) annotation?
> 

it will have the lock, but i need to check again about the annotation.

> 
> 
>> +{
>> +    struct dp_netdev_port *port;
>> +
>> +    /* For each port. */
>> +    HMAP_FOR_EACH (port, node, &dp->ports) {
>> +        if (!netdev_is_pmd(port->netdev)) {
>> +            continue;
>> +        }
> 
> [snip]
> 
>
Kevin Traynor July 1, 2021, 11:17 p.m. UTC | #4
On 23/06/2021 12:21, Pai G, Sunil wrote:
> Hey Kevin , 
> 
> Patch looks good to me.
> Builds fine , all test cases here http://patchwork.ozlabs.org/project/openvswitch/patch/20210316154532.127858-1-ktraynor@redhat.com/ pass as well.
> 
> Some minor nits inline :
> 
> <snipped>
> 
>> +static bool
>> +pmd_rebalance_dry_run(struct dp_netdev *dp)
>> +    OVS_REQUIRES(dp->port_mutex)
>> +{
>> +    struct sched_numa_list *numa_list_cur;
>> +    struct sched_numa_list *numa_list_est;
>> +    bool thresh_met = false;
>> +    uint64_t current, estimate;
> 
> current and estimate aren't specific, may be current_var and estimate_var ?
> 

changed to *_var.

>> +    uint64_t improvement = 0;
>> +
>> +    VLOG_DBG("PMD auto load balance performing dry run.");
>> +
>> +    /* Populate current assignments. */
>> +    numa_list_cur = xzalloc(sizeof *numa_list_cur);
>> +    sched_numa_list_populate(numa_list_cur, dp);
>> +    sched_numa_list_assignments(numa_list_cur, dp);
>> +
>> +    /* Populate estimated assignments. */
>> +    numa_list_est = xzalloc(sizeof *numa_list_est);
>> +    sched_numa_list_populate(numa_list_est, dp);
>> +    sched_numa_list_schedule(numa_list_est, dp,
>> +                             dp->pmd_rxq_assign_cyc, VLL_DBG);
>> +
>> +    /* Check if cross-numa polling, there is only one numa with PMDs. */
>> +    if (!sched_numa_list_cross_numa_polling(numa_list_est) ||
>> +            sched_numa_list_count(numa_list_est) == 1) {
>> +
>> +        /* Calculate variances. */
>> +        current = sched_numa_list_variance(numa_list_cur);
>> +        estimate = sched_numa_list_variance(numa_list_est);
>> +
>> +        if (estimate < current) {
>> +             improvement = ((current - estimate) * 100) / current;
>> +        }
>> +        VLOG_DBG("Current variance %"PRIu64" Estimated variance
>> %"PRIu64"",
>> +                 current, estimate);
> 
> space alignment issues. extra space is required at the start of second statement to align with the first one ?

yes, fixed.

> Also, comma or full stop after Current variance %"PRIu64" ?
> 

added

>> +        VLOG_DBG("Variance improvement %"PRIu64"%%", improvement);
>> +
>> +        if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
>> +            thresh_met = true;
>> +            VLOG_DBG("PMD load variance improvement threshold %u%% "
>> +                      "is met", dp->pmd_alb.rebalance_improve_thresh);
> 
> space alignment issue. Extra space added here before "is_met".
> 

fixed

> 
>> +        } else {
>> +            VLOG_DBG("PMD load variance improvement threshold %u%% is not
>> met",
>> +                      dp->pmd_alb.rebalance_improve_thresh);
>> +        }
>> +    } else {
>> +        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
>> +                 "multiple numa nodes. Unable to accurately estimate.");
>> +    }
>> +
>> +    sched_numa_list_free_entries(numa_list_cur);
>> +    sched_numa_list_free_entries(numa_list_est);
>> +
>> +    free(numa_list_cur);
>> +    free(numa_list_est);
>> +
>> +    return thresh_met;
>> +}
>> +
>>  static void
>>  reload_affected_pmds(struct dp_netdev *dp) @@ -5897,215 +5925,4 @@
>> variance(uint64_t a[], int n)  }
> 
> <snipped>
>
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 57d23e112..eaa4e9733 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4872,138 +4872,4 @@  port_reconfigure(struct dp_netdev_port *port)
 }
 
-struct rr_numa_list {
-    struct hmap numas;  /* Contains 'struct rr_numa' */
-};
-
-struct rr_numa {
-    struct hmap_node node;
-
-    int numa_id;
-
-    /* Non isolated pmds on numa node 'numa_id' */
-    struct dp_netdev_pmd_thread **pmds;
-    int n_pmds;
-
-    int cur_index;
-    bool idx_inc;
-};
-
-static size_t
-rr_numa_list_count(struct rr_numa_list *rr)
-{
-    return hmap_count(&rr->numas);
-}
-
-static struct rr_numa *
-rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
-{
-    struct rr_numa *numa;
-
-    HMAP_FOR_EACH_WITH_HASH (numa, node, hash_int(numa_id, 0), &rr->numas) {
-        if (numa->numa_id == numa_id) {
-            return numa;
-        }
-    }
-
-    return NULL;
-}
-
-/* Returns the next node in numa list following 'numa' in round-robin fashion.
- * Returns first node if 'numa' is a null pointer or the last node in 'rr'.
- * Returns NULL if 'rr' numa list is empty. */
-static struct rr_numa *
-rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
-{
-    struct hmap_node *node = NULL;
-
-    if (numa) {
-        node = hmap_next(&rr->numas, &numa->node);
-    }
-    if (!node) {
-        node = hmap_first(&rr->numas);
-    }
-
-    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
-}
-
-static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
-{
-    struct dp_netdev_pmd_thread *pmd;
-    struct rr_numa *numa;
-
-    hmap_init(&rr->numas);
-
-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
-            continue;
-        }
-
-        numa = rr_numa_list_lookup(rr, pmd->numa_id);
-        if (!numa) {
-            numa = xzalloc(sizeof *numa);
-            numa->numa_id = pmd->numa_id;
-            hmap_insert(&rr->numas, &numa->node, hash_int(pmd->numa_id, 0));
-        }
-        numa->n_pmds++;
-        numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
-        numa->pmds[numa->n_pmds - 1] = pmd;
-        /* At least one pmd so initialise curr_idx and idx_inc. */
-        numa->cur_index = 0;
-        numa->idx_inc = true;
-    }
-}
-
-/*
- * Returns the next pmd from the numa node.
- *
- * If 'updown' is 'true' it will alternate between selecting the next pmd in
- * either an up or down walk, switching between up/down when the first or last
- * core is reached. e.g. 1,2,3,3,2,1,1,2...
- *
- * If 'updown' is 'false' it will select the next pmd wrapping around when last
- * core reached. e.g. 1,2,3,1,2,3,1,2...
- */
-static struct dp_netdev_pmd_thread *
-rr_numa_get_pmd(struct rr_numa *numa, bool updown)
-{
-    int numa_idx = numa->cur_index;
-
-    if (numa->idx_inc == true) {
-        /* Incrementing through list of pmds. */
-        if (numa->cur_index == numa->n_pmds-1) {
-            /* Reached the last pmd. */
-            if (updown) {
-                numa->idx_inc = false;
-            } else {
-                numa->cur_index = 0;
-            }
-        } else {
-            numa->cur_index++;
-        }
-    } else {
-        /* Decrementing through list of pmds. */
-        if (numa->cur_index == 0) {
-            /* Reached the first pmd. */
-            numa->idx_inc = true;
-        } else {
-            numa->cur_index--;
-        }
-    }
-    return numa->pmds[numa_idx];
-}
-
-static void
-rr_numa_list_destroy(struct rr_numa_list *rr)
-{
-    struct rr_numa *numa;
-
-    HMAP_FOR_EACH_POP (numa, node, &rr->numas) {
-        free(numa->pmds);
-        free(numa);
-    }
-    hmap_destroy(&rr->numas);
-}
-
 struct sched_numa_list {
     struct hmap numas;  /* Contains 'struct sched_numa'. */
@@ -5033,4 +4899,6 @@  struct sched_numa {
 };
 
+static uint64_t variance(uint64_t a[], int n);
+
 static size_t
 sched_numa_list_count(struct sched_numa_list *numa_list)
@@ -5181,4 +5049,36 @@  sched_add_rxq_to_sched_pmd(struct sched_pmd *sched_pmd,
 }
 
+static void
+sched_numa_list_assignments(struct sched_numa_list *numa_list,
+                                     struct dp_netdev *dp)
+{
+    struct dp_netdev_port *port;
+
+    /* For each port. */
+    HMAP_FOR_EACH (port, node, &dp->ports) {
+        if (!netdev_is_pmd(port->netdev)) {
+            continue;
+        }
+        /* For each rxq on the port. */
+        for (unsigned qid = 0; qid < port->n_rxq; qid++) {
+            struct dp_netdev_rxq *rxq = &port->rxqs[qid];
+            struct sched_pmd *sched_pmd;
+            uint64_t proc_cycles = 0;
+
+            for (int i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+                proc_cycles  += dp_netdev_rxq_get_intrvl_cycles(rxq, i);
+            }
+
+            sched_pmd = find_sched_pmd_by_pmd(numa_list, rxq->pmd);
+            if (sched_pmd) {
+                if (rxq->core_id != OVS_CORE_UNSPEC) {
+                    sched_pmd->isolated = true;
+                }
+                sched_add_rxq_to_sched_pmd(sched_pmd, rxq, proc_cycles);
+            }
+        }
+    }
+}
+
 static void
 sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
@@ -5204,4 +5104,31 @@  sched_numa_list_put_in_place(struct sched_numa_list *numa_list)
 }
 
+static bool
+sched_numa_list_cross_numa_polling(struct sched_numa_list *numa_list)
+{
+    struct sched_numa *numa;
+
+    /* For each numa */
+    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
+        /* For each pmd */
+        for (int i = 0; i < numa->n_pmds; i++) {
+            struct sched_pmd *sched_pmd;
+
+            sched_pmd = &numa->pmds[i];
+            /* For each rxq. */
+            for (unsigned k = 0; k < sched_pmd->n_rxq; k++) {
+                struct dp_netdev_rxq *rxq = sched_pmd->rxqs[k];
+
+                if (!sched_pmd->isolated &&
+                    rxq->pmd->numa_id !=
+                        netdev_get_numa_id(rxq->port->netdev)) {
+                    return true;
+                }
+            }
+        }
+    }
+    return false;
+}
+
 static unsigned
 sched_get_numa_pmd_noniso(struct sched_numa *numa)
@@ -5516,4 +5443,105 @@  rxq_scheduling(struct dp_netdev *dp) OVS_REQUIRES(dp->port_mutex)
 }
 
+static uint64_t
+sched_numa_list_variance(struct sched_numa_list *numa_list)
+{
+    struct sched_numa *numa;
+    uint64_t *percent_busy = NULL;
+    unsigned total_pmds = 0;
+    int n_proc = 0;
+    uint64_t var;
+
+    HMAP_FOR_EACH (numa, node, &numa_list->numas) {
+        total_pmds += numa->n_pmds;
+        percent_busy = xrealloc(percent_busy,
+                                total_pmds * sizeof *percent_busy);
+
+        for (unsigned i = 0; i < numa->n_pmds; i++) {
+            struct sched_pmd *sched_pmd;
+            uint64_t total_cycles = 0;
+
+            sched_pmd = &numa->pmds[i];
+            /* Exclude isolated PMDs from variance calculations. */
+            if (sched_pmd->isolated == true) {
+                continue;
+            }
+            /* Get the total pmd cycles for an interval. */
+            atomic_read_relaxed(&sched_pmd->pmd->intrvl_cycles, &total_cycles);
+
+            if (total_cycles) {
+                /* Estimate the cycles to cover all intervals. */
+                total_cycles *= PMD_RXQ_INTERVAL_MAX;
+                percent_busy[n_proc++] = (sched_pmd->pmd_proc_cycles * 100)
+                                             / total_cycles;
+            } else {
+                percent_busy[n_proc++] = 0;
+            }
+        }
+    }
+    var = variance(percent_busy, n_proc);
+    free(percent_busy);
+    return var;
+}
+
+static bool
+pmd_rebalance_dry_run(struct dp_netdev *dp)
+    OVS_REQUIRES(dp->port_mutex)
+{
+    struct sched_numa_list *numa_list_cur;
+    struct sched_numa_list *numa_list_est;
+    bool thresh_met = false;
+    uint64_t current, estimate;
+    uint64_t improvement = 0;
+
+    VLOG_DBG("PMD auto load balance performing dry run.");
+
+    /* Populate current assignments. */
+    numa_list_cur = xzalloc(sizeof *numa_list_cur);
+    sched_numa_list_populate(numa_list_cur, dp);
+    sched_numa_list_assignments(numa_list_cur, dp);
+
+    /* Populate estimated assignments. */
+    numa_list_est = xzalloc(sizeof *numa_list_est);
+    sched_numa_list_populate(numa_list_est, dp);
+    sched_numa_list_schedule(numa_list_est, dp,
+                             dp->pmd_rxq_assign_cyc, VLL_DBG);
+
+    /* Check if cross-numa polling, there is only one numa with PMDs. */
+    if (!sched_numa_list_cross_numa_polling(numa_list_est) ||
+            sched_numa_list_count(numa_list_est) == 1) {
+
+        /* Calculate variances. */
+        current = sched_numa_list_variance(numa_list_cur);
+        estimate = sched_numa_list_variance(numa_list_est);
+
+        if (estimate < current) {
+             improvement = ((current - estimate) * 100) / current;
+        }
+        VLOG_DBG("Current variance %"PRIu64" Estimated variance %"PRIu64"",
+                 current, estimate);
+        VLOG_DBG("Variance improvement %"PRIu64"%%", improvement);
+
+        if (improvement >= dp->pmd_alb.rebalance_improve_thresh) {
+            thresh_met = true;
+            VLOG_DBG("PMD load variance improvement threshold %u%% "
+                      "is met", dp->pmd_alb.rebalance_improve_thresh);
+        } else {
+            VLOG_DBG("PMD load variance improvement threshold %u%% is not met",
+                      dp->pmd_alb.rebalance_improve_thresh);
+        }
+    } else {
+        VLOG_DBG("PMD auto load balance detected cross-numa polling with "
+                 "multiple numa nodes. Unable to accurately estimate.");
+    }
+
+    sched_numa_list_free_entries(numa_list_cur);
+    sched_numa_list_free_entries(numa_list_est);
+
+    free(numa_list_cur);
+    free(numa_list_est);
+
+    return thresh_met;
+}
+
 static void
 reload_affected_pmds(struct dp_netdev *dp)
@@ -5897,215 +5925,4 @@  variance(uint64_t a[], int n)
 }
 
-
-/* Returns the variance in the PMDs usage as part of dry run of rxqs
- * assignment to PMDs. */
-static bool
-get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
-                     uint32_t num_pmds, uint64_t *predicted_variance)
-    OVS_REQUIRES(dp->port_mutex)
-{
-    struct dp_netdev_port *port;
-    struct dp_netdev_pmd_thread *pmd;
-    struct dp_netdev_rxq **rxqs = NULL;
-    struct rr_numa *numa = NULL;
-    struct rr_numa_list rr;
-    int n_rxqs = 0;
-    bool ret = false;
-    uint64_t *pmd_usage;
-
-    if (!predicted_variance) {
-        return ret;
-    }
-
-    pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
-
-    HMAP_FOR_EACH (port, node, &dp->ports) {
-        if (!netdev_is_pmd(port->netdev)) {
-            continue;
-        }
-
-        for (int qid = 0; qid < port->n_rxq; qid++) {
-            struct dp_netdev_rxq *q = &port->rxqs[qid];
-            uint64_t cycle_hist = 0;
-
-            if (q->pmd->isolated) {
-                continue;
-            }
-
-            if (n_rxqs == 0) {
-                rxqs = xmalloc(sizeof *rxqs);
-            } else {
-                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
-            }
-
-            /* Sum the queue intervals and store the cycle history. */
-            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
-                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
-            }
-            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
-                                         cycle_hist);
-            /* Store the queue. */
-            rxqs[n_rxqs++] = q;
-        }
-    }
-    if (n_rxqs > 1) {
-        /* Sort the queues in order of the processing cycles
-         * they consumed during their last pmd interval. */
-        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
-    }
-    rr_numa_list_populate(dp, &rr);
-
-    for (int i = 0; i < n_rxqs; i++) {
-        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
-        numa = rr_numa_list_lookup(&rr, numa_id);
-        /* If there is no available pmd on the local numa but there is only one
-         * numa for cross-numa polling, we can estimate the dry run. */
-        if (!numa && rr_numa_list_count(&rr) == 1) {
-            numa = rr_numa_list_next(&rr, NULL);
-        }
-        if (!numa) {
-            VLOG_DBG("PMD auto lb dry run: "
-                     "There's no available (non-isolated) PMD thread on NUMA "
-                     "node %d for port '%s' and there are PMD threads on more "
-                     "than one NUMA node available for cross-NUMA polling. "
-                     "Aborting.", numa_id, netdev_rxq_get_name(rxqs[i]->rx));
-            goto cleanup;
-        }
-
-        pmd = rr_numa_get_pmd(numa, true);
-        VLOG_DBG("PMD auto lb dry run. Predicted: Core %d on numa node %d "
-                  "to be assigned port \'%s\' rx queue %d "
-                  "(measured processing cycles %"PRIu64").",
-                  pmd->core_id, numa_id,
-                  netdev_rxq_get_name(rxqs[i]->rx),
-                  netdev_rxq_get_queue_id(rxqs[i]->rx),
-                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
-
-        for (int id = 0; id < num_pmds; id++) {
-            if (pmd->core_id == core_list[id]) {
-                /* Add the processing cycles of rxq to pmd polling it. */
-                pmd_usage[id] += dp_netdev_rxq_get_cycles(rxqs[i],
-                                        RXQ_CYCLES_PROC_HIST);
-            }
-        }
-    }
-
-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        uint64_t total_cycles = 0;
-
-        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
-            continue;
-        }
-
-        /* Get the total pmd cycles for an interval. */
-        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
-        /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
-        for (int id = 0; id < num_pmds; id++) {
-            if (pmd->core_id == core_list[id]) {
-                if (pmd_usage[id]) {
-                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
-                }
-                VLOG_DBG("PMD auto lb dry run. Predicted: Core %d, "
-                         "usage %"PRIu64"", pmd->core_id, pmd_usage[id]);
-            }
-        }
-    }
-    *predicted_variance = variance(pmd_usage, num_pmds);
-    ret = true;
-
-cleanup:
-    rr_numa_list_destroy(&rr);
-    free(rxqs);
-    free(pmd_usage);
-    return ret;
-}
-
-/* Does the dry run of Rxq assignment to PMDs and returns true if it gives
- * better distribution of load on PMDs. */
-static bool
-pmd_rebalance_dry_run(struct dp_netdev *dp)
-    OVS_REQUIRES(dp->port_mutex)
-{
-    struct dp_netdev_pmd_thread *pmd;
-    uint64_t *curr_pmd_usage;
-
-    uint64_t curr_variance;
-    uint64_t new_variance;
-    uint64_t improvement = 0;
-    uint32_t num_pmds;
-    uint32_t *pmd_corelist;
-    struct rxq_poll *poll;
-    bool ret;
-
-    num_pmds = cmap_count(&dp->poll_threads);
-
-    if (num_pmds > 1) {
-        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
-        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
-    } else {
-        return false;
-    }
-
-    num_pmds = 0;
-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        uint64_t total_cycles = 0;
-        uint64_t total_proc = 0;
-
-        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
-            continue;
-        }
-
-        /* Get the total pmd cycles for an interval. */
-        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
-        /* Estimate the cycles to cover all intervals. */
-        total_cycles *= PMD_RXQ_INTERVAL_MAX;
-
-        ovs_mutex_lock(&pmd->port_mutex);
-        HMAP_FOR_EACH (poll, node, &pmd->poll_list) {
-            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
-                total_proc += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
-            }
-        }
-        ovs_mutex_unlock(&pmd->port_mutex);
-
-        if (total_proc) {
-            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
-        }
-
-        VLOG_DBG("PMD auto lb dry run. Current: Core %d, usage %"PRIu64"",
-                  pmd->core_id, curr_pmd_usage[num_pmds]);
-
-        if (atomic_count_get(&pmd->pmd_overloaded)) {
-            atomic_count_set(&pmd->pmd_overloaded, 0);
-        }
-
-        pmd_corelist[num_pmds] = pmd->core_id;
-        num_pmds++;
-    }
-
-    curr_variance = variance(curr_pmd_usage, num_pmds);
-    ret = get_dry_run_variance(dp, pmd_corelist, num_pmds, &new_variance);
-
-    if (ret) {
-        VLOG_DBG("PMD auto lb dry run. Current PMD variance: %"PRIu64","
-                  " Predicted PMD variance: %"PRIu64"",
-                  curr_variance, new_variance);
-
-        if (new_variance < curr_variance) {
-            improvement =
-                ((curr_variance - new_variance) * 100) / curr_variance;
-        }
-        if (improvement < dp->pmd_alb.rebalance_improve_thresh) {
-            ret = false;
-        }
-    }
-
-    free(curr_pmd_usage);
-    free(pmd_corelist);
-    return ret;
-}
-
-
 /* Return true if needs to revalidate datapath flows. */
 static bool
@@ -6183,6 +6000,6 @@  dpif_netdev_run(struct dpif *dpif)
                 !ports_require_restart(dp) &&
                 pmd_rebalance_dry_run(dp)) {
-                VLOG_INFO("PMD auto lb dry run."
-                          " requesting datapath reconfigure.");
+                VLOG_INFO("PMD auto load balance dry run. "
+                          "Requesting datapath reconfigure.");
                 dp_netdev_request_reconfigure(dp);
             }