diff mbox series

[ovs-dev,v2] Adding support for PMD auto load balancing

Message ID 1546547894-16821-1-git-send-email-nitin.katiyar@ericsson.com
State Changes Requested
Delegated to: Ian Stokes
Headers show
Series [ovs-dev,v2] Adding support for PMD auto load balancing | expand

Commit Message

Nitin Katiyar Jan. 3, 2019, 12:36 p.m. UTC
Port rx queues that have not been statically assigned to PMDs are currently
assigned based on periodically sampled load measurements.
The assignment is performed at specific instances – port addition, port
deletion, upon reassignment request via CLI etc.

Due to change in traffic pattern over time it can cause uneven load among
the PMDs and thus resulting in lower overall throughout.

This patch enables the support of auto load balancing of PMDs based on
measured load of RX queues. Each PMD measures the processing load for each
of its associated queues every 10 seconds. If the aggregated PMD load reaches
95% for 6 consecutive intervals then PMD considers itself to be overloaded.

If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
performed by OVS main thread. The dry-run does NOT change the existing
queue to PMD assignments.

If the resultant mapping of dry-run indicates an improved distribution
of the load then the actual reassignment will be performed.

The automatic rebalancing will be disabled by default and has to be
enabled via configuration option. The interval (in minutes) between
two consecutive rebalancing can also be configured via CLI, default
is 1 min.

Following example commands can be used to set the auto-lb params:
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"

Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
---
 lib/dpif-netdev.c    | 403 +++++++++++++++++++++++++++++++++++++++++++++++++--
 vswitchd/vswitch.xml |  30 ++++
 2 files changed, 424 insertions(+), 9 deletions(-)

Comments

Aaron Conole Jan. 3, 2019, 3:08 p.m. UTC | #1
Nitin Katiyar <nitin.katiyar@ericsson.com> writes:

> Port rx queues that have not been statically assigned to PMDs are currently
> assigned based on periodically sampled load measurements.
> The assignment is performed at specific instances – port addition, port
> deletion, upon reassignment request via CLI etc.
>
> Due to change in traffic pattern over time it can cause uneven load among
> the PMDs and thus resulting in lower overall throughout.
>
> This patch enables the support of auto load balancing of PMDs based on
> measured load of RX queues. Each PMD measures the processing load for each
> of its associated queues every 10 seconds. If the aggregated PMD load reaches
> 95% for 6 consecutive intervals then PMD considers itself to be overloaded.
>
> If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
> performed by OVS main thread. The dry-run does NOT change the existing
> queue to PMD assignments.
>
> If the resultant mapping of dry-run indicates an improved distribution
> of the load then the actual reassignment will be performed.
>
> The automatic rebalancing will be disabled by default and has to be
> enabled via configuration option. The interval (in minutes) between
> two consecutive rebalancing can also be configured via CLI, default
> is 1 min.
>
> Following example commands can be used to set the auto-lb params:
> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"
>
> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
> ---
>  lib/dpif-netdev.c    | 403 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  vswitchd/vswitch.xml |  30 ++++
>  2 files changed, 424 insertions(+), 9 deletions(-)
>
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 1564db9..8db106f 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -80,6 +80,12 @@
>  
>  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>  
> +/* Auto Load Balancing Defaults */
> +#define ACCEPT_IMPROVE_DEFAULT       (25)
> +#define PMD_LOAD_THRE_DEFAULT        (95)
> +#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
> +#define MIN_TO_MSEC                  60000
> +
>  #define FLOW_DUMP_MAX_BATCH 50
>  /* Use per thread recirc_depth to prevent recirculation loop. */
>  #define MAX_RECIRC_DEPTH 6
> @@ -288,6 +294,13 @@ struct dp_meter {
>      struct dp_meter_band bands[];
>  };
>  
> +struct pmd_auto_lb {
> +    bool auto_lb_conf;        /* enable-disable auto load balancing */
> +    bool is_enabled;          /* auto_lb current status */
> +    uint64_t rebalance_intvl;
> +    uint64_t rebalance_poll_timer;
> +};
> +
>  /* Datapath based on the network device interface from netdev.h.
>   *
>   *
> @@ -368,6 +381,7 @@ struct dp_netdev {
>      uint64_t last_tnl_conf_seq;
>  
>      struct conntrack conntrack;
> +    struct pmd_auto_lb pmd_alb;
>  };
>  
>  static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> @@ -682,6 +696,7 @@ struct dp_netdev_pmd_thread {
>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>      /* List of rx queues to poll. */
>      struct hmap poll_list OVS_GUARDED;
> +
>      /* Map of 'tx_port's used for transmission.  Written by the main thread,
>       * read by the pmd thread. */
>      struct hmap tx_ports OVS_GUARDED;
> @@ -702,6 +717,11 @@ struct dp_netdev_pmd_thread {
>      /* Keep track of detailed PMD performance statistics. */
>      struct pmd_perf_stats perf_stats;
>  
> +    /* Some stats from previous iteration used by automatic pmd
> +       load balance logic. */
> +    uint64_t prev_stats[PMD_N_STATS];
> +    atomic_count pmd_overloaded;
> +
>      /* Set to true if the pmd thread needs to be reloaded. */
>      bool need_reload;
>  };
> @@ -792,9 +812,11 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
>                           enum rxq_cycles_counter_type type);
>  static void
>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> -                           unsigned long long cycles);
> +                                unsigned long long cycles,
> +                                unsigned idx);
>  static uint64_t
> -dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
> +                                unsigned idx);
>  static void
>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
>                                 bool purge);
> @@ -3734,6 +3756,51 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
>      }
>  }
>  
> +/* Enable/Disable PMD auto load balancing */
> +static void
> +set_pmd_auto_lb(struct dp_netdev *dp)
> +{
> +    unsigned int cnt = 0;
> +    struct dp_netdev_pmd_thread *pmd;
> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> +
> +    bool enable = false;
> +    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
> +
> +    /* Ensure that there is at least 2 non-isolated PMDs and
> +     * one of the PMD is polling more than one rxq
> +     */
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
> +            continue;
> +        }
> +
> +        cnt++;
> +        if (hmap_count(&pmd->poll_list) > 1) {
> +            if (enable && (cnt > 1)) {
> +                break;
> +            } else {
> +                enable = true;
> +            }
> +        }
> +    }
> +
> +    /* Enable auto LB if it is configured and cycle based assignment is true */
> +    enable = enable && pmd_rxq_assign_cyc && pmd_alb->auto_lb_conf;
> +
> +    if (pmd_alb->is_enabled != enable) {
> +        pmd_alb->is_enabled = enable;
> +        if (pmd_alb->is_enabled) {
> +            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%lu(msec)\n",
> +                       pmd_alb->rebalance_intvl);

For this, please change to something like:

+            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%"
+                      PRIu64 "(msec)\n",

Otherwise, you'll trigger an error as:

  error: format ‘%lu’ expects argument of type ‘long unsigned int’, but
  argument 4 has type ‘uint64_t’ [-Werror=format=]

Detected at:

https://travis-ci.org/ovsrobot/ovs/jobs/474841007

> +        } else {
> +            pmd_alb->rebalance_poll_timer = 0;
> +            VLOG_INFO("PMD auto lb is disabled\n");
> +        }
> +    }
> +
> +}
> +
>  /* Applies datapath configuration from the database. Some of the changes are
>   * actually applied in dpif_netdev_run(). */
>  static int
> @@ -3748,6 +3815,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>      uint32_t insert_min, cur_min;
>      uint32_t tx_flush_interval, cur_tx_flush_interval;
> +    uint64_t rebalance_intvl;
>  
>      tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
>                                       DEFAULT_TX_FLUSH_INTERVAL);
> @@ -3819,6 +3887,23 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
>                    pmd_rxq_assign);
>          dp_netdev_request_reconfigure(dp);
>      }
> +
> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> +    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-lb",
> +                              false);
> +
> +    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebalance-intvl",
> +                              PMD_REBALANCE_POLL_INTERVAL);
> +
> +    /* Input is in min, convert it to msec */
> +    rebalance_intvl =
> +        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
> +
> +    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
> +        pmd_alb->rebalance_intvl = rebalance_intvl;
> +    }
> +
> +    set_pmd_auto_lb(dp);
>      return 0;
>  }
>  
> @@ -3974,9 +4059,9 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
>  
>  static void
>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> -                                unsigned long long cycles)
> +                                unsigned long long cycles,
> +                                unsigned idx)
>  {
> -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
>      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
>  }
>  
> @@ -4762,6 +4847,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>  
>      /* Reload affected pmd threads. */
>      reload_affected_pmds(dp);
> +
> +    /* Check if PMD Auto LB is to be enabled */
> +    set_pmd_auto_lb(dp);
>  }
>  
>  /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
> @@ -4780,6 +4868,228 @@ ports_require_restart(const struct dp_netdev *dp)
>      return false;
>  }
>  
> +/* Function for calculating variance */
> +static uint64_t
> +variance(uint64_t a[], int n)
> +{
> +    /* Compute mean (average of elements) */
> +    uint64_t sum = 0;
> +    uint64_t mean = 0;
> +    uint64_t sqDiff = 0;
> +
> +    if (!n) {
> +        return 0;
> +    }
> +
> +    for (int i = 0; i < n; i++) {
> +        sum += a[i];
> +    }
> +
> +    if (sum) {
> +        mean = sum / n;
> +
> +        /* Compute sum squared differences with mean. */
> +        for (int i = 0; i < n; i++) {
> +            sqDiff += (a[i] - mean)*(a[i] - mean);
> +        }
> +    }
> +    return (sqDiff ? (sqDiff / n) : 0);
> +}
> +
> +static uint64_t
> +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list, uint32_t num)
> +{
> +    struct dp_netdev_port *port;
> +    struct dp_netdev_pmd_thread *pmd;
> +    struct dp_netdev_rxq ** rxqs = NULL;
> +    struct rr_numa *numa = NULL;
> +    struct rr_numa_list rr;
> +    int n_rxqs = 0;
> +    uint64_t ret = 0;
> +    uint64_t *pmd_usage;
> +
> +    pmd_usage = xcalloc(num, sizeof(uint64_t));
> +
> +    HMAP_FOR_EACH (port, node, &dp->ports) {
> +        if (!netdev_is_pmd(port->netdev)) {
> +            continue;
> +        }
> +
> +        for (int qid = 0; qid < port->n_rxq; qid++) {
> +            struct dp_netdev_rxq *q = &port->rxqs[qid];
> +            uint64_t cycle_hist = 0;
> +
> +            if (q->pmd->isolated) {
> +                continue;
> +            }
> +
> +            if (n_rxqs == 0) {
> +                rxqs = xmalloc(sizeof *rxqs);
> +            } else {
> +                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
> +            }
> +
> +            /* Sum the queue intervals and store the cycle history. */
> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
> +            }
> +            /* Do we need to add intrvl_cycles in history??
> +             * but then we should clear interval cycles also */
> +            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
> +                                         cycle_hist);
> +            /* Store the queue. */
> +            rxqs[n_rxqs++] = q;
> +        }
> +    }
> +    if (n_rxqs > 1) {
> +        /* Sort the queues in order of the processing cycles
> +         * they consumed during their last pmd interval. */
> +        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
> +    }
> +    rr_numa_list_populate(dp, &rr);
> +
> +    for (int i = 0; i < n_rxqs; i++) {
> +        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
> +        numa = rr_numa_list_lookup(&rr, numa_id);
> +        if (!numa) {
> +            /* Don't consider queues across NUMA  ???*/
> +            continue;
> +        }
> +
> +        pmd = rr_numa_get_pmd(numa, true);
> +        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned port \'%s\' "
> +                  "rx queue %d "
> +                  "(measured processing cycles %"PRIu64").",
> +                  pmd->core_id, numa_id,
> +                  netdev_rxq_get_name(rxqs[i]->rx),
> +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
> +                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
> +
> +        for (int id = 0; id < num; id++) {
> +            if (pmd->core_id == core_list[id]) {
> +                /* Add the processing cycles of rxq to pmd polling it */
> +                uint64_t proc_cycles = 0;
> +                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++) {
> +                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
> +                                                                   idx);
> +                }
> +                pmd_usage[id] += proc_cycles;
> +            }
> +        }
> +    }
> +
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        uint64_t total_cycles = 0;
> +
> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> +            continue;
> +        }
> +
> +        /* Get the total pmd cycles for an interval. */
> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> +        /* Estimate the cycles to cover all intervals. */
> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +        for (int id = 0; id < num; id++) {
> +            if (pmd->core_id == core_list[id]) {
> +                if (pmd_usage[id]) {
> +                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
> +                }
> +                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
> +                          pmd->core_id, pmd_usage[id]);
> +            }
> +        }
> +    }
> +    ret = variance(pmd_usage, num);
> +
> +    rr_numa_list_destroy(&rr);
> +    free(rxqs);
> +    free(pmd_usage);
> +    return ret;
> +}
> +
> +static bool
> +pmd_rebalance_dry_run(struct dp_netdev *dp)
> +{
> +    struct dp_netdev_pmd_thread *pmd;
> +    uint64_t *curr_pmd_usage;
> +
> +    uint64_t curr_variance;
> +    uint64_t new_variance;
> +    uint64_t improvement = 0;
> +    uint32_t num_pmds;
> +    uint32_t *pmd_corelist;
> +    struct rxq_poll *poll, *poll_next;
> +
> +    num_pmds = cmap_count(&dp->poll_threads);
> +
> +    if (num_pmds > 1) {
> +        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
> +        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
> +    } else {
> +        return false;
> +    }
> +
> +    num_pmds = 0;
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        uint64_t total_cycles = 0;
> +        uint64_t total_proc = 0;
> +
> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> +            continue;
> +        }
> +
> +        /* Get the total pmd cycles for an interval. */
> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> +        /* Estimate the cycles to cover all intervals. */
> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +
> +        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
> +            uint64_t proc_cycles = 0;
> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
> +            }
> +            total_proc += proc_cycles;
> +        }
> +        if (total_proc) {
> +            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
> +        }
> +
> +        VLOG_DBG("PMD_AUTO_LB_MON curr_pmd_usage(%d):%"PRIu64"",
> +                  pmd->core_id, curr_pmd_usage[num_pmds]);
> +
> +        if (atomic_count_get(&pmd->pmd_overloaded)) {
> +            atomic_count_set(&pmd->pmd_overloaded, 0);
> +        }
> +
> +        pmd_corelist[num_pmds] = pmd->core_id;
> +        num_pmds++;
> +    }
> +
> +    curr_variance = variance(curr_pmd_usage, num_pmds);
> +
> +    new_variance = get_dry_run_variance(dp, pmd_corelist, num_pmds);
> +    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
> +              " curr_variance: %"PRIu64"",
> +              new_variance, curr_variance);
> +
> +    if (new_variance && (new_variance < curr_variance)) {
> +        improvement =
> +            ((curr_variance - new_variance) * 100) / curr_variance;
> +
> +        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"", improvement);
> +    }
> +
> +    free(curr_pmd_usage);
> +    free(pmd_corelist);
> +
> +    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
> +        return true;
> +    }
> +
> +    return false;
> +}
> +
> +
>  /* Return true if needs to revalidate datapath flows. */
>  static bool
>  dpif_netdev_run(struct dpif *dpif)
> @@ -4789,6 +5099,9 @@ dpif_netdev_run(struct dpif *dpif)
>      struct dp_netdev_pmd_thread *non_pmd;
>      uint64_t new_tnl_seq;
>      bool need_to_flush = true;
> +    bool pmd_rebalance = false;
> +    long long int now = time_msec();
> +    struct dp_netdev_pmd_thread *pmd;
>  
>      ovs_mutex_lock(&dp->port_mutex);
>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
> @@ -4821,6 +5134,38 @@ dpif_netdev_run(struct dpif *dpif)
>          dp_netdev_pmd_unref(non_pmd);
>      }
>  
> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> +    if (pmd_alb->is_enabled) {
> +        if (!pmd_alb->rebalance_poll_timer) {
> +            pmd_alb->rebalance_poll_timer = now;
> +        } else if ((pmd_alb->rebalance_poll_timer +
> +                   pmd_alb->rebalance_intvl) < now) {
> +            pmd_alb->rebalance_poll_timer = now;
> +            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +                if (atomic_count_get(&pmd->pmd_overloaded) >=
> +                                    PMD_RXQ_INTERVAL_MAX) {
> +                    pmd_rebalance = true;
> +                    break;
> +                }
> +            }
> +            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd rebalance:%d",
> +                      pmd_rebalance);
> +
> +            if (pmd_rebalance &&
> +                !dp_netdev_is_reconf_required(dp) &&
> +                !ports_require_restart(dp) &&
> +                pmd_rebalance_dry_run(dp)) {
> +
> +                ovs_mutex_unlock(&dp->port_mutex);
> +                ovs_mutex_lock(&dp_netdev_mutex);
> +                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD RECONFIGURE");
> +                dp_netdev_request_reconfigure(dp);
> +                ovs_mutex_unlock(&dp_netdev_mutex);
> +                ovs_mutex_lock(&dp->port_mutex);
> +            }
> +        }
> +    }
> +
>      if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
>          reconfigure_datapath(dp);
>      }
> @@ -4979,6 +5324,8 @@ pmd_thread_main(void *f_)
>  reload:
>      pmd_alloc_static_tx_qid(pmd);
>  
> +    atomic_count_init(&pmd->pmd_overloaded, 0);
> +
>      /* List port/core affinity */
>      for (i = 0; i < poll_cnt; i++) {
>         VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
> @@ -4986,6 +5333,10 @@ reload:
>                  netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
>         /* Reset the rxq current cycles counter. */
>         dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
> +
> +       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
> +            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
> +       }
>      }
>  
>      if (!poll_cnt) {
> @@ -7188,17 +7539,51 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
>                             struct polled_queue *poll_list, int poll_cnt)
>  {
>      struct dpcls *cls;
> +    uint64_t tot_idle = 0, tot_proc = 0;
> +    unsigned int idx;
> +    unsigned int pmd_load = 0;
>  
>      if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
>          uint64_t curr_tsc;
> +        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
> +        if (pmd_alb->is_enabled && !pmd->isolated) {
> +            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
> +                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
> +            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
> +                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
> +
> +            if (tot_proc) {
> +                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
> +            }
> +
> +            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
> +                atomic_count_inc(&pmd->pmd_overloaded);
> +
> +                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT iter %d",
> +                          atomic_count_get(&pmd->pmd_overloaded));
> +            } else {
> +                atomic_count_set(&pmd->pmd_overloaded, 0);
> +            }
> +        }
> +
> +        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
> +        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
> +
>          /* Get the cycles that were used to process each queue and store. */
>          for (unsigned i = 0; i < poll_cnt; i++) {
> -            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
> -                                                        RXQ_CYCLES_PROC_CURR);
> -            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
> -            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
> -                                     0);
> +            uint64_t rxq_cyc_curr;
> +            struct dp_netdev_rxq *rxq;
> +
> +            rxq = poll_list[i].rxq;
> +            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> +
> +            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_CURR);
> +            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
> +            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR, 0);
>          }
> +
>          curr_tsc = cycles_counter_update(&pmd->perf_stats);
>          if (pmd->intrvl_tsc_prev) {
>              /* There is a prev timestamp, store a new intrvl cycle count. */
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 2160910..ff3589c 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -574,6 +574,36 @@
>              be set to 'skip_sw'.
>          </p>
>        </column>
> +      <column name="other_config" key="pmd-auto-lb"
> +              type='{"type": "boolean"}'>
> +        <p>
> +         Configures PMD Auto Load Balancing that allows automatic assignment of
> +         RX queues to PMDs if any of PMDs is overloaded (i.e. processing cycles
> +         > 95%).
> +        </p>
> +        <p>
> +         It uses current scheme of cycle based assignment of RX queues that
> +         are not statically pinned to PMDs.
> +        </p>
> +        <p>
> +          Set this value to <code>true</code> to enable this option.
> +        </p>
> +        <p>
> +          The default value is <code>false</code>.
> +        </p>
> +        <p>
> +         This only comes in effect if cycle based assignment is enabled and
> +         there are more than one non-isolated PMDs present and atleast one of
> +         it polls more than one queue.
> +        </p>
> +      </column>
> +      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
> +              type='{"type": "integer", "minInteger": 1}'>
> +        <p>
> +         The minimum time (in minutes) 2 consecutive PMD Auto Load Balancing
> +         iterations.
> +        </p>
> +      </column>
>      </group>
>      <group title="Status">
>        <column name="next_cfg">
Kevin Traynor Jan. 3, 2019, 8:18 p.m. UTC | #2
On 01/03/2019 12:36 PM, Nitin Katiyar wrote:
> Port rx queues that have not been statically assigned to PMDs are currently
> assigned based on periodically sampled load measurements.
> The assignment is performed at specific instances – port addition, port
> deletion, upon reassignment request via CLI etc.
> 
> Due to change in traffic pattern over time it can cause uneven load among
> the PMDs and thus resulting in lower overall throughout.
> 
> This patch enables the support of auto load balancing of PMDs based on
> measured load of RX queues. Each PMD measures the processing load for each
> of its associated queues every 10 seconds. If the aggregated PMD load reaches
> 95% for 6 consecutive intervals then PMD considers itself to be overloaded.
> 
> If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
> performed by OVS main thread. The dry-run does NOT change the existing
> queue to PMD assignments.
> 
> If the resultant mapping of dry-run indicates an improved distribution
> of the load then the actual reassignment will be performed.
> 
> The automatic rebalancing will be disabled by default and has to be
> enabled via configuration option. The interval (in minutes) between
> two consecutive rebalancing can also be configured via CLI, default
> is 1 min.
> 
> Following example commands can be used to set the auto-lb params:
> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"
> 

Hi Nitin, thanks for v2. Not full review yet but sending some comments
below.

Maybe you can put some of the above into a new section below this
http://docs.openvswitch.org/en/latest/topics/dpdk/pmd/#port-rx-queue-assigment-to-pmd-threads

I also think this feature should be experimental until it has been road
tested a bit more.

> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> Co-authored-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
> ---
>  lib/dpif-netdev.c    | 403 +++++++++++++++++++++++++++++++++++++++++++++++++--
>  vswitchd/vswitch.xml |  30 ++++
>  2 files changed, 424 insertions(+), 9 deletions(-)
> 

There seems to be windows style line endings in the patch? or something
else has gone wrong for me.

> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 1564db9..8db106f 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -80,6 +80,12 @@
>  
>  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>  
> +/* Auto Load Balancing Defaults */
> +#define ACCEPT_IMPROVE_DEFAULT       (25)
> +#define PMD_LOAD_THRE_DEFAULT        (95)

Probably you should remove the brackets above to be consistent with the
others below and in the rest of the file.

> +#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
> +#define MIN_TO_MSEC                  60000
> +
>  #define FLOW_DUMP_MAX_BATCH 50
>  /* Use per thread recirc_depth to prevent recirculation loop. */
>  #define MAX_RECIRC_DEPTH 6
> @@ -288,6 +294,13 @@ struct dp_meter {
>      struct dp_meter_band bands[];
>  };
>  
> +struct pmd_auto_lb {
> +    bool auto_lb_conf;        /* enable-disable auto load balancing */

I'm not sure what '_conf' is short for? maybe it should be something
like 'auto_lb_requested'

> +    bool is_enabled;          /* auto_lb current status */

Comments should be of style /* Sentence case. */
http://docs.openvswitch.org/en/latest/internals/contributing/coding-style/#comments


> +    uint64_t rebalance_intvl;
> +    uint64_t rebalance_poll_timer;
> +};
> +
>  /* Datapath based on the network device interface from netdev.h.
>   *
>   *
> @@ -368,6 +381,7 @@ struct dp_netdev {
>      uint64_t last_tnl_conf_seq;
>  
>      struct conntrack conntrack;
> +    struct pmd_auto_lb pmd_alb;
>  };
>  
>  static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> @@ -682,6 +696,7 @@ struct dp_netdev_pmd_thread {
>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>      /* List of rx queues to poll. */
>      struct hmap poll_list OVS_GUARDED;
> +

Unrelated newline should be removed

>      /* Map of 'tx_port's used for transmission.  Written by the main thread,
>       * read by the pmd thread. */
>      struct hmap tx_ports OVS_GUARDED;
> @@ -702,6 +717,11 @@ struct dp_netdev_pmd_thread {
>      /* Keep track of detailed PMD performance statistics. */
>      struct pmd_perf_stats perf_stats;
>  
> +    /* Some stats from previous iteration used by automatic pmd
> +       load balance logic. */

Nit, but see coding stds. and other multi-line comments wrt style

> +    uint64_t prev_stats[PMD_N_STATS];> +    atomic_count pmd_overloaded;
> +
>      /* Set to true if the pmd thread needs to be reloaded. */
>      bool need_reload;
>  };
> @@ -792,9 +812,11 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
>                           enum rxq_cycles_counter_type type);
>  static void
>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> -                           unsigned long long cycles);
> +                                unsigned long long cycles,
> +                                unsigned idx);
>  static uint64_t
> -dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
> +                                unsigned idx);

no need to change dp_netdev_rxq_get_intrvl_cycles()

>  static void
>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
>                                 bool purge);
> @@ -3734,6 +3756,51 @@ dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
>      }
>  }
>  
> +/* Enable/Disable PMD auto load balancing */
> +static void
> +set_pmd_auto_lb(struct dp_netdev *dp)
> +{
> +    unsigned int cnt = 0;
> +    struct dp_netdev_pmd_thread *pmd;
> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> +
> +    bool enable = false;
> +    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
> +
> +    /* Ensure that there is at least 2 non-isolated PMDs and
> +     * one of the PMD is polling more than one rxq
> +     */
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
> +            continue;
> +        }
> +
> +        cnt++;
> +        if (hmap_count(&pmd->poll_list) > 1) {
> +            if (enable && (cnt > 1)) {
> +                break;
> +            } else {
> +                enable = true;
> +            }
> +        }
> +    }
> +

Won't this give the wrong result if there is one pmd with multiple
rxq's? How about something in the loop like,

        if (hmap_count(&pmd->poll_list) > 1) {
            multirxq = true;
        }
        if (cnt && multirxq) {
            enable = true;
            break;
        }
        cnt++;

> +    /* Enable auto LB if it is configured and cycle based assignment is true */
> +    enable = enable && pmd_rxq_assign_cyc && pmd_alb->auto_lb_conf;
> +
> +    if (pmd_alb->is_enabled != enable) {
> +        pmd_alb->is_enabled = enable;
> +        if (pmd_alb->is_enabled) {
> +            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%lu(msec)\n",
> +                       pmd_alb->rebalance_intvl);
> +        } else {
> +            pmd_alb->rebalance_poll_timer = 0;
> +            VLOG_INFO("PMD auto lb is disabled\n");
> +        }
> +    }
> +
> +}
> +
>  /* Applies datapath configuration from the database. Some of the changes are
>   * actually applied in dpif_netdev_run(). */
>  static int
> @@ -3748,6 +3815,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>      uint32_t insert_min, cur_min;
>      uint32_t tx_flush_interval, cur_tx_flush_interval;
> +    uint64_t rebalance_intvl;
>  
>      tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
>                                       DEFAULT_TX_FLUSH_INTERVAL);
> @@ -3819,6 +3887,23 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
>                    pmd_rxq_assign);
>          dp_netdev_request_reconfigure(dp);
>      }
> +
> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> +    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-lb",
> +                              false);
> +
> +    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebalance-intvl",
> +                              PMD_REBALANCE_POLL_INTERVAL);
> +
> +    /* Input is in min, convert it to msec */
> +    rebalance_intvl =
> +        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
> +

This is creating a default when the user sets 0 - that needs to be
documented.

With current values, this could overflow rebalance_intvl. The user value
is in minutes, so suggest to limit user input to some reasonable value
like 1 week in minutes, and then the min to msec can be safe. See
tx-flush-interval as an example where the range is limited.

> +    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
> +        pmd_alb->rebalance_intvl = rebalance_intvl;
> +    }
> +
> +    set_pmd_auto_lb(dp);
>      return 0;
>  }
>  
> @@ -3974,9 +4059,9 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
>  
>  static void
>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> -                                unsigned long long cycles)
> +                                unsigned long long cycles,
> +                                unsigned idx)
>  {
> -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
>      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
>  }
>  
> @@ -4762,6 +4847,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>  
>      /* Reload affected pmd threads. */
>      reload_affected_pmds(dp);
> +
> +    /* Check if PMD Auto LB is to be enabled */
> +    set_pmd_auto_lb(dp);
>  }
>  
>  /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
> @@ -4780,6 +4868,228 @@ ports_require_restart(const struct dp_netdev *dp)
>      return false;
>  }
>  
> +/* Function for calculating variance */
> +static uint64_t
> +variance(uint64_t a[], int n)
> +{
> +    /* Compute mean (average of elements) */
> +    uint64_t sum = 0;
> +    uint64_t mean = 0;
> +    uint64_t sqDiff = 0;
> +
> +    if (!n) {
> +        return 0;
> +    }
> +
> +    for (int i = 0; i < n; i++) {
> +        sum += a[i];
> +    }
> +
> +    if (sum) {
> +        mean = sum / n;
> +
> +        /* Compute sum squared differences with mean. */
> +        for (int i = 0; i < n; i++) {
> +            sqDiff += (a[i] - mean)*(a[i] - mean);
> +        }
> +    }
> +    return (sqDiff ? (sqDiff / n) : 0);
> +}
> +
> +static uint64_t
> +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list, uint32_t num)

I think this function would require the port_mutex.

> +{
> +    struct dp_netdev_port *port;
> +    struct dp_netdev_pmd_thread *pmd;
> +    struct dp_netdev_rxq ** rxqs = NULL;
> +    struct rr_numa *numa = NULL;
> +    struct rr_numa_list rr;
> +    int n_rxqs = 0;
> +    uint64_t ret = 0;
> +    uint64_t *pmd_usage;
> +
> +    pmd_usage = xcalloc(num, sizeof(uint64_t));
> +
> +    HMAP_FOR_EACH (port, node, &dp->ports) {
> +        if (!netdev_is_pmd(port->netdev)) {
> +            continue;
> +        }
> +
> +        for (int qid = 0; qid < port->n_rxq; qid++) {
> +            struct dp_netdev_rxq *q = &port->rxqs[qid];
> +            uint64_t cycle_hist = 0;
> +
> +            if (q->pmd->isolated) {
> +                continue;
> +            }
> +
> +            if (n_rxqs == 0) {
> +                rxqs = xmalloc(sizeof *rxqs);
> +            } else {
> +                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
> +            }
> +
> +            /* Sum the queue intervals and store the cycle history. */
> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
> +            }
> +            /* Do we need to add intrvl_cycles in history??

If you want to use compare_rxq_cycles() then you have to put them in
history, but it's only used for that and re-written everytime, so I
don't think it is harmful.

> +             * but then we should clear interval cycles also */

I don't think you should be clearing interval cycles in a dry run,
otherwise they will be reset if the real rebalance occurs.

> +            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
> +                                         cycle_hist);
> +            /* Store the queue. */
> +            rxqs[n_rxqs++] = q;
> +        }
> +    }
> +    if (n_rxqs > 1) {
> +        /* Sort the queues in order of the processing cycles
> +         * they consumed during their last pmd interval. */
> +        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
> +    }
> +    rr_numa_list_populate(dp, &rr);
> +
> +    for (int i = 0; i < n_rxqs; i++) {
> +        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
> +        numa = rr_numa_list_lookup(&rr, numa_id);
> +        if (!numa) {
> +            /* Don't consider queues across NUMA  ???*/

I think you should abort the whole dry run process if this is happening

> +            continue;
> +        }
> +
> +        pmd = rr_numa_get_pmd(numa, true);
> +        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned port \'%s\' "
> +                  "rx queue %d "
> +                  "(measured processing cycles %"PRIu64").",
> +                  pmd->core_id, numa_id,
> +                  netdev_rxq_get_name(rxqs[i]->rx),
> +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
> +                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
> +
> +        for (int id = 0; id < num; id++) {
> +            if (pmd->core_id == core_list[id]) {
> +                /* Add the processing cycles of rxq to pmd polling it */
> +                uint64_t proc_cycles = 0;
> +                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++) {
> +                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
> +                                                                   idx);
> +                }
> +                pmd_usage[id] += proc_cycles;
> +            }
> +        }
> +    }
> +
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        uint64_t total_cycles = 0;
> +
> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> +            continue;
> +        }
> +
> +        /* Get the total pmd cycles for an interval. */
> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> +        /* Estimate the cycles to cover all intervals. */
> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +        for (int id = 0; id < num; id++) {
> +            if (pmd->core_id == core_list[id]) {
> +                if (pmd_usage[id]) {
> +                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
> +                }
> +                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
> +                          pmd->core_id, pmd_usage[id]);
> +            }
> +        }
> +    }
> +    ret = variance(pmd_usage, num);
> +
> +    rr_numa_list_destroy(&rr);
> +    free(rxqs);
> +    free(pmd_usage);
> +    return ret;
> +}
> +
> +static bool
> +pmd_rebalance_dry_run(struct dp_netdev *dp)
> +{
> +    struct dp_netdev_pmd_thread *pmd;
> +    uint64_t *curr_pmd_usage;
> +
> +    uint64_t curr_variance;
> +    uint64_t new_variance;
> +    uint64_t improvement = 0;
> +    uint32_t num_pmds;
> +    uint32_t *pmd_corelist;
> +    struct rxq_poll *poll, *poll_next;
> +
> +    num_pmds = cmap_count(&dp->poll_threads);
> +
> +    if (num_pmds > 1) {
> +        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
> +        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
> +    } else {
> +        return false;
> +    }
> +
> +    num_pmds = 0;
> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +        uint64_t total_cycles = 0;
> +        uint64_t total_proc = 0;
> +
> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> +            continue;
> +        }
> +
> +        /* Get the total pmd cycles for an interval. */
> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> +        /* Estimate the cycles to cover all intervals. */
> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> +
> +        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
> +            uint64_t proc_cycles = 0;
> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> +                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
> +            }
> +            total_proc += proc_cycles;
> +        }
> +        if (total_proc) {
> +            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
> +        }
> +
> +        VLOG_DBG("PMD_AUTO_LB_MON curr_pmd_usage(%d):%"PRIu64"",
> +                  pmd->core_id, curr_pmd_usage[num_pmds]);
> +
> +        if (atomic_count_get(&pmd->pmd_overloaded)) {
> +            atomic_count_set(&pmd->pmd_overloaded, 0);
> +        }
> +
> +        pmd_corelist[num_pmds] = pmd->core_id;
> +        num_pmds++;
> +    }
> +
> +    curr_variance = variance(curr_pmd_usage, num_pmds);
> +
> +    new_variance = get_dry_run_variance(dp, pmd_corelist, num_pmds);
> +    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
> +              " curr_variance: %"PRIu64"",
> +              new_variance, curr_variance);
> +
> +    if (new_variance && (new_variance < curr_variance)) {
> +        improvement =
> +            ((curr_variance - new_variance) * 100) / curr_variance;
> +
> +        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"", improvement);
> +    }
> +
> +    free(curr_pmd_usage);
> +    free(pmd_corelist);
> +
> +    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
> +        return true;
> +    }
> +
> +    return false;
> +}
> +
> +
>  /* Return true if needs to revalidate datapath flows. */
>  static bool
>  dpif_netdev_run(struct dpif *dpif)
> @@ -4789,6 +5099,9 @@ dpif_netdev_run(struct dpif *dpif)
>      struct dp_netdev_pmd_thread *non_pmd;
>      uint64_t new_tnl_seq;
>      bool need_to_flush = true;
> +    bool pmd_rebalance = false;
> +    long long int now = time_msec();
> +    struct dp_netdev_pmd_thread *pmd;
>  
>      ovs_mutex_lock(&dp->port_mutex);
>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
> @@ -4821,6 +5134,38 @@ dpif_netdev_run(struct dpif *dpif)
>          dp_netdev_pmd_unref(non_pmd);
>      }
>  
> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> +    if (pmd_alb->is_enabled) {
> +        if (!pmd_alb->rebalance_poll_timer) {
> +            pmd_alb->rebalance_poll_timer = now;
> +        } else if ((pmd_alb->rebalance_poll_timer +
> +                   pmd_alb->rebalance_intvl) < now) {
> +            pmd_alb->rebalance_poll_timer = now;
> +            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> +                if (atomic_count_get(&pmd->pmd_overloaded) >=
> +                                    PMD_RXQ_INTERVAL_MAX) {
> +                    pmd_rebalance = true;
> +                    break;
> +                }
> +            }
> +            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd rebalance:%d",
> +                      pmd_rebalance);
> +
> +            if (pmd_rebalance &&
> +                !dp_netdev_is_reconf_required(dp) &&
> +                !ports_require_restart(dp) &&
> +                pmd_rebalance_dry_run(dp)) {

Don't you need the dp_netdev_mutex for call to pmd_rebalance_dry_run, or
at least for some parts of it?

> +
> +                ovs_mutex_unlock(&dp->port_mutex);

It seems odd to be unlocking this and then taking it again, is there a
reason?

> +                ovs_mutex_lock(&dp_netdev_mutex);
> +                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD RECONFIGURE");
> +                dp_netdev_request_reconfigure(dp);
> +                ovs_mutex_unlock(&dp_netdev_mutex);
> +                ovs_mutex_lock(&dp->port_mutex);
> +            }
> +        }
> +    }
> +
>      if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
>          reconfigure_datapath(dp);
>      }
> @@ -4979,6 +5324,8 @@ pmd_thread_main(void *f_)
>  reload:
>      pmd_alloc_static_tx_qid(pmd);
>  
> +    atomic_count_init(&pmd->pmd_overloaded, 0);
> +
>      /* List port/core affinity */
>      for (i = 0; i < poll_cnt; i++) {
>         VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
> @@ -4986,6 +5333,10 @@ reload:
>                  netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
>         /* Reset the rxq current cycles counter. */
>         dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
> +
> +       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
> +            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
> +       }

is it needed for this patch? won't all the values have been refreshed by
the time the next check is performed anyway

>      }
>  
>      if (!poll_cnt) {
> @@ -7188,17 +7539,51 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
>                             struct polled_queue *poll_list, int poll_cnt)
>  {
>      struct dpcls *cls;
> +    uint64_t tot_idle = 0, tot_proc = 0;
> +    unsigned int idx;
> +    unsigned int pmd_load = 0;
>  
>      if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
>          uint64_t curr_tsc;
> +        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
> +        if (pmd_alb->is_enabled && !pmd->isolated) {
> +            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
> +                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
> +            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
> +                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
> +
> +            if (tot_proc) {
> +                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
> +            }
> +
> +            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
> +                atomic_count_inc(&pmd->pmd_overloaded);
> +
> +                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT iter %d",
> +                          atomic_count_get(&pmd->pmd_overloaded));

Better to remove this log

> +            } else {
> +                atomic_count_set(&pmd->pmd_overloaded, 0);
> +            }
> +        }
> +
> +        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
> +        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
> +

These have been used earlier - are they initialized somewhere for the
first use? you could just skip the above if() on the first call, so they
get initialized, or else init them at the start of pmd_thread_main()

>          /* Get the cycles that were used to process each queue and store. */
>          for (unsigned i = 0; i < poll_cnt; i++) {
> -            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
> -                                                        RXQ_CYCLES_PROC_CURR);
> -            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
> -            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
> -                                     0);
> +            uint64_t rxq_cyc_curr;
> +            struct dp_netdev_rxq *rxq;
> +
> +            rxq = poll_list[i].rxq;
> +            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> +
> +            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_CURR);
> +            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
> +            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR, 0);
>          }
> +
>          curr_tsc = cycles_counter_update(&pmd->perf_stats);
>          if (pmd->intrvl_tsc_prev) {
>              /* There is a prev timestamp, store a new intrvl cycle count. */
> diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> index 2160910..ff3589c 100644
> --- a/vswitchd/vswitch.xml
> +++ b/vswitchd/vswitch.xml
> @@ -574,6 +574,36 @@
>              be set to 'skip_sw'.
>          </p>
>        </column>
> +      <column name="other_config" key="pmd-auto-lb"
> +              type='{"type": "boolean"}'>
> +        <p>
> +         Configures PMD Auto Load Balancing that allows automatic assignment of
> +         RX queues to PMDs if any of PMDs is overloaded (i.e. processing cycles
> +         > 95%).
> +        </p>
> +        <p>
> +         It uses current scheme of cycle based assignment of RX queues that
> +         are not statically pinned to PMDs.
> +        </p>
> +        <p>
> +          Set this value to <code>true</code> to enable this option.
> +        </p>
> +        <p>
> +          The default value is <code>false</code>.
> +        </p>
> +        <p>
> +         This only comes in effect if cycle based assignment is enabled and
> +         there are more than one non-isolated PMDs present and atleast one of
> +         it polls more than one queue.
> +        </p>
> +      </column>
> +      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
> +              type='{"type": "integer", "minInteger": 1}'>
> +        <p>
> +         The minimum time (in minutes) 2 consecutive PMD Auto Load Balancing
> +         iterations.
> +        </p>
> +      </column>
>      </group>
>      <group title="Status">
>        <column name="next_cfg">
>
Nitin Katiyar Jan. 4, 2019, 12:39 p.m. UTC | #3
> -----Original Message-----
> From: Kevin Traynor [mailto:ktraynor@redhat.com]
> Sent: Friday, January 04, 2019 1:48 AM
> To: Nitin Katiyar <nitin.katiyar@ericsson.com>; ovs-dev@openvswitch.org
> Cc: Rohith Basavaraja <rohith.basavaraja@gmail.com>; Venkatesan Pradeep
> <venkatesan.pradeep@ericsson.com>
> Subject: Re: [PATCH v2] Adding support for PMD auto load balancing
> 
> On 01/03/2019 12:36 PM, Nitin Katiyar wrote:
> > Port rx queues that have not been statically assigned to PMDs are
> > currently assigned based on periodically sampled load measurements.
> > The assignment is performed at specific instances – port addition,
> > port deletion, upon reassignment request via CLI etc.
> >
> > Due to change in traffic pattern over time it can cause uneven load
> > among the PMDs and thus resulting in lower overall throughout.
> >
> > This patch enables the support of auto load balancing of PMDs based on
> > measured load of RX queues. Each PMD measures the processing load for
> > each of its associated queues every 10 seconds. If the aggregated PMD
> > load reaches 95% for 6 consecutive intervals then PMD considers itself to
> be overloaded.
> >
> > If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
> > performed by OVS main thread. The dry-run does NOT change the existing
> > queue to PMD assignments.
> >
> > If the resultant mapping of dry-run indicates an improved distribution
> > of the load then the actual reassignment will be performed.
> >
> > The automatic rebalancing will be disabled by default and has to be
> > enabled via configuration option. The interval (in minutes) between
> > two consecutive rebalancing can also be configured via CLI, default is
> > 1 min.
> >
> > Following example commands can be used to set the auto-lb params:
> > ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
> > ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"
> >
> 
> Hi Nitin, thanks for v2. Not full review yet but sending some comments below.
> 
> Maybe you can put some of the above into a new section below this
> http://docs.openvswitch.org/en/latest/topics/dpdk/pmd/#port-rx-queue-
> assigment-to-pmd-threads
Sure, I will update that too.
> 
> I also think this feature should be experimental until it has been road tested a
> bit more.
> 
> > Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> > Co-authored-by: Venkatesan Pradeep
> <venkatesan.pradeep@ericsson.com>
> > Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> > Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
> > Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
> > ---
> >  lib/dpif-netdev.c    | 403
> +++++++++++++++++++++++++++++++++++++++++++++++++--
> >  vswitchd/vswitch.xml |  30 ++++
> >  2 files changed, 424 insertions(+), 9 deletions(-)
> >
> 
> There seems to be windows style line endings in the patch? or something else
> has gone wrong for me.
> 
> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> > 1564db9..8db106f 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -80,6 +80,12 @@
> >
> >  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> >
> > +/* Auto Load Balancing Defaults */
> > +#define ACCEPT_IMPROVE_DEFAULT       (25)
> > +#define PMD_LOAD_THRE_DEFAULT        (95)
> 
> Probably you should remove the brackets above to be consistent with the
> others below and in the rest of the file.
> 
> > +#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
> > +#define MIN_TO_MSEC                  60000
> > +
> >  #define FLOW_DUMP_MAX_BATCH 50
> >  /* Use per thread recirc_depth to prevent recirculation loop. */
> > #define MAX_RECIRC_DEPTH 6 @@ -288,6 +294,13 @@ struct dp_meter {
> >      struct dp_meter_band bands[];
> >  };
> >
> > +struct pmd_auto_lb {
> > +    bool auto_lb_conf;        /* enable-disable auto load balancing */
> 
> I'm not sure what '_conf' is short for? maybe it should be something like
> 'auto_lb_requested'
Sure
> 
> > +    bool is_enabled;          /* auto_lb current status */
> 
> Comments should be of style /* Sentence case. */
> http://docs.openvswitch.org/en/latest/internals/contributing/coding-
> style/#comments
> 
Thanks for providing the link. I will update in next version
> 
> > +    uint64_t rebalance_intvl;
> > +    uint64_t rebalance_poll_timer;
> > +};
> > +
> >  /* Datapath based on the network device interface from netdev.h.
> >   *
> >   *
> > @@ -368,6 +381,7 @@ struct dp_netdev {
> >      uint64_t last_tnl_conf_seq;
> >
> >      struct conntrack conntrack;
> > +    struct pmd_auto_lb pmd_alb;
> >  };
> >
> >  static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
> > @@ -682,6 +696,7 @@ struct dp_netdev_pmd_thread {
> >      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
> >      /* List of rx queues to poll. */
> >      struct hmap poll_list OVS_GUARDED;
> > +
> 
> Unrelated newline should be removed
> 
> >      /* Map of 'tx_port's used for transmission.  Written by the main thread,
> >       * read by the pmd thread. */
> >      struct hmap tx_ports OVS_GUARDED; @@ -702,6 +717,11 @@ struct
> > dp_netdev_pmd_thread {
> >      /* Keep track of detailed PMD performance statistics. */
> >      struct pmd_perf_stats perf_stats;
> >
> > +    /* Some stats from previous iteration used by automatic pmd
> > +       load balance logic. */
> 
> Nit, but see coding stds. and other multi-line comments wrt style
> 
> > +    uint64_t prev_stats[PMD_N_STATS];> +    atomic_count
> pmd_overloaded;
> > +
> >      /* Set to true if the pmd thread needs to be reloaded. */
> >      bool need_reload;
> >  };
> > @@ -792,9 +812,11 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq
> *rx,
> >                           enum rxq_cycles_counter_type type);  static
> > void  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> > -                           unsigned long long cycles);
> > +                                unsigned long long cycles,
> > +                                unsigned idx);
> >  static uint64_t
> > -dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned
> > idx);
> > +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
> > +                                unsigned idx);
> 
> no need to change dp_netdev_rxq_get_intrvl_cycles()
> 
> >  static void
> >  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread
> *pmd,
> >                                 bool purge); @@ -3734,6 +3756,51 @@
> > dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
> >      }
> >  }
> >
> > +/* Enable/Disable PMD auto load balancing */ static void
> > +set_pmd_auto_lb(struct dp_netdev *dp) {
> > +    unsigned int cnt = 0;
> > +    struct dp_netdev_pmd_thread *pmd;
> > +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> > +
> > +    bool enable = false;
> > +    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
> > +
> > +    /* Ensure that there is at least 2 non-isolated PMDs and
> > +     * one of the PMD is polling more than one rxq
> > +     */
> > +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> > +        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
> > +            continue;
> > +        }
> > +
> > +        cnt++;
> > +        if (hmap_count(&pmd->poll_list) > 1) {
> > +            if (enable && (cnt > 1)) {
> > +                break;
> > +            } else {
> > +                enable = true;
> > +            }
> > +        }
> > +    }
> > +
> 
> Won't this give the wrong result if there is one pmd with multiple rxq's? How
> about something in the loop like,
Yes, you are right. Thanks for catching this.
> 
>         if (hmap_count(&pmd->poll_list) > 1) {
>             multirxq = true;
>         }
>         if (cnt && multirxq) {
>             enable = true;
>             break;
>         }
>         cnt++;
> 
> > +    /* Enable auto LB if it is configured and cycle based assignment is true */
> > +    enable = enable && pmd_rxq_assign_cyc && pmd_alb->auto_lb_conf;
> > +
> > +    if (pmd_alb->is_enabled != enable) {
> > +        pmd_alb->is_enabled = enable;
> > +        if (pmd_alb->is_enabled) {
> > +            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%lu(msec)\n",
> > +                       pmd_alb->rebalance_intvl);
> > +        } else {
> > +            pmd_alb->rebalance_poll_timer = 0;
> > +            VLOG_INFO("PMD auto lb is disabled\n");
> > +        }
> > +    }
> > +
> > +}
> > +
> >  /* Applies datapath configuration from the database. Some of the changes
> are
> >   * actually applied in dpif_netdev_run(). */  static int @@ -3748,6
> > +3815,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap
> *other_config)
> >                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
> >      uint32_t insert_min, cur_min;
> >      uint32_t tx_flush_interval, cur_tx_flush_interval;
> > +    uint64_t rebalance_intvl;
> >
> >      tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
> >                                       DEFAULT_TX_FLUSH_INTERVAL); @@
> > -3819,6 +3887,23 @@ dpif_netdev_set_config(struct dpif *dpif, const
> struct smap *other_config)
> >                    pmd_rxq_assign);
> >          dp_netdev_request_reconfigure(dp);
> >      }
> > +
> > +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> > +    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-lb",
> > +                              false);
> > +
> > +    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebalance-
> intvl",
> > +                              PMD_REBALANCE_POLL_INTERVAL);
> > +
> > +    /* Input is in min, convert it to msec */
> > +    rebalance_intvl =
> > +        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC :
> > + MIN_TO_MSEC;
> > +
> 
> This is creating a default when the user sets 0 - that needs to be documented.
> 
> With current values, this could overflow rebalance_intvl. The user value is in
> minutes, so suggest to limit user input to some reasonable value like 1 week in
> minutes, and then the min to msec can be safe. See tx-flush-interval as an
> example where the range is limited.
Thanks, I will update the documentation.
> 
> > +    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
> > +        pmd_alb->rebalance_intvl = rebalance_intvl;
> > +    }
> > +
> > +    set_pmd_auto_lb(dp);
> >      return 0;
> >  }
> >
> > @@ -3974,9 +4059,9 @@ dp_netdev_rxq_get_cycles(struct
> dp_netdev_rxq
> > *rx,
> >
> >  static void
> >  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> > -                                unsigned long long cycles)
> > +                                unsigned long long cycles,
> > +                                unsigned idx)
> >  {
> > -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> >      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);  }
> >
> > @@ -4762,6 +4847,9 @@ reconfigure_datapath(struct dp_netdev *dp)
> >
> >      /* Reload affected pmd threads. */
> >      reload_affected_pmds(dp);
> > +
> > +    /* Check if PMD Auto LB is to be enabled */
> > +    set_pmd_auto_lb(dp);
> >  }
> >
> >  /* Returns true if one of the netdevs in 'dp' requires a
> > reconfiguration */ @@ -4780,6 +4868,228 @@
> ports_require_restart(const struct dp_netdev *dp)
> >      return false;
> >  }
> >
> > +/* Function for calculating variance */ static uint64_t
> > +variance(uint64_t a[], int n) {
> > +    /* Compute mean (average of elements) */
> > +    uint64_t sum = 0;
> > +    uint64_t mean = 0;
> > +    uint64_t sqDiff = 0;
> > +
> > +    if (!n) {
> > +        return 0;
> > +    }
> > +
> > +    for (int i = 0; i < n; i++) {
> > +        sum += a[i];
> > +    }
> > +
> > +    if (sum) {
> > +        mean = sum / n;
> > +
> > +        /* Compute sum squared differences with mean. */
> > +        for (int i = 0; i < n; i++) {
> > +            sqDiff += (a[i] - mean)*(a[i] - mean);
> > +        }
> > +    }
> > +    return (sqDiff ? (sqDiff / n) : 0); }
> > +
> > +static uint64_t
> > +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
> > +uint32_t num)
> 
> I think this function would require the port_mutex.
I will check and add in next version.
> 
> > +{
> > +    struct dp_netdev_port *port;
> > +    struct dp_netdev_pmd_thread *pmd;
> > +    struct dp_netdev_rxq ** rxqs = NULL;
> > +    struct rr_numa *numa = NULL;
> > +    struct rr_numa_list rr;
> > +    int n_rxqs = 0;
> > +    uint64_t ret = 0;
> > +    uint64_t *pmd_usage;
> > +
> > +    pmd_usage = xcalloc(num, sizeof(uint64_t));
> > +
> > +    HMAP_FOR_EACH (port, node, &dp->ports) {
> > +        if (!netdev_is_pmd(port->netdev)) {
> > +            continue;
> > +        }
> > +
> > +        for (int qid = 0; qid < port->n_rxq; qid++) {
> > +            struct dp_netdev_rxq *q = &port->rxqs[qid];
> > +            uint64_t cycle_hist = 0;
> > +
> > +            if (q->pmd->isolated) {
> > +                continue;
> > +            }
> > +
> > +            if (n_rxqs == 0) {
> > +                rxqs = xmalloc(sizeof *rxqs);
> > +            } else {
> > +                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
> > +            }
> > +
> > +            /* Sum the queue intervals and store the cycle history. */
> > +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> > +                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
> > +            }
> > +            /* Do we need to add intrvl_cycles in history??
> 
> If you want to use compare_rxq_cycles() then you have to put them in history,
> but it's only used for that and re-written everytime, so I don't think it is
> harmful.
Yeah, that is the objective of adding it. 
> 
> > +             * but then we should clear interval cycles also */
> 
> I don't think you should be clearing interval cycles in a dry run, otherwise they
> will be reset if the real rebalance occurs.
Thanks for clarifying it. I will remove the comment.
> 
> > +            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
> > +                                         cycle_hist);
> > +            /* Store the queue. */
> > +            rxqs[n_rxqs++] = q;
> > +        }
> > +    }
> > +    if (n_rxqs > 1) {
> > +        /* Sort the queues in order of the processing cycles
> > +         * they consumed during their last pmd interval. */
> > +        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
> > +    }
> > +    rr_numa_list_populate(dp, &rr);
> > +
> > +    for (int i = 0; i < n_rxqs; i++) {
> > +        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
> > +        numa = rr_numa_list_lookup(&rr, numa_id);
> > +        if (!numa) {
> > +            /* Don't consider queues across NUMA  ???*/
> 
> I think you should abort the whole dry run process if this is happening
Okay.
> 
> > +            continue;
> > +        }
> > +
> > +        pmd = rr_numa_get_pmd(numa, true);
> > +        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned port
> \'%s\' "
> > +                  "rx queue %d "
> > +                  "(measured processing cycles %"PRIu64").",
> > +                  pmd->core_id, numa_id,
> > +                  netdev_rxq_get_name(rxqs[i]->rx),
> > +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
> > +                  dp_netdev_rxq_get_cycles(rxqs[i],
> > + RXQ_CYCLES_PROC_HIST));
> > +
> > +        for (int id = 0; id < num; id++) {
> > +            if (pmd->core_id == core_list[id]) {
> > +                /* Add the processing cycles of rxq to pmd polling it */
> > +                uint64_t proc_cycles = 0;
> > +                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++) {
> > +                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
> > +                                                                   idx);
> > +                }
> > +                pmd_usage[id] += proc_cycles;
> > +            }
> > +        }
> > +    }
> > +
> > +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> > +        uint64_t total_cycles = 0;
> > +
> > +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> > +            continue;
> > +        }
> > +
> > +        /* Get the total pmd cycles for an interval. */
> > +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> > +        /* Estimate the cycles to cover all intervals. */
> > +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> > +        for (int id = 0; id < num; id++) {
> > +            if (pmd->core_id == core_list[id]) {
> > +                if (pmd_usage[id]) {
> > +                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
> > +                }
> > +                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
> > +                          pmd->core_id, pmd_usage[id]);
> > +            }
> > +        }
> > +    }
> > +    ret = variance(pmd_usage, num);
> > +
> > +    rr_numa_list_destroy(&rr);
> > +    free(rxqs);
> > +    free(pmd_usage);
> > +    return ret;
> > +}
> > +
> > +static bool
> > +pmd_rebalance_dry_run(struct dp_netdev *dp) {
> > +    struct dp_netdev_pmd_thread *pmd;
> > +    uint64_t *curr_pmd_usage;
> > +
> > +    uint64_t curr_variance;
> > +    uint64_t new_variance;
> > +    uint64_t improvement = 0;
> > +    uint32_t num_pmds;
> > +    uint32_t *pmd_corelist;
> > +    struct rxq_poll *poll, *poll_next;
> > +
> > +    num_pmds = cmap_count(&dp->poll_threads);
> > +
> > +    if (num_pmds > 1) {
> > +        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
> > +        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
> > +    } else {
> > +        return false;
> > +    }
> > +
> > +    num_pmds = 0;
> > +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> > +        uint64_t total_cycles = 0;
> > +        uint64_t total_proc = 0;
> > +
> > +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> > +            continue;
> > +        }
> > +
> > +        /* Get the total pmd cycles for an interval. */
> > +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> > +        /* Estimate the cycles to cover all intervals. */
> > +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> > +
> > +        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
> > +            uint64_t proc_cycles = 0;
> > +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> > +                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
> > +            }
> > +            total_proc += proc_cycles;
> > +        }
> > +        if (total_proc) {
> > +            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
> > +        }
> > +
> > +        VLOG_DBG("PMD_AUTO_LB_MON curr_pmd_usage(%d):%"PRIu64"",
> > +                  pmd->core_id, curr_pmd_usage[num_pmds]);
> > +
> > +        if (atomic_count_get(&pmd->pmd_overloaded)) {
> > +            atomic_count_set(&pmd->pmd_overloaded, 0);
> > +        }
> > +
> > +        pmd_corelist[num_pmds] = pmd->core_id;
> > +        num_pmds++;
> > +    }
> > +
> > +    curr_variance = variance(curr_pmd_usage, num_pmds);
> > +
> > +    new_variance = get_dry_run_variance(dp, pmd_corelist, num_pmds);
> > +    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
> > +              " curr_variance: %"PRIu64"",
> > +              new_variance, curr_variance);
> > +
> > +    if (new_variance && (new_variance < curr_variance)) {
> > +        improvement =
> > +            ((curr_variance - new_variance) * 100) / curr_variance;
> > +
> > +        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"",
> improvement);
> > +    }
> > +
> > +    free(curr_pmd_usage);
> > +    free(pmd_corelist);
> > +
> > +    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
> > +        return true;
> > +    }
> > +
> > +    return false;
> > +}
> > +
> > +
> >  /* Return true if needs to revalidate datapath flows. */  static bool
> > dpif_netdev_run(struct dpif *dpif) @@ -4789,6 +5099,9 @@
> > dpif_netdev_run(struct dpif *dpif)
> >      struct dp_netdev_pmd_thread *non_pmd;
> >      uint64_t new_tnl_seq;
> >      bool need_to_flush = true;
> > +    bool pmd_rebalance = false;
> > +    long long int now = time_msec();
> > +    struct dp_netdev_pmd_thread *pmd;
> >
> >      ovs_mutex_lock(&dp->port_mutex);
> >      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); @@ -4821,6
> > +5134,38 @@ dpif_netdev_run(struct dpif *dpif)
> >          dp_netdev_pmd_unref(non_pmd);
> >      }
> >
> > +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> > +    if (pmd_alb->is_enabled) {
> > +        if (!pmd_alb->rebalance_poll_timer) {
> > +            pmd_alb->rebalance_poll_timer = now;
> > +        } else if ((pmd_alb->rebalance_poll_timer +
> > +                   pmd_alb->rebalance_intvl) < now) {
> > +            pmd_alb->rebalance_poll_timer = now;
> > +            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> > +                if (atomic_count_get(&pmd->pmd_overloaded) >=
> > +                                    PMD_RXQ_INTERVAL_MAX) {
> > +                    pmd_rebalance = true;
> > +                    break;
> > +                }
> > +            }
> > +            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd
> rebalance:%d",
> > +                      pmd_rebalance);
> > +
> > +            if (pmd_rebalance &&
> > +                !dp_netdev_is_reconf_required(dp) &&
> > +                !ports_require_restart(dp) &&
> > +                pmd_rebalance_dry_run(dp)) {
> 
> Don't you need the dp_netdev_mutex for call to pmd_rebalance_dry_run, or
> at least for some parts of it?
I will check it.
> 
> > +
> > +                ovs_mutex_unlock(&dp->port_mutex);
> 
> It seems odd to be unlocking this and then taking it again, is there a reason?
Need to check it again if we can have both locks at the same time.
> 
> > +                ovs_mutex_lock(&dp_netdev_mutex);
> > +                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD RECONFIGURE");
> > +                dp_netdev_request_reconfigure(dp);
> > +                ovs_mutex_unlock(&dp_netdev_mutex);
> > +                ovs_mutex_lock(&dp->port_mutex);
> > +            }
> > +        }
> > +    }
> > +
> >      if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
> >          reconfigure_datapath(dp);
> >      }
> > @@ -4979,6 +5324,8 @@ pmd_thread_main(void *f_)
> >  reload:
> >      pmd_alloc_static_tx_qid(pmd);
> >
> > +    atomic_count_init(&pmd->pmd_overloaded, 0);
> > +
> >      /* List port/core affinity */
> >      for (i = 0; i < poll_cnt; i++) {
> >         VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
> > @@ -4986,6 +5333,10 @@ reload:
> >                  netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
> >         /* Reset the rxq current cycles counter. */
> >         dp_netdev_rxq_set_cycles(poll_list[i].rxq,
> > RXQ_CYCLES_PROC_CURR, 0);
> > +
> > +       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
> > +            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
> > +       }
> 
> is it needed for this patch? won't all the values have been refreshed by the
> time the next check is performed anyway
I thought it is safe to reset it. If PMD is reset in middle of cycle then it may have stale information when dry_run is executed.
> 
> >      }
> >
> >      if (!poll_cnt) {
> > @@ -7188,17 +7539,51 @@ dp_netdev_pmd_try_optimize(struct
> dp_netdev_pmd_thread *pmd,
> >                             struct polled_queue *poll_list, int
> > poll_cnt)  {
> >      struct dpcls *cls;
> > +    uint64_t tot_idle = 0, tot_proc = 0;
> > +    unsigned int idx;
> > +    unsigned int pmd_load = 0;
> >
> >      if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
> >          uint64_t curr_tsc;
> > +        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
> > +        if (pmd_alb->is_enabled && !pmd->isolated) {
> > +            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
> > +                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
> > +            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
> > +                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
> > +
> > +            if (tot_proc) {
> > +                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
> > +            }
> > +
> > +            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
> > +                atomic_count_inc(&pmd->pmd_overloaded);
> > +
> > +                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT iter
> %d",
> > +                          atomic_count_get(&pmd->pmd_overloaded));
> 
> Better to remove this log
Sure
> 
> > +            } else {
> > +                atomic_count_set(&pmd->pmd_overloaded, 0);
> > +            }
> > +        }
> > +
> > +        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
> > +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
> > +        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
> > +
> > + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
> > +
> 
> These have been used earlier - are they initialized somewhere for the first
> use? you could just skip the above if() on the first call, so they get initialized,
> or else init them at the start of pmd_thread_main()
It would have in initialized to 0 when pmd structure is created. We are not resetting it to 0 whenever PMD is reloaded.
> 
> >          /* Get the cycles that were used to process each queue and store. */
> >          for (unsigned i = 0; i < poll_cnt; i++) {
> > -            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
> > -                                                        RXQ_CYCLES_PROC_CURR);
> > -            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
> > -            dp_netdev_rxq_set_cycles(poll_list[i].rxq,
> RXQ_CYCLES_PROC_CURR,
> > -                                     0);
> > +            uint64_t rxq_cyc_curr;
> > +            struct dp_netdev_rxq *rxq;
> > +
> > +            rxq = poll_list[i].rxq;
> > +            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> > +
> > +            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq,
> RXQ_CYCLES_PROC_CURR);
> > +            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
> > +            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR, 0);
> >          }
> > +
> >          curr_tsc = cycles_counter_update(&pmd->perf_stats);
> >          if (pmd->intrvl_tsc_prev) {
> >              /* There is a prev timestamp, store a new intrvl cycle
> > count. */ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> > index 2160910..ff3589c 100644
> > --- a/vswitchd/vswitch.xml
> > +++ b/vswitchd/vswitch.xml
> > @@ -574,6 +574,36 @@
> >              be set to 'skip_sw'.
> >          </p>
> >        </column>
> > +      <column name="other_config" key="pmd-auto-lb"
> > +              type='{"type": "boolean"}'>
> > +        <p>
> > +         Configures PMD Auto Load Balancing that allows automatic
> assignment of
> > +         RX queues to PMDs if any of PMDs is overloaded (i.e. processing
> cycles
> > +         > 95%).
> > +        </p>
> > +        <p>
> > +         It uses current scheme of cycle based assignment of RX queues that
> > +         are not statically pinned to PMDs.
> > +        </p>
> > +        <p>
> > +          Set this value to <code>true</code> to enable this option.
> > +        </p>
> > +        <p>
> > +          The default value is <code>false</code>.
> > +        </p>
> > +        <p>
> > +         This only comes in effect if cycle based assignment is enabled and
> > +         there are more than one non-isolated PMDs present and atleast one
> of
> > +         it polls more than one queue.
> > +        </p>
> > +      </column>
> > +      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
> > +              type='{"type": "integer", "minInteger": 1}'>
> > +        <p>
> > +         The minimum time (in minutes) 2 consecutive PMD Auto Load
> Balancing
> > +         iterations.
> > +        </p>
> > +      </column>
> >      </group>
> >      <group title="Status">
> >        <column name="next_cfg">
> >
Kevin Traynor Jan. 4, 2019, 2:56 p.m. UTC | #4
On 01/04/2019 12:39 PM, Nitin Katiyar wrote:
> 
> 
>> -----Original Message-----
>> From: Kevin Traynor [mailto:ktraynor@redhat.com]
>> Sent: Friday, January 04, 2019 1:48 AM
>> To: Nitin Katiyar <nitin.katiyar@ericsson.com>; ovs-dev@openvswitch.org
>> Cc: Rohith Basavaraja <rohith.basavaraja@gmail.com>; Venkatesan Pradeep
>> <venkatesan.pradeep@ericsson.com>
>> Subject: Re: [PATCH v2] Adding support for PMD auto load balancing
>>
>> On 01/03/2019 12:36 PM, Nitin Katiyar wrote:
>>> Port rx queues that have not been statically assigned to PMDs are
>>> currently assigned based on periodically sampled load measurements.
>>> The assignment is performed at specific instances – port addition,
>>> port deletion, upon reassignment request via CLI etc.
>>>
>>> Due to change in traffic pattern over time it can cause uneven load
>>> among the PMDs and thus resulting in lower overall throughout.
>>>
>>> This patch enables the support of auto load balancing of PMDs based on
>>> measured load of RX queues. Each PMD measures the processing load for
>>> each of its associated queues every 10 seconds. If the aggregated PMD
>>> load reaches 95% for 6 consecutive intervals then PMD considers itself to
>> be overloaded.
>>>
>>> If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
>>> performed by OVS main thread. The dry-run does NOT change the existing
>>> queue to PMD assignments.
>>>
>>> If the resultant mapping of dry-run indicates an improved distribution
>>> of the load then the actual reassignment will be performed.
>>>
>>> The automatic rebalancing will be disabled by default and has to be
>>> enabled via configuration option. The interval (in minutes) between
>>> two consecutive rebalancing can also be configured via CLI, default is
>>> 1 min.
>>>
>>> Following example commands can be used to set the auto-lb params:
>>> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
>>> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"
>>>
>>
>> Hi Nitin, thanks for v2. Not full review yet but sending some comments below.
>>
>> Maybe you can put some of the above into a new section below this
>> http://docs.openvswitch.org/en/latest/topics/dpdk/pmd/#port-rx-queue-
>> assigment-to-pmd-threads
> Sure, I will update that too.
>>
>> I also think this feature should be experimental until it has been road tested a
>> bit more.
>>
>>> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
>>> Co-authored-by: Venkatesan Pradeep
>> <venkatesan.pradeep@ericsson.com>
>>> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
>>> Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
>>> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
>>> ---
>>>  lib/dpif-netdev.c    | 403
>> +++++++++++++++++++++++++++++++++++++++++++++++++--
>>>  vswitchd/vswitch.xml |  30 ++++
>>>  2 files changed, 424 insertions(+), 9 deletions(-)
>>>
>>
>> There seems to be windows style line endings in the patch? or something else
>> has gone wrong for me.
>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
>>> 1564db9..8db106f 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -80,6 +80,12 @@
>>>
>>>  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>>>
>>> +/* Auto Load Balancing Defaults */
>>> +#define ACCEPT_IMPROVE_DEFAULT       (25)
>>> +#define PMD_LOAD_THRE_DEFAULT        (95)
>>
>> Probably you should remove the brackets above to be consistent with the
>> others below and in the rest of the file.
>>
>>> +#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
>>> +#define MIN_TO_MSEC                  60000
>>> +
>>>  #define FLOW_DUMP_MAX_BATCH 50
>>>  /* Use per thread recirc_depth to prevent recirculation loop. */
>>> #define MAX_RECIRC_DEPTH 6 @@ -288,6 +294,13 @@ struct dp_meter {
>>>      struct dp_meter_band bands[];
>>>  };
>>>
>>> +struct pmd_auto_lb {
>>> +    bool auto_lb_conf;        /* enable-disable auto load balancing */
>>
>> I'm not sure what '_conf' is short for? maybe it should be something like
>> 'auto_lb_requested'
> Sure
>>
>>> +    bool is_enabled;          /* auto_lb current status */
>>
>> Comments should be of style /* Sentence case. */
>> http://docs.openvswitch.org/en/latest/internals/contributing/coding-
>> style/#comments
>>
> Thanks for providing the link. I will update in next version
>>
>>> +    uint64_t rebalance_intvl;
>>> +    uint64_t rebalance_poll_timer;
>>> +};
>>> +
>>>  /* Datapath based on the network device interface from netdev.h.
>>>   *
>>>   *
>>> @@ -368,6 +381,7 @@ struct dp_netdev {
>>>      uint64_t last_tnl_conf_seq;
>>>
>>>      struct conntrack conntrack;
>>> +    struct pmd_auto_lb pmd_alb;
>>>  };
>>>
>>>  static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
>>> @@ -682,6 +696,7 @@ struct dp_netdev_pmd_thread {
>>>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>>      /* List of rx queues to poll. */
>>>      struct hmap poll_list OVS_GUARDED;
>>> +
>>
>> Unrelated newline should be removed
>>
>>>      /* Map of 'tx_port's used for transmission.  Written by the main thread,
>>>       * read by the pmd thread. */
>>>      struct hmap tx_ports OVS_GUARDED; @@ -702,6 +717,11 @@ struct
>>> dp_netdev_pmd_thread {
>>>      /* Keep track of detailed PMD performance statistics. */
>>>      struct pmd_perf_stats perf_stats;
>>>
>>> +    /* Some stats from previous iteration used by automatic pmd
>>> +       load balance logic. */
>>
>> Nit, but see coding stds. and other multi-line comments wrt style
>>
>>> +    uint64_t prev_stats[PMD_N_STATS];> +    atomic_count
>> pmd_overloaded;
>>> +
>>>      /* Set to true if the pmd thread needs to be reloaded. */
>>>      bool need_reload;
>>>  };
>>> @@ -792,9 +812,11 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq
>> *rx,
>>>                           enum rxq_cycles_counter_type type);  static
>>> void  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>>> -                           unsigned long long cycles);
>>> +                                unsigned long long cycles,
>>> +                                unsigned idx);
>>>  static uint64_t
>>> -dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned
>>> idx);
>>> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
>>> +                                unsigned idx);
>>
>> no need to change dp_netdev_rxq_get_intrvl_cycles()
>>
>>>  static void
>>>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread
>> *pmd,
>>>                                 bool purge); @@ -3734,6 +3756,51 @@
>>> dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
>>>      }
>>>  }
>>>
>>> +/* Enable/Disable PMD auto load balancing */ static void
>>> +set_pmd_auto_lb(struct dp_netdev *dp) {
>>> +    unsigned int cnt = 0;
>>> +    struct dp_netdev_pmd_thread *pmd;
>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
>>> +
>>> +    bool enable = false;
>>> +    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
>>> +
>>> +    /* Ensure that there is at least 2 non-isolated PMDs and
>>> +     * one of the PMD is polling more than one rxq
>>> +     */
>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
>>> +            continue;
>>> +        }
>>> +
>>> +        cnt++;
>>> +        if (hmap_count(&pmd->poll_list) > 1) {
>>> +            if (enable && (cnt > 1)) {
>>> +                break;
>>> +            } else {
>>> +                enable = true;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>
>> Won't this give the wrong result if there is one pmd with multiple rxq's? How
>> about something in the loop like,
> Yes, you are right. Thanks for catching this.
>>
>>         if (hmap_count(&pmd->poll_list) > 1) {
>>             multirxq = true;
>>         }
>>         if (cnt && multirxq) {
>>             enable = true;
>>             break;
>>         }
>>         cnt++;
>>
>>> +    /* Enable auto LB if it is configured and cycle based assignment is true */
>>> +    enable = enable && pmd_rxq_assign_cyc && pmd_alb->auto_lb_conf;
>>> +
>>> +    if (pmd_alb->is_enabled != enable) {
>>> +        pmd_alb->is_enabled = enable;
>>> +        if (pmd_alb->is_enabled) {
>>> +            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%lu(msec)\n",
>>> +                       pmd_alb->rebalance_intvl);
>>> +        } else {
>>> +            pmd_alb->rebalance_poll_timer = 0;
>>> +            VLOG_INFO("PMD auto lb is disabled\n");
>>> +        }
>>> +    }
>>> +
>>> +}
>>> +
>>>  /* Applies datapath configuration from the database. Some of the changes
>> are
>>>   * actually applied in dpif_netdev_run(). */  static int @@ -3748,6
>>> +3815,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap
>> *other_config)
>>>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>>>      uint32_t insert_min, cur_min;
>>>      uint32_t tx_flush_interval, cur_tx_flush_interval;
>>> +    uint64_t rebalance_intvl;
>>>
>>>      tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
>>>                                       DEFAULT_TX_FLUSH_INTERVAL); @@
>>> -3819,6 +3887,23 @@ dpif_netdev_set_config(struct dpif *dpif, const
>> struct smap *other_config)
>>>                    pmd_rxq_assign);
>>>          dp_netdev_request_reconfigure(dp);
>>>      }
>>> +
>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
>>> +    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-lb",
>>> +                              false);
>>> +
>>> +    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebalance-
>> intvl",
>>> +                              PMD_REBALANCE_POLL_INTERVAL);
>>> +
>>> +    /* Input is in min, convert it to msec */
>>> +    rebalance_intvl =
>>> +        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC :
>>> + MIN_TO_MSEC;
>>> +
>>
>> This is creating a default when the user sets 0 - that needs to be documented.
>>
>> With current values, this could overflow rebalance_intvl. The user value is in
>> minutes, so suggest to limit user input to some reasonable value like 1 week in
>> minutes, and then the min to msec can be safe. See tx-flush-interval as an
>> example where the range is limited.
> Thanks, I will update the documentation.
>>
>>> +    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
>>> +        pmd_alb->rebalance_intvl = rebalance_intvl;
>>> +    }
>>> +
>>> +    set_pmd_auto_lb(dp);
>>>      return 0;
>>>  }
>>>
>>> @@ -3974,9 +4059,9 @@ dp_netdev_rxq_get_cycles(struct
>> dp_netdev_rxq
>>> *rx,
>>>
>>>  static void
>>>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>>> -                                unsigned long long cycles)
>>> +                                unsigned long long cycles,
>>> +                                unsigned idx)
>>>  {
>>> -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
>>>      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);  }
>>>
>>> @@ -4762,6 +4847,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>>>
>>>      /* Reload affected pmd threads. */
>>>      reload_affected_pmds(dp);
>>> +
>>> +    /* Check if PMD Auto LB is to be enabled */
>>> +    set_pmd_auto_lb(dp);
>>>  }
>>>
>>>  /* Returns true if one of the netdevs in 'dp' requires a
>>> reconfiguration */ @@ -4780,6 +4868,228 @@
>> ports_require_restart(const struct dp_netdev *dp)
>>>      return false;
>>>  }
>>>
>>> +/* Function for calculating variance */ static uint64_t
>>> +variance(uint64_t a[], int n) {
>>> +    /* Compute mean (average of elements) */
>>> +    uint64_t sum = 0;
>>> +    uint64_t mean = 0;
>>> +    uint64_t sqDiff = 0;
>>> +
>>> +    if (!n) {
>>> +        return 0;
>>> +    }
>>> +
>>> +    for (int i = 0; i < n; i++) {
>>> +        sum += a[i];
>>> +    }
>>> +
>>> +    if (sum) {
>>> +        mean = sum / n;
>>> +
>>> +        /* Compute sum squared differences with mean. */
>>> +        for (int i = 0; i < n; i++) {
>>> +            sqDiff += (a[i] - mean)*(a[i] - mean);
>>> +        }
>>> +    }
>>> +    return (sqDiff ? (sqDiff / n) : 0); }
>>> +
>>> +static uint64_t
>>> +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
>>> +uint32_t num)
>>
>> I think this function would require the port_mutex.
> I will check and add in next version.
>>
>>> +{
>>> +    struct dp_netdev_port *port;
>>> +    struct dp_netdev_pmd_thread *pmd;
>>> +    struct dp_netdev_rxq ** rxqs = NULL;
>>> +    struct rr_numa *numa = NULL;
>>> +    struct rr_numa_list rr;
>>> +    int n_rxqs = 0;
>>> +    uint64_t ret = 0;
>>> +    uint64_t *pmd_usage;
>>> +
>>> +    pmd_usage = xcalloc(num, sizeof(uint64_t));
>>> +
>>> +    HMAP_FOR_EACH (port, node, &dp->ports) {
>>> +        if (!netdev_is_pmd(port->netdev)) {
>>> +            continue;
>>> +        }
>>> +
>>> +        for (int qid = 0; qid < port->n_rxq; qid++) {
>>> +            struct dp_netdev_rxq *q = &port->rxqs[qid];
>>> +            uint64_t cycle_hist = 0;
>>> +
>>> +            if (q->pmd->isolated) {
>>> +                continue;
>>> +            }
>>> +
>>> +            if (n_rxqs == 0) {
>>> +                rxqs = xmalloc(sizeof *rxqs);
>>> +            } else {
>>> +                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
>>> +            }
>>> +
>>> +            /* Sum the queue intervals and store the cycle history. */
>>> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
>>> +                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
>>> +            }
>>> +            /* Do we need to add intrvl_cycles in history??
>>
>> If you want to use compare_rxq_cycles() then you have to put them in history,
>> but it's only used for that and re-written everytime, so I don't think it is
>> harmful.
> Yeah, that is the objective of adding it. 
>>
>>> +             * but then we should clear interval cycles also */
>>
>> I don't think you should be clearing interval cycles in a dry run, otherwise they
>> will be reset if the real rebalance occurs.
> Thanks for clarifying it. I will remove the comment.
>>
>>> +            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
>>> +                                         cycle_hist);
>>> +            /* Store the queue. */
>>> +            rxqs[n_rxqs++] = q;
>>> +        }
>>> +    }
>>> +    if (n_rxqs > 1) {
>>> +        /* Sort the queues in order of the processing cycles
>>> +         * they consumed during their last pmd interval. */
>>> +        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
>>> +    }
>>> +    rr_numa_list_populate(dp, &rr);
>>> +
>>> +    for (int i = 0; i < n_rxqs; i++) {
>>> +        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
>>> +        numa = rr_numa_list_lookup(&rr, numa_id);
>>> +        if (!numa) {
>>> +            /* Don't consider queues across NUMA  ???*/
>>
>> I think you should abort the whole dry run process if this is happening
> Okay.
>>
>>> +            continue;
>>> +        }
>>> +
>>> +        pmd = rr_numa_get_pmd(numa, true);
>>> +        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned port
>> \'%s\' "
>>> +                  "rx queue %d "
>>> +                  "(measured processing cycles %"PRIu64").",
>>> +                  pmd->core_id, numa_id,
>>> +                  netdev_rxq_get_name(rxqs[i]->rx),
>>> +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
>>> +                  dp_netdev_rxq_get_cycles(rxqs[i],
>>> + RXQ_CYCLES_PROC_HIST));
>>> +
>>> +        for (int id = 0; id < num; id++) {
>>> +            if (pmd->core_id == core_list[id]) {
>>> +                /* Add the processing cycles of rxq to pmd polling it */
>>> +                uint64_t proc_cycles = 0;
>>> +                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++) {
>>> +                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
>>> +                                                                   idx);
>>> +                }
>>> +                pmd_usage[id] += proc_cycles;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +        uint64_t total_cycles = 0;
>>> +
>>> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
>>> +            continue;
>>> +        }
>>> +
>>> +        /* Get the total pmd cycles for an interval. */
>>> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>>> +        /* Estimate the cycles to cover all intervals. */
>>> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
>>> +        for (int id = 0; id < num; id++) {
>>> +            if (pmd->core_id == core_list[id]) {
>>> +                if (pmd_usage[id]) {
>>> +                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
>>> +                }
>>> +                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
>>> +                          pmd->core_id, pmd_usage[id]);
>>> +            }
>>> +        }
>>> +    }
>>> +    ret = variance(pmd_usage, num);
>>> +
>>> +    rr_numa_list_destroy(&rr);
>>> +    free(rxqs);
>>> +    free(pmd_usage);
>>> +    return ret;
>>> +}
>>> +
>>> +static bool
>>> +pmd_rebalance_dry_run(struct dp_netdev *dp) {
>>> +    struct dp_netdev_pmd_thread *pmd;
>>> +    uint64_t *curr_pmd_usage;
>>> +
>>> +    uint64_t curr_variance;
>>> +    uint64_t new_variance;
>>> +    uint64_t improvement = 0;
>>> +    uint32_t num_pmds;
>>> +    uint32_t *pmd_corelist;
>>> +    struct rxq_poll *poll, *poll_next;
>>> +
>>> +    num_pmds = cmap_count(&dp->poll_threads);
>>> +
>>> +    if (num_pmds > 1) {
>>> +        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
>>> +        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
>>> +    } else {
>>> +        return false;
>>> +    }
>>> +
>>> +    num_pmds = 0;
>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +        uint64_t total_cycles = 0;
>>> +        uint64_t total_proc = 0;
>>> +
>>> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
>>> +            continue;
>>> +        }
>>> +
>>> +        /* Get the total pmd cycles for an interval. */
>>> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>>> +        /* Estimate the cycles to cover all intervals. */
>>> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
>>> +
>>> +        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
>>> +            uint64_t proc_cycles = 0;
>>> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
>>> +                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
>>> +            }
>>> +            total_proc += proc_cycles;
>>> +        }
>>> +        if (total_proc) {
>>> +            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
>>> +        }
>>> +
>>> +        VLOG_DBG("PMD_AUTO_LB_MON curr_pmd_usage(%d):%"PRIu64"",
>>> +                  pmd->core_id, curr_pmd_usage[num_pmds]);
>>> +
>>> +        if (atomic_count_get(&pmd->pmd_overloaded)) {
>>> +            atomic_count_set(&pmd->pmd_overloaded, 0);
>>> +        }
>>> +
>>> +        pmd_corelist[num_pmds] = pmd->core_id;
>>> +        num_pmds++;
>>> +    }
>>> +
>>> +    curr_variance = variance(curr_pmd_usage, num_pmds);
>>> +
>>> +    new_variance = get_dry_run_variance(dp, pmd_corelist, num_pmds);
>>> +    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
>>> +              " curr_variance: %"PRIu64"",
>>> +              new_variance, curr_variance);
>>> +
>>> +    if (new_variance && (new_variance < curr_variance)) {
>>> +        improvement =
>>> +            ((curr_variance - new_variance) * 100) / curr_variance;
>>> +
>>> +        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"",
>> improvement);
>>> +    }
>>> +
>>> +    free(curr_pmd_usage);
>>> +    free(pmd_corelist);
>>> +
>>> +    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
>>> +        return true;
>>> +    }
>>> +
>>> +    return false;
>>> +}
>>> +
>>> +
>>>  /* Return true if needs to revalidate datapath flows. */  static bool
>>> dpif_netdev_run(struct dpif *dpif) @@ -4789,6 +5099,9 @@
>>> dpif_netdev_run(struct dpif *dpif)
>>>      struct dp_netdev_pmd_thread *non_pmd;
>>>      uint64_t new_tnl_seq;
>>>      bool need_to_flush = true;
>>> +    bool pmd_rebalance = false;
>>> +    long long int now = time_msec();
>>> +    struct dp_netdev_pmd_thread *pmd;
>>>
>>>      ovs_mutex_lock(&dp->port_mutex);
>>>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); @@ -4821,6
>>> +5134,38 @@ dpif_netdev_run(struct dpif *dpif)
>>>          dp_netdev_pmd_unref(non_pmd);
>>>      }
>>>
>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
>>> +    if (pmd_alb->is_enabled) {
>>> +        if (!pmd_alb->rebalance_poll_timer) {
>>> +            pmd_alb->rebalance_poll_timer = now;
>>> +        } else if ((pmd_alb->rebalance_poll_timer +
>>> +                   pmd_alb->rebalance_intvl) < now) {
>>> +            pmd_alb->rebalance_poll_timer = now;
>>> +            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +                if (atomic_count_get(&pmd->pmd_overloaded) >=
>>> +                                    PMD_RXQ_INTERVAL_MAX) {
>>> +                    pmd_rebalance = true;
>>> +                    break;
>>> +                }
>>> +            }
>>> +            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd
>> rebalance:%d",
>>> +                      pmd_rebalance);
>>> +
>>> +            if (pmd_rebalance &&
>>> +                !dp_netdev_is_reconf_required(dp) &&
>>> +                !ports_require_restart(dp) &&
>>> +                pmd_rebalance_dry_run(dp)) {
>>
>> Don't you need the dp_netdev_mutex for call to pmd_rebalance_dry_run, or
>> at least for some parts of it?
> I will check it.

Actually, I think it's not needed

>>
>>> +
>>> +                ovs_mutex_unlock(&dp->port_mutex);
>>
>> It seems odd to be unlocking this and then taking it again, is there a reason?
> Need to check it again if we can have both locks at the same time.
>>
>>> +                ovs_mutex_lock(&dp_netdev_mutex);

I don't think you need this lock or to unlock the port_mutex

>>> +                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD RECONFIGURE");
>>> +                dp_netdev_request_reconfigure(dp);
>>> +                ovs_mutex_unlock(&dp_netdev_mutex);
>>> +                ovs_mutex_lock(&dp->port_mutex);
>>> +            }
>>> +        }
>>> +    }
>>> +
>>>      if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
>>>          reconfigure_datapath(dp);
>>>      }
>>> @@ -4979,6 +5324,8 @@ pmd_thread_main(void *f_)
>>>  reload:
>>>      pmd_alloc_static_tx_qid(pmd);
>>>
>>> +    atomic_count_init(&pmd->pmd_overloaded, 0);
>>> +
>>>      /* List port/core affinity */
>>>      for (i = 0; i < poll_cnt; i++) {
>>>         VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
>>> @@ -4986,6 +5333,10 @@ reload:
>>>                  netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
>>>         /* Reset the rxq current cycles counter. */
>>>         dp_netdev_rxq_set_cycles(poll_list[i].rxq,
>>> RXQ_CYCLES_PROC_CURR, 0);
>>> +
>>> +       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
>>> +            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
>>> +       }
>>
>> is it needed for this patch? won't all the values have been refreshed by the
>> time the next check is performed anyway
> I thought it is safe to reset it. If PMD is reset in middle of cycle then it may have stale information when dry_run is executed.

It shouldn't be reset as it can clear info for some rxqs before rxq-pmd
assignment when ports are reconfigured. You can see this in the rxq-pmd
assignment logs, when adding/removing rxqs.

>>
>>>      }
>>>
>>>      if (!poll_cnt) {
>>> @@ -7188,17 +7539,51 @@ dp_netdev_pmd_try_optimize(struct
>> dp_netdev_pmd_thread *pmd,
>>>                             struct polled_queue *poll_list, int
>>> poll_cnt)  {
>>>      struct dpcls *cls;
>>> +    uint64_t tot_idle = 0, tot_proc = 0;
>>> +    unsigned int idx;
>>> +    unsigned int pmd_load = 0;
>>>
>>>      if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
>>>          uint64_t curr_tsc;
>>> +        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
>>> +        if (pmd_alb->is_enabled && !pmd->isolated) {
>>> +            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
>>> +                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
>>> +            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
>>> +                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
>>> +
>>> +            if (tot_proc) {
>>> +                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
>>> +            }
>>> +
>>> +            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
>>> +                atomic_count_inc(&pmd->pmd_overloaded);
>>> +
>>> +                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT iter
>> %d",
>>> +                          atomic_count_get(&pmd->pmd_overloaded));
>>
>> Better to remove this log
> Sure
>>
>>> +            } else {
>>> +                atomic_count_set(&pmd->pmd_overloaded, 0);
>>> +            }
>>> +        }
>>> +
>>> +        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
>>> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
>>> +        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
>>> +
>>> + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
>>> +
>>
>> These have been used earlier - are they initialized somewhere for the first
>> use? you could just skip the above if() on the first call, so they get initialized,
>> or else init them at the start of pmd_thread_main()
> It would have in initialized to 0 when pmd structure is created. We are not resetting it to 0 whenever PMD is reloaded.

but isn't the counters.n structure reset everytime the pmd reloads? so
it would mean that this should be reset also. Maybe I read the pmd_stats
code wrong.

>>
>>>          /* Get the cycles that were used to process each queue and store. */
>>>          for (unsigned i = 0; i < poll_cnt; i++) {
>>> -            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
>>> -                                                        RXQ_CYCLES_PROC_CURR);
>>> -            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
>>> -            dp_netdev_rxq_set_cycles(poll_list[i].rxq,
>> RXQ_CYCLES_PROC_CURR,
>>> -                                     0);
>>> +            uint64_t rxq_cyc_curr;
>>> +            struct dp_netdev_rxq *rxq;
>>> +
>>> +            rxq = poll_list[i].rxq;
>>> +            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
>>> +
>>> +            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq,
>> RXQ_CYCLES_PROC_CURR);
>>> +            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
>>> +            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR, 0);
>>>          }
>>> +
>>>          curr_tsc = cycles_counter_update(&pmd->perf_stats);
>>>          if (pmd->intrvl_tsc_prev) {
>>>              /* There is a prev timestamp, store a new intrvl cycle
>>> count. */ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
>>> index 2160910..ff3589c 100644
>>> --- a/vswitchd/vswitch.xml
>>> +++ b/vswitchd/vswitch.xml
>>> @@ -574,6 +574,36 @@
>>>              be set to 'skip_sw'.
>>>          </p>
>>>        </column>
>>> +      <column name="other_config" key="pmd-auto-lb"
>>> +              type='{"type": "boolean"}'>
>>> +        <p>
>>> +         Configures PMD Auto Load Balancing that allows automatic
>> assignment of
>>> +         RX queues to PMDs if any of PMDs is overloaded (i.e. processing
>> cycles
>>> +         > 95%).
>>> +        </p>
>>> +        <p>
>>> +         It uses current scheme of cycle based assignment of RX queues that
>>> +         are not statically pinned to PMDs.
>>> +        </p>
>>> +        <p>
>>> +          Set this value to <code>true</code> to enable this option.
>>> +        </p>
>>> +        <p>
>>> +          The default value is <code>false</code>.
>>> +        </p>
>>> +        <p>
>>> +         This only comes in effect if cycle based assignment is enabled and
>>> +         there are more than one non-isolated PMDs present and atleast one
>> of
>>> +         it polls more than one queue.
>>> +        </p>
>>> +      </column>
>>> +      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
>>> +              type='{"type": "integer", "minInteger": 1}'>
>>> +        <p>
>>> +         The minimum time (in minutes) 2 consecutive PMD Auto Load
>> Balancing
>>> +         iterations.
>>> +        </p>
>>> +      </column>
>>>      </group>
>>>      <group title="Status">
>>>        <column name="next_cfg">
>>>
>
Kevin Traynor Jan. 4, 2019, 4:56 p.m. UTC | #5
On 01/04/2019 02:56 PM, Kevin Traynor wrote:
> On 01/04/2019 12:39 PM, Nitin Katiyar wrote:
>>
>>
>>> -----Original Message-----
>>> From: Kevin Traynor [mailto:ktraynor@redhat.com]
>>> Sent: Friday, January 04, 2019 1:48 AM
>>> To: Nitin Katiyar <nitin.katiyar@ericsson.com>; ovs-dev@openvswitch.org
>>> Cc: Rohith Basavaraja <rohith.basavaraja@gmail.com>; Venkatesan Pradeep
>>> <venkatesan.pradeep@ericsson.com>
>>> Subject: Re: [PATCH v2] Adding support for PMD auto load balancing
>>>
>>> On 01/03/2019 12:36 PM, Nitin Katiyar wrote:
>>>> Port rx queues that have not been statically assigned to PMDs are
>>>> currently assigned based on periodically sampled load measurements.
>>>> The assignment is performed at specific instances – port addition,
>>>> port deletion, upon reassignment request via CLI etc.
>>>>
>>>> Due to change in traffic pattern over time it can cause uneven load
>>>> among the PMDs and thus resulting in lower overall throughout.
>>>>
>>>> This patch enables the support of auto load balancing of PMDs based on
>>>> measured load of RX queues. Each PMD measures the processing load for
>>>> each of its associated queues every 10 seconds. If the aggregated PMD
>>>> load reaches 95% for 6 consecutive intervals then PMD considers itself to
>>> be overloaded.
>>>>
>>>> If any PMD is overloaded, a dry-run of the PMD assignment algorithm is
>>>> performed by OVS main thread. The dry-run does NOT change the existing
>>>> queue to PMD assignments.
>>>>
>>>> If the resultant mapping of dry-run indicates an improved distribution
>>>> of the load then the actual reassignment will be performed.
>>>>
>>>> The automatic rebalancing will be disabled by default and has to be
>>>> enabled via configuration option. The interval (in minutes) between
>>>> two consecutive rebalancing can also be configured via CLI, default is
>>>> 1 min.
>>>>
>>>> Following example commands can be used to set the auto-lb params:
>>>> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
>>>> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-intvl="5"
>>>>
>>>
>>> Hi Nitin, thanks for v2. Not full review yet but sending some comments below.
>>>

Additional minor comment below, thanks.

>>> Maybe you can put some of the above into a new section below this
>>> http://docs.openvswitch.org/en/latest/topics/dpdk/pmd/#port-rx-queue-
>>> assigment-to-pmd-threads
>> Sure, I will update that too.
>>>
>>> I also think this feature should be experimental until it has been road tested a
>>> bit more.
>>>
>>>> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
>>>> Co-authored-by: Venkatesan Pradeep
>>> <venkatesan.pradeep@ericsson.com>
>>>> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
>>>> Signed-off-by: Venkatesan Pradeep <venkatesan.pradeep@ericsson.com>
>>>> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
>>>> ---
>>>>  lib/dpif-netdev.c    | 403
>>> +++++++++++++++++++++++++++++++++++++++++++++++++--
>>>>  vswitchd/vswitch.xml |  30 ++++
>>>>  2 files changed, 424 insertions(+), 9 deletions(-)
>>>>
>>>
>>> There seems to be windows style line endings in the patch? or something else
>>> has gone wrong for me.
>>>
>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
>>>> 1564db9..8db106f 100644
>>>> --- a/lib/dpif-netdev.c
>>>> +++ b/lib/dpif-netdev.c
>>>> @@ -80,6 +80,12 @@
>>>>
>>>>  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>>>>
>>>> +/* Auto Load Balancing Defaults */
>>>> +#define ACCEPT_IMPROVE_DEFAULT       (25)
>>>> +#define PMD_LOAD_THRE_DEFAULT        (95)
>>>
>>> Probably you should remove the brackets above to be consistent with the
>>> others below and in the rest of the file.
>>>
>>>> +#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
>>>> +#define MIN_TO_MSEC                  60000
>>>> +
>>>>  #define FLOW_DUMP_MAX_BATCH 50
>>>>  /* Use per thread recirc_depth to prevent recirculation loop. */
>>>> #define MAX_RECIRC_DEPTH 6 @@ -288,6 +294,13 @@ struct dp_meter {
>>>>      struct dp_meter_band bands[];
>>>>  };
>>>>
>>>> +struct pmd_auto_lb {
>>>> +    bool auto_lb_conf;        /* enable-disable auto load balancing */
>>>
>>> I'm not sure what '_conf' is short for? maybe it should be something like
>>> 'auto_lb_requested'
>> Sure
>>>
>>>> +    bool is_enabled;          /* auto_lb current status */
>>>
>>> Comments should be of style /* Sentence case. */
>>> http://docs.openvswitch.org/en/latest/internals/contributing/coding-
>>> style/#comments
>>>
>> Thanks for providing the link. I will update in next version
>>>
>>>> +    uint64_t rebalance_intvl;
>>>> +    uint64_t rebalance_poll_timer;
>>>> +};
>>>> +
>>>>  /* Datapath based on the network device interface from netdev.h.
>>>>   *
>>>>   *
>>>> @@ -368,6 +381,7 @@ struct dp_netdev {
>>>>      uint64_t last_tnl_conf_seq;
>>>>
>>>>      struct conntrack conntrack;
>>>> +    struct pmd_auto_lb pmd_alb;
>>>>  };
>>>>
>>>>  static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
>>>> @@ -682,6 +696,7 @@ struct dp_netdev_pmd_thread {
>>>>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>>>      /* List of rx queues to poll. */
>>>>      struct hmap poll_list OVS_GUARDED;
>>>> +
>>>
>>> Unrelated newline should be removed
>>>
>>>>      /* Map of 'tx_port's used for transmission.  Written by the main thread,
>>>>       * read by the pmd thread. */
>>>>      struct hmap tx_ports OVS_GUARDED; @@ -702,6 +717,11 @@ struct
>>>> dp_netdev_pmd_thread {
>>>>      /* Keep track of detailed PMD performance statistics. */
>>>>      struct pmd_perf_stats perf_stats;
>>>>
>>>> +    /* Some stats from previous iteration used by automatic pmd
>>>> +       load balance logic. */
>>>
>>> Nit, but see coding stds. and other multi-line comments wrt style
>>>
>>>> +    uint64_t prev_stats[PMD_N_STATS];> +    atomic_count
>>> pmd_overloaded;
>>>> +
>>>>      /* Set to true if the pmd thread needs to be reloaded. */
>>>>      bool need_reload;
>>>>  };
>>>> @@ -792,9 +812,11 @@ dp_netdev_rxq_get_cycles(struct dp_netdev_rxq
>>> *rx,
>>>>                           enum rxq_cycles_counter_type type);  static
>>>> void  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>>>> -                           unsigned long long cycles);
>>>> +                                unsigned long long cycles,
>>>> +                                unsigned idx);
>>>>  static uint64_t
>>>> -dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned
>>>> idx);
>>>> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
>>>> +                                unsigned idx);
>>>
>>> no need to change dp_netdev_rxq_get_intrvl_cycles()
>>>
>>>>  static void
>>>>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread
>>> *pmd,
>>>>                                 bool purge); @@ -3734,6 +3756,51 @@
>>>> dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
>>>>      }
>>>>  }
>>>>
>>>> +/* Enable/Disable PMD auto load balancing */ static void
>>>> +set_pmd_auto_lb(struct dp_netdev *dp) {
>>>> +    unsigned int cnt = 0;
>>>> +    struct dp_netdev_pmd_thread *pmd;
>>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
>>>> +
>>>> +    bool enable = false;
>>>> +    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
>>>> +
>>>> +    /* Ensure that there is at least 2 non-isolated PMDs and
>>>> +     * one of the PMD is polling more than one rxq
>>>> +     */
>>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> +        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        cnt++;
>>>> +        if (hmap_count(&pmd->poll_list) > 1) {
>>>> +            if (enable && (cnt > 1)) {
>>>> +                break;
>>>> +            } else {
>>>> +                enable = true;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>
>>> Won't this give the wrong result if there is one pmd with multiple rxq's? How
>>> about something in the loop like,
>> Yes, you are right. Thanks for catching this.
>>>
>>>         if (hmap_count(&pmd->poll_list) > 1) {
>>>             multirxq = true;
>>>         }
>>>         if (cnt && multirxq) {
>>>             enable = true;
>>>             break;
>>>         }
>>>         cnt++;
>>>
>>>> +    /* Enable auto LB if it is configured and cycle based assignment is true */
>>>> +    enable = enable && pmd_rxq_assign_cyc && pmd_alb->auto_lb_conf;
>>>> +
>>>> +    if (pmd_alb->is_enabled != enable) {
>>>> +        pmd_alb->is_enabled = enable;
>>>> +        if (pmd_alb->is_enabled) {
>>>> +            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%lu(msec)\n",
>>>> +                       pmd_alb->rebalance_intvl);
>>>> +        } else {
>>>> +            pmd_alb->rebalance_poll_timer = 0;
>>>> +            VLOG_INFO("PMD auto lb is disabled\n");
>>>> +        }
>>>> +    }
>>>> +
>>>> +}
>>>> +
>>>>  /* Applies datapath configuration from the database. Some of the changes
>>> are
>>>>   * actually applied in dpif_netdev_run(). */  static int @@ -3748,6
>>>> +3815,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap
>>> *other_config)
>>>>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
>>>>      uint32_t insert_min, cur_min;
>>>>      uint32_t tx_flush_interval, cur_tx_flush_interval;
>>>> +    uint64_t rebalance_intvl;
>>>>
>>>>      tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
>>>>                                       DEFAULT_TX_FLUSH_INTERVAL); @@
>>>> -3819,6 +3887,23 @@ dpif_netdev_set_config(struct dpif *dpif, const
>>> struct smap *other_config)
>>>>                    pmd_rxq_assign);
>>>>          dp_netdev_request_reconfigure(dp);
>>>>      }
>>>> +
>>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
>>>> +    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-lb",
>>>> +                              false);
>>>> +
>>>> +    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebalance-
>>> intvl",
>>>> +                              PMD_REBALANCE_POLL_INTERVAL);
>>>> +
>>>> +    /* Input is in min, convert it to msec */
>>>> +    rebalance_intvl =
>>>> +        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC :
>>>> + MIN_TO_MSEC;
>>>> +
>>>
>>> This is creating a default when the user sets 0 - that needs to be documented.
>>>
>>> With current values, this could overflow rebalance_intvl. The user value is in
>>> minutes, so suggest to limit user input to some reasonable value like 1 week in
>>> minutes, and then the min to msec can be safe. See tx-flush-interval as an
>>> example where the range is limited.
>> Thanks, I will update the documentation.
>>>
>>>> +    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
>>>> +        pmd_alb->rebalance_intvl = rebalance_intvl;
>>>> +    }
>>>> +
>>>> +    set_pmd_auto_lb(dp);
>>>>      return 0;
>>>>  }
>>>>
>>>> @@ -3974,9 +4059,9 @@ dp_netdev_rxq_get_cycles(struct
>>> dp_netdev_rxq
>>>> *rx,
>>>>
>>>>  static void
>>>>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
>>>> -                                unsigned long long cycles)
>>>> +                                unsigned long long cycles,
>>>> +                                unsigned idx)
>>>>  {
>>>> -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
>>>>      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);  }
>>>>
>>>> @@ -4762,6 +4847,9 @@ reconfigure_datapath(struct dp_netdev *dp)
>>>>
>>>>      /* Reload affected pmd threads. */
>>>>      reload_affected_pmds(dp);
>>>> +
>>>> +    /* Check if PMD Auto LB is to be enabled */
>>>> +    set_pmd_auto_lb(dp);
>>>>  }
>>>>
>>>>  /* Returns true if one of the netdevs in 'dp' requires a
>>>> reconfiguration */ @@ -4780,6 +4868,228 @@
>>> ports_require_restart(const struct dp_netdev *dp)
>>>>      return false;
>>>>  }
>>>>
>>>> +/* Function for calculating variance */ static uint64_t
>>>> +variance(uint64_t a[], int n) {
>>>> +    /* Compute mean (average of elements) */
>>>> +    uint64_t sum = 0;
>>>> +    uint64_t mean = 0;
>>>> +    uint64_t sqDiff = 0;
>>>> +
>>>> +    if (!n) {
>>>> +        return 0;
>>>> +    }
>>>> +
>>>> +    for (int i = 0; i < n; i++) {
>>>> +        sum += a[i];
>>>> +    }
>>>> +
>>>> +    if (sum) {
>>>> +        mean = sum / n;
>>>> +
>>>> +        /* Compute sum squared differences with mean. */
>>>> +        for (int i = 0; i < n; i++) {
>>>> +            sqDiff += (a[i] - mean)*(a[i] - mean);
>>>> +        }
>>>> +    }
>>>> +    return (sqDiff ? (sqDiff / n) : 0); }
>>>> +
>>>> +static uint64_t
>>>> +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
>>>> +uint32_t num)
>>>
>>> I think this function would require the port_mutex.
>> I will check and add in next version.
>>>
>>>> +{
>>>> +    struct dp_netdev_port *port;
>>>> +    struct dp_netdev_pmd_thread *pmd;
>>>> +    struct dp_netdev_rxq ** rxqs = NULL;
>>>> +    struct rr_numa *numa = NULL;
>>>> +    struct rr_numa_list rr;
>>>> +    int n_rxqs = 0;
>>>> +    uint64_t ret = 0;
>>>> +    uint64_t *pmd_usage;
>>>> +
>>>> +    pmd_usage = xcalloc(num, sizeof(uint64_t));
>>>> +
>>>> +    HMAP_FOR_EACH (port, node, &dp->ports) {
>>>> +        if (!netdev_is_pmd(port->netdev)) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        for (int qid = 0; qid < port->n_rxq; qid++) {
>>>> +            struct dp_netdev_rxq *q = &port->rxqs[qid];
>>>> +            uint64_t cycle_hist = 0;
>>>> +
>>>> +            if (q->pmd->isolated) {
>>>> +                continue;
>>>> +            }
>>>> +
>>>> +            if (n_rxqs == 0) {
>>>> +                rxqs = xmalloc(sizeof *rxqs);
>>>> +            } else {
>>>> +                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
>>>> +            }
>>>> +
>>>> +            /* Sum the queue intervals and store the cycle history. */
>>>> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
>>>> +                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
>>>> +            }
>>>> +            /* Do we need to add intrvl_cycles in history??
>>>
>>> If you want to use compare_rxq_cycles() then you have to put them in history,
>>> but it's only used for that and re-written everytime, so I don't think it is
>>> harmful.
>> Yeah, that is the objective of adding it. 
>>>
>>>> +             * but then we should clear interval cycles also */
>>>
>>> I don't think you should be clearing interval cycles in a dry run, otherwise they
>>> will be reset if the real rebalance occurs.
>> Thanks for clarifying it. I will remove the comment.
>>>
>>>> +            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
>>>> +                                         cycle_hist);
>>>> +            /* Store the queue. */
>>>> +            rxqs[n_rxqs++] = q;
>>>> +        }
>>>> +    }
>>>> +    if (n_rxqs > 1) {
>>>> +        /* Sort the queues in order of the processing cycles
>>>> +         * they consumed during their last pmd interval. */
>>>> +        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
>>>> +    }
>>>> +    rr_numa_list_populate(dp, &rr);
>>>> +
>>>> +    for (int i = 0; i < n_rxqs; i++) {
>>>> +        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
>>>> +        numa = rr_numa_list_lookup(&rr, numa_id);
>>>> +        if (!numa) {
>>>> +            /* Don't consider queues across NUMA  ???*/
>>>
>>> I think you should abort the whole dry run process if this is happening
>> Okay.
>>>
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        pmd = rr_numa_get_pmd(numa, true);
>>>> +        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned port
>>> \'%s\' "
>>>> +                  "rx queue %d "
>>>> +                  "(measured processing cycles %"PRIu64").",
>>>> +                  pmd->core_id, numa_id,
>>>> +                  netdev_rxq_get_name(rxqs[i]->rx),
>>>> +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
>>>> +                  dp_netdev_rxq_get_cycles(rxqs[i],
>>>> + RXQ_CYCLES_PROC_HIST));
>>>> +
>>>> +        for (int id = 0; id < num; id++) {
>>>> +            if (pmd->core_id == core_list[id]) {
>>>> +                /* Add the processing cycles of rxq to pmd polling it */
>>>> +                uint64_t proc_cycles = 0;

>>>> +                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++) {
>>>> +                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
>>>> +                                                                   idx);
>>>> +                }

This is ok, but if you can be sure it was already done and result stored
in PROC_HIST earlier in the function, you could just use
dp_netdev_rxq_get_cycles(rxqs[i],RXQ_CYCLES_PROC_HIST)

>>>> +                pmd_usage[id] += proc_cycles;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> +        uint64_t total_cycles = 0;
>>>> +
>>>> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        /* Get the total pmd cycles for an interval. */
>>>> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>>>> +        /* Estimate the cycles to cover all intervals. */
>>>> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
>>>> +        for (int id = 0; id < num; id++) {
>>>> +            if (pmd->core_id == core_list[id]) {
>>>> +                if (pmd_usage[id]) {
>>>> +                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
>>>> +                }
>>>> +                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
>>>> +                          pmd->core_id, pmd_usage[id]);
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +    ret = variance(pmd_usage, num);
>>>> +
>>>> +    rr_numa_list_destroy(&rr);
>>>> +    free(rxqs);
>>>> +    free(pmd_usage);
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static bool
>>>> +pmd_rebalance_dry_run(struct dp_netdev *dp) {
>>>> +    struct dp_netdev_pmd_thread *pmd;
>>>> +    uint64_t *curr_pmd_usage;
>>>> +
>>>> +    uint64_t curr_variance;
>>>> +    uint64_t new_variance;
>>>> +    uint64_t improvement = 0;
>>>> +    uint32_t num_pmds;
>>>> +    uint32_t *pmd_corelist;
>>>> +    struct rxq_poll *poll, *poll_next;
>>>> +
>>>> +    num_pmds = cmap_count(&dp->poll_threads);
>>>> +
>>>> +    if (num_pmds > 1) {
>>>> +        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
>>>> +        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
>>>> +    } else {
>>>> +        return false;
>>>> +    }
>>>> +
>>>> +    num_pmds = 0;
>>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> +        uint64_t total_cycles = 0;
>>>> +        uint64_t total_proc = 0;
>>>> +
>>>> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
>>>> +            continue;
>>>> +        }
>>>> +
>>>> +        /* Get the total pmd cycles for an interval. */
>>>> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
>>>> +        /* Estimate the cycles to cover all intervals. */
>>>> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
>>>> +
>>>> +        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
>>>> +            uint64_t proc_cycles = 0;
>>>> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
>>>> +                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
>>>> +            }
>>>> +            total_proc += proc_cycles;
>>>> +        }
>>>> +        if (total_proc) {
>>>> +            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
>>>> +        }
>>>> +
>>>> +        VLOG_DBG("PMD_AUTO_LB_MON curr_pmd_usage(%d):%"PRIu64"",
>>>> +                  pmd->core_id, curr_pmd_usage[num_pmds]);
>>>> +
>>>> +        if (atomic_count_get(&pmd->pmd_overloaded)) {
>>>> +            atomic_count_set(&pmd->pmd_overloaded, 0);
>>>> +        }
>>>> +
>>>> +        pmd_corelist[num_pmds] = pmd->core_id;
>>>> +        num_pmds++;
>>>> +    }
>>>> +
>>>> +    curr_variance = variance(curr_pmd_usage, num_pmds);
>>>> +
>>>> +    new_variance = get_dry_run_variance(dp, pmd_corelist, num_pmds);
>>>> +    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
>>>> +              " curr_variance: %"PRIu64"",
>>>> +              new_variance, curr_variance);
>>>> +
>>>> +    if (new_variance && (new_variance < curr_variance)) {
>>>> +        improvement =
>>>> +            ((curr_variance - new_variance) * 100) / curr_variance;
>>>> +
>>>> +        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"",
>>> improvement);
>>>> +    }
>>>> +
>>>> +    free(curr_pmd_usage);
>>>> +    free(pmd_corelist);
>>>> +
>>>> +    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
>>>> +        return true;
>>>> +    }
>>>> +
>>>> +    return false;
>>>> +}
>>>> +
>>>> +
>>>>  /* Return true if needs to revalidate datapath flows. */  static bool
>>>> dpif_netdev_run(struct dpif *dpif) @@ -4789,6 +5099,9 @@
>>>> dpif_netdev_run(struct dpif *dpif)
>>>>      struct dp_netdev_pmd_thread *non_pmd;
>>>>      uint64_t new_tnl_seq;
>>>>      bool need_to_flush = true;
>>>> +    bool pmd_rebalance = false;
>>>> +    long long int now = time_msec();
>>>> +    struct dp_netdev_pmd_thread *pmd;
>>>>
>>>>      ovs_mutex_lock(&dp->port_mutex);
>>>>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); @@ -4821,6
>>>> +5134,38 @@ dpif_netdev_run(struct dpif *dpif)
>>>>          dp_netdev_pmd_unref(non_pmd);
>>>>      }
>>>>
>>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
>>>> +    if (pmd_alb->is_enabled) {
>>>> +        if (!pmd_alb->rebalance_poll_timer) {
>>>> +            pmd_alb->rebalance_poll_timer = now;
>>>> +        } else if ((pmd_alb->rebalance_poll_timer +
>>>> +                   pmd_alb->rebalance_intvl) < now) {
>>>> +            pmd_alb->rebalance_poll_timer = now;
>>>> +            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> +                if (atomic_count_get(&pmd->pmd_overloaded) >=
>>>> +                                    PMD_RXQ_INTERVAL_MAX) {
>>>> +                    pmd_rebalance = true;
>>>> +                    break;
>>>> +                }
>>>> +            }
>>>> +            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd
>>> rebalance:%d",
>>>> +                      pmd_rebalance);
>>>> +
>>>> +            if (pmd_rebalance &&
>>>> +                !dp_netdev_is_reconf_required(dp) &&
>>>> +                !ports_require_restart(dp) &&
>>>> +                pmd_rebalance_dry_run(dp)) {
>>>
>>> Don't you need the dp_netdev_mutex for call to pmd_rebalance_dry_run, or
>>> at least for some parts of it?
>> I will check it.
> 
> Actually, I think it's not needed
> 
>>>
>>>> +
>>>> +                ovs_mutex_unlock(&dp->port_mutex);
>>>
>>> It seems odd to be unlocking this and then taking it again, is there a reason?
>> Need to check it again if we can have both locks at the same time.
>>>
>>>> +                ovs_mutex_lock(&dp_netdev_mutex);
> 
> I don't think you need this lock or to unlock the port_mutex
> 
>>>> +                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD RECONFIGURE");
>>>> +                dp_netdev_request_reconfigure(dp);
>>>> +                ovs_mutex_unlock(&dp_netdev_mutex);
>>>> +                ovs_mutex_lock(&dp->port_mutex);
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>>      if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
>>>>          reconfigure_datapath(dp);
>>>>      }
>>>> @@ -4979,6 +5324,8 @@ pmd_thread_main(void *f_)
>>>>  reload:
>>>>      pmd_alloc_static_tx_qid(pmd);
>>>>
>>>> +    atomic_count_init(&pmd->pmd_overloaded, 0);
>>>> +
>>>>      /* List port/core affinity */
>>>>      for (i = 0; i < poll_cnt; i++) {
>>>>         VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
>>>> @@ -4986,6 +5333,10 @@ reload:
>>>>                  netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
>>>>         /* Reset the rxq current cycles counter. */
>>>>         dp_netdev_rxq_set_cycles(poll_list[i].rxq,
>>>> RXQ_CYCLES_PROC_CURR, 0);
>>>> +
>>>> +       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
>>>> +            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
>>>> +       }
>>>
>>> is it needed for this patch? won't all the values have been refreshed by the
>>> time the next check is performed anyway
>> I thought it is safe to reset it. If PMD is reset in middle of cycle then it may have stale information when dry_run is executed.
> 
> It shouldn't be reset as it can clear info for some rxqs before rxq-pmd
> assignment when ports are reconfigured. You can see this in the rxq-pmd
> assignment logs, when adding/removing rxqs.
> 
>>>
>>>>      }
>>>>
>>>>      if (!poll_cnt) {
>>>> @@ -7188,17 +7539,51 @@ dp_netdev_pmd_try_optimize(struct
>>> dp_netdev_pmd_thread *pmd,
>>>>                             struct polled_queue *poll_list, int
>>>> poll_cnt)  {
>>>>      struct dpcls *cls;
>>>> +    uint64_t tot_idle = 0, tot_proc = 0;
>>>> +    unsigned int idx;
>>>> +    unsigned int pmd_load = 0;
>>>>
>>>>      if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
>>>>          uint64_t curr_tsc;
>>>> +        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
>>>> +        if (pmd_alb->is_enabled && !pmd->isolated) {
>>>> +            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
>>>> +                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
>>>> +            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
>>>> +                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
>>>> +
>>>> +            if (tot_proc) {
>>>> +                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
>>>> +            }
>>>> +
>>>> +            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
>>>> +                atomic_count_inc(&pmd->pmd_overloaded);
>>>> +
>>>> +                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT iter
>>> %d",
>>>> +                          atomic_count_get(&pmd->pmd_overloaded));
>>>
>>> Better to remove this log
>> Sure
>>>
>>>> +            } else {
>>>> +                atomic_count_set(&pmd->pmd_overloaded, 0);
>>>> +            }
>>>> +        }
>>>> +
>>>> +        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
>>>> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
>>>> +        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
>>>> +
>>>> + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
>>>> +
>>>
>>> These have been used earlier - are they initialized somewhere for the first
>>> use? you could just skip the above if() on the first call, so they get initialized,
>>> or else init them at the start of pmd_thread_main()
>> It would have in initialized to 0 when pmd structure is created. We are not resetting it to 0 whenever PMD is reloaded.
> 
> but isn't the counters.n structure reset everytime the pmd reloads? so
> it would mean that this should be reset also. Maybe I read the pmd_stats
> code wrong.
> 
>>>
>>>>          /* Get the cycles that were used to process each queue and store. */
>>>>          for (unsigned i = 0; i < poll_cnt; i++) {
>>>> -            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
>>>> -                                                        RXQ_CYCLES_PROC_CURR);
>>>> -            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
>>>> -            dp_netdev_rxq_set_cycles(poll_list[i].rxq,
>>> RXQ_CYCLES_PROC_CURR,
>>>> -                                     0);
>>>> +            uint64_t rxq_cyc_curr;
>>>> +            struct dp_netdev_rxq *rxq;
>>>> +
>>>> +            rxq = poll_list[i].rxq;
>>>> +            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
>>>> +
>>>> +            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq,
>>> RXQ_CYCLES_PROC_CURR);
>>>> +            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
>>>> +            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR, 0);
>>>>          }
>>>> +
>>>>          curr_tsc = cycles_counter_update(&pmd->perf_stats);
>>>>          if (pmd->intrvl_tsc_prev) {
>>>>              /* There is a prev timestamp, store a new intrvl cycle
>>>> count. */ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
>>>> index 2160910..ff3589c 100644
>>>> --- a/vswitchd/vswitch.xml
>>>> +++ b/vswitchd/vswitch.xml
>>>> @@ -574,6 +574,36 @@
>>>>              be set to 'skip_sw'.
>>>>          </p>
>>>>        </column>
>>>> +      <column name="other_config" key="pmd-auto-lb"
>>>> +              type='{"type": "boolean"}'>
>>>> +        <p>
>>>> +         Configures PMD Auto Load Balancing that allows automatic
>>> assignment of
>>>> +         RX queues to PMDs if any of PMDs is overloaded (i.e. processing
>>> cycles
>>>> +         > 95%).
>>>> +        </p>
>>>> +        <p>
>>>> +         It uses current scheme of cycle based assignment of RX queues that
>>>> +         are not statically pinned to PMDs.
>>>> +        </p>
>>>> +        <p>
>>>> +          Set this value to <code>true</code> to enable this option.
>>>> +        </p>
>>>> +        <p>
>>>> +          The default value is <code>false</code>.
>>>> +        </p>
>>>> +        <p>
>>>> +         This only comes in effect if cycle based assignment is enabled and
>>>> +         there are more than one non-isolated PMDs present and atleast one
>>> of
>>>> +         it polls more than one queue.
>>>> +        </p>
>>>> +      </column>
>>>> +      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
>>>> +              type='{"type": "integer", "minInteger": 1}'>
>>>> +        <p>
>>>> +         The minimum time (in minutes) 2 consecutive PMD Auto Load
>>> Balancing
>>>> +         iterations.
>>>> +        </p>
>>>> +      </column>
>>>>      </group>
>>>>      <group title="Status">
>>>>        <column name="next_cfg">
>>>>
>>
>
Nitin Katiyar Jan. 7, 2019, 4:46 a.m. UTC | #6
> -----Original Message-----
> From: Kevin Traynor [mailto:ktraynor@redhat.com]
> Sent: Friday, January 04, 2019 10:26 PM
> To: Nitin Katiyar <nitin.katiyar@ericsson.com>; ovs-dev@openvswitch.org
> Cc: Rohith Basavaraja <rohith.basavaraja@gmail.com>; Venkatesan Pradeep
> <venkatesan.pradeep@ericsson.com>
> Subject: Re: [PATCH v2] Adding support for PMD auto load balancing
> 
> On 01/04/2019 02:56 PM, Kevin Traynor wrote:
> > On 01/04/2019 12:39 PM, Nitin Katiyar wrote:
> >>
> >>
> >>> -----Original Message-----
> >>> From: Kevin Traynor [mailto:ktraynor@redhat.com]
> >>> Sent: Friday, January 04, 2019 1:48 AM
> >>> To: Nitin Katiyar <nitin.katiyar@ericsson.com>;
> >>> ovs-dev@openvswitch.org
> >>> Cc: Rohith Basavaraja <rohith.basavaraja@gmail.com>; Venkatesan
> >>> Pradeep <venkatesan.pradeep@ericsson.com>
> >>> Subject: Re: [PATCH v2] Adding support for PMD auto load balancing
> >>>
> >>> On 01/03/2019 12:36 PM, Nitin Katiyar wrote:
> >>>> Port rx queues that have not been statically assigned to PMDs are
> >>>> currently assigned based on periodically sampled load measurements.
> >>>> The assignment is performed at specific instances – port addition,
> >>>> port deletion, upon reassignment request via CLI etc.
> >>>>
> >>>> Due to change in traffic pattern over time it can cause uneven load
> >>>> among the PMDs and thus resulting in lower overall throughout.
> >>>>
> >>>> This patch enables the support of auto load balancing of PMDs based
> >>>> on measured load of RX queues. Each PMD measures the processing
> >>>> load for each of its associated queues every 10 seconds. If the
> >>>> aggregated PMD load reaches 95% for 6 consecutive intervals then
> >>>> PMD considers itself to
> >>> be overloaded.
> >>>>
> >>>> If any PMD is overloaded, a dry-run of the PMD assignment algorithm
> >>>> is performed by OVS main thread. The dry-run does NOT change the
> >>>> existing queue to PMD assignments.
> >>>>
> >>>> If the resultant mapping of dry-run indicates an improved
> >>>> distribution of the load then the actual reassignment will be performed.
> >>>>
> >>>> The automatic rebalancing will be disabled by default and has to be
> >>>> enabled via configuration option. The interval (in minutes) between
> >>>> two consecutive rebalancing can also be configured via CLI, default
> >>>> is
> >>>> 1 min.
> >>>>
> >>>> Following example commands can be used to set the auto-lb params:
> >>>> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb="true"
> >>>> ovs-vsctl set open_vswitch . other_config:pmd-auto-lb-rebalance-
> intvl="5"
> >>>>
> >>>
> >>> Hi Nitin, thanks for v2. Not full review yet but sending some comments
> below.
> >>>
> 
> Additional minor comment below, thanks.
Thanks again Kevin.
> 
> >>> Maybe you can put some of the above into a new section below this
> >>> http://docs.openvswitch.org/en/latest/topics/dpdk/pmd/#port-rx-queue
> >>> -
> >>> assigment-to-pmd-threads
> >> Sure, I will update that too.
> >>>
> >>> I also think this feature should be experimental until it has been
> >>> road tested a bit more.
> >>>
> >>>> Co-authored-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> >>>> Co-authored-by: Venkatesan Pradeep
> >>> <venkatesan.pradeep@ericsson.com>
> >>>> Signed-off-by: Rohith Basavaraja <rohith.basavaraja@gmail.com>
> >>>> Signed-off-by: Venkatesan Pradeep
> <venkatesan.pradeep@ericsson.com>
> >>>> Signed-off-by: Nitin Katiyar <nitin.katiyar@ericsson.com>
> >>>> ---
> >>>>  lib/dpif-netdev.c    | 403
> >>> +++++++++++++++++++++++++++++++++++++++++++++++++--
> >>>>  vswitchd/vswitch.xml |  30 ++++
> >>>>  2 files changed, 424 insertions(+), 9 deletions(-)
> >>>>
> >>>
> >>> There seems to be windows style line endings in the patch? or
> >>> something else has gone wrong for me.
> >>>
> >>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> >>>> 1564db9..8db106f 100644
> >>>> --- a/lib/dpif-netdev.c
> >>>> +++ b/lib/dpif-netdev.c
> >>>> @@ -80,6 +80,12 @@
> >>>>
> >>>>  VLOG_DEFINE_THIS_MODULE(dpif_netdev);
> >>>>
> >>>> +/* Auto Load Balancing Defaults */
> >>>> +#define ACCEPT_IMPROVE_DEFAULT       (25)
> >>>> +#define PMD_LOAD_THRE_DEFAULT        (95)
> >>>
> >>> Probably you should remove the brackets above to be consistent with
> >>> the others below and in the rest of the file.
> >>>
> >>>> +#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
> >>>> +#define MIN_TO_MSEC                  60000
> >>>> +
> >>>>  #define FLOW_DUMP_MAX_BATCH 50
> >>>>  /* Use per thread recirc_depth to prevent recirculation loop. */
> >>>> #define MAX_RECIRC_DEPTH 6 @@ -288,6 +294,13 @@ struct
> dp_meter {
> >>>>      struct dp_meter_band bands[];
> >>>>  };
> >>>>
> >>>> +struct pmd_auto_lb {
> >>>> +    bool auto_lb_conf;        /* enable-disable auto load balancing */
> >>>
> >>> I'm not sure what '_conf' is short for? maybe it should be something
> >>> like 'auto_lb_requested'
> >> Sure
> >>>
> >>>> +    bool is_enabled;          /* auto_lb current status */
> >>>
> >>> Comments should be of style /* Sentence case. */
> >>> http://docs.openvswitch.org/en/latest/internals/contributing/coding-
> >>> style/#comments
> >>>
> >> Thanks for providing the link. I will update in next version
> >>>
> >>>> +    uint64_t rebalance_intvl;
> >>>> +    uint64_t rebalance_poll_timer; };
> >>>> +
> >>>>  /* Datapath based on the network device interface from netdev.h.
> >>>>   *
> >>>>   *
> >>>> @@ -368,6 +381,7 @@ struct dp_netdev {
> >>>>      uint64_t last_tnl_conf_seq;
> >>>>
> >>>>      struct conntrack conntrack;
> >>>> +    struct pmd_auto_lb pmd_alb;
> >>>>  };
> >>>>
> >>>>  static void meter_lock(const struct dp_netdev *dp, uint32_t
> >>>> meter_id) @@ -682,6 +696,7 @@ struct dp_netdev_pmd_thread {
> >>>>      struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and
> 'tx_ports'. */
> >>>>      /* List of rx queues to poll. */
> >>>>      struct hmap poll_list OVS_GUARDED;
> >>>> +
> >>>
> >>> Unrelated newline should be removed
> >>>
> >>>>      /* Map of 'tx_port's used for transmission.  Written by the main
> thread,
> >>>>       * read by the pmd thread. */
> >>>>      struct hmap tx_ports OVS_GUARDED; @@ -702,6 +717,11 @@ struct
> >>>> dp_netdev_pmd_thread {
> >>>>      /* Keep track of detailed PMD performance statistics. */
> >>>>      struct pmd_perf_stats perf_stats;
> >>>>
> >>>> +    /* Some stats from previous iteration used by automatic pmd
> >>>> +       load balance logic. */
> >>>
> >>> Nit, but see coding stds. and other multi-line comments wrt style
> >>>
> >>>> +    uint64_t prev_stats[PMD_N_STATS];> +    atomic_count
> >>> pmd_overloaded;
> >>>> +
> >>>>      /* Set to true if the pmd thread needs to be reloaded. */
> >>>>      bool need_reload;
> >>>>  };
> >>>> @@ -792,9 +812,11 @@ dp_netdev_rxq_get_cycles(struct
> dp_netdev_rxq
> >>> *rx,
> >>>>                           enum rxq_cycles_counter_type type);
> >>>> static void  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> >>>> -                           unsigned long long cycles);
> >>>> +                                unsigned long long cycles,
> >>>> +                                unsigned idx);
> >>>>  static uint64_t
> >>>> -dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned
> >>>> idx);
> >>>> +dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
> >>>> +                                unsigned idx);
> >>>
> >>> no need to change dp_netdev_rxq_get_intrvl_cycles()
> >>>
> >>>>  static void
> >>>>  dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread
> >>> *pmd,
> >>>>                                 bool purge); @@ -3734,6 +3756,51 @@
> >>>> dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t
> n_ops,
> >>>>      }
> >>>>  }
> >>>>
> >>>> +/* Enable/Disable PMD auto load balancing */ static void
> >>>> +set_pmd_auto_lb(struct dp_netdev *dp) {
> >>>> +    unsigned int cnt = 0;
> >>>> +    struct dp_netdev_pmd_thread *pmd;
> >>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> >>>> +
> >>>> +    bool enable = false;
> >>>> +    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
> >>>> +
> >>>> +    /* Ensure that there is at least 2 non-isolated PMDs and
> >>>> +     * one of the PMD is polling more than one rxq
> >>>> +     */
> >>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>>> +        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        cnt++;
> >>>> +        if (hmap_count(&pmd->poll_list) > 1) {
> >>>> +            if (enable && (cnt > 1)) {
> >>>> +                break;
> >>>> +            } else {
> >>>> +                enable = true;
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>
> >>> Won't this give the wrong result if there is one pmd with multiple
> >>> rxq's? How about something in the loop like,
> >> Yes, you are right. Thanks for catching this.
> >>>
> >>>         if (hmap_count(&pmd->poll_list) > 1) {
> >>>             multirxq = true;
> >>>         }
> >>>         if (cnt && multirxq) {
> >>>             enable = true;
> >>>             break;
> >>>         }
> >>>         cnt++;
> >>>
> >>>> +    /* Enable auto LB if it is configured and cycle based assignment is
> true */
> >>>> +    enable = enable && pmd_rxq_assign_cyc &&
> >>>> + pmd_alb->auto_lb_conf;
> >>>> +
> >>>> +    if (pmd_alb->is_enabled != enable) {
> >>>> +        pmd_alb->is_enabled = enable;
> >>>> +        if (pmd_alb->is_enabled) {
> >>>> +            VLOG_INFO("PMD auto lb is enabled, rebalance
> intvl:%lu(msec)\n",
> >>>> +                       pmd_alb->rebalance_intvl);
> >>>> +        } else {
> >>>> +            pmd_alb->rebalance_poll_timer = 0;
> >>>> +            VLOG_INFO("PMD auto lb is disabled\n");
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +}
> >>>> +
> >>>>  /* Applies datapath configuration from the database. Some of the
> >>>> changes
> >>> are
> >>>>   * actually applied in dpif_netdev_run(). */  static int @@
> >>>> -3748,6
> >>>> +3815,7 @@ dpif_netdev_set_config(struct dpif *dpif, const struct
> >>>> +smap
> >>> *other_config)
> >>>>                          DEFAULT_EM_FLOW_INSERT_INV_PROB);
> >>>>      uint32_t insert_min, cur_min;
> >>>>      uint32_t tx_flush_interval, cur_tx_flush_interval;
> >>>> +    uint64_t rebalance_intvl;
> >>>>
> >>>>      tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
> >>>>                                       DEFAULT_TX_FLUSH_INTERVAL);
> >>>> @@
> >>>> -3819,6 +3887,23 @@ dpif_netdev_set_config(struct dpif *dpif, const
> >>> struct smap *other_config)
> >>>>                    pmd_rxq_assign);
> >>>>          dp_netdev_request_reconfigure(dp);
> >>>>      }
> >>>> +
> >>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> >>>> +    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-
> lb",
> >>>> +                              false);
> >>>> +
> >>>> +    rebalance_intvl = smap_get_int(other_config,
> >>>> + "pmd-auto-lb-rebalance-
> >>> intvl",
> >>>> +                              PMD_REBALANCE_POLL_INTERVAL);
> >>>> +
> >>>> +    /* Input is in min, convert it to msec */
> >>>> +    rebalance_intvl =
> >>>> +        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC :
> >>>> + MIN_TO_MSEC;
> >>>> +
> >>>
> >>> This is creating a default when the user sets 0 - that needs to be
> documented.
> >>>
> >>> With current values, this could overflow rebalance_intvl. The user
> >>> value is in minutes, so suggest to limit user input to some
> >>> reasonable value like 1 week in minutes, and then the min to msec
> >>> can be safe. See tx-flush-interval as an example where the range is
> limited.
> >> Thanks, I will update the documentation.
> >>>
> >>>> +    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
> >>>> +        pmd_alb->rebalance_intvl = rebalance_intvl;
> >>>> +    }
> >>>> +
> >>>> +    set_pmd_auto_lb(dp);
> >>>>      return 0;
> >>>>  }
> >>>>
> >>>> @@ -3974,9 +4059,9 @@ dp_netdev_rxq_get_cycles(struct
> >>> dp_netdev_rxq
> >>>> *rx,
> >>>>
> >>>>  static void
> >>>>  dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
> >>>> -                                unsigned long long cycles)
> >>>> +                                unsigned long long cycles,
> >>>> +                                unsigned idx)
> >>>>  {
> >>>> -    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> >>>>      atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);  }
> >>>>
> >>>> @@ -4762,6 +4847,9 @@ reconfigure_datapath(struct dp_netdev *dp)
> >>>>
> >>>>      /* Reload affected pmd threads. */
> >>>>      reload_affected_pmds(dp);
> >>>> +
> >>>> +    /* Check if PMD Auto LB is to be enabled */
> >>>> +    set_pmd_auto_lb(dp);
> >>>>  }
> >>>>
> >>>>  /* Returns true if one of the netdevs in 'dp' requires a
> >>>> reconfiguration */ @@ -4780,6 +4868,228 @@
> >>> ports_require_restart(const struct dp_netdev *dp)
> >>>>      return false;
> >>>>  }
> >>>>
> >>>> +/* Function for calculating variance */ static uint64_t
> >>>> +variance(uint64_t a[], int n) {
> >>>> +    /* Compute mean (average of elements) */
> >>>> +    uint64_t sum = 0;
> >>>> +    uint64_t mean = 0;
> >>>> +    uint64_t sqDiff = 0;
> >>>> +
> >>>> +    if (!n) {
> >>>> +        return 0;
> >>>> +    }
> >>>> +
> >>>> +    for (int i = 0; i < n; i++) {
> >>>> +        sum += a[i];
> >>>> +    }
> >>>> +
> >>>> +    if (sum) {
> >>>> +        mean = sum / n;
> >>>> +
> >>>> +        /* Compute sum squared differences with mean. */
> >>>> +        for (int i = 0; i < n; i++) {
> >>>> +            sqDiff += (a[i] - mean)*(a[i] - mean);
> >>>> +        }
> >>>> +    }
> >>>> +    return (sqDiff ? (sqDiff / n) : 0); }
> >>>> +
> >>>> +static uint64_t
> >>>> +get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list,
> >>>> +uint32_t num)
> >>>
> >>> I think this function would require the port_mutex.
> >> I will check and add in next version.
> >>>
> >>>> +{
> >>>> +    struct dp_netdev_port *port;
> >>>> +    struct dp_netdev_pmd_thread *pmd;
> >>>> +    struct dp_netdev_rxq ** rxqs = NULL;
> >>>> +    struct rr_numa *numa = NULL;
> >>>> +    struct rr_numa_list rr;
> >>>> +    int n_rxqs = 0;
> >>>> +    uint64_t ret = 0;
> >>>> +    uint64_t *pmd_usage;
> >>>> +
> >>>> +    pmd_usage = xcalloc(num, sizeof(uint64_t));
> >>>> +
> >>>> +    HMAP_FOR_EACH (port, node, &dp->ports) {
> >>>> +        if (!netdev_is_pmd(port->netdev)) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        for (int qid = 0; qid < port->n_rxq; qid++) {
> >>>> +            struct dp_netdev_rxq *q = &port->rxqs[qid];
> >>>> +            uint64_t cycle_hist = 0;
> >>>> +
> >>>> +            if (q->pmd->isolated) {
> >>>> +                continue;
> >>>> +            }
> >>>> +
> >>>> +            if (n_rxqs == 0) {
> >>>> +                rxqs = xmalloc(sizeof *rxqs);
> >>>> +            } else {
> >>>> +                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
> >>>> +            }
> >>>> +
> >>>> +            /* Sum the queue intervals and store the cycle history. */
> >>>> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> >>>> +                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
> >>>> +            }
> >>>> +            /* Do we need to add intrvl_cycles in history??
> >>>
> >>> If you want to use compare_rxq_cycles() then you have to put them in
> >>> history, but it's only used for that and re-written everytime, so I
> >>> don't think it is harmful.
> >> Yeah, that is the objective of adding it.
> >>>
> >>>> +             * but then we should clear interval cycles also */
> >>>
> >>> I don't think you should be clearing interval cycles in a dry run,
> >>> otherwise they will be reset if the real rebalance occurs.
> >> Thanks for clarifying it. I will remove the comment.
> >>>
> >>>> +            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
> >>>> +                                         cycle_hist);
> >>>> +            /* Store the queue. */
> >>>> +            rxqs[n_rxqs++] = q;
> >>>> +        }
> >>>> +    }
> >>>> +    if (n_rxqs > 1) {
> >>>> +        /* Sort the queues in order of the processing cycles
> >>>> +         * they consumed during their last pmd interval. */
> >>>> +        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
> >>>> +    }
> >>>> +    rr_numa_list_populate(dp, &rr);
> >>>> +
> >>>> +    for (int i = 0; i < n_rxqs; i++) {
> >>>> +        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
> >>>> +        numa = rr_numa_list_lookup(&rr, numa_id);
> >>>> +        if (!numa) {
> >>>> +            /* Don't consider queues across NUMA  ???*/
> >>>
> >>> I think you should abort the whole dry run process if this is
> >>> happening
> >> Okay.
> >>>
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        pmd = rr_numa_get_pmd(numa, true);
> >>>> +        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned
> >>>> + port
> >>> \'%s\' "
> >>>> +                  "rx queue %d "
> >>>> +                  "(measured processing cycles %"PRIu64").",
> >>>> +                  pmd->core_id, numa_id,
> >>>> +                  netdev_rxq_get_name(rxqs[i]->rx),
> >>>> +                  netdev_rxq_get_queue_id(rxqs[i]->rx),
> >>>> +                  dp_netdev_rxq_get_cycles(rxqs[i],
> >>>> + RXQ_CYCLES_PROC_HIST));
> >>>> +
> >>>> +        for (int id = 0; id < num; id++) {
> >>>> +            if (pmd->core_id == core_list[id]) {
> >>>> +                /* Add the processing cycles of rxq to pmd polling it */
> >>>> +                uint64_t proc_cycles = 0;
> 
> >>>> +                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++)
> {
> >>>> +                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
> >>>> +                                                                   idx);
> >>>> +                }
> 
> This is ok, but if you can be sure it was already done and result stored in
> PROC_HIST earlier in the function, you could just use
> dp_netdev_rxq_get_cycles(rxqs[i],RXQ_CYCLES_PROC_HIST)
> 
Yeah, I realized it after looking more into the existing code. Will modify it in next version. Thanks
> >>>> +                pmd_usage[id] += proc_cycles;
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>>> +        uint64_t total_cycles = 0;
> >>>> +
> >>>> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        /* Get the total pmd cycles for an interval. */
> >>>> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> >>>> +        /* Estimate the cycles to cover all intervals. */
> >>>> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> >>>> +        for (int id = 0; id < num; id++) {
> >>>> +            if (pmd->core_id == core_list[id]) {
> >>>> +                if (pmd_usage[id]) {
> >>>> +                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
> >>>> +                }
> >>>> +                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
> >>>> +                          pmd->core_id, pmd_usage[id]);
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +    ret = variance(pmd_usage, num);
> >>>> +
> >>>> +    rr_numa_list_destroy(&rr);
> >>>> +    free(rxqs);
> >>>> +    free(pmd_usage);
> >>>> +    return ret;
> >>>> +}
> >>>> +
> >>>> +static bool
> >>>> +pmd_rebalance_dry_run(struct dp_netdev *dp) {
> >>>> +    struct dp_netdev_pmd_thread *pmd;
> >>>> +    uint64_t *curr_pmd_usage;
> >>>> +
> >>>> +    uint64_t curr_variance;
> >>>> +    uint64_t new_variance;
> >>>> +    uint64_t improvement = 0;
> >>>> +    uint32_t num_pmds;
> >>>> +    uint32_t *pmd_corelist;
> >>>> +    struct rxq_poll *poll, *poll_next;
> >>>> +
> >>>> +    num_pmds = cmap_count(&dp->poll_threads);
> >>>> +
> >>>> +    if (num_pmds > 1) {
> >>>> +        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
> >>>> +        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
> >>>> +    } else {
> >>>> +        return false;
> >>>> +    }
> >>>> +
> >>>> +    num_pmds = 0;
> >>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>>> +        uint64_t total_cycles = 0;
> >>>> +        uint64_t total_proc = 0;
> >>>> +
> >>>> +        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
> >>>> +            continue;
> >>>> +        }
> >>>> +
> >>>> +        /* Get the total pmd cycles for an interval. */
> >>>> +        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
> >>>> +        /* Estimate the cycles to cover all intervals. */
> >>>> +        total_cycles *= PMD_RXQ_INTERVAL_MAX;
> >>>> +
> >>>> +        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
> >>>> +            uint64_t proc_cycles = 0;
> >>>> +            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
> >>>> +                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq,
> i);
> >>>> +            }
> >>>> +            total_proc += proc_cycles;
> >>>> +        }
> >>>> +        if (total_proc) {
> >>>> +            curr_pmd_usage[num_pmds] = (total_proc * 100) /
> total_cycles;
> >>>> +        }
> >>>> +
> >>>> +        VLOG_DBG("PMD_AUTO_LB_MON
> curr_pmd_usage(%d):%"PRIu64"",
> >>>> +                  pmd->core_id, curr_pmd_usage[num_pmds]);
> >>>> +
> >>>> +        if (atomic_count_get(&pmd->pmd_overloaded)) {
> >>>> +            atomic_count_set(&pmd->pmd_overloaded, 0);
> >>>> +        }
> >>>> +
> >>>> +        pmd_corelist[num_pmds] = pmd->core_id;
> >>>> +        num_pmds++;
> >>>> +    }
> >>>> +
> >>>> +    curr_variance = variance(curr_pmd_usage, num_pmds);
> >>>> +
> >>>> +    new_variance = get_dry_run_variance(dp, pmd_corelist,
> num_pmds);
> >>>> +    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
> >>>> +              " curr_variance: %"PRIu64"",
> >>>> +              new_variance, curr_variance);
> >>>> +
> >>>> +    if (new_variance && (new_variance < curr_variance)) {
> >>>> +        improvement =
> >>>> +            ((curr_variance - new_variance) * 100) /
> >>>> + curr_variance;
> >>>> +
> >>>> +        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"",
> >>> improvement);
> >>>> +    }
> >>>> +
> >>>> +    free(curr_pmd_usage);
> >>>> +    free(pmd_corelist);
> >>>> +
> >>>> +    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
> >>>> +        return true;
> >>>> +    }
> >>>> +
> >>>> +    return false;
> >>>> +}
> >>>> +
> >>>> +
> >>>>  /* Return true if needs to revalidate datapath flows. */  static
> >>>> bool dpif_netdev_run(struct dpif *dpif) @@ -4789,6 +5099,9 @@
> >>>> dpif_netdev_run(struct dpif *dpif)
> >>>>      struct dp_netdev_pmd_thread *non_pmd;
> >>>>      uint64_t new_tnl_seq;
> >>>>      bool need_to_flush = true;
> >>>> +    bool pmd_rebalance = false;
> >>>> +    long long int now = time_msec();
> >>>> +    struct dp_netdev_pmd_thread *pmd;
> >>>>
> >>>>      ovs_mutex_lock(&dp->port_mutex);
> >>>>      non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); @@ -
> 4821,6
> >>>> +5134,38 @@ dpif_netdev_run(struct dpif *dpif)
> >>>>          dp_netdev_pmd_unref(non_pmd);
> >>>>      }
> >>>>
> >>>> +    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
> >>>> +    if (pmd_alb->is_enabled) {
> >>>> +        if (!pmd_alb->rebalance_poll_timer) {
> >>>> +            pmd_alb->rebalance_poll_timer = now;
> >>>> +        } else if ((pmd_alb->rebalance_poll_timer +
> >>>> +                   pmd_alb->rebalance_intvl) < now) {
> >>>> +            pmd_alb->rebalance_poll_timer = now;
> >>>> +            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>>> +                if (atomic_count_get(&pmd->pmd_overloaded) >=
> >>>> +                                    PMD_RXQ_INTERVAL_MAX) {
> >>>> +                    pmd_rebalance = true;
> >>>> +                    break;
> >>>> +                }
> >>>> +            }
> >>>> +            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd
> >>> rebalance:%d",
> >>>> +                      pmd_rebalance);
> >>>> +
> >>>> +            if (pmd_rebalance &&
> >>>> +                !dp_netdev_is_reconf_required(dp) &&
> >>>> +                !ports_require_restart(dp) &&
> >>>> +                pmd_rebalance_dry_run(dp)) {
> >>>
> >>> Don't you need the dp_netdev_mutex for call to
> >>> pmd_rebalance_dry_run, or at least for some parts of it?
> >> I will check it.
> >
> > Actually, I think it's not needed
> >
> >>>
> >>>> +
> >>>> +                ovs_mutex_unlock(&dp->port_mutex);
> >>>
> >>> It seems odd to be unlocking this and then taking it again, is there a
> reason?
> >> Need to check it again if we can have both locks at the same time.
> >>>
> >>>> +                ovs_mutex_lock(&dp_netdev_mutex);
> >
> > I don't think you need this lock or to unlock the port_mutex
> >
> >>>> +                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD
> RECONFIGURE");
> >>>> +                dp_netdev_request_reconfigure(dp);
> >>>> +                ovs_mutex_unlock(&dp_netdev_mutex);
> >>>> +                ovs_mutex_lock(&dp->port_mutex);
> >>>> +            }
> >>>> +        }
> >>>> +    }
> >>>> +
> >>>>      if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp))
> {
> >>>>          reconfigure_datapath(dp);
> >>>>      }
> >>>> @@ -4979,6 +5324,8 @@ pmd_thread_main(void *f_)
> >>>>  reload:
> >>>>      pmd_alloc_static_tx_qid(pmd);
> >>>>
> >>>> +    atomic_count_init(&pmd->pmd_overloaded, 0);
> >>>> +
> >>>>      /* List port/core affinity */
> >>>>      for (i = 0; i < poll_cnt; i++) {
> >>>>         VLOG_DBG("Core %d processing port \'%s\' with queue-id
> >>>> %d\n", @@ -4986,6 +5333,10 @@ reload:
> >>>>                  netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
> >>>>         /* Reset the rxq current cycles counter. */
> >>>>         dp_netdev_rxq_set_cycles(poll_list[i].rxq,
> >>>> RXQ_CYCLES_PROC_CURR, 0);
> >>>> +
> >>>> +       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
> >>>> +            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
> >>>> +       }
> >>>
> >>> is it needed for this patch? won't all the values have been
> >>> refreshed by the time the next check is performed anyway
> >> I thought it is safe to reset it. If PMD is reset in middle of cycle then it may
> have stale information when dry_run is executed.
> >
> > It shouldn't be reset as it can clear info for some rxqs before
> > rxq-pmd assignment when ports are reconfigured. You can see this in
> > the rxq-pmd assignment logs, when adding/removing rxqs.
> >
> >>>
> >>>>      }
> >>>>
> >>>>      if (!poll_cnt) {
> >>>> @@ -7188,17 +7539,51 @@ dp_netdev_pmd_try_optimize(struct
> >>> dp_netdev_pmd_thread *pmd,
> >>>>                             struct polled_queue *poll_list, int
> >>>> poll_cnt)  {
> >>>>      struct dpcls *cls;
> >>>> +    uint64_t tot_idle = 0, tot_proc = 0;
> >>>> +    unsigned int idx;
> >>>> +    unsigned int pmd_load = 0;
> >>>>
> >>>>      if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
> >>>>          uint64_t curr_tsc;
> >>>> +        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
> >>>> +        if (pmd_alb->is_enabled && !pmd->isolated) {
> >>>> +            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE]
> -
> >>>> +                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
> >>>> +            tot_proc = pmd-
> >perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
> >>>> +                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
> >>>> +
> >>>> +            if (tot_proc) {
> >>>> +                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
> >>>> +            }
> >>>> +
> >>>> +            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
> >>>> +                atomic_count_inc(&pmd->pmd_overloaded);
> >>>> +
> >>>> +                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT
> iter
> >>> %d",
> >>>> +                          atomic_count_get(&pmd->pmd_overloaded));
> >>>
> >>> Better to remove this log
> >> Sure
> >>>
> >>>> +            } else {
> >>>> +                atomic_count_set(&pmd->pmd_overloaded, 0);
> >>>> +            }
> >>>> +        }
> >>>> +
> >>>> +        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
> >>>> +                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
> >>>> +        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
> >>>> +
> >>>> + pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
> >>>> +
> >>>
> >>> These have been used earlier - are they initialized somewhere for
> >>> the first use? you could just skip the above if() on the first call,
> >>> so they get initialized, or else init them at the start of
> >>> pmd_thread_main()
> >> It would have in initialized to 0 when pmd structure is created. We are not
> resetting it to 0 whenever PMD is reloaded.
> >
> > but isn't the counters.n structure reset everytime the pmd reloads? so
> > it would mean that this should be reset also. Maybe I read the
> > pmd_stats code wrong.
> >
> >>>
> >>>>          /* Get the cycles that were used to process each queue and store.
> */
> >>>>          for (unsigned i = 0; i < poll_cnt; i++) {
> >>>> -            uint64_t rxq_cyc_curr =
> dp_netdev_rxq_get_cycles(poll_list[i].rxq,
> >>>> -                                                        RXQ_CYCLES_PROC_CURR);
> >>>> -            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
> >>>> -            dp_netdev_rxq_set_cycles(poll_list[i].rxq,
> >>> RXQ_CYCLES_PROC_CURR,
> >>>> -                                     0);
> >>>> +            uint64_t rxq_cyc_curr;
> >>>> +            struct dp_netdev_rxq *rxq;
> >>>> +
> >>>> +            rxq = poll_list[i].rxq;
> >>>> +            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
> >>>> +
> >>>> +            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq,
> >>> RXQ_CYCLES_PROC_CURR);
> >>>> +            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
> >>>> +            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR,
> >>>> + 0);
> >>>>          }
> >>>> +
> >>>>          curr_tsc = cycles_counter_update(&pmd->perf_stats);
> >>>>          if (pmd->intrvl_tsc_prev) {
> >>>>              /* There is a prev timestamp, store a new intrvl cycle
> >>>> count. */ diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
> >>>> index 2160910..ff3589c 100644
> >>>> --- a/vswitchd/vswitch.xml
> >>>> +++ b/vswitchd/vswitch.xml
> >>>> @@ -574,6 +574,36 @@
> >>>>              be set to 'skip_sw'.
> >>>>          </p>
> >>>>        </column>
> >>>> +      <column name="other_config" key="pmd-auto-lb"
> >>>> +              type='{"type": "boolean"}'>
> >>>> +        <p>
> >>>> +         Configures PMD Auto Load Balancing that allows automatic
> >>> assignment of
> >>>> +         RX queues to PMDs if any of PMDs is overloaded (i.e.
> >>>> + processing
> >>> cycles
> >>>> +         > 95%).
> >>>> +        </p>
> >>>> +        <p>
> >>>> +         It uses current scheme of cycle based assignment of RX queues
> that
> >>>> +         are not statically pinned to PMDs.
> >>>> +        </p>
> >>>> +        <p>
> >>>> +          Set this value to <code>true</code> to enable this option.
> >>>> +        </p>
> >>>> +        <p>
> >>>> +          The default value is <code>false</code>.
> >>>> +        </p>
> >>>> +        <p>
> >>>> +         This only comes in effect if cycle based assignment is enabled and
> >>>> +         there are more than one non-isolated PMDs present and
> >>>> + atleast one
> >>> of
> >>>> +         it polls more than one queue.
> >>>> +        </p>
> >>>> +      </column>
> >>>> +      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
> >>>> +              type='{"type": "integer", "minInteger": 1}'>
> >>>> +        <p>
> >>>> +         The minimum time (in minutes) 2 consecutive PMD Auto Load
> >>> Balancing
> >>>> +         iterations.
> >>>> +        </p>
> >>>> +      </column>
> >>>>      </group>
> >>>>      <group title="Status">
> >>>>        <column name="next_cfg">
> >>>>
> >>
> >
diff mbox series

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1564db9..8db106f 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -80,6 +80,12 @@ 
 
 VLOG_DEFINE_THIS_MODULE(dpif_netdev);
 
+/* Auto Load Balancing Defaults */
+#define ACCEPT_IMPROVE_DEFAULT       (25)
+#define PMD_LOAD_THRE_DEFAULT        (95)
+#define PMD_REBALANCE_POLL_INTERVAL  1 /* 1 Min */
+#define MIN_TO_MSEC                  60000
+
 #define FLOW_DUMP_MAX_BATCH 50
 /* Use per thread recirc_depth to prevent recirculation loop. */
 #define MAX_RECIRC_DEPTH 6
@@ -288,6 +294,13 @@  struct dp_meter {
     struct dp_meter_band bands[];
 };
 
+struct pmd_auto_lb {
+    bool auto_lb_conf;        /* enable-disable auto load balancing */
+    bool is_enabled;          /* auto_lb current status */
+    uint64_t rebalance_intvl;
+    uint64_t rebalance_poll_timer;
+};
+
 /* Datapath based on the network device interface from netdev.h.
  *
  *
@@ -368,6 +381,7 @@  struct dp_netdev {
     uint64_t last_tnl_conf_seq;
 
     struct conntrack conntrack;
+    struct pmd_auto_lb pmd_alb;
 };
 
 static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
@@ -682,6 +696,7 @@  struct dp_netdev_pmd_thread {
     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
     /* List of rx queues to poll. */
     struct hmap poll_list OVS_GUARDED;
+
     /* Map of 'tx_port's used for transmission.  Written by the main thread,
      * read by the pmd thread. */
     struct hmap tx_ports OVS_GUARDED;
@@ -702,6 +717,11 @@  struct dp_netdev_pmd_thread {
     /* Keep track of detailed PMD performance statistics. */
     struct pmd_perf_stats perf_stats;
 
+    /* Some stats from previous iteration used by automatic pmd
+       load balance logic. */
+    uint64_t prev_stats[PMD_N_STATS];
+    atomic_count pmd_overloaded;
+
     /* Set to true if the pmd thread needs to be reloaded. */
     bool need_reload;
 };
@@ -792,9 +812,11 @@  dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
                          enum rxq_cycles_counter_type type);
 static void
 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
-                           unsigned long long cycles);
+                                unsigned long long cycles,
+                                unsigned idx);
 static uint64_t
-dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx);
+dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx,
+                                unsigned idx);
 static void
 dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
                                bool purge);
@@ -3734,6 +3756,51 @@  dpif_netdev_operate(struct dpif *dpif, struct dpif_op **ops, size_t n_ops,
     }
 }
 
+/* Enable/Disable PMD auto load balancing */
+static void
+set_pmd_auto_lb(struct dp_netdev *dp)
+{
+    unsigned int cnt = 0;
+    struct dp_netdev_pmd_thread *pmd;
+    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
+
+    bool enable = false;
+    bool pmd_rxq_assign_cyc = dp->pmd_rxq_assign_cyc;
+
+    /* Ensure that there is at least 2 non-isolated PMDs and
+     * one of the PMD is polling more than one rxq
+     */
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        if (pmd->core_id == NON_PMD_CORE_ID || pmd->isolated) {
+            continue;
+        }
+
+        cnt++;
+        if (hmap_count(&pmd->poll_list) > 1) {
+            if (enable && (cnt > 1)) {
+                break;
+            } else {
+                enable = true;
+            }
+        }
+    }
+
+    /* Enable auto LB if it is configured and cycle based assignment is true */
+    enable = enable && pmd_rxq_assign_cyc && pmd_alb->auto_lb_conf;
+
+    if (pmd_alb->is_enabled != enable) {
+        pmd_alb->is_enabled = enable;
+        if (pmd_alb->is_enabled) {
+            VLOG_INFO("PMD auto lb is enabled, rebalance intvl:%lu(msec)\n",
+                       pmd_alb->rebalance_intvl);
+        } else {
+            pmd_alb->rebalance_poll_timer = 0;
+            VLOG_INFO("PMD auto lb is disabled\n");
+        }
+    }
+
+}
+
 /* Applies datapath configuration from the database. Some of the changes are
  * actually applied in dpif_netdev_run(). */
 static int
@@ -3748,6 +3815,7 @@  dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
                         DEFAULT_EM_FLOW_INSERT_INV_PROB);
     uint32_t insert_min, cur_min;
     uint32_t tx_flush_interval, cur_tx_flush_interval;
+    uint64_t rebalance_intvl;
 
     tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
                                      DEFAULT_TX_FLUSH_INTERVAL);
@@ -3819,6 +3887,23 @@  dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
                   pmd_rxq_assign);
         dp_netdev_request_reconfigure(dp);
     }
+
+    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
+    pmd_alb->auto_lb_conf = smap_get_bool(other_config, "pmd-auto-lb",
+                              false);
+
+    rebalance_intvl = smap_get_int(other_config, "pmd-auto-lb-rebalance-intvl",
+                              PMD_REBALANCE_POLL_INTERVAL);
+
+    /* Input is in min, convert it to msec */
+    rebalance_intvl =
+        rebalance_intvl ? rebalance_intvl * MIN_TO_MSEC : MIN_TO_MSEC;
+
+    if (pmd_alb->rebalance_intvl != rebalance_intvl) {
+        pmd_alb->rebalance_intvl = rebalance_intvl;
+    }
+
+    set_pmd_auto_lb(dp);
     return 0;
 }
 
@@ -3974,9 +4059,9 @@  dp_netdev_rxq_get_cycles(struct dp_netdev_rxq *rx,
 
 static void
 dp_netdev_rxq_set_intrvl_cycles(struct dp_netdev_rxq *rx,
-                                unsigned long long cycles)
+                                unsigned long long cycles,
+                                unsigned idx)
 {
-    unsigned int idx = rx->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
     atomic_store_relaxed(&rx->cycles_intrvl[idx], cycles);
 }
 
@@ -4762,6 +4847,9 @@  reconfigure_datapath(struct dp_netdev *dp)
 
     /* Reload affected pmd threads. */
     reload_affected_pmds(dp);
+
+    /* Check if PMD Auto LB is to be enabled */
+    set_pmd_auto_lb(dp);
 }
 
 /* Returns true if one of the netdevs in 'dp' requires a reconfiguration */
@@ -4780,6 +4868,228 @@  ports_require_restart(const struct dp_netdev *dp)
     return false;
 }
 
+/* Function for calculating variance */
+static uint64_t
+variance(uint64_t a[], int n)
+{
+    /* Compute mean (average of elements) */
+    uint64_t sum = 0;
+    uint64_t mean = 0;
+    uint64_t sqDiff = 0;
+
+    if (!n) {
+        return 0;
+    }
+
+    for (int i = 0; i < n; i++) {
+        sum += a[i];
+    }
+
+    if (sum) {
+        mean = sum / n;
+
+        /* Compute sum squared differences with mean. */
+        for (int i = 0; i < n; i++) {
+            sqDiff += (a[i] - mean)*(a[i] - mean);
+        }
+    }
+    return (sqDiff ? (sqDiff / n) : 0);
+}
+
+static uint64_t
+get_dry_run_variance(struct dp_netdev *dp, uint32_t *core_list, uint32_t num)
+{
+    struct dp_netdev_port *port;
+    struct dp_netdev_pmd_thread *pmd;
+    struct dp_netdev_rxq ** rxqs = NULL;
+    struct rr_numa *numa = NULL;
+    struct rr_numa_list rr;
+    int n_rxqs = 0;
+    uint64_t ret = 0;
+    uint64_t *pmd_usage;
+
+    pmd_usage = xcalloc(num, sizeof(uint64_t));
+
+    HMAP_FOR_EACH (port, node, &dp->ports) {
+        if (!netdev_is_pmd(port->netdev)) {
+            continue;
+        }
+
+        for (int qid = 0; qid < port->n_rxq; qid++) {
+            struct dp_netdev_rxq *q = &port->rxqs[qid];
+            uint64_t cycle_hist = 0;
+
+            if (q->pmd->isolated) {
+                continue;
+            }
+
+            if (n_rxqs == 0) {
+                rxqs = xmalloc(sizeof *rxqs);
+            } else {
+                rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
+            }
+
+            /* Sum the queue intervals and store the cycle history. */
+            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+                cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
+            }
+            /* Do we need to add intrvl_cycles in history??
+             * but then we should clear interval cycles also */
+            dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST,
+                                         cycle_hist);
+            /* Store the queue. */
+            rxqs[n_rxqs++] = q;
+        }
+    }
+    if (n_rxqs > 1) {
+        /* Sort the queues in order of the processing cycles
+         * they consumed during their last pmd interval. */
+        qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
+    }
+    rr_numa_list_populate(dp, &rr);
+
+    for (int i = 0; i < n_rxqs; i++) {
+        int numa_id = netdev_get_numa_id(rxqs[i]->port->netdev);
+        numa = rr_numa_list_lookup(&rr, numa_id);
+        if (!numa) {
+            /* Don't consider queues across NUMA  ???*/
+            continue;
+        }
+
+        pmd = rr_numa_get_pmd(numa, true);
+        VLOG_DBG("PMD AUTO_LB:Core %d on numa node %d assigned port \'%s\' "
+                  "rx queue %d "
+                  "(measured processing cycles %"PRIu64").",
+                  pmd->core_id, numa_id,
+                  netdev_rxq_get_name(rxqs[i]->rx),
+                  netdev_rxq_get_queue_id(rxqs[i]->rx),
+                  dp_netdev_rxq_get_cycles(rxqs[i], RXQ_CYCLES_PROC_HIST));
+
+        for (int id = 0; id < num; id++) {
+            if (pmd->core_id == core_list[id]) {
+                /* Add the processing cycles of rxq to pmd polling it */
+                uint64_t proc_cycles = 0;
+                for (unsigned idx = 0; idx < PMD_RXQ_INTERVAL_MAX; idx++) {
+                    proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxqs[i],
+                                                                   idx);
+                }
+                pmd_usage[id] += proc_cycles;
+            }
+        }
+    }
+
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        uint64_t total_cycles = 0;
+
+        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
+            continue;
+        }
+
+        /* Get the total pmd cycles for an interval. */
+        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
+        /* Estimate the cycles to cover all intervals. */
+        total_cycles *= PMD_RXQ_INTERVAL_MAX;
+        for (int id = 0; id < num; id++) {
+            if (pmd->core_id == core_list[id]) {
+                if (pmd_usage[id]) {
+                    pmd_usage[id] = (pmd_usage[id] * 100) / total_cycles;
+                }
+                VLOG_DBG("Core_id:%d, usage:%"PRIu64"\n",
+                          pmd->core_id, pmd_usage[id]);
+            }
+        }
+    }
+    ret = variance(pmd_usage, num);
+
+    rr_numa_list_destroy(&rr);
+    free(rxqs);
+    free(pmd_usage);
+    return ret;
+}
+
+static bool
+pmd_rebalance_dry_run(struct dp_netdev *dp)
+{
+    struct dp_netdev_pmd_thread *pmd;
+    uint64_t *curr_pmd_usage;
+
+    uint64_t curr_variance;
+    uint64_t new_variance;
+    uint64_t improvement = 0;
+    uint32_t num_pmds;
+    uint32_t *pmd_corelist;
+    struct rxq_poll *poll, *poll_next;
+
+    num_pmds = cmap_count(&dp->poll_threads);
+
+    if (num_pmds > 1) {
+        curr_pmd_usage = xcalloc(num_pmds, sizeof(uint64_t));
+        pmd_corelist = xcalloc(num_pmds, sizeof(uint32_t));
+    } else {
+        return false;
+    }
+
+    num_pmds = 0;
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        uint64_t total_cycles = 0;
+        uint64_t total_proc = 0;
+
+        if ((pmd->core_id == NON_PMD_CORE_ID) || pmd->isolated) {
+            continue;
+        }
+
+        /* Get the total pmd cycles for an interval. */
+        atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
+        /* Estimate the cycles to cover all intervals. */
+        total_cycles *= PMD_RXQ_INTERVAL_MAX;
+
+        HMAP_FOR_EACH_SAFE (poll, poll_next, node, &pmd->poll_list) {
+            uint64_t proc_cycles = 0;
+            for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+                proc_cycles += dp_netdev_rxq_get_intrvl_cycles(poll->rxq, i);
+            }
+            total_proc += proc_cycles;
+        }
+        if (total_proc) {
+            curr_pmd_usage[num_pmds] = (total_proc * 100) / total_cycles;
+        }
+
+        VLOG_DBG("PMD_AUTO_LB_MON curr_pmd_usage(%d):%"PRIu64"",
+                  pmd->core_id, curr_pmd_usage[num_pmds]);
+
+        if (atomic_count_get(&pmd->pmd_overloaded)) {
+            atomic_count_set(&pmd->pmd_overloaded, 0);
+        }
+
+        pmd_corelist[num_pmds] = pmd->core_id;
+        num_pmds++;
+    }
+
+    curr_variance = variance(curr_pmd_usage, num_pmds);
+
+    new_variance = get_dry_run_variance(dp, pmd_corelist, num_pmds);
+    VLOG_DBG("PMD_AUTO_LB_MON new variance: %"PRIu64","
+              " curr_variance: %"PRIu64"",
+              new_variance, curr_variance);
+
+    if (new_variance && (new_variance < curr_variance)) {
+        improvement =
+            ((curr_variance - new_variance) * 100) / curr_variance;
+
+        VLOG_DBG("PMD_AUTO_LB_MON improvement %"PRIu64"", improvement);
+    }
+
+    free(curr_pmd_usage);
+    free(pmd_corelist);
+
+    if (improvement >= ACCEPT_IMPROVE_DEFAULT) {
+        return true;
+    }
+
+    return false;
+}
+
+
 /* Return true if needs to revalidate datapath flows. */
 static bool
 dpif_netdev_run(struct dpif *dpif)
@@ -4789,6 +5099,9 @@  dpif_netdev_run(struct dpif *dpif)
     struct dp_netdev_pmd_thread *non_pmd;
     uint64_t new_tnl_seq;
     bool need_to_flush = true;
+    bool pmd_rebalance = false;
+    long long int now = time_msec();
+    struct dp_netdev_pmd_thread *pmd;
 
     ovs_mutex_lock(&dp->port_mutex);
     non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
@@ -4821,6 +5134,38 @@  dpif_netdev_run(struct dpif *dpif)
         dp_netdev_pmd_unref(non_pmd);
     }
 
+    struct pmd_auto_lb * pmd_alb = &dp->pmd_alb;
+    if (pmd_alb->is_enabled) {
+        if (!pmd_alb->rebalance_poll_timer) {
+            pmd_alb->rebalance_poll_timer = now;
+        } else if ((pmd_alb->rebalance_poll_timer +
+                   pmd_alb->rebalance_intvl) < now) {
+            pmd_alb->rebalance_poll_timer = now;
+            CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+                if (atomic_count_get(&pmd->pmd_overloaded) >=
+                                    PMD_RXQ_INTERVAL_MAX) {
+                    pmd_rebalance = true;
+                    break;
+                }
+            }
+            VLOG_DBG("PMD_AUTO_LB_MON periodic check:pmd rebalance:%d",
+                      pmd_rebalance);
+
+            if (pmd_rebalance &&
+                !dp_netdev_is_reconf_required(dp) &&
+                !ports_require_restart(dp) &&
+                pmd_rebalance_dry_run(dp)) {
+
+                ovs_mutex_unlock(&dp->port_mutex);
+                ovs_mutex_lock(&dp_netdev_mutex);
+                VLOG_DBG("PMD_AUTO_LB_MON Invoking PMD RECONFIGURE");
+                dp_netdev_request_reconfigure(dp);
+                ovs_mutex_unlock(&dp_netdev_mutex);
+                ovs_mutex_lock(&dp->port_mutex);
+            }
+        }
+    }
+
     if (dp_netdev_is_reconf_required(dp) || ports_require_restart(dp)) {
         reconfigure_datapath(dp);
     }
@@ -4979,6 +5324,8 @@  pmd_thread_main(void *f_)
 reload:
     pmd_alloc_static_tx_qid(pmd);
 
+    atomic_count_init(&pmd->pmd_overloaded, 0);
+
     /* List port/core affinity */
     for (i = 0; i < poll_cnt; i++) {
        VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
@@ -4986,6 +5333,10 @@  reload:
                 netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
        /* Reset the rxq current cycles counter. */
        dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
+
+       for (unsigned j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
+            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, 0, j);
+       }
     }
 
     if (!poll_cnt) {
@@ -7188,17 +7539,51 @@  dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
                            struct polled_queue *poll_list, int poll_cnt)
 {
     struct dpcls *cls;
+    uint64_t tot_idle = 0, tot_proc = 0;
+    unsigned int idx;
+    unsigned int pmd_load = 0;
 
     if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
         uint64_t curr_tsc;
+        struct pmd_auto_lb * pmd_alb = &pmd->dp->pmd_alb;
+        if (pmd_alb->is_enabled && !pmd->isolated) {
+            tot_idle = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE] -
+                       pmd->prev_stats[PMD_CYCLES_ITER_IDLE];
+            tot_proc = pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY] -
+                       pmd->prev_stats[PMD_CYCLES_ITER_BUSY];
+
+            if (tot_proc) {
+                pmd_load = ((tot_proc * 100) / (tot_idle + tot_proc));
+            }
+
+            if (pmd_load >= PMD_LOAD_THRE_DEFAULT) {
+                atomic_count_inc(&pmd->pmd_overloaded);
+
+                VLOG_DBG("PMD_AUTO_LB_MON PMD OVERLOAD DETECT iter %d",
+                          atomic_count_get(&pmd->pmd_overloaded));
+            } else {
+                atomic_count_set(&pmd->pmd_overloaded, 0);
+            }
+        }
+
+        pmd->prev_stats[PMD_CYCLES_ITER_IDLE] =
+                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_IDLE];
+        pmd->prev_stats[PMD_CYCLES_ITER_BUSY] =
+                        pmd->perf_stats.counters.n[PMD_CYCLES_ITER_BUSY];
+
         /* Get the cycles that were used to process each queue and store. */
         for (unsigned i = 0; i < poll_cnt; i++) {
-            uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
-                                                        RXQ_CYCLES_PROC_CURR);
-            dp_netdev_rxq_set_intrvl_cycles(poll_list[i].rxq, rxq_cyc_curr);
-            dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
-                                     0);
+            uint64_t rxq_cyc_curr;
+            struct dp_netdev_rxq *rxq;
+
+            rxq = poll_list[i].rxq;
+            idx = rxq->intrvl_idx++ % PMD_RXQ_INTERVAL_MAX;
+
+            rxq_cyc_curr = dp_netdev_rxq_get_cycles(rxq, RXQ_CYCLES_PROC_CURR);
+            dp_netdev_rxq_set_intrvl_cycles(rxq, rxq_cyc_curr, idx);
+            dp_netdev_rxq_set_cycles(rxq, RXQ_CYCLES_PROC_CURR, 0);
         }
+
         curr_tsc = cycles_counter_update(&pmd->perf_stats);
         if (pmd->intrvl_tsc_prev) {
             /* There is a prev timestamp, store a new intrvl cycle count. */
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index 2160910..ff3589c 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -574,6 +574,36 @@ 
             be set to 'skip_sw'.
         </p>
       </column>
+      <column name="other_config" key="pmd-auto-lb"
+              type='{"type": "boolean"}'>
+        <p>
+         Configures PMD Auto Load Balancing that allows automatic assignment of
+         RX queues to PMDs if any of PMDs is overloaded (i.e. processing cycles
+         > 95%).
+        </p>
+        <p>
+         It uses current scheme of cycle based assignment of RX queues that
+         are not statically pinned to PMDs.
+        </p>
+        <p>
+          Set this value to <code>true</code> to enable this option.
+        </p>
+        <p>
+          The default value is <code>false</code>.
+        </p>
+        <p>
+         This only comes in effect if cycle based assignment is enabled and
+         there are more than one non-isolated PMDs present and atleast one of
+         it polls more than one queue.
+        </p>
+      </column>
+      <column name="other_config" key="pmd-auto-lb-rebalance-intvl"
+              type='{"type": "integer", "minInteger": 1}'>
+        <p>
+         The minimum time (in minutes) 2 consecutive PMD Auto Load Balancing
+         iterations.
+        </p>
+      </column>
     </group>
     <group title="Status">
       <column name="next_cfg">