diff mbox

[ovs-dev,v2,1/3] dpif-netdev: Incremental addition/deletion of PMD threads.

Message ID 1496153546-20532-2-git-send-email-i.maximets@samsung.com
State Superseded
Headers show

Commit Message

Ilya Maximets May 30, 2017, 2:12 p.m. UTC
Currently, change of 'pmd-cpu-mask' is very heavy operation.
It requires destroying of all the PMD threads and creating
them back. After that, all the threads will sleep until
ports' redistribution finished.

This patch adds ability to not stop the datapath while
adjusting number/placement of PMD threads. All not affected
threads will forward traffic without any additional latencies.

id-pool created for static tx queue ids to keep them sequential
in a flexible way. non-PMD thread will always have
static_tx_qid = 0 as it was before.

Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
---
 lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
 tests/pmd.at      |   2 +-
 2 files changed, 105 insertions(+), 40 deletions(-)

Comments

Mark Kavanagh June 1, 2017, 2:30 p.m. UTC | #1
>From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>Ilya Maximets
>Sent: Tuesday, May 30, 2017 3:12 PM
>To: dev@openvswitch.org; Darrell Ball <dball@vmware.com>
>Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Ilya Maximets <i.maximets@samsung.com>
>Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD threads.
>
>Currently, change of 'pmd-cpu-mask' is very heavy operation.
>It requires destroying of all the PMD threads and creating	
>them back. After that, all the threads will sleep until
>ports' redistribution finished.
>
>This patch adds ability to not stop the datapath while
>adjusting number/placement of PMD threads. All not affected
>threads will forward traffic without any additional latencies.
>
>id-pool created for static tx queue ids to keep them sequential
>in a flexible way. non-PMD thread will always have
>static_tx_qid = 0 as it was before.
>
>Signed-off-by: Ilya Maximets <i.maximets@samsung.com>

Hi Ilya,

Patch LGTM - just once comment inline.

I conducted some light testing and observed the following:
    - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask (in previous implementation, forwarding stopped completely)
        - occasional drop to zero in forwarding rate when modifying pmd-cpu-mask (e.g. change from '2' to '12') 
    - temporary drop of ~1mpps when adding/deleting port (similar to existing implementation) 

I've also conducted the following sanity checks on the patch, and found no issues:
 - checkpatch.py
 - compilation with clang
 - compilation with gcc
 - sparse
 - make check (no additional tests fail after application of this patch)

Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>

Thanks,
Mark

>---
> lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
> tests/pmd.at      |   2 +-
> 2 files changed, 105 insertions(+), 40 deletions(-)
>
>diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>index b50164b..596d133 100644
>--- a/lib/dpif-netdev.c
>+++ b/lib/dpif-netdev.c
>@@ -48,6 +48,7 @@
> #include "fat-rwlock.h"
> #include "flow.h"
> #include "hmapx.h"
>+#include "id-pool.h"
> #include "latch.h"
> #include "netdev.h"
> #include "netdev-vport.h"
>@@ -277,6 +278,9 @@ struct dp_netdev {
>
>     /* Stores all 'struct dp_netdev_pmd_thread's. */
>     struct cmap poll_threads;
>+    /* id pool for per thread static_tx_qid. */
>+    struct id_pool *tx_qid_pool;
>+    struct ovs_mutex tx_qid_pool_mutex;
>
>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
>      * instance for non-pmd thread. */
>@@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread {
>     /* Queue id used by this pmd thread to send packets on all netdevs if
>      * XPS disabled for this netdev. All static_tx_qid's are unique and less
>      * than 'cmap_count(dp->poll_threads)'. */
>-    const int static_tx_qid;
>+    uint32_t static_tx_qid;
>
>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>     /* List of rx queues to poll. */
>@@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev
>*dp,
>                                                       unsigned core_id);
> static struct dp_netdev_pmd_thread *
> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
>+static void dp_netdev_del_pmd(struct dp_netdev *dp,
>+                              struct dp_netdev_pmd_thread *pmd);
> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>@@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>
>     cmap_init(&dp->poll_threads);
>+
>+    ovs_mutex_init(&dp->tx_qid_pool_mutex);
>+    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
>+    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
>+
>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
>     ovsthread_key_create(&dp->per_pmd_key, NULL);
>
>     ovs_mutex_lock(&dp->port_mutex);
>+    /* non-PMD will be created before all other threads and will
>+     * allocate static_tx_qid = 0. */
>     dp_netdev_set_nonpmd(dp);
>
>     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
>@@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
>     dp_netdev_destroy_all_pmds(dp, true);
>     cmap_destroy(&dp->poll_threads);
>
>+    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
>+    id_pool_destroy(dp->tx_qid_pool);
>+
>     ovs_mutex_destroy(&dp->non_pmd_mutex);
>     ovsthread_key_delete(dp->per_pmd_key);
>
>@@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp-
>>port_mutex)
> }
>
> static void
>+reload_affected_pmds(struct dp_netdev *dp)
>+{
>+    struct dp_netdev_pmd_thread *pmd;
>+
>+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>+        if (pmd->need_reload) {
>+            dp_netdev_reload_pmd__(pmd);
>+            pmd->need_reload = false;
>+        }
>+    }
>+}
>+
>+static void
> reconfigure_pmd_threads(struct dp_netdev *dp)
>     OVS_REQUIRES(dp->port_mutex)
> {
>     struct dp_netdev_pmd_thread *pmd;
>     struct ovs_numa_dump *pmd_cores;
>+    struct ovs_numa_info_core *core;
>+    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
>+    struct hmapx_node *node;
>     bool changed = false;
>+    bool need_to_adjust_static_tx_qids = false;
>
>     /* The pmd threads should be started only if there's a pmd port in the
>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
>@@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>     }
>
>-    /* Check for changed configuration */
>-    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
>-        changed = true;
>-    } else {
>-        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>-            if (pmd->core_id != NON_PMD_CORE_ID
>-                && !ovs_numa_dump_contains_core(pmd_cores,
>-                                                pmd->numa_id,
>-                                                pmd->core_id)) {
>-                changed = true;
>-                break;
>-            }
>+    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>+     * PMD threads. Otherwise, new threads will allocate all the freed ids. */

Can you elaborate on why the static_tx_qids need to be adjusted - it's still not clear to me from the comment.

>+    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
>+        need_to_adjust_static_tx_qids = true;
>+    }
>+
>+    /* Check for unwanted pmd threads */
>+    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>+        if (pmd->core_id == NON_PMD_CORE_ID) {
>+            continue;
>+        }
>+        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
>+                                                    pmd->core_id)) {
>+            hmapx_add(&to_delete, pmd);
>+        } else if (need_to_adjust_static_tx_qids) {
>+            pmd->need_reload = true;
>         }
>     }
>
>-    /* Destroy the old and recreate the new pmd threads.  We don't perform an
>-     * incremental update because we would have to adjust 'static_tx_qid'. */
>-    if (changed) {
>-        struct ovs_numa_info_core *core;
>-        struct ovs_numa_info_numa *numa;
>+    HMAPX_FOR_EACH (node, &to_delete) {
>+        pmd = (struct dp_netdev_pmd_thread *) node->data;
>+        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
>+                  pmd->numa_id, pmd->core_id);
>+        dp_netdev_del_pmd(dp, pmd);
>+    }
>+    changed = !hmapx_is_empty(&to_delete);
>+    hmapx_destroy(&to_delete);
>
>-        /* Do not destroy the non pmd thread. */
>-        dp_netdev_destroy_all_pmds(dp, false);
>-        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
>-            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
>+    if (need_to_adjust_static_tx_qids) {
>+        /* 'static_tx_qid's are not sequential now.
>+         * Reload remaining threads to fix this. */
>+        reload_affected_pmds(dp);
>+    }
>
>+    /* Check for required new pmd threads */
>+    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>+        pmd = dp_netdev_get_pmd(dp, core->core_id);
>+        if (!pmd) {
>+            pmd = xzalloc(sizeof *pmd);
>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
>-
>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>+            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
>+                      pmd->numa_id, pmd->core_id);
>+            changed = true;
>+        } else {
>+            dp_netdev_pmd_unref(pmd);
>         }
>+    }
>+
>+    if (changed) {
>+        struct ovs_numa_info_numa *numa;
>
>         /* Log the number of pmd threads per numa node. */
>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
>-            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
>+            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
>                       numa->n_cores, numa->numa_id);
>         }
>     }
>@@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
> }
>
> static void
>-reload_affected_pmds(struct dp_netdev *dp)
>-{
>-    struct dp_netdev_pmd_thread *pmd;
>-
>-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>-        if (pmd->need_reload) {
>-            dp_netdev_reload_pmd__(pmd);
>-            pmd->need_reload = false;
>-        }
>-    }
>-}
>-
>-static void
> pmd_remove_stale_ports(struct dp_netdev *dp,
>                        struct dp_netdev_pmd_thread *pmd)
>     OVS_EXCLUDED(pmd->port_mutex)
>@@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>     }
> }
>
>+static void
>+pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>+{
>+    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>+    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
>+        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
>+                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
>+    }
>+    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>+
>+    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
>+             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
>+}
>+
>+static void
>+pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>+{
>+    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>+    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
>+    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>+}
>+
> static int
> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>                           struct polled_queue **ppoll_list)
>@@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
>     dpdk_set_lcore_id(pmd->core_id);
>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
> reload:
>+    pmd_alloc_static_tx_qid(pmd);
>     emc_cache_init(&pmd->flow_cache);
>
>     /* List port/core affinity */
>@@ -3766,6 +3830,7 @@ reload:
>     dp_netdev_pmd_reload_done(pmd);
>
>     emc_cache_uninit(&pmd->flow_cache);
>+    pmd_free_static_tx_qid(pmd);
>
>     if (!exiting) {
>         goto reload;
>@@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>dp_netdev *dp,
>     pmd->numa_id = numa_id;
>     pmd->need_reload = false;
>
>-    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
>-
>     ovs_refcount_init(&pmd->ref_cnt);
>     latch_init(&pmd->exit_latch);
>     pmd->reload_seq = seq_create();
>@@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>dp_netdev *dp,
>      * actual thread created for NON_PMD_CORE_ID. */
>     if (core_id == NON_PMD_CORE_ID) {
>         emc_cache_init(&pmd->flow_cache);
>+        pmd_alloc_static_tx_qid(pmd);
>     }
>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
>                 hash_int(core_id, 0));
>@@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread
>*pmd)
>         ovs_mutex_lock(&dp->non_pmd_mutex);
>         emc_cache_uninit(&pmd->flow_cache);
>         pmd_free_cached_ports(pmd);
>+        pmd_free_static_tx_qid(pmd);
>         ovs_mutex_unlock(&dp->non_pmd_mutex);
>     } else {
>         latch_set(&pmd->exit_latch);
>diff --git a/tests/pmd.at b/tests/pmd.at
>index 05755b3..39a3645 100644
>--- a/tests/pmd.at
>+++ b/tests/pmd.at
>@@ -38,7 +38,7 @@ dnl
> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
> m4_define([CHECK_PMD_THREADS_CREATED], [
>-    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
>+    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
>     line_st=$3
>     if [[ -z "$line_st" ]]
>     then
>--
>2.7.4
>
>_______________________________________________
>dev mailing list
>dev@openvswitch.org
>https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Ilya Maximets June 5, 2017, 8:17 a.m. UTC | #2
Thanks for testing, Mark.

Comments inline.

Best regards, Ilya Maximets.

On 01.06.2017 17:30, Kavanagh, Mark B wrote:
>> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-bounces@openvswitch.org] On Behalf Of
>> Ilya Maximets
>> Sent: Tuesday, May 30, 2017 3:12 PM
>> To: dev@openvswitch.org; Darrell Ball <dball@vmware.com>
>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Ilya Maximets <i.maximets@samsung.com>
>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD threads.
>>
>> Currently, change of 'pmd-cpu-mask' is very heavy operation.
>> It requires destroying of all the PMD threads and creating	
>> them back. After that, all the threads will sleep until
>> ports' redistribution finished.
>>
>> This patch adds ability to not stop the datapath while
>> adjusting number/placement of PMD threads. All not affected
>> threads will forward traffic without any additional latencies.
>>
>> id-pool created for static tx queue ids to keep them sequential
>> in a flexible way. non-PMD thread will always have
>> static_tx_qid = 0 as it was before.
>>
>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> 
> Hi Ilya,
> 
> Patch LGTM - just once comment inline.
> 
> I conducted some light testing and observed the following:
>     - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask (in previous implementation, forwarding stopped completely)
>         - occasional drop to zero in forwarding rate when modifying pmd-cpu-mask (e.g. change from '2' to '12')

I guess, drop to zero in this case could be fixed by second
patch of the series. (If it caused by port reconfiguration)

>     - temporary drop of ~1mpps when adding/deleting port (similar to existing implementation) 
> 
> I've also conducted the following sanity checks on the patch, and found no issues:
>  - checkpatch.py
>  - compilation with clang
>  - compilation with gcc
>  - sparse
>  - make check (no additional tests fail after application of this patch)
> 
> Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> 
> Thanks,
> Mark
> 
>> ---
>> lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
>> tests/pmd.at      |   2 +-
>> 2 files changed, 105 insertions(+), 40 deletions(-)
>>
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index b50164b..596d133 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -48,6 +48,7 @@
>> #include "fat-rwlock.h"
>> #include "flow.h"
>> #include "hmapx.h"
>> +#include "id-pool.h"
>> #include "latch.h"
>> #include "netdev.h"
>> #include "netdev-vport.h"
>> @@ -277,6 +278,9 @@ struct dp_netdev {
>>
>>     /* Stores all 'struct dp_netdev_pmd_thread's. */
>>     struct cmap poll_threads;
>> +    /* id pool for per thread static_tx_qid. */
>> +    struct id_pool *tx_qid_pool;
>> +    struct ovs_mutex tx_qid_pool_mutex;
>>
>>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
>>      * instance for non-pmd thread. */
>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread {
>>     /* Queue id used by this pmd thread to send packets on all netdevs if
>>      * XPS disabled for this netdev. All static_tx_qid's are unique and less
>>      * than 'cmap_count(dp->poll_threads)'. */
>> -    const int static_tx_qid;
>> +    uint32_t static_tx_qid;
>>
>>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>     /* List of rx queues to poll. */
>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev
>> *dp,
>>                                                       unsigned core_id);
>> static struct dp_netdev_pmd_thread *
>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
>> +static void dp_netdev_del_pmd(struct dp_netdev *dp,
>> +                              struct dp_netdev_pmd_thread *pmd);
>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>>
>>     cmap_init(&dp->poll_threads);
>> +
>> +    ovs_mutex_init(&dp->tx_qid_pool_mutex);
>> +    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
>> +    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
>> +
>>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
>>     ovsthread_key_create(&dp->per_pmd_key, NULL);
>>
>>     ovs_mutex_lock(&dp->port_mutex);
>> +    /* non-PMD will be created before all other threads and will
>> +     * allocate static_tx_qid = 0. */
>>     dp_netdev_set_nonpmd(dp);
>>
>>     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
>>     dp_netdev_destroy_all_pmds(dp, true);
>>     cmap_destroy(&dp->poll_threads);
>>
>> +    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
>> +    id_pool_destroy(dp->tx_qid_pool);
>> +
>>     ovs_mutex_destroy(&dp->non_pmd_mutex);
>>     ovsthread_key_delete(dp->per_pmd_key);
>>
>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp-
>>> port_mutex)
>> }
>>
>> static void
>> +reload_affected_pmds(struct dp_netdev *dp)
>> +{
>> +    struct dp_netdev_pmd_thread *pmd;
>> +
>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> +        if (pmd->need_reload) {
>> +            dp_netdev_reload_pmd__(pmd);
>> +            pmd->need_reload = false;
>> +        }
>> +    }
>> +}
>> +
>> +static void
>> reconfigure_pmd_threads(struct dp_netdev *dp)
>>     OVS_REQUIRES(dp->port_mutex)
>> {
>>     struct dp_netdev_pmd_thread *pmd;
>>     struct ovs_numa_dump *pmd_cores;
>> +    struct ovs_numa_info_core *core;
>> +    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
>> +    struct hmapx_node *node;
>>     bool changed = false;
>> +    bool need_to_adjust_static_tx_qids = false;
>>
>>     /* The pmd threads should be started only if there's a pmd port in the
>>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>>     }
>>
>> -    /* Check for changed configuration */
>> -    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
>> -        changed = true;
>> -    } else {
>> -        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> -            if (pmd->core_id != NON_PMD_CORE_ID
>> -                && !ovs_numa_dump_contains_core(pmd_cores,
>> -                                                pmd->numa_id,
>> -                                                pmd->core_id)) {
>> -                changed = true;
>> -                break;
>> -            }
>> +    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>> +     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
> 
> Can you elaborate on why the static_tx_qids need to be adjusted - it's still not clear to me from the comment.

I think, currently, the main case for keeping tx queue ids sequential
is the vhost-user ports with big number of queues but disabled by
driver in guest.

If vhost-user device has enough number of queues to not use dynamic
distribution (XPS) and some of queues will be disabled inside the
guest, we will face almost same issue as described in
347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.")

For example:

	We may have 4 threads and vhost-user port with 4 queues.
	Lets assume that static tx qids distributed as follows:
		thread #0 --> queue #0
		thread #1 --> queue #1
		thread #2 --> queue #3
		thread #3 --> queue #2
	Now guest disables queue #3 (ethtool -L eth0 combined 3)
	And we're manually changing pmd-cpu-mask to remove thread #3.
	(You need to be sure that threads wasn't reloaded here.)
	At this point, if tx queue ids wasn't adjusted, we will have:
		thread #0 --> queue #0 --> queue #0
		thread #1 --> queue #1 --> queue #1
		thread #2 --> queue #3 --> queue #0
	Where the last column is the result of remapping inside
	__netdev_dpdk_vhost_send().
	So, in this case, there will be 3 queues available, but
	only 2 used by 3 threads. --> unused queue #2 and locking
	on access to queue #0.

>> +    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
>> +        need_to_adjust_static_tx_qids = true;
>> +    }
>> +
>> +    /* Check for unwanted pmd threads */
>> +    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
>> +            continue;
>> +        }
>> +        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
>> +                                                    pmd->core_id)) {
>> +            hmapx_add(&to_delete, pmd);
>> +        } else if (need_to_adjust_static_tx_qids) {
>> +            pmd->need_reload = true;
>>         }
>>     }
>>
>> -    /* Destroy the old and recreate the new pmd threads.  We don't perform an
>> -     * incremental update because we would have to adjust 'static_tx_qid'. */
>> -    if (changed) {
>> -        struct ovs_numa_info_core *core;
>> -        struct ovs_numa_info_numa *numa;
>> +    HMAPX_FOR_EACH (node, &to_delete) {
>> +        pmd = (struct dp_netdev_pmd_thread *) node->data;
>> +        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
>> +                  pmd->numa_id, pmd->core_id);
>> +        dp_netdev_del_pmd(dp, pmd);
>> +    }
>> +    changed = !hmapx_is_empty(&to_delete);
>> +    hmapx_destroy(&to_delete);
>>
>> -        /* Do not destroy the non pmd thread. */
>> -        dp_netdev_destroy_all_pmds(dp, false);
>> -        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
>> -            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
>> +    if (need_to_adjust_static_tx_qids) {
>> +        /* 'static_tx_qid's are not sequential now.
>> +         * Reload remaining threads to fix this. */
>> +        reload_affected_pmds(dp);
>> +    }
>>
>> +    /* Check for required new pmd threads */
>> +    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>> +        pmd = dp_netdev_get_pmd(dp, core->core_id);
>> +        if (!pmd) {
>> +            pmd = xzalloc(sizeof *pmd);
>>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
>> -
>>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>> +            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
>> +                      pmd->numa_id, pmd->core_id);
>> +            changed = true;
>> +        } else {
>> +            dp_netdev_pmd_unref(pmd);
>>         }
>> +    }
>> +
>> +    if (changed) {
>> +        struct ovs_numa_info_numa *numa;
>>
>>         /* Log the number of pmd threads per numa node. */
>>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
>> -            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
>> +            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
>>                       numa->n_cores, numa->numa_id);
>>         }
>>     }
>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>> }
>>
>> static void
>> -reload_affected_pmds(struct dp_netdev *dp)
>> -{
>> -    struct dp_netdev_pmd_thread *pmd;
>> -
>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>> -        if (pmd->need_reload) {
>> -            dp_netdev_reload_pmd__(pmd);
>> -            pmd->need_reload = false;
>> -        }
>> -    }
>> -}
>> -
>> -static void
>> pmd_remove_stale_ports(struct dp_netdev *dp,
>>                        struct dp_netdev_pmd_thread *pmd)
>>     OVS_EXCLUDED(pmd->port_mutex)
>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>>     }
>> }
>>
>> +static void
>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>> +{
>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>> +    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
>> +        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
>> +                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
>> +    }
>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>> +
>> +    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
>> +             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
>> +}
>> +
>> +static void
>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>> +{
>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>> +    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>> +}
>> +
>> static int
>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>>                           struct polled_queue **ppoll_list)
>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
>>     dpdk_set_lcore_id(pmd->core_id);
>>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>> reload:
>> +    pmd_alloc_static_tx_qid(pmd);
>>     emc_cache_init(&pmd->flow_cache);
>>
>>     /* List port/core affinity */
>> @@ -3766,6 +3830,7 @@ reload:
>>     dp_netdev_pmd_reload_done(pmd);
>>
>>     emc_cache_uninit(&pmd->flow_cache);
>> +    pmd_free_static_tx_qid(pmd);
>>
>>     if (!exiting) {
>>         goto reload;
>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>> dp_netdev *dp,
>>     pmd->numa_id = numa_id;
>>     pmd->need_reload = false;
>>
>> -    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
>> -
>>     ovs_refcount_init(&pmd->ref_cnt);
>>     latch_init(&pmd->exit_latch);
>>     pmd->reload_seq = seq_create();
>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>> dp_netdev *dp,
>>      * actual thread created for NON_PMD_CORE_ID. */
>>     if (core_id == NON_PMD_CORE_ID) {
>>         emc_cache_init(&pmd->flow_cache);
>> +        pmd_alloc_static_tx_qid(pmd);
>>     }
>>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
>>                 hash_int(core_id, 0));
>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread
>> *pmd)
>>         ovs_mutex_lock(&dp->non_pmd_mutex);
>>         emc_cache_uninit(&pmd->flow_cache);
>>         pmd_free_cached_ports(pmd);
>> +        pmd_free_static_tx_qid(pmd);
>>         ovs_mutex_unlock(&dp->non_pmd_mutex);
>>     } else {
>>         latch_set(&pmd->exit_latch);
>> diff --git a/tests/pmd.at b/tests/pmd.at
>> index 05755b3..39a3645 100644
>> --- a/tests/pmd.at
>> +++ b/tests/pmd.at
>> @@ -38,7 +38,7 @@ dnl
>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
>> m4_define([CHECK_PMD_THREADS_CREATED], [
>> -    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
>> +    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
>>     line_st=$3
>>     if [[ -z "$line_st" ]]
>>     then
>> --
>> 2.7.4
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> 
>
Mark Kavanagh June 6, 2017, 10:02 a.m. UTC | #3
>From: Ilya Maximets [mailto:i.maximets@samsung.com]
>Sent: Monday, June 5, 2017 9:18 AM
>To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; dev@openvswitch.org; Darrell Ball
><dball@vmware.com>
>Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Daniele Di Proietto <diproiettod@ovn.org>; Ben
>Pfaff <blp@ovn.org>; Pravin Shelar <pshelar@ovn.org>; Loftus, Ciara <ciara.loftus@intel.com>;
>Stokes, Ian <ian.stokes@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>threads.
>
>Thanks for testing, Mark.
>
>Comments inline.
>
>Best regards, Ilya Maximets.
>
>On 01.06.2017 17:30, Kavanagh, Mark B wrote:
>>> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-bounces@openvswitch.org] On Behalf
>Of
>>> Ilya Maximets
>>> Sent: Tuesday, May 30, 2017 3:12 PM
>>> To: dev@openvswitch.org; Darrell Ball <dball@vmware.com>
>>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Ilya Maximets <i.maximets@samsung.com>
>>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>threads.
>>>
>>> Currently, change of 'pmd-cpu-mask' is very heavy operation.
>>> It requires destroying of all the PMD threads and creating
>>> them back. After that, all the threads will sleep until
>>> ports' redistribution finished.
>>>
>>> This patch adds ability to not stop the datapath while
>>> adjusting number/placement of PMD threads. All not affected
>>> threads will forward traffic without any additional latencies.
>>>
>>> id-pool created for static tx queue ids to keep them sequential
>>> in a flexible way. non-PMD thread will always have
>>> static_tx_qid = 0 as it was before.
>>>
>>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>>
>> Hi Ilya,
>>
>> Patch LGTM - just once comment inline.
>>
>> I conducted some light testing and observed the following:
>>     - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask (in previous
>implementation, forwarding stopped completely)
>>         - occasional drop to zero in forwarding rate when modifying pmd-cpu-mask (e.g.
>change from '2' to '12')
>
>I guess, drop to zero in this case could be fixed by second
>patch of the series. (If it caused by port reconfiguration)

Yes, that's true - my colleague, Ian Stokes, will review and test that patch shortly.

>
>>     - temporary drop of ~1mpps when adding/deleting port (similar to existing
>implementation)
>>
>> I've also conducted the following sanity checks on the patch, and found no issues:
>>  - checkpatch.py
>>  - compilation with clang
>>  - compilation with gcc
>>  - sparse
>>  - make check (no additional tests fail after application of this patch)
>>
>> Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>
>> Thanks,
>> Mark
>>
>>> ---
>>> lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
>>> tests/pmd.at      |   2 +-
>>> 2 files changed, 105 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index b50164b..596d133 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -48,6 +48,7 @@
>>> #include "fat-rwlock.h"
>>> #include "flow.h"
>>> #include "hmapx.h"
>>> +#include "id-pool.h"
>>> #include "latch.h"
>>> #include "netdev.h"
>>> #include "netdev-vport.h"
>>> @@ -277,6 +278,9 @@ struct dp_netdev {
>>>
>>>     /* Stores all 'struct dp_netdev_pmd_thread's. */
>>>     struct cmap poll_threads;
>>> +    /* id pool for per thread static_tx_qid. */
>>> +    struct id_pool *tx_qid_pool;
>>> +    struct ovs_mutex tx_qid_pool_mutex;
>>>
>>>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
>>>      * instance for non-pmd thread. */
>>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread {
>>>     /* Queue id used by this pmd thread to send packets on all netdevs if
>>>      * XPS disabled for this netdev. All static_tx_qid's are unique and less
>>>      * than 'cmap_count(dp->poll_threads)'. */
>>> -    const int static_tx_qid;
>>> +    uint32_t static_tx_qid;
>>>
>>>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>>     /* List of rx queues to poll. */
>>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev
>>> *dp,
>>>                                                       unsigned core_id);
>>> static struct dp_netdev_pmd_thread *
>>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
>>> +static void dp_netdev_del_pmd(struct dp_netdev *dp,
>>> +                              struct dp_netdev_pmd_thread *pmd);
>>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
>>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
>>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>>>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>>>
>>>     cmap_init(&dp->poll_threads);
>>> +
>>> +    ovs_mutex_init(&dp->tx_qid_pool_mutex);
>>> +    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
>>> +    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
>>> +
>>>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
>>>     ovsthread_key_create(&dp->per_pmd_key, NULL);
>>>
>>>     ovs_mutex_lock(&dp->port_mutex);
>>> +    /* non-PMD will be created before all other threads and will
>>> +     * allocate static_tx_qid = 0. */
>>>     dp_netdev_set_nonpmd(dp);
>>>
>>>     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
>>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
>>>     dp_netdev_destroy_all_pmds(dp, true);
>>>     cmap_destroy(&dp->poll_threads);
>>>
>>> +    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
>>> +    id_pool_destroy(dp->tx_qid_pool);
>>> +
>>>     ovs_mutex_destroy(&dp->non_pmd_mutex);
>>>     ovsthread_key_delete(dp->per_pmd_key);
>>>
>>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp-
>>>> port_mutex)
>>> }
>>>
>>> static void
>>> +reload_affected_pmds(struct dp_netdev *dp)
>>> +{
>>> +    struct dp_netdev_pmd_thread *pmd;
>>> +
>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> +        if (pmd->need_reload) {
>>> +            dp_netdev_reload_pmd__(pmd);
>>> +            pmd->need_reload = false;
>>> +        }
>>> +    }
>>> +}
>>> +
>>> +static void
>>> reconfigure_pmd_threads(struct dp_netdev *dp)
>>>     OVS_REQUIRES(dp->port_mutex)
>>> {
>>>     struct dp_netdev_pmd_thread *pmd;
>>>     struct ovs_numa_dump *pmd_cores;
>>> +    struct ovs_numa_info_core *core;
>>> +    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
>>> +    struct hmapx_node *node;
>>>     bool changed = false;
>>> +    bool need_to_adjust_static_tx_qids = false;
>>>
>>>     /* The pmd threads should be started only if there's a pmd port in the
>>>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
>>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>>>     }
>>>
>>> -    /* Check for changed configuration */
>>> -    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
>>> -        changed = true;
>>> -    } else {
>>> -        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> -            if (pmd->core_id != NON_PMD_CORE_ID
>>> -                && !ovs_numa_dump_contains_core(pmd_cores,
>>> -                                                pmd->numa_id,
>>> -                                                pmd->core_id)) {
>>> -                changed = true;
>>> -                break;
>>> -            }
>>> +    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>>> +     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
>>
>> Can you elaborate on why the static_tx_qids need to be adjusted - it's still not clear to
>me from the comment.
>
>I think, currently, the main case for keeping tx queue ids sequential
>is the vhost-user ports with big number of queues but disabled by
>driver in guest.
>
>If vhost-user device has enough number of queues to not use dynamic
>distribution (XPS) and some of queues will be disabled inside the
>guest, we will face almost same issue as described in
>347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.")
>
>For example:
>
>	We may have 4 threads and vhost-user port with 4 queues.
>	Lets assume that static tx qids distributed as follows:
>		thread #0 --> queue #0
>		thread #1 --> queue #1
>		thread #2 --> queue #3
>		thread #3 --> queue #2
>	Now guest disables queue #3 (ethtool -L eth0 combined 3)
>	And we're manually changing pmd-cpu-mask to remove thread #3.
>	(You need to be sure that threads wasn't reloaded here.)
>	At this point, if tx queue ids wasn't adjusted, we will have:
>		thread #0 --> queue #0 --> queue #0
>		thread #1 --> queue #1 --> queue #1
>		thread #2 --> queue #3 --> queue #0
>	Where the last column is the result of remapping inside
>	__netdev_dpdk_vhost_send().
>	So, in this case, there will be 3 queues available, but
>	only 2 used by 3 threads. --> unused queue #2 and locking
>	on access to queue #0.

Okay, I got it - thanks for the detailed explanation Ilya.

With that, Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com> 


>
>>> +    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
>>> +        need_to_adjust_static_tx_qids = true;
>>> +    }
>>> +
>>> +    /* Check for unwanted pmd threads */
>>> +    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
>>> +            continue;
>>> +        }
>>> +        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
>>> +                                                    pmd->core_id)) {
>>> +            hmapx_add(&to_delete, pmd);
>>> +        } else if (need_to_adjust_static_tx_qids) {
>>> +            pmd->need_reload = true;
>>>         }
>>>     }
>>>
>>> -    /* Destroy the old and recreate the new pmd threads.  We don't perform an
>>> -     * incremental update because we would have to adjust 'static_tx_qid'. */
>>> -    if (changed) {
>>> -        struct ovs_numa_info_core *core;
>>> -        struct ovs_numa_info_numa *numa;
>>> +    HMAPX_FOR_EACH (node, &to_delete) {
>>> +        pmd = (struct dp_netdev_pmd_thread *) node->data;
>>> +        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
>>> +                  pmd->numa_id, pmd->core_id);
>>> +        dp_netdev_del_pmd(dp, pmd);
>>> +    }
>>> +    changed = !hmapx_is_empty(&to_delete);
>>> +    hmapx_destroy(&to_delete);
>>>
>>> -        /* Do not destroy the non pmd thread. */
>>> -        dp_netdev_destroy_all_pmds(dp, false);
>>> -        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
>>> -            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
>>> +    if (need_to_adjust_static_tx_qids) {
>>> +        /* 'static_tx_qid's are not sequential now.
>>> +         * Reload remaining threads to fix this. */
>>> +        reload_affected_pmds(dp);
>>> +    }
>>>
>>> +    /* Check for required new pmd threads */
>>> +    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>>> +        pmd = dp_netdev_get_pmd(dp, core->core_id);
>>> +        if (!pmd) {
>>> +            pmd = xzalloc(sizeof *pmd);
>>>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
>>> -
>>>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>>> +            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
>>> +                      pmd->numa_id, pmd->core_id);
>>> +            changed = true;
>>> +        } else {
>>> +            dp_netdev_pmd_unref(pmd);
>>>         }
>>> +    }
>>> +
>>> +    if (changed) {
>>> +        struct ovs_numa_info_numa *numa;
>>>
>>>         /* Log the number of pmd threads per numa node. */
>>>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
>>> -            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
>>> +            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
>>>                       numa->n_cores, numa->numa_id);
>>>         }
>>>     }
>>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>> }
>>>
>>> static void
>>> -reload_affected_pmds(struct dp_netdev *dp)
>>> -{
>>> -    struct dp_netdev_pmd_thread *pmd;
>>> -
>>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>> -        if (pmd->need_reload) {
>>> -            dp_netdev_reload_pmd__(pmd);
>>> -            pmd->need_reload = false;
>>> -        }
>>> -    }
>>> -}
>>> -
>>> -static void
>>> pmd_remove_stale_ports(struct dp_netdev *dp,
>>>                        struct dp_netdev_pmd_thread *pmd)
>>>     OVS_EXCLUDED(pmd->port_mutex)
>>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>>>     }
>>> }
>>>
>>> +static void
>>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>> +{
>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>> +    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
>>> +        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
>>> +                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
>>> +    }
>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>> +
>>> +    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
>>> +             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
>>> +}
>>> +
>>> +static void
>>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>> +{
>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>> +    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>> +}
>>> +
>>> static int
>>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>>>                           struct polled_queue **ppoll_list)
>>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
>>>     dpdk_set_lcore_id(pmd->core_id);
>>>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>>> reload:
>>> +    pmd_alloc_static_tx_qid(pmd);
>>>     emc_cache_init(&pmd->flow_cache);
>>>
>>>     /* List port/core affinity */
>>> @@ -3766,6 +3830,7 @@ reload:
>>>     dp_netdev_pmd_reload_done(pmd);
>>>
>>>     emc_cache_uninit(&pmd->flow_cache);
>>> +    pmd_free_static_tx_qid(pmd);
>>>
>>>     if (!exiting) {
>>>         goto reload;
>>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>>> dp_netdev *dp,
>>>     pmd->numa_id = numa_id;
>>>     pmd->need_reload = false;
>>>
>>> -    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
>>> -
>>>     ovs_refcount_init(&pmd->ref_cnt);
>>>     latch_init(&pmd->exit_latch);
>>>     pmd->reload_seq = seq_create();
>>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>>> dp_netdev *dp,
>>>      * actual thread created for NON_PMD_CORE_ID. */
>>>     if (core_id == NON_PMD_CORE_ID) {
>>>         emc_cache_init(&pmd->flow_cache);
>>> +        pmd_alloc_static_tx_qid(pmd);
>>>     }
>>>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
>>>                 hash_int(core_id, 0));
>>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread
>>> *pmd)
>>>         ovs_mutex_lock(&dp->non_pmd_mutex);
>>>         emc_cache_uninit(&pmd->flow_cache);
>>>         pmd_free_cached_ports(pmd);
>>> +        pmd_free_static_tx_qid(pmd);
>>>         ovs_mutex_unlock(&dp->non_pmd_mutex);
>>>     } else {
>>>         latch_set(&pmd->exit_latch);
>>> diff --git a/tests/pmd.at b/tests/pmd.at
>>> index 05755b3..39a3645 100644
>>> --- a/tests/pmd.at
>>> +++ b/tests/pmd.at
>>> @@ -38,7 +38,7 @@ dnl
>>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
>>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
>>> m4_define([CHECK_PMD_THREADS_CREATED], [
>>> -    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
>>> +    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
>>>     line_st=$3
>>>     if [[ -z "$line_st" ]]
>>>     then
>>> --
>>> 2.7.4
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>
>>
>>
Stokes, Ian June 15, 2017, 8:24 a.m. UTC | #4
> >From: Ilya Maximets [mailto:i.maximets@samsung.com]
> >Sent: Monday, June 5, 2017 9:18 AM
> >To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; dev@openvswitch.org;
> >Darrell Ball <dball@vmware.com>
> >Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Daniele Di Proietto
> ><diproiettod@ovn.org>; Ben Pfaff <blp@ovn.org>; Pravin Shelar
> ><pshelar@ovn.org>; Loftus, Ciara <ciara.loftus@intel.com>; Stokes, Ian
> ><ian.stokes@intel.com>; Kevin Traynor <ktraynor@redhat.com>
> >Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental
> >addition/deletion of PMD threads.
> >
> >Thanks for testing, Mark.
> >
> >Comments inline.
> >
> >Best regards, Ilya Maximets.
> >
> >On 01.06.2017 17:30, Kavanagh, Mark B wrote:
> >>> From: ovs-dev-bounces@openvswitch.org
> >>> [mailto:ovs-dev-bounces@openvswitch.org] On Behalf
> >Of
> >>> Ilya Maximets
> >>> Sent: Tuesday, May 30, 2017 3:12 PM
> >>> To: dev@openvswitch.org; Darrell Ball <dball@vmware.com>
> >>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Ilya Maximets
> >>> <i.maximets@samsung.com>
> >>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental
> >>> addition/deletion of PMD
> >threads.
> >>>
> >>> Currently, change of 'pmd-cpu-mask' is very heavy operation.
> >>> It requires destroying of all the PMD threads and creating them
> >>> back. After that, all the threads will sleep until ports'
> >>> redistribution finished.
> >>>
> >>> This patch adds ability to not stop the datapath while adjusting
> >>> number/placement of PMD threads. All not affected threads will
> >>> forward traffic without any additional latencies.
> >>>
> >>> id-pool created for static tx queue ids to keep them sequential in a
> >>> flexible way. non-PMD thread will always have static_tx_qid = 0 as
> >>> it was before.
> >>>
> >>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
> >>
> >> Hi Ilya,
> >>
> >> Patch LGTM - just once comment inline.
> >>
> >> I conducted some light testing and observed the following:
> >>     - temporary dip in forwarding rate (~50%) when modifying
> >> pmd-cpu-mask (in previous
> >implementation, forwarding stopped completely)
> >>         - occasional drop to zero in forwarding rate when modifying
> pmd-cpu-mask (e.g.
> >change from '2' to '12')
> >
> >I guess, drop to zero in this case could be fixed by second patch of
> >the series. (If it caused by port reconfiguration)
> 
> Yes, that's true - my colleague, Ian Stokes, will review and test that
> patch shortly.

Just a follow up here, I've tested the avoid port reconfiguration patch and it removes the drop caused by reconfiguration.

https://mail.openvswitch.org/pipermail/ovs-dev/2017-June/334123.html

Ian
> 
> >
> >>     - temporary drop of ~1mpps when adding/deleting port (similar to
> >> existing
> >implementation)
> >>
> >> I've also conducted the following sanity checks on the patch, and found
> no issues:
> >>  - checkpatch.py
> >>  - compilation with clang
> >>  - compilation with gcc
> >>  - sparse
> >>  - make check (no additional tests fail after application of this
> >> patch)
> >>
> >> Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> >>
> >> Thanks,
> >> Mark
> >>
> >>> ---
> >>> lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++-------
> --------
> >>> tests/pmd.at      |   2 +-
> >>> 2 files changed, 105 insertions(+), 40 deletions(-)
> >>>
> >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> >>> b50164b..596d133 100644
> >>> --- a/lib/dpif-netdev.c
> >>> +++ b/lib/dpif-netdev.c
> >>> @@ -48,6 +48,7 @@
> >>> #include "fat-rwlock.h"
> >>> #include "flow.h"
> >>> #include "hmapx.h"
> >>> +#include "id-pool.h"
> >>> #include "latch.h"
> >>> #include "netdev.h"
> >>> #include "netdev-vport.h"
> >>> @@ -277,6 +278,9 @@ struct dp_netdev {
> >>>
> >>>     /* Stores all 'struct dp_netdev_pmd_thread's. */
> >>>     struct cmap poll_threads;
> >>> +    /* id pool for per thread static_tx_qid. */
> >>> +    struct id_pool *tx_qid_pool;
> >>> +    struct ovs_mutex tx_qid_pool_mutex;
> >>>
> >>>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
> >>>      * instance for non-pmd thread. */ @@ -562,7 +566,7 @@ struct
> >>> dp_netdev_pmd_thread {
> >>>     /* Queue id used by this pmd thread to send packets on all netdevs
> if
> >>>      * XPS disabled for this netdev. All static_tx_qid's are unique
> and less
> >>>      * than 'cmap_count(dp->poll_threads)'. */
> >>> -    const int static_tx_qid;
> >>> +    uint32_t static_tx_qid;
> >>>
> >>>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and
> 'tx_ports'. */
> >>>     /* List of rx queues to poll. */ @@ -642,6 +646,8 @@ static
> >>> struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
> >>>                                                       unsigned
> >>> core_id); static struct dp_netdev_pmd_thread *
> >>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position
> >>> *pos);
> >>> +static void dp_netdev_del_pmd(struct dp_netdev *dp,
> >>> +                              struct dp_netdev_pmd_thread *pmd);
> >>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool
> >>> non_pmd); static void dp_netdev_pmd_clear_ports(struct
> >>> dp_netdev_pmd_thread *pmd); static void
> >>> dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, @@ -
> 1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct
> dpif_class *class,
> >>>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
> >>>
> >>>     cmap_init(&dp->poll_threads);
> >>> +
> >>> +    ovs_mutex_init(&dp->tx_qid_pool_mutex);
> >>> +    /* We need 1 Tx queue for each possible core + 1 for non-PMD
> threads. */
> >>> +    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() +
> >>> + 1);
> >>> +
> >>>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
> >>>     ovsthread_key_create(&dp->per_pmd_key, NULL);
> >>>
> >>>     ovs_mutex_lock(&dp->port_mutex);
> >>> +    /* non-PMD will be created before all other threads and will
> >>> +     * allocate static_tx_qid = 0. */
> >>>     dp_netdev_set_nonpmd(dp);
> >>>
> >>>     error = do_add_port(dp, name,
> >>> dpif_netdev_port_open_type(dp->class,
> >>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
> >>>     dp_netdev_destroy_all_pmds(dp, true);
> >>>     cmap_destroy(&dp->poll_threads);
> >>>
> >>> +    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
> >>> +    id_pool_destroy(dp->tx_qid_pool);
> >>> +
> >>>     ovs_mutex_destroy(&dp->non_pmd_mutex);
> >>>     ovsthread_key_delete(dp->per_pmd_key);
> >>>
> >>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool
> >>> pinned) OVS_REQUIRES(dp-
> >>>> port_mutex)
> >>> }
> >>>
> >>> static void
> >>> +reload_affected_pmds(struct dp_netdev *dp) {
> >>> +    struct dp_netdev_pmd_thread *pmd;
> >>> +
> >>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>> +        if (pmd->need_reload) {
> >>> +            dp_netdev_reload_pmd__(pmd);
> >>> +            pmd->need_reload = false;
> >>> +        }
> >>> +    }
> >>> +}
> >>> +
> >>> +static void
> >>> reconfigure_pmd_threads(struct dp_netdev *dp)
> >>>     OVS_REQUIRES(dp->port_mutex)
> >>> {
> >>>     struct dp_netdev_pmd_thread *pmd;
> >>>     struct ovs_numa_dump *pmd_cores;
> >>> +    struct ovs_numa_info_core *core;
> >>> +    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
> >>> +    struct hmapx_node *node;
> >>>     bool changed = false;
> >>> +    bool need_to_adjust_static_tx_qids = false;
> >>>
> >>>     /* The pmd threads should be started only if there's a pmd port in
> the
> >>>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we
> >>> start @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev
> *dp)
> >>>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
> >>>     }
> >>>
> >>> -    /* Check for changed configuration */
> >>> -    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp-
> >poll_threads) - 1) {
> >>> -        changed = true;
> >>> -    } else {
> >>> -        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>> -            if (pmd->core_id != NON_PMD_CORE_ID
> >>> -                && !ovs_numa_dump_contains_core(pmd_cores,
> >>> -                                                pmd->numa_id,
> >>> -                                                pmd->core_id)) {
> >>> -                changed = true;
> >>> -                break;
> >>> -            }
> >>> +    /* We need to adjust 'static_tx_qid's only if we're reducing
> number of
> >>> +     * PMD threads. Otherwise, new threads will allocate all the
> >>> + freed ids. */
> >>
> >> Can you elaborate on why the static_tx_qids need to be adjusted -
> >> it's still not clear to
> >me from the comment.
> >
> >I think, currently, the main case for keeping tx queue ids sequential
> >is the vhost-user ports with big number of queues but disabled by
> >driver in guest.
> >
> >If vhost-user device has enough number of queues to not use dynamic
> >distribution (XPS) and some of queues will be disabled inside the
> >guest, we will face almost same issue as described in 347ba9bb9b7a
> >("dpif-netdev: Unique and sequential tx_qids.")
> >
> >For example:
> >
> >	We may have 4 threads and vhost-user port with 4 queues.
> >	Lets assume that static tx qids distributed as follows:
> >		thread #0 --> queue #0
> >		thread #1 --> queue #1
> >		thread #2 --> queue #3
> >		thread #3 --> queue #2
> >	Now guest disables queue #3 (ethtool -L eth0 combined 3)
> >	And we're manually changing pmd-cpu-mask to remove thread #3.
> >	(You need to be sure that threads wasn't reloaded here.)
> >	At this point, if tx queue ids wasn't adjusted, we will have:
> >		thread #0 --> queue #0 --> queue #0
> >		thread #1 --> queue #1 --> queue #1
> >		thread #2 --> queue #3 --> queue #0
> >	Where the last column is the result of remapping inside
> >	__netdev_dpdk_vhost_send().
> >	So, in this case, there will be 3 queues available, but
> >	only 2 used by 3 threads. --> unused queue #2 and locking
> >	on access to queue #0.
> 
> Okay, I got it - thanks for the detailed explanation Ilya.
> 
> With that, Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
> 
> 
> >
> >>> +    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp-
> >poll_threads) - 1) {
> >>> +        need_to_adjust_static_tx_qids = true;
> >>> +    }
> >>> +
> >>> +    /* Check for unwanted pmd threads */
> >>> +    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
> >>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
> >>> +            continue;
> >>> +        }
> >>> +        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
> >>> +                                                    pmd->core_id)) {
> >>> +            hmapx_add(&to_delete, pmd);
> >>> +        } else if (need_to_adjust_static_tx_qids) {
> >>> +            pmd->need_reload = true;
> >>>         }
> >>>     }
> >>>
> >>> -    /* Destroy the old and recreate the new pmd threads.  We don't
> perform an
> >>> -     * incremental update because we would have to adjust
> 'static_tx_qid'. */
> >>> -    if (changed) {
> >>> -        struct ovs_numa_info_core *core;
> >>> -        struct ovs_numa_info_numa *numa;
> >>> +    HMAPX_FOR_EACH (node, &to_delete) {
> >>> +        pmd = (struct dp_netdev_pmd_thread *) node->data;
> >>> +        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d
> destroyed.",
> >>> +                  pmd->numa_id, pmd->core_id);
> >>> +        dp_netdev_del_pmd(dp, pmd);
> >>> +    }
> >>> +    changed = !hmapx_is_empty(&to_delete);
> >>> +    hmapx_destroy(&to_delete);
> >>>
> >>> -        /* Do not destroy the non pmd thread. */
> >>> -        dp_netdev_destroy_all_pmds(dp, false);
> >>> -        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
> >>> -            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
> >>> +    if (need_to_adjust_static_tx_qids) {
> >>> +        /* 'static_tx_qid's are not sequential now.
> >>> +         * Reload remaining threads to fix this. */
> >>> +        reload_affected_pmds(dp);
> >>> +    }
> >>>
> >>> +    /* Check for required new pmd threads */
> >>> +    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
> >>> +        pmd = dp_netdev_get_pmd(dp, core->core_id);
> >>> +        if (!pmd) {
> >>> +            pmd = xzalloc(sizeof *pmd);
> >>>             dp_netdev_configure_pmd(pmd, dp, core->core_id,
> >>> core->numa_id);
> >>> -
> >>>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main,
> >>> pmd);
> >>> +            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d
> created.",
> >>> +                      pmd->numa_id, pmd->core_id);
> >>> +            changed = true;
> >>> +        } else {
> >>> +            dp_netdev_pmd_unref(pmd);
> >>>         }
> >>> +    }
> >>> +
> >>> +    if (changed) {
> >>> +        struct ovs_numa_info_numa *numa;
> >>>
> >>>         /* Log the number of pmd threads per numa node. */
> >>>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
> >>> -            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node
> %d",
> >>> +            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa
> >>> + node %d",
> >>>                       numa->n_cores, numa->numa_id);
> >>>         }
> >>>     }
> >>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
> >>> }
> >>>
> >>> static void
> >>> -reload_affected_pmds(struct dp_netdev *dp) -{
> >>> -    struct dp_netdev_pmd_thread *pmd;
> >>> -
> >>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
> >>> -        if (pmd->need_reload) {
> >>> -            dp_netdev_reload_pmd__(pmd);
> >>> -            pmd->need_reload = false;
> >>> -        }
> >>> -    }
> >>> -}
> >>> -
> >>> -static void
> >>> pmd_remove_stale_ports(struct dp_netdev *dp,
> >>>                        struct dp_netdev_pmd_thread *pmd)
> >>>     OVS_EXCLUDED(pmd->port_mutex)
> >>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct
> dp_netdev_pmd_thread *pmd)
> >>>     }
> >>> }
> >>>
> >>> +static void
> >>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd) {
> >>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
> >>> +    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid))
> {
> >>> +        VLOG_ABORT("static_tx_qid allocation failed for PMD on core
> %2d"
> >>> +                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
> >>> +    }
> >>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
> >>> +
> >>> +    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core
> %2d"
> >>> +             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id,
> >>> +pmd->numa_id); }
> >>> +
> >>> +static void
> >>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd) {
> >>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
> >>> +    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
> >>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
> >>> +}
> >>> +
> >>> static int
> >>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
> >>>                           struct polled_queue **ppoll_list) @@
> >>> -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
> >>>     dpdk_set_lcore_id(pmd->core_id);
> >>>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
> >>> reload:
> >>> +    pmd_alloc_static_tx_qid(pmd);
> >>>     emc_cache_init(&pmd->flow_cache);
> >>>
> >>>     /* List port/core affinity */
> >>> @@ -3766,6 +3830,7 @@ reload:
> >>>     dp_netdev_pmd_reload_done(pmd);
> >>>
> >>>     emc_cache_uninit(&pmd->flow_cache);
> >>> +    pmd_free_static_tx_qid(pmd);
> >>>
> >>>     if (!exiting) {
> >>>         goto reload;
> >>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct
> >>> dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
> >>>     pmd->numa_id = numa_id;
> >>>     pmd->need_reload = false;
> >>>
> >>> -    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp-
> >poll_threads);
> >>> -
> >>>     ovs_refcount_init(&pmd->ref_cnt);
> >>>     latch_init(&pmd->exit_latch);
> >>>     pmd->reload_seq = seq_create();
> >>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct
> >>> dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
> >>>      * actual thread created for NON_PMD_CORE_ID. */
> >>>     if (core_id == NON_PMD_CORE_ID) {
> >>>         emc_cache_init(&pmd->flow_cache);
> >>> +        pmd_alloc_static_tx_qid(pmd);
> >>>     }
> >>>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *,
> &pmd->node),
> >>>                 hash_int(core_id, 0)); @@ -4240,6 +4304,7 @@
> >>> dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread
> >>> *pmd)
> >>>         ovs_mutex_lock(&dp->non_pmd_mutex);
> >>>         emc_cache_uninit(&pmd->flow_cache);
> >>>         pmd_free_cached_ports(pmd);
> >>> +        pmd_free_static_tx_qid(pmd);
> >>>         ovs_mutex_unlock(&dp->non_pmd_mutex);
> >>>     } else {
> >>>         latch_set(&pmd->exit_latch); diff --git a/tests/pmd.at
> >>> b/tests/pmd.at index 05755b3..39a3645 100644
> >>> --- a/tests/pmd.at
> >>> +++ b/tests/pmd.at
> >>> @@ -38,7 +38,7 @@ dnl
> >>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1
> >>> not dnl passed. Checking starts from line number 'line' in ovs-
> vswithd.log .
> >>> m4_define([CHECK_PMD_THREADS_CREATED], [
> >>> -    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
> >>> +    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
> >>>     line_st=$3
> >>>     if [[ -z "$line_st" ]]
> >>>     then
> >>> --
> >>> 2.7.4
> >>>
> >>> _______________________________________________
> >>> dev mailing list
> >>> dev@openvswitch.org
> >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >>
> >>
> >>
Ilya Maximets June 15, 2017, 10:06 a.m. UTC | #5
On 06.06.2017 13:02, Kavanagh, Mark B wrote:
> 
> 
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Monday, June 5, 2017 9:18 AM
>> To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; dev@openvswitch.org; Darrell Ball
>> <dball@vmware.com>
>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Daniele Di Proietto <diproiettod@ovn.org>; Ben
>> Pfaff <blp@ovn.org>; Pravin Shelar <pshelar@ovn.org>; Loftus, Ciara <ciara.loftus@intel.com>;
>> Stokes, Ian <ian.stokes@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>> Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>> threads.
>>
>> Thanks for testing, Mark.
>>
>> Comments inline.
>>
>> Best regards, Ilya Maximets.
>>
>> On 01.06.2017 17:30, Kavanagh, Mark B wrote:
>>>> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-bounces@openvswitch.org] On Behalf
>> Of
>>>> Ilya Maximets
>>>> Sent: Tuesday, May 30, 2017 3:12 PM
>>>> To: dev@openvswitch.org; Darrell Ball <dball@vmware.com>
>>>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Ilya Maximets <i.maximets@samsung.com>
>>>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>> threads.
>>>>
>>>> Currently, change of 'pmd-cpu-mask' is very heavy operation.
>>>> It requires destroying of all the PMD threads and creating
>>>> them back. After that, all the threads will sleep until
>>>> ports' redistribution finished.
>>>>
>>>> This patch adds ability to not stop the datapath while
>>>> adjusting number/placement of PMD threads. All not affected
>>>> threads will forward traffic without any additional latencies.
>>>>
>>>> id-pool created for static tx queue ids to keep them sequential
>>>> in a flexible way. non-PMD thread will always have
>>>> static_tx_qid = 0 as it was before.
>>>>
>>>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>>>
>>> Hi Ilya,
>>>
>>> Patch LGTM - just once comment inline.
>>>
>>> I conducted some light testing and observed the following:
>>>     - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask (in previous
>> implementation, forwarding stopped completely)
>>>         - occasional drop to zero in forwarding rate when modifying pmd-cpu-mask (e.g.
>> change from '2' to '12')
>>
>> I guess, drop to zero in this case could be fixed by second
>> patch of the series. (If it caused by port reconfiguration)
> 
> Yes, that's true - my colleague, Ian Stokes, will review and test that patch shortly.
> 
>>
>>>     - temporary drop of ~1mpps when adding/deleting port (similar to existing
>> implementation)
>>>
>>> I've also conducted the following sanity checks on the patch, and found no issues:
>>>  - checkpatch.py
>>>  - compilation with clang
>>>  - compilation with gcc
>>>  - sparse
>>>  - make check (no additional tests fail after application of this patch)
>>>
>>> Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>>
>>> Thanks,
>>> Mark
>>>
>>>> ---
>>>> lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
>>>> tests/pmd.at      |   2 +-
>>>> 2 files changed, 105 insertions(+), 40 deletions(-)
>>>>
>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>>> index b50164b..596d133 100644
>>>> --- a/lib/dpif-netdev.c
>>>> +++ b/lib/dpif-netdev.c
>>>> @@ -48,6 +48,7 @@
>>>> #include "fat-rwlock.h"
>>>> #include "flow.h"
>>>> #include "hmapx.h"
>>>> +#include "id-pool.h"
>>>> #include "latch.h"
>>>> #include "netdev.h"
>>>> #include "netdev-vport.h"
>>>> @@ -277,6 +278,9 @@ struct dp_netdev {
>>>>
>>>>     /* Stores all 'struct dp_netdev_pmd_thread's. */
>>>>     struct cmap poll_threads;
>>>> +    /* id pool for per thread static_tx_qid. */
>>>> +    struct id_pool *tx_qid_pool;
>>>> +    struct ovs_mutex tx_qid_pool_mutex;
>>>>
>>>>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
>>>>      * instance for non-pmd thread. */
>>>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread {
>>>>     /* Queue id used by this pmd thread to send packets on all netdevs if
>>>>      * XPS disabled for this netdev. All static_tx_qid's are unique and less
>>>>      * than 'cmap_count(dp->poll_threads)'. */
>>>> -    const int static_tx_qid;
>>>> +    uint32_t static_tx_qid;
>>>>
>>>>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>>>     /* List of rx queues to poll. */
>>>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev
>>>> *dp,
>>>>                                                       unsigned core_id);
>>>> static struct dp_netdev_pmd_thread *
>>>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
>>>> +static void dp_netdev_del_pmd(struct dp_netdev *dp,
>>>> +                              struct dp_netdev_pmd_thread *pmd);
>>>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
>>>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
>>>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>>>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
>>>>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>>>>
>>>>     cmap_init(&dp->poll_threads);
>>>> +
>>>> +    ovs_mutex_init(&dp->tx_qid_pool_mutex);
>>>> +    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
>>>> +    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
>>>> +
>>>>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
>>>>     ovsthread_key_create(&dp->per_pmd_key, NULL);
>>>>
>>>>     ovs_mutex_lock(&dp->port_mutex);
>>>> +    /* non-PMD will be created before all other threads and will
>>>> +     * allocate static_tx_qid = 0. */
>>>>     dp_netdev_set_nonpmd(dp);
>>>>
>>>>     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
>>>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
>>>>     dp_netdev_destroy_all_pmds(dp, true);
>>>>     cmap_destroy(&dp->poll_threads);
>>>>
>>>> +    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
>>>> +    id_pool_destroy(dp->tx_qid_pool);
>>>> +
>>>>     ovs_mutex_destroy(&dp->non_pmd_mutex);
>>>>     ovsthread_key_delete(dp->per_pmd_key);
>>>>
>>>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp-
>>>>> port_mutex)
>>>> }
>>>>
>>>> static void
>>>> +reload_affected_pmds(struct dp_netdev *dp)
>>>> +{
>>>> +    struct dp_netdev_pmd_thread *pmd;
>>>> +
>>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> +        if (pmd->need_reload) {
>>>> +            dp_netdev_reload_pmd__(pmd);
>>>> +            pmd->need_reload = false;
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>> +static void
>>>> reconfigure_pmd_threads(struct dp_netdev *dp)
>>>>     OVS_REQUIRES(dp->port_mutex)
>>>> {
>>>>     struct dp_netdev_pmd_thread *pmd;
>>>>     struct ovs_numa_dump *pmd_cores;
>>>> +    struct ovs_numa_info_core *core;
>>>> +    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
>>>> +    struct hmapx_node *node;
>>>>     bool changed = false;
>>>> +    bool need_to_adjust_static_tx_qids = false;
>>>>
>>>>     /* The pmd threads should be started only if there's a pmd port in the
>>>>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
>>>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>>>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>>>>     }
>>>>
>>>> -    /* Check for changed configuration */
>>>> -    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
>>>> -        changed = true;
>>>> -    } else {
>>>> -        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> -            if (pmd->core_id != NON_PMD_CORE_ID
>>>> -                && !ovs_numa_dump_contains_core(pmd_cores,
>>>> -                                                pmd->numa_id,
>>>> -                                                pmd->core_id)) {
>>>> -                changed = true;
>>>> -                break;
>>>> -            }
>>>> +    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>>>> +     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
>>>
>>> Can you elaborate on why the static_tx_qids need to be adjusted - it's still not clear to
>> me from the comment.
>>
>> I think, currently, the main case for keeping tx queue ids sequential
>> is the vhost-user ports with big number of queues but disabled by
>> driver in guest.
>>
>> If vhost-user device has enough number of queues to not use dynamic
>> distribution (XPS) and some of queues will be disabled inside the
>> guest, we will face almost same issue as described in
>> 347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.")
>>
>> For example:
>>
>> 	We may have 4 threads and vhost-user port with 4 queues.
>> 	Lets assume that static tx qids distributed as follows:
>> 		thread #0 --> queue #0
>> 		thread #1 --> queue #1
>> 		thread #2 --> queue #3
>> 		thread #3 --> queue #2
>> 	Now guest disables queue #3 (ethtool -L eth0 combined 3)
>> 	And we're manually changing pmd-cpu-mask to remove thread #3.
>> 	(You need to be sure that threads wasn't reloaded here.)
>> 	At this point, if tx queue ids wasn't adjusted, we will have:
>> 		thread #0 --> queue #0 --> queue #0
>> 		thread #1 --> queue #1 --> queue #1
>> 		thread #2 --> queue #3 --> queue #0
>> 	Where the last column is the result of remapping inside
>> 	__netdev_dpdk_vhost_send().
>> 	So, in this case, there will be 3 queues available, but
>> 	only 2 used by 3 threads. --> unused queue #2 and locking
>> 	on access to queue #0.
> 
> Okay, I got it - thanks for the detailed explanation Ilya.
> 
> With that, Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com> 

Hi Mark. Do you think I need to add some comment about txq adjustment
in the code? I'm going to respin the series because of compiler
warning in the second patch found by Ian and want to know your
opinion.

Also, I'm going to keep your Tested-by tag because of no code changes
in this patch planned. I'll add your Acked-by in case no additional
comment needed or if below example looks good to you.

I can add something like this:

    /* We need to adjust 'static_tx_qid's only if we're reducing number of
     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
        /* Adjustment is required to keep 'static_tx_qid's sequential and
         * avoid possible issues, for example, imbalanced tx queue usage
         * and unnecessary locking caused by remapping on netdev level. */
        need_to_adjust_static_tx_qids = true;
    }

What do you think?

>>
>>>> +    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
>>>> +        need_to_adjust_static_tx_qids = true;
>>>> +    }
>>>> +
>>>> +    /* Check for unwanted pmd threads */
>>>> +    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>>>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
>>>> +            continue;
>>>> +        }
>>>> +        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
>>>> +                                                    pmd->core_id)) {
>>>> +            hmapx_add(&to_delete, pmd);
>>>> +        } else if (need_to_adjust_static_tx_qids) {
>>>> +            pmd->need_reload = true;
>>>>         }
>>>>     }
>>>>
>>>> -    /* Destroy the old and recreate the new pmd threads.  We don't perform an
>>>> -     * incremental update because we would have to adjust 'static_tx_qid'. */
>>>> -    if (changed) {
>>>> -        struct ovs_numa_info_core *core;
>>>> -        struct ovs_numa_info_numa *numa;
>>>> +    HMAPX_FOR_EACH (node, &to_delete) {
>>>> +        pmd = (struct dp_netdev_pmd_thread *) node->data;
>>>> +        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
>>>> +                  pmd->numa_id, pmd->core_id);
>>>> +        dp_netdev_del_pmd(dp, pmd);
>>>> +    }
>>>> +    changed = !hmapx_is_empty(&to_delete);
>>>> +    hmapx_destroy(&to_delete);
>>>>
>>>> -        /* Do not destroy the non pmd thread. */
>>>> -        dp_netdev_destroy_all_pmds(dp, false);
>>>> -        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
>>>> -            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
>>>> +    if (need_to_adjust_static_tx_qids) {
>>>> +        /* 'static_tx_qid's are not sequential now.
>>>> +         * Reload remaining threads to fix this. */
>>>> +        reload_affected_pmds(dp);
>>>> +    }
>>>>
>>>> +    /* Check for required new pmd threads */
>>>> +    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>>>> +        pmd = dp_netdev_get_pmd(dp, core->core_id);
>>>> +        if (!pmd) {
>>>> +            pmd = xzalloc(sizeof *pmd);
>>>>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
>>>> -
>>>>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>>>> +            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
>>>> +                      pmd->numa_id, pmd->core_id);
>>>> +            changed = true;
>>>> +        } else {
>>>> +            dp_netdev_pmd_unref(pmd);
>>>>         }
>>>> +    }
>>>> +
>>>> +    if (changed) {
>>>> +        struct ovs_numa_info_numa *numa;
>>>>
>>>>         /* Log the number of pmd threads per numa node. */
>>>>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
>>>> -            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
>>>> +            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
>>>>                       numa->n_cores, numa->numa_id);
>>>>         }
>>>>     }
>>>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>>> }
>>>>
>>>> static void
>>>> -reload_affected_pmds(struct dp_netdev *dp)
>>>> -{
>>>> -    struct dp_netdev_pmd_thread *pmd;
>>>> -
>>>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>> -        if (pmd->need_reload) {
>>>> -            dp_netdev_reload_pmd__(pmd);
>>>> -            pmd->need_reload = false;
>>>> -        }
>>>> -    }
>>>> -}
>>>> -
>>>> -static void
>>>> pmd_remove_stale_ports(struct dp_netdev *dp,
>>>>                        struct dp_netdev_pmd_thread *pmd)
>>>>     OVS_EXCLUDED(pmd->port_mutex)
>>>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>>>>     }
>>>> }
>>>>
>>>> +static void
>>>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>>> +{
>>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>>> +    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
>>>> +        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
>>>> +                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
>>>> +    }
>>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>>> +
>>>> +    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
>>>> +             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
>>>> +}
>>>> +
>>>> +static void
>>>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>>> +{
>>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>>> +    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
>>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>>> +}
>>>> +
>>>> static int
>>>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>>>>                           struct polled_queue **ppoll_list)
>>>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
>>>>     dpdk_set_lcore_id(pmd->core_id);
>>>>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>>>> reload:
>>>> +    pmd_alloc_static_tx_qid(pmd);
>>>>     emc_cache_init(&pmd->flow_cache);
>>>>
>>>>     /* List port/core affinity */
>>>> @@ -3766,6 +3830,7 @@ reload:
>>>>     dp_netdev_pmd_reload_done(pmd);
>>>>
>>>>     emc_cache_uninit(&pmd->flow_cache);
>>>> +    pmd_free_static_tx_qid(pmd);
>>>>
>>>>     if (!exiting) {
>>>>         goto reload;
>>>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>>>> dp_netdev *dp,
>>>>     pmd->numa_id = numa_id;
>>>>     pmd->need_reload = false;
>>>>
>>>> -    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
>>>> -
>>>>     ovs_refcount_init(&pmd->ref_cnt);
>>>>     latch_init(&pmd->exit_latch);
>>>>     pmd->reload_seq = seq_create();
>>>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>>>> dp_netdev *dp,
>>>>      * actual thread created for NON_PMD_CORE_ID. */
>>>>     if (core_id == NON_PMD_CORE_ID) {
>>>>         emc_cache_init(&pmd->flow_cache);
>>>> +        pmd_alloc_static_tx_qid(pmd);
>>>>     }
>>>>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
>>>>                 hash_int(core_id, 0));
>>>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread
>>>> *pmd)
>>>>         ovs_mutex_lock(&dp->non_pmd_mutex);
>>>>         emc_cache_uninit(&pmd->flow_cache);
>>>>         pmd_free_cached_ports(pmd);
>>>> +        pmd_free_static_tx_qid(pmd);
>>>>         ovs_mutex_unlock(&dp->non_pmd_mutex);
>>>>     } else {
>>>>         latch_set(&pmd->exit_latch);
>>>> diff --git a/tests/pmd.at b/tests/pmd.at
>>>> index 05755b3..39a3645 100644
>>>> --- a/tests/pmd.at
>>>> +++ b/tests/pmd.at
>>>> @@ -38,7 +38,7 @@ dnl
>>>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
>>>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
>>>> m4_define([CHECK_PMD_THREADS_CREATED], [
>>>> -    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
>>>> +    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
>>>>     line_st=$3
>>>>     if [[ -z "$line_st" ]]
>>>>     then
>>>> --
>>>> 2.7.4
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>>
>>>
> 
> 
>
Mark Kavanagh June 15, 2017, 10:11 a.m. UTC | #6
>From: Ilya Maximets [mailto:i.maximets@samsung.com]
>Sent: Thursday, June 15, 2017 11:07 AM
>To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; dev@openvswitch.org; Darrell Ball
><dball@vmware.com>
>Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Daniele Di Proietto <diproiettod@ovn.org>; Ben
>Pfaff <blp@ovn.org>; Pravin Shelar <pshelar@ovn.org>; Loftus, Ciara <ciara.loftus@intel.com>;
>Stokes, Ian <ian.stokes@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>threads.
>
>On 06.06.2017 13:02, Kavanagh, Mark B wrote:
>>
>>
>>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>>> Sent: Monday, June 5, 2017 9:18 AM
>>> To: Kavanagh, Mark B <mark.b.kavanagh@intel.com>; dev@openvswitch.org; Darrell Ball
>>> <dball@vmware.com>
>>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Daniele Di Proietto <diproiettod@ovn.org>; Ben
>>> Pfaff <blp@ovn.org>; Pravin Shelar <pshelar@ovn.org>; Loftus, Ciara
><ciara.loftus@intel.com>;
>>> Stokes, Ian <ian.stokes@intel.com>; Kevin Traynor <ktraynor@redhat.com>
>>> Subject: Re: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>>> threads.
>>>
>>> Thanks for testing, Mark.
>>>
>>> Comments inline.
>>>
>>> Best regards, Ilya Maximets.
>>>
>>> On 01.06.2017 17:30, Kavanagh, Mark B wrote:
>>>>> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-bounces@openvswitch.org] On Behalf
>>> Of
>>>>> Ilya Maximets
>>>>> Sent: Tuesday, May 30, 2017 3:12 PM
>>>>> To: dev@openvswitch.org; Darrell Ball <dball@vmware.com>
>>>>> Cc: Heetae Ahn <heetae82.ahn@samsung.com>; Ilya Maximets <i.maximets@samsung.com>
>>>>> Subject: [ovs-dev] [PATCH v2 1/3] dpif-netdev: Incremental addition/deletion of PMD
>>> threads.
>>>>>
>>>>> Currently, change of 'pmd-cpu-mask' is very heavy operation.
>>>>> It requires destroying of all the PMD threads and creating
>>>>> them back. After that, all the threads will sleep until
>>>>> ports' redistribution finished.
>>>>>
>>>>> This patch adds ability to not stop the datapath while
>>>>> adjusting number/placement of PMD threads. All not affected
>>>>> threads will forward traffic without any additional latencies.
>>>>>
>>>>> id-pool created for static tx queue ids to keep them sequential
>>>>> in a flexible way. non-PMD thread will always have
>>>>> static_tx_qid = 0 as it was before.
>>>>>
>>>>> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
>>>>
>>>> Hi Ilya,
>>>>
>>>> Patch LGTM - just once comment inline.
>>>>
>>>> I conducted some light testing and observed the following:
>>>>     - temporary dip in forwarding rate (~50%) when modifying pmd-cpu-mask (in previous
>>> implementation, forwarding stopped completely)
>>>>         - occasional drop to zero in forwarding rate when modifying pmd-cpu-mask (e.g.
>>> change from '2' to '12')
>>>
>>> I guess, drop to zero in this case could be fixed by second
>>> patch of the series. (If it caused by port reconfiguration)
>>
>> Yes, that's true - my colleague, Ian Stokes, will review and test that patch shortly.
>>
>>>
>>>>     - temporary drop of ~1mpps when adding/deleting port (similar to existing
>>> implementation)
>>>>
>>>> I've also conducted the following sanity checks on the patch, and found no issues:
>>>>  - checkpatch.py
>>>>  - compilation with clang
>>>>  - compilation with gcc
>>>>  - sparse
>>>>  - make check (no additional tests fail after application of this patch)
>>>>
>>>> Tested-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>>>>
>>>> Thanks,
>>>> Mark
>>>>
>>>>> ---
>>>>> lib/dpif-netdev.c | 143 +++++++++++++++++++++++++++++++++++++++---------------
>>>>> tests/pmd.at      |   2 +-
>>>>> 2 files changed, 105 insertions(+), 40 deletions(-)
>>>>>
>>>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>>>> index b50164b..596d133 100644
>>>>> --- a/lib/dpif-netdev.c
>>>>> +++ b/lib/dpif-netdev.c
>>>>> @@ -48,6 +48,7 @@
>>>>> #include "fat-rwlock.h"
>>>>> #include "flow.h"
>>>>> #include "hmapx.h"
>>>>> +#include "id-pool.h"
>>>>> #include "latch.h"
>>>>> #include "netdev.h"
>>>>> #include "netdev-vport.h"
>>>>> @@ -277,6 +278,9 @@ struct dp_netdev {
>>>>>
>>>>>     /* Stores all 'struct dp_netdev_pmd_thread's. */
>>>>>     struct cmap poll_threads;
>>>>> +    /* id pool for per thread static_tx_qid. */
>>>>> +    struct id_pool *tx_qid_pool;
>>>>> +    struct ovs_mutex tx_qid_pool_mutex;
>>>>>
>>>>>     /* Protects the access of the 'struct dp_netdev_pmd_thread'
>>>>>      * instance for non-pmd thread. */
>>>>> @@ -562,7 +566,7 @@ struct dp_netdev_pmd_thread {
>>>>>     /* Queue id used by this pmd thread to send packets on all netdevs if
>>>>>      * XPS disabled for this netdev. All static_tx_qid's are unique and less
>>>>>      * than 'cmap_count(dp->poll_threads)'. */
>>>>> -    const int static_tx_qid;
>>>>> +    uint32_t static_tx_qid;
>>>>>
>>>>>     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
>>>>>     /* List of rx queues to poll. */
>>>>> @@ -642,6 +646,8 @@ static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct
>dp_netdev
>>>>> *dp,
>>>>>                                                       unsigned core_id);
>>>>> static struct dp_netdev_pmd_thread *
>>>>> dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
>>>> +static void dp_netdev_del_pmd(struct dp_netdev *dp,
>>>>> +                              struct dp_netdev_pmd_thread *pmd);
>>>>> static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
>>>>> static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
>>>>> static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
>>>>> @@ -1181,10 +1187,17 @@ create_dp_netdev(const char *name, const struct dpif_class
>*class,
>>>>>     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
>>>>>
>>>>>     cmap_init(&dp->poll_threads);
>>>>> +
>>>>> +    ovs_mutex_init(&dp->tx_qid_pool_mutex);
>>>>> +    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
>>>>> +    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
>>>>> +
>>>>>     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
>>>>>     ovsthread_key_create(&dp->per_pmd_key, NULL);
>>>>>
>>>>>     ovs_mutex_lock(&dp->port_mutex);
>>>>> +    /* non-PMD will be created before all other threads and will
>>>>> +     * allocate static_tx_qid = 0. */
>>>>>     dp_netdev_set_nonpmd(dp);
>>>>>
>>>>>     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
>>>>> @@ -1279,6 +1292,9 @@ dp_netdev_free(struct dp_netdev *dp)
>>>>>     dp_netdev_destroy_all_pmds(dp, true);
>>>>>     cmap_destroy(&dp->poll_threads);
>>>>>
>>>>> +    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
>>>>> +    id_pool_destroy(dp->tx_qid_pool);
>>>>> +
>>>>>     ovs_mutex_destroy(&dp->non_pmd_mutex);
>>>>>     ovsthread_key_delete(dp->per_pmd_key);
>>>>>
>>>>> @@ -3302,12 +3318,29 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned)
>OVS_REQUIRES(dp-
>>>>>> port_mutex)
>>>>> }
>>>>>
>>>>> static void
>>>>> +reload_affected_pmds(struct dp_netdev *dp)
>>>>> +{
>>>>> +    struct dp_netdev_pmd_thread *pmd;
>>>>> +
>>>>> +    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>>> +        if (pmd->need_reload) {
>>>>> +            dp_netdev_reload_pmd__(pmd);
>>>>> +            pmd->need_reload = false;
>>>>> +        }
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +static void
>>>>> reconfigure_pmd_threads(struct dp_netdev *dp)
>>>>>     OVS_REQUIRES(dp->port_mutex)
>>>>> {
>>>>>     struct dp_netdev_pmd_thread *pmd;
>>>>>     struct ovs_numa_dump *pmd_cores;
>>>>> +    struct ovs_numa_info_core *core;
>>>>> +    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
>>>>> +    struct hmapx_node *node;
>>>>>     bool changed = false;
>>>>> +    bool need_to_adjust_static_tx_qids = false;
>>>>>
>>>>>     /* The pmd threads should be started only if there's a pmd port in the
>>>>>      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
>>>>> @@ -3320,40 +3353,61 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>>>>         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
>>>>>     }
>>>>>
>>>>> -    /* Check for changed configuration */
>>>>> -    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
>>>>> -        changed = true;
>>>>> -    } else {
>>>>> -        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>>> -            if (pmd->core_id != NON_PMD_CORE_ID
>>>>> -                && !ovs_numa_dump_contains_core(pmd_cores,
>>>>> -                                                pmd->numa_id,
>>>>> -                                                pmd->core_id)) {
>>>>> -                changed = true;
>>>>> -                break;
>>>>> -            }
>>>>> +    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>>>>> +     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
>>>>
>>>> Can you elaborate on why the static_tx_qids need to be adjusted - it's still not clear to
>>> me from the comment.
>>>
>>> I think, currently, the main case for keeping tx queue ids sequential
>>> is the vhost-user ports with big number of queues but disabled by
>>> driver in guest.
>>>
>>> If vhost-user device has enough number of queues to not use dynamic
>>> distribution (XPS) and some of queues will be disabled inside the
>>> guest, we will face almost same issue as described in
>>> 347ba9bb9b7a ("dpif-netdev: Unique and sequential tx_qids.")
>>>
>>> For example:
>>>
>>> 	We may have 4 threads and vhost-user port with 4 queues.
>>> 	Lets assume that static tx qids distributed as follows:
>>> 		thread #0 --> queue #0
>>> 		thread #1 --> queue #1
>>> 		thread #2 --> queue #3
>>> 		thread #3 --> queue #2
>>> 	Now guest disables queue #3 (ethtool -L eth0 combined 3)
>>> 	And we're manually changing pmd-cpu-mask to remove thread #3.
>>> 	(You need to be sure that threads wasn't reloaded here.)
>>> 	At this point, if tx queue ids wasn't adjusted, we will have:
>>> 		thread #0 --> queue #0 --> queue #0
>>> 		thread #1 --> queue #1 --> queue #1
>>> 		thread #2 --> queue #3 --> queue #0
>>> 	Where the last column is the result of remapping inside
>>> 	__netdev_dpdk_vhost_send().
>>> 	So, in this case, there will be 3 queues available, but
>>> 	only 2 used by 3 threads. --> unused queue #2 and locking
>>> 	on access to queue #0.
>>
>> Okay, I got it - thanks for the detailed explanation Ilya.
>>
>> With that, Acked-by: Mark Kavanagh <mark.b.kavanagh@intel.com>
>
>Hi Mark. Do you think I need to add some comment about txq adjustment
>in the code? I'm going to respin the series because of compiler
>warning in the second patch found by Ian and want to know your
>opinion.
>
>Also, I'm going to keep your Tested-by tag because of no code changes
>in this patch planned. I'll add your Acked-by in case no additional
>comment needed or if below example looks good to you.
>
>I can add something like this:
>
>    /* We need to adjust 'static_tx_qid's only if we're reducing number of
>     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
>    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
>        /* Adjustment is required to keep 'static_tx_qid's sequential and
>         * avoid possible issues, for example, imbalanced tx queue usage
>         * and unnecessary locking caused by remapping on netdev level. */
>        need_to_adjust_static_tx_qids = true;
>    }
>
>What do you think?

Hey Ilya,

Sounds good to me - feel free to add my [acked|tested]-by tags on the V3 patch.

Thanks!
Mark


>
>>>
>>>>> +    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
>>>>> +        need_to_adjust_static_tx_qids = true;
>>>>> +    }
>>>>> +
>>>>> +    /* Check for unwanted pmd threads */
>>>>> +    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
>>>>> +        if (pmd->core_id == NON_PMD_CORE_ID) {
>>>>> +            continue;
>>>>> +        }
>>>>> +        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
>>>>> +                                                    pmd->core_id)) {
>>>>> +            hmapx_add(&to_delete, pmd);
>>>>> +        } else if (need_to_adjust_static_tx_qids) {
>>>>> +            pmd->need_reload = true;
>>>>>         }
>>>>>     }
>>>>>
>>>>> -    /* Destroy the old and recreate the new pmd threads.  We don't perform an
>>>>> -     * incremental update because we would have to adjust 'static_tx_qid'. */
>>>>> -    if (changed) {
>>>>> -        struct ovs_numa_info_core *core;
>>>>> -        struct ovs_numa_info_numa *numa;
>>>>> +    HMAPX_FOR_EACH (node, &to_delete) {
>>>>> +        pmd = (struct dp_netdev_pmd_thread *) node->data;
>>>>> +        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
>>>>> +                  pmd->numa_id, pmd->core_id);
>>>>> +        dp_netdev_del_pmd(dp, pmd);
>>>>> +    }
>>>>> +    changed = !hmapx_is_empty(&to_delete);
>>>>> +    hmapx_destroy(&to_delete);
>>>>>
>>>>> -        /* Do not destroy the non pmd thread. */
>>>>> -        dp_netdev_destroy_all_pmds(dp, false);
>>>>> -        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
>>>>> -            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
>>>>> +    if (need_to_adjust_static_tx_qids) {
>>>>> +        /* 'static_tx_qid's are not sequential now.
>>>>> +         * Reload remaining threads to fix this. */
>>>>> +        reload_affected_pmds(dp);
>>>>> +    }
>>>>>
>>>>> +    /* Check for required new pmd threads */
>>>>> +    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
>>>>> +        pmd = dp_netdev_get_pmd(dp, core->core_id);
>>>>> +        if (!pmd) {
>>>>> +            pmd = xzalloc(sizeof *pmd);
>>>>>             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
>>>>> -
>>>>>             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
>>>>> +            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
>>>>> +                      pmd->numa_id, pmd->core_id);
>>>>> +            changed = true;
>>>>> +        } else {
>>>>> +            dp_netdev_pmd_unref(pmd);
>>>>>         }
>>>>> +    }
>>>>> +
>>>>> +    if (changed) {
>>>>> +        struct ovs_numa_info_numa *numa;
>>>>>
>>>>>         /* Log the number of pmd threads per numa node. */
>>>>>         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
>>>>> -            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
>>>>> +            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
>>>>>                       numa->n_cores, numa->numa_id);
>>>>>         }
>>>>>     }
>>>>> @@ -3362,19 +3416,6 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
>>>>> }
>>>>>
>>>>> static void
>>>>> -reload_affected_pmds(struct dp_netdev *dp)
>>>>> -{
>>>>> -    struct dp_netdev_pmd_thread *pmd;
>>>>> -
>>>>> -    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
>>>>> -        if (pmd->need_reload) {
>>>>> -            dp_netdev_reload_pmd__(pmd);
>>>>> -            pmd->need_reload = false;
>>>>> -        }
>>>>> -    }
>>>>> -}
>>>>> -
>>>>> -static void
>>>>> pmd_remove_stale_ports(struct dp_netdev *dp,
>>>>>                        struct dp_netdev_pmd_thread *pmd)
>>>>>     OVS_EXCLUDED(pmd->port_mutex)
>>>>> @@ -3673,6 +3714,28 @@ pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
>>>>>     }
>>>>> }
>>>>>
>>>>> +static void
>>>>> +pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>>>> +{
>>>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>>>> +    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
>>>>> +        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
>>>>> +                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
>>>>> +    }
>>>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>>>> +
>>>>> +    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
>>>>> +             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
>>>>> +}
>>>>> +
>>>>> +static void
>>>>> +pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
>>>>> +{
>>>>> +    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
>>>>> +    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
>>>>> +    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
>>>>> +}
>>>>> +
>>>>> static int
>>>>> pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
>>>>>                           struct polled_queue **ppoll_list)
>>>>> @@ -3718,6 +3781,7 @@ pmd_thread_main(void *f_)
>>>>>     dpdk_set_lcore_id(pmd->core_id);
>>>>>     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>>>>> reload:
>>>>> +    pmd_alloc_static_tx_qid(pmd);
>>>>>     emc_cache_init(&pmd->flow_cache);
>>>>>
>>>>>     /* List port/core affinity */
>>>>> @@ -3766,6 +3830,7 @@ reload:
>>>>>     dp_netdev_pmd_reload_done(pmd);
>>>>>
>>>>>     emc_cache_uninit(&pmd->flow_cache);
>>>>> +    pmd_free_static_tx_qid(pmd);
>>>>>
>>>>>     if (!exiting) {
>>>>>         goto reload;
>>>>> @@ -4176,8 +4241,6 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>>>>> dp_netdev *dp,
>>>>>     pmd->numa_id = numa_id;
>>>>>     pmd->need_reload = false;
>>>>>
>>>>> -    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
>>>>> -
>>>>>     ovs_refcount_init(&pmd->ref_cnt);
>>>>>     latch_init(&pmd->exit_latch);
>>>>>     pmd->reload_seq = seq_create();
>>>>> @@ -4198,6 +4261,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct
>>>>> dp_netdev *dp,
>>>>>      * actual thread created for NON_PMD_CORE_ID. */
>>>>>     if (core_id == NON_PMD_CORE_ID) {
>>>>>         emc_cache_init(&pmd->flow_cache);
>>>>> +        pmd_alloc_static_tx_qid(pmd);
>>>>>     }
>>>>>     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
>>>>>                 hash_int(core_id, 0));
>>>>> @@ -4240,6 +4304,7 @@ dp_netdev_del_pmd(struct dp_netdev *dp, struct
>dp_netdev_pmd_thread
>>>>> *pmd)
>>>>>         ovs_mutex_lock(&dp->non_pmd_mutex);
>>>>>         emc_cache_uninit(&pmd->flow_cache);
>>>>>         pmd_free_cached_ports(pmd);
>>>>> +        pmd_free_static_tx_qid(pmd);
>>>>>         ovs_mutex_unlock(&dp->non_pmd_mutex);
>>>>>     } else {
>>>>>         latch_set(&pmd->exit_latch);
>>>>> diff --git a/tests/pmd.at b/tests/pmd.at
>>>>> index 05755b3..39a3645 100644
>>>>> --- a/tests/pmd.at
>>>>> +++ b/tests/pmd.at
>>>>> @@ -38,7 +38,7 @@ dnl
>>>>> dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
>>>>> dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
>>>>> m4_define([CHECK_PMD_THREADS_CREATED], [
>>>>> -    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
>>>>> +    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
>>>>>     line_st=$3
>>>>>     if [[ -z "$line_st" ]]
>>>>>     then
>>>>> --
>>>>> 2.7.4
>>>>>
>>>>> _______________________________________________
>>>>> dev mailing list
>>>>> dev@openvswitch.org
>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>
>>>>
>>>>
>>
>>
>>
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b50164b..596d133 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -48,6 +48,7 @@ 
 #include "fat-rwlock.h"
 #include "flow.h"
 #include "hmapx.h"
+#include "id-pool.h"
 #include "latch.h"
 #include "netdev.h"
 #include "netdev-vport.h"
@@ -277,6 +278,9 @@  struct dp_netdev {
 
     /* Stores all 'struct dp_netdev_pmd_thread's. */
     struct cmap poll_threads;
+    /* id pool for per thread static_tx_qid. */
+    struct id_pool *tx_qid_pool;
+    struct ovs_mutex tx_qid_pool_mutex;
 
     /* Protects the access of the 'struct dp_netdev_pmd_thread'
      * instance for non-pmd thread. */
@@ -562,7 +566,7 @@  struct dp_netdev_pmd_thread {
     /* Queue id used by this pmd thread to send packets on all netdevs if
      * XPS disabled for this netdev. All static_tx_qid's are unique and less
      * than 'cmap_count(dp->poll_threads)'. */
-    const int static_tx_qid;
+    uint32_t static_tx_qid;
 
     struct ovs_mutex port_mutex;    /* Mutex for 'poll_list' and 'tx_ports'. */
     /* List of rx queues to poll. */
@@ -642,6 +646,8 @@  static struct dp_netdev_pmd_thread *dp_netdev_get_pmd(struct dp_netdev *dp,
                                                       unsigned core_id);
 static struct dp_netdev_pmd_thread *
 dp_netdev_pmd_get_next(struct dp_netdev *dp, struct cmap_position *pos);
+static void dp_netdev_del_pmd(struct dp_netdev *dp,
+                              struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_destroy_all_pmds(struct dp_netdev *dp, bool non_pmd);
 static void dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd);
 static void dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
@@ -1181,10 +1187,17 @@  create_dp_netdev(const char *name, const struct dpif_class *class,
     atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
 
     cmap_init(&dp->poll_threads);
+
+    ovs_mutex_init(&dp->tx_qid_pool_mutex);
+    /* We need 1 Tx queue for each possible core + 1 for non-PMD threads. */
+    dp->tx_qid_pool = id_pool_create(0, ovs_numa_get_n_cores() + 1);
+
     ovs_mutex_init_recursive(&dp->non_pmd_mutex);
     ovsthread_key_create(&dp->per_pmd_key, NULL);
 
     ovs_mutex_lock(&dp->port_mutex);
+    /* non-PMD will be created before all other threads and will
+     * allocate static_tx_qid = 0. */
     dp_netdev_set_nonpmd(dp);
 
     error = do_add_port(dp, name, dpif_netdev_port_open_type(dp->class,
@@ -1279,6 +1292,9 @@  dp_netdev_free(struct dp_netdev *dp)
     dp_netdev_destroy_all_pmds(dp, true);
     cmap_destroy(&dp->poll_threads);
 
+    ovs_mutex_destroy(&dp->tx_qid_pool_mutex);
+    id_pool_destroy(dp->tx_qid_pool);
+
     ovs_mutex_destroy(&dp->non_pmd_mutex);
     ovsthread_key_delete(dp->per_pmd_key);
 
@@ -3302,12 +3318,29 @@  rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
 }
 
 static void
+reload_affected_pmds(struct dp_netdev *dp)
+{
+    struct dp_netdev_pmd_thread *pmd;
+
+    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+        if (pmd->need_reload) {
+            dp_netdev_reload_pmd__(pmd);
+            pmd->need_reload = false;
+        }
+    }
+}
+
+static void
 reconfigure_pmd_threads(struct dp_netdev *dp)
     OVS_REQUIRES(dp->port_mutex)
 {
     struct dp_netdev_pmd_thread *pmd;
     struct ovs_numa_dump *pmd_cores;
+    struct ovs_numa_info_core *core;
+    struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
+    struct hmapx_node *node;
     bool changed = false;
+    bool need_to_adjust_static_tx_qids = false;
 
     /* The pmd threads should be started only if there's a pmd port in the
      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
@@ -3320,40 +3353,61 @@  reconfigure_pmd_threads(struct dp_netdev *dp)
         pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
     }
 
-    /* Check for changed configuration */
-    if (ovs_numa_dump_count(pmd_cores) != cmap_count(&dp->poll_threads) - 1) {
-        changed = true;
-    } else {
-        CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-            if (pmd->core_id != NON_PMD_CORE_ID
-                && !ovs_numa_dump_contains_core(pmd_cores,
-                                                pmd->numa_id,
-                                                pmd->core_id)) {
-                changed = true;
-                break;
-            }
+    /* We need to adjust 'static_tx_qid's only if we're reducing number of
+     * PMD threads. Otherwise, new threads will allocate all the freed ids. */
+    if (ovs_numa_dump_count(pmd_cores) < cmap_count(&dp->poll_threads) - 1) {
+        need_to_adjust_static_tx_qids = true;
+    }
+
+    /* Check for unwanted pmd threads */
+    CMAP_FOR_EACH(pmd, node, &dp->poll_threads) {
+        if (pmd->core_id == NON_PMD_CORE_ID) {
+            continue;
+        }
+        if (!ovs_numa_dump_contains_core(pmd_cores, pmd->numa_id,
+                                                    pmd->core_id)) {
+            hmapx_add(&to_delete, pmd);
+        } else if (need_to_adjust_static_tx_qids) {
+            pmd->need_reload = true;
         }
     }
 
-    /* Destroy the old and recreate the new pmd threads.  We don't perform an
-     * incremental update because we would have to adjust 'static_tx_qid'. */
-    if (changed) {
-        struct ovs_numa_info_core *core;
-        struct ovs_numa_info_numa *numa;
+    HMAPX_FOR_EACH (node, &to_delete) {
+        pmd = (struct dp_netdev_pmd_thread *) node->data;
+        VLOG_INFO("PMD thread on numa_id: %d, core id: %2d destroyed.",
+                  pmd->numa_id, pmd->core_id);
+        dp_netdev_del_pmd(dp, pmd);
+    }
+    changed = !hmapx_is_empty(&to_delete);
+    hmapx_destroy(&to_delete);
 
-        /* Do not destroy the non pmd thread. */
-        dp_netdev_destroy_all_pmds(dp, false);
-        FOR_EACH_CORE_ON_DUMP (core, pmd_cores) {
-            struct dp_netdev_pmd_thread *pmd = xzalloc(sizeof *pmd);
+    if (need_to_adjust_static_tx_qids) {
+        /* 'static_tx_qid's are not sequential now.
+         * Reload remaining threads to fix this. */
+        reload_affected_pmds(dp);
+    }
 
+    /* Check for required new pmd threads */
+    FOR_EACH_CORE_ON_DUMP(core, pmd_cores) {
+        pmd = dp_netdev_get_pmd(dp, core->core_id);
+        if (!pmd) {
+            pmd = xzalloc(sizeof *pmd);
             dp_netdev_configure_pmd(pmd, dp, core->core_id, core->numa_id);
-
             pmd->thread = ovs_thread_create("pmd", pmd_thread_main, pmd);
+            VLOG_INFO("PMD thread on numa_id: %d, core id: %2d created.",
+                      pmd->numa_id, pmd->core_id);
+            changed = true;
+        } else {
+            dp_netdev_pmd_unref(pmd);
         }
+    }
+
+    if (changed) {
+        struct ovs_numa_info_numa *numa;
 
         /* Log the number of pmd threads per numa node. */
         FOR_EACH_NUMA_ON_DUMP (numa, pmd_cores) {
-            VLOG_INFO("Created %"PRIuSIZE" pmd threads on numa node %d",
+            VLOG_INFO("There are %"PRIuSIZE" pmd threads on numa node %d",
                       numa->n_cores, numa->numa_id);
         }
     }
@@ -3362,19 +3416,6 @@  reconfigure_pmd_threads(struct dp_netdev *dp)
 }
 
 static void
-reload_affected_pmds(struct dp_netdev *dp)
-{
-    struct dp_netdev_pmd_thread *pmd;
-
-    CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
-        if (pmd->need_reload) {
-            dp_netdev_reload_pmd__(pmd);
-            pmd->need_reload = false;
-        }
-    }
-}
-
-static void
 pmd_remove_stale_ports(struct dp_netdev *dp,
                        struct dp_netdev_pmd_thread *pmd)
     OVS_EXCLUDED(pmd->port_mutex)
@@ -3673,6 +3714,28 @@  pmd_load_cached_ports(struct dp_netdev_pmd_thread *pmd)
     }
 }
 
+static void
+pmd_alloc_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
+{
+    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
+    if (!id_pool_alloc_id(pmd->dp->tx_qid_pool, &pmd->static_tx_qid)) {
+        VLOG_ABORT("static_tx_qid allocation failed for PMD on core %2d"
+                   ", numa_id %d.", pmd->core_id, pmd->numa_id);
+    }
+    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
+
+    VLOG_DBG("static_tx_qid = %d allocated for PMD thread on core %2d"
+             ", numa_id %d.", pmd->static_tx_qid, pmd->core_id, pmd->numa_id);
+}
+
+static void
+pmd_free_static_tx_qid(struct dp_netdev_pmd_thread *pmd)
+{
+    ovs_mutex_lock(&pmd->dp->tx_qid_pool_mutex);
+    id_pool_free_id(pmd->dp->tx_qid_pool, pmd->static_tx_qid);
+    ovs_mutex_unlock(&pmd->dp->tx_qid_pool_mutex);
+}
+
 static int
 pmd_load_queues_and_ports(struct dp_netdev_pmd_thread *pmd,
                           struct polled_queue **ppoll_list)
@@ -3718,6 +3781,7 @@  pmd_thread_main(void *f_)
     dpdk_set_lcore_id(pmd->core_id);
     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
 reload:
+    pmd_alloc_static_tx_qid(pmd);
     emc_cache_init(&pmd->flow_cache);
 
     /* List port/core affinity */
@@ -3766,6 +3830,7 @@  reload:
     dp_netdev_pmd_reload_done(pmd);
 
     emc_cache_uninit(&pmd->flow_cache);
+    pmd_free_static_tx_qid(pmd);
 
     if (!exiting) {
         goto reload;
@@ -4176,8 +4241,6 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
     pmd->numa_id = numa_id;
     pmd->need_reload = false;
 
-    *CONST_CAST(int *, &pmd->static_tx_qid) = cmap_count(&dp->poll_threads);
-
     ovs_refcount_init(&pmd->ref_cnt);
     latch_init(&pmd->exit_latch);
     pmd->reload_seq = seq_create();
@@ -4198,6 +4261,7 @@  dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
      * actual thread created for NON_PMD_CORE_ID. */
     if (core_id == NON_PMD_CORE_ID) {
         emc_cache_init(&pmd->flow_cache);
+        pmd_alloc_static_tx_qid(pmd);
     }
     cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
                 hash_int(core_id, 0));
@@ -4240,6 +4304,7 @@  dp_netdev_del_pmd(struct dp_netdev *dp, struct dp_netdev_pmd_thread *pmd)
         ovs_mutex_lock(&dp->non_pmd_mutex);
         emc_cache_uninit(&pmd->flow_cache);
         pmd_free_cached_ports(pmd);
+        pmd_free_static_tx_qid(pmd);
         ovs_mutex_unlock(&dp->non_pmd_mutex);
     } else {
         latch_set(&pmd->exit_latch);
diff --git a/tests/pmd.at b/tests/pmd.at
index 05755b3..39a3645 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -38,7 +38,7 @@  dnl
 dnl Whaits for creation of 'n_threads' or at least 1 thread if $1 not
 dnl passed. Checking starts from line number 'line' in ovs-vswithd.log .
 m4_define([CHECK_PMD_THREADS_CREATED], [
-    PATTERN="Created [[0-9]]* pmd threads on numa node $2"
+    PATTERN="There are [[0-9]]* pmd threads on numa node $2"
     line_st=$3
     if [[ -z "$line_st" ]]
     then