Message ID | 20191202160330.14413-1-david.marchand@redhat.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE. | expand |
On 02.12.2019 17:03, David Marchand wrote: > Most DPDK components make the assumption that rte_lcore_id() returns > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of > the LCORE_ID_ANY special value). Do you think that makes a practical sense? In a real virtualization environments users are usually using cpus with lower numbers and it's most likely possible to change NUMA layout to have CPUs from all the nodes enumerated from the low CPU numbers. It's uncommon also to use all the CPUs for just OVS on a big system with huge number of cores. Another thing is can we really do this on a DPDK level? Maybe it'll be a step to dynamic registering/unregistering of non-EAL threads? Since you're wiping off the meaning of a lcore as a CPU core, it becomes just a thread_id in a DPDK point of view. So, maybe application could call a new API function instead of managing these strange mappings by itself? One comment inline (not a full review). Best regards, Ilya Maximets. > > OVS does not currently check which value is set in > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > side. > > Introduce a lcore allocator in OVS for PMD threads and map them to > unused lcores from DPDK à la --lcores. > > The physical cores on which the PMD threads are running still > constitutes an important information when debugging, so still keep > those in the PMD thread names but add a new debug log when starting > them. > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > registering them via the rte_thread_set_affinity() helper. > > Signed-off-by: David Marchand <david.marchand@redhat.com> > --- > lib/dpdk-stub.c | 8 +++++- > lib/dpdk.c | 69 +++++++++++++++++++++++++++++++++++++++++++---- > lib/dpdk.h | 3 ++- > lib/dpif-netdev.c | 3 ++- > 4 files changed, 75 insertions(+), 8 deletions(-) > > diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c > index c332c217c..90473bc8e 100644 > --- a/lib/dpdk-stub.c > +++ b/lib/dpdk-stub.c > @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) > } > > void > -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) > +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) > +{ > + /* Nothing */ > +} > + > +void > +dpdk_uninit_thread_context(void) > { > /* Nothing */ > } > diff --git a/lib/dpdk.c b/lib/dpdk.c > index 21dd47e80..771baa413 100644 > --- a/lib/dpdk.c > +++ b/lib/dpdk.c > @@ -33,6 +33,7 @@ > > #include "dirs.h" > #include "fatal-signal.h" > +#include "id-pool.h" > #include "netdev-dpdk.h" > #include "netdev-offload-provider.h" > #include "openvswitch/dynamic-string.h" > @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization > * of DPDK. */ > static bool per_port_memory = false; /* Status of per port memory support */ > > +static struct id_pool *lcore_id_pool; > +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER; > + > static int > process_vhost_flags(char *flag, const char *default_val, int size, > const struct smap *ovs_other_config, > @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > > - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { > + if (args_contains(&args, "-c") || args_contains(&args, "-l") || > + args_contains(&args, "--lcores")) { > auto_determine = false; > } > > @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config) > * thread affintity - default to core #0 */ > VLOG_ERR("Thread getaffinity failed. Using core #0"); > } > - svec_add(&args, "-l"); > - svec_add_nocopy(&args, xasprintf("%d", cpu)); > + svec_add(&args, "--lcores"); > + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); > } > > svec_terminate(&args); > @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > > + ovs_mutex_lock(&lcore_id_pool_mutex); > + lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE); > + /* Empty the whole pool... */ > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > + uint32_t lcore_id; > + > + id_pool_alloc_id(lcore_id_pool, &lcore_id); > + } > + /* ...and release the unused spots. */ > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > + if (rte_eal_lcore_role(lcore) != ROLE_OFF) { > + continue; > + } > + id_pool_free_id(lcore_id_pool, lcore); > + } > + ovs_mutex_unlock(&lcore_id_pool_mutex); > + > /* We are called from the main thread here */ > RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; > > @@ -522,11 +544,48 @@ dpdk_available(void) > } > > void > -dpdk_set_lcore_id(unsigned cpu) > +dpdk_init_thread_context(unsigned cpu) > { > + cpu_set_t cpuset; > + unsigned lcore; > + int err; > + > /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ > ovs_assert(cpu != NON_PMD_CORE_ID); > - RTE_PER_LCORE(_lcore_id) = cpu; > + > + ovs_mutex_lock(&lcore_id_pool_mutex); > + if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) { > + lcore = NON_PMD_CORE_ID; If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD mempool caches since it doesn't take a 'non_pmd_mutex'. > + } > + ovs_mutex_unlock(&lcore_id_pool_mutex); > + > + RTE_PER_LCORE(_lcore_id) = lcore; > + > + /* DPDK is not initialised, nothing more to do. */ > + if (lcore == NON_PMD_CORE_ID) { > + return; > + } > + > + CPU_ZERO(&cpuset); > + err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset); > + if (err) { > + VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err)); > + } > + > + rte_thread_set_affinity(&cpuset); > + VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu); > +} > + > +void > +dpdk_uninit_thread_context(void) > +{ > + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { > + return; > + } > + > + ovs_mutex_lock(&lcore_id_pool_mutex); > + id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id)); > + ovs_mutex_unlock(&lcore_id_pool_mutex); > } > > void > diff --git a/lib/dpdk.h b/lib/dpdk.h > index 736a64279..404ac1a4b 100644 > --- a/lib/dpdk.h > +++ b/lib/dpdk.h > @@ -36,7 +36,8 @@ struct smap; > struct ovsrec_open_vswitch; > > void dpdk_init(const struct smap *ovs_other_config); > -void dpdk_set_lcore_id(unsigned cpu); > +void dpdk_init_thread_context(unsigned cpu); > +void dpdk_uninit_thread_context(void); > const char *dpdk_get_vhost_sock_dir(void); > bool dpdk_vhost_iommu_enabled(void); > bool dpdk_vhost_postcopy_enabled(void); > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 5142bad1d..c40031a78 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_) > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > ovs_numa_thread_setaffinity_core(pmd->core_id); > - dpdk_set_lcore_id(pmd->core_id); > + dpdk_init_thread_context(pmd->core_id); > poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); > dfc_cache_init(&pmd->flow_cache); > pmd_alloc_static_tx_qid(pmd); > @@ -5592,6 +5592,7 @@ reload: > dfc_cache_uninit(&pmd->flow_cache); > free(poll_list); > pmd_free_cached_ports(pmd); > + dpdk_uninit_thread_context(); > return NULL; > } > >
On Tue, Dec 3, 2019 at 12:34 PM Ilya Maximets <i.maximets@ovn.org> wrote: > > On 02.12.2019 17:03, David Marchand wrote: > > Most DPDK components make the assumption that rte_lcore_id() returns > > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of > > the LCORE_ID_ANY special value). > > Do you think that makes a practical sense? In a real virtualization > environments users are usually using cpus with lower numbers and it's > most likely possible to change NUMA layout to have CPUs from all the > nodes enumerated from the low CPU numbers. That is if you can reconfigure the NUMA layout. How do you achieve this? Bios/firmware configuration? Kernel boot options (did not find)? I imagine it would be a pain to enforce this on a fleet of servers with different hw specs. > It's uncommon also to use all the CPUs for just OVS on a big system > with huge number of cores. Yes, obviously, you don't want to use all your cores for OVS :-). > > Another thing is can we really do this on a DPDK level? Maybe it'll > be a step to dynamic registering/unregistering of non-EAL threads? I'd like to ultimately get to a dynamic register mechanism. But I first wanted to fix the existing code... > Since you're wiping off the meaning of a lcore as a CPU core, it > becomes just a thread_id in a DPDK point of view. So, maybe application > could call a new API function instead of managing these strange > mappings by itself? ... as it is unlikely we will backport an experimental API to previous dpdk versions. > > One comment inline (not a full review). > > Best regards, Ilya Maximets. > > > > > OVS does not currently check which value is set in > > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > > side. > > > > Introduce a lcore allocator in OVS for PMD threads and map them to > > unused lcores from DPDK à la --lcores. > > > > The physical cores on which the PMD threads are running still > > constitutes an important information when debugging, so still keep > > those in the PMD thread names but add a new debug log when starting > > them. > > > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > > registering them via the rte_thread_set_affinity() helper. > > > > Signed-off-by: David Marchand <david.marchand@redhat.com> > > --- > > lib/dpdk-stub.c | 8 +++++- > > lib/dpdk.c | 69 +++++++++++++++++++++++++++++++++++++++++++---- > > lib/dpdk.h | 3 ++- > > lib/dpif-netdev.c | 3 ++- > > 4 files changed, 75 insertions(+), 8 deletions(-) > > > > diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c > > index c332c217c..90473bc8e 100644 > > --- a/lib/dpdk-stub.c > > +++ b/lib/dpdk-stub.c > > @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) > > } > > > > void > > -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) > > +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) > > +{ > > + /* Nothing */ > > +} > > + > > +void > > +dpdk_uninit_thread_context(void) > > { > > /* Nothing */ > > } > > diff --git a/lib/dpdk.c b/lib/dpdk.c > > index 21dd47e80..771baa413 100644 > > --- a/lib/dpdk.c > > +++ b/lib/dpdk.c > > @@ -33,6 +33,7 @@ > > > > #include "dirs.h" > > #include "fatal-signal.h" > > +#include "id-pool.h" > > #include "netdev-dpdk.h" > > #include "netdev-offload-provider.h" > > #include "openvswitch/dynamic-string.h" > > @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization > > * of DPDK. */ > > static bool per_port_memory = false; /* Status of per port memory support */ > > > > +static struct id_pool *lcore_id_pool; > > +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER; > > + > > static int > > process_vhost_flags(char *flag, const char *default_val, int size, > > const struct smap *ovs_other_config, > > @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config) > > } > > } > > > > - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { > > + if (args_contains(&args, "-c") || args_contains(&args, "-l") || > > + args_contains(&args, "--lcores")) { > > auto_determine = false; > > } > > > > @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config) > > * thread affintity - default to core #0 */ > > VLOG_ERR("Thread getaffinity failed. Using core #0"); > > } > > - svec_add(&args, "-l"); > > - svec_add_nocopy(&args, xasprintf("%d", cpu)); > > + svec_add(&args, "--lcores"); > > + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); > > } > > > > svec_terminate(&args); > > @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config) > > } > > } > > > > + ovs_mutex_lock(&lcore_id_pool_mutex); > > + lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE); > > + /* Empty the whole pool... */ > > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > > + uint32_t lcore_id; > > + > > + id_pool_alloc_id(lcore_id_pool, &lcore_id); > > + } > > + /* ...and release the unused spots. */ > > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > > + if (rte_eal_lcore_role(lcore) != ROLE_OFF) { > > + continue; > > + } > > + id_pool_free_id(lcore_id_pool, lcore); > > + } > > + ovs_mutex_unlock(&lcore_id_pool_mutex); > > + > > /* We are called from the main thread here */ > > RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; > > > > @@ -522,11 +544,48 @@ dpdk_available(void) > > } > > > > void > > -dpdk_set_lcore_id(unsigned cpu) > > +dpdk_init_thread_context(unsigned cpu) > > { > > + cpu_set_t cpuset; > > + unsigned lcore; > > + int err; > > + > > /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ > > ovs_assert(cpu != NON_PMD_CORE_ID); > > - RTE_PER_LCORE(_lcore_id) = cpu; > > + > > + ovs_mutex_lock(&lcore_id_pool_mutex); > > + if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) { > > + lcore = NON_PMD_CORE_ID; > > If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD > mempool caches since it doesn't take a 'non_pmd_mutex'. Ok, need to look at this again. I missed something. I don't understand how you would create PMD threads with cpu != NON_PMD_CORE_ID without dpdk enabled but still call the mempool code. -- David Marchand
On Mon, Dec 02, 2019 at 05:03:30PM +0100, David Marchand wrote: > Most DPDK components make the assumption that rte_lcore_id() returns > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of > the LCORE_ID_ANY special value). > > OVS does not currently check which value is set in > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > side. > > Introduce a lcore allocator in OVS for PMD threads and map them to > unused lcores from DPDK à la --lcores. > > The physical cores on which the PMD threads are running still > constitutes an important information when debugging, so still keep > those in the PMD thread names but add a new debug log when starting > them. > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > registering them via the rte_thread_set_affinity() helper. > > Signed-off-by: David Marchand <david.marchand@redhat.com> > --- I liked the idea because maybe we could lower RTE_MAX_LCORE and save some memory. It's unusual to require the default amount anyways. I changed RTE_MAX_LCORE to 4, and ran some tests with CPU 8: 2019-12-04T18:27:22.797Z|00001|dpdk(pmd-c08/id:3)|INFO|Initialised lcore 1 for core 8 I didn't find any issues with either the patch or with the tests. Acked-by: Flavio Leitner <fbl@sysclose.org> Thanks fbl
On Wed, Dec 4, 2019 at 8:52 PM Flavio Leitner <fbl@sysclose.org> wrote: > > On Mon, Dec 02, 2019 at 05:03:30PM +0100, David Marchand wrote: > > Most DPDK components make the assumption that rte_lcore_id() returns > > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of > > the LCORE_ID_ANY special value). > > > > OVS does not currently check which value is set in > > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > > side. > > > > Introduce a lcore allocator in OVS for PMD threads and map them to > > unused lcores from DPDK à la --lcores. > > > > The physical cores on which the PMD threads are running still > > constitutes an important information when debugging, so still keep > > those in the PMD thread names but add a new debug log when starting > > them. > > > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > > registering them via the rte_thread_set_affinity() helper. > > > > Signed-off-by: David Marchand <david.marchand@redhat.com> > > --- > > I liked the idea because maybe we could lower RTE_MAX_LCORE and save > some memory. It's unusual to require the default amount anyways. About decreasing RTE_MAX_LCORE, I had that in mind, but the problem is existing deployments with -c/-l in ovsdb. We could try to parse this, reformat and only pass --lcores to dpdk. Still feels a bit dangerous. > > I changed RTE_MAX_LCORE to 4, and ran some tests with CPU 8: > 2019-12-04T18:27:22.797Z|00001|dpdk(pmd-c08/id:3)|INFO|Initialised lcore 1 for core 8 > > I didn't find any issues with either the patch or with the tests. > > Acked-by: Flavio Leitner <fbl@sysclose.org> Thanks for the review and test!
On Wed, Dec 04, 2019 at 09:09:42PM +0100, David Marchand wrote: > On Wed, Dec 4, 2019 at 8:52 PM Flavio Leitner <fbl@sysclose.org> wrote: > > > > On Mon, Dec 02, 2019 at 05:03:30PM +0100, David Marchand wrote: > > > Most DPDK components make the assumption that rte_lcore_id() returns > > > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of > > > the LCORE_ID_ANY special value). > > > > > > OVS does not currently check which value is set in > > > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > > > side. > > > > > > Introduce a lcore allocator in OVS for PMD threads and map them to > > > unused lcores from DPDK à la --lcores. > > > > > > The physical cores on which the PMD threads are running still > > > constitutes an important information when debugging, so still keep > > > those in the PMD thread names but add a new debug log when starting > > > them. > > > > > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > > > registering them via the rte_thread_set_affinity() helper. > > > > > > Signed-off-by: David Marchand <david.marchand@redhat.com> > > > --- > > > > I liked the idea because maybe we could lower RTE_MAX_LCORE and save > > some memory. It's unusual to require the default amount anyways. > > About decreasing RTE_MAX_LCORE, I had that in mind, but the problem is > existing deployments with -c/-l in ovsdb. > We could try to parse this, reformat and only pass --lcores to dpdk. > Still feels a bit dangerous. Always the legacy stuff... :) > > I changed RTE_MAX_LCORE to 4, and ran some tests with CPU 8: > > 2019-12-04T18:27:22.797Z|00001|dpdk(pmd-c08/id:3)|INFO|Initialised lcore 1 for core 8 > > > > I didn't find any issues with either the patch or with the tests. > > > > Acked-by: Flavio Leitner <fbl@sysclose.org> > > Thanks for the review and test! BTW, it would be nice to have an ovs-appctl command to dump a list of IDs and CPUs in case the logs are rotated, config changed and whatnot. For instance, sos could get an accurate report dumping a list.
On Wed, Dec 4, 2019 at 9:21 PM Flavio Leitner <fbl@sysclose.org> wrote: > BTW, it would be nice to have an ovs-appctl command to dump a list > of IDs and CPUs in case the logs are rotated, config changed and > whatnot. For instance, sos could get an accurate report dumping a > list. Good idea. I will look at this once I get to the other point raised by Ilya. Thanks. -- David Marchand
On 03.12.2019 14:00, David Marchand wrote: > On Tue, Dec 3, 2019 at 12:34 PM Ilya Maximets <i.maximets@ovn.org> wrote: >> >> On 02.12.2019 17:03, David Marchand wrote: >>> Most DPDK components make the assumption that rte_lcore_id() returns >>> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of >>> the LCORE_ID_ANY special value). >> >> Do you think that makes a practical sense? In a real virtualization >> environments users are usually using cpus with lower numbers and it's >> most likely possible to change NUMA layout to have CPUs from all the >> nodes enumerated from the low CPU numbers. > > That is if you can reconfigure the NUMA layout. > How do you achieve this? > Bios/firmware configuration? It's usually a BIOS configuration. Modern servers provides configurations for 2-4 types of core enumeration along with choosing NUMA topology. > Kernel boot options (did not find)? > > I imagine it would be a pain to enforce this on a fleet of servers > with different hw specs. From my experience, you almost never could just take a new server and run your virtualization software. You'll have to enable some HW capabilities or enable "Highly Optimized Virtualization Configuration Set" in BIOS anyway. > > >> It's uncommon also to use all the CPUs for just OVS on a big system >> with huge number of cores. > > Yes, obviously, you don't want to use all your cores for OVS :-). > > >> >> Another thing is can we really do this on a DPDK level? Maybe it'll >> be a step to dynamic registering/unregistering of non-EAL threads? > > I'd like to ultimately get to a dynamic register mechanism. > But I first wanted to fix the existing code... > > >> Since you're wiping off the meaning of a lcore as a CPU core, it >> becomes just a thread_id in a DPDK point of view. So, maybe application >> could call a new API function instead of managing these strange >> mappings by itself? > > ... as it is unlikely we will backport an experimental API to previous > dpdk versions. > > >> >> One comment inline (not a full review). >> >> Best regards, Ilya Maximets. >> >>> >>> OVS does not currently check which value is set in >>> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK >>> side. >>> >>> Introduce a lcore allocator in OVS for PMD threads and map them to >>> unused lcores from DPDK à la --lcores. >>> >>> The physical cores on which the PMD threads are running still >>> constitutes an important information when debugging, so still keep >>> those in the PMD thread names but add a new debug log when starting >>> them. >>> >>> Synchronize DPDK internals on numa and cpuset for the PMD threads by >>> registering them via the rte_thread_set_affinity() helper. >>> >>> Signed-off-by: David Marchand <david.marchand@redhat.com> >>> --- >>> lib/dpdk-stub.c | 8 +++++- >>> lib/dpdk.c | 69 +++++++++++++++++++++++++++++++++++++++++++---- >>> lib/dpdk.h | 3 ++- >>> lib/dpif-netdev.c | 3 ++- >>> 4 files changed, 75 insertions(+), 8 deletions(-) >>> >>> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c >>> index c332c217c..90473bc8e 100644 >>> --- a/lib/dpdk-stub.c >>> +++ b/lib/dpdk-stub.c >>> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) >>> } >>> >>> void >>> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) >>> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) >>> +{ >>> + /* Nothing */ >>> +} >>> + >>> +void >>> +dpdk_uninit_thread_context(void) >>> { >>> /* Nothing */ >>> } >>> diff --git a/lib/dpdk.c b/lib/dpdk.c >>> index 21dd47e80..771baa413 100644 >>> --- a/lib/dpdk.c >>> +++ b/lib/dpdk.c >>> @@ -33,6 +33,7 @@ >>> >>> #include "dirs.h" >>> #include "fatal-signal.h" >>> +#include "id-pool.h" >>> #include "netdev-dpdk.h" >>> #include "netdev-offload-provider.h" >>> #include "openvswitch/dynamic-string.h" >>> @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization >>> * of DPDK. */ >>> static bool per_port_memory = false; /* Status of per port memory support */ >>> >>> +static struct id_pool *lcore_id_pool; OVS_GUARDED_BY() would be nice. >>> +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER; >>> + >>> static int >>> process_vhost_flags(char *flag, const char *default_val, int size, >>> const struct smap *ovs_other_config, >>> @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config) >>> } >>> } >>> >>> - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { >>> + if (args_contains(&args, "-c") || args_contains(&args, "-l") || >>> + args_contains(&args, "--lcores")) { >>> auto_determine = false; >>> } >>> >>> @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config) >>> * thread affintity - default to core #0 */ >>> VLOG_ERR("Thread getaffinity failed. Using core #0"); >>> } >>> - svec_add(&args, "-l"); >>> - svec_add_nocopy(&args, xasprintf("%d", cpu)); >>> + svec_add(&args, "--lcores"); >>> + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); >>> } >>> >>> svec_terminate(&args); >>> @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config) >>> } >>> } >>> >>> + ovs_mutex_lock(&lcore_id_pool_mutex); >>> + lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE); >>> + /* Empty the whole pool... */ >>> + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { >>> + uint32_t lcore_id; >>> + >>> + id_pool_alloc_id(lcore_id_pool, &lcore_id); >>> + } >>> + /* ...and release the unused spots. */ >>> + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { >>> + if (rte_eal_lcore_role(lcore) != ROLE_OFF) { >>> + continue; >>> + } >>> + id_pool_free_id(lcore_id_pool, lcore); >>> + } >>> + ovs_mutex_unlock(&lcore_id_pool_mutex); >>> + >>> /* We are called from the main thread here */ >>> RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; >>> >>> @@ -522,11 +544,48 @@ dpdk_available(void) >>> } >>> >>> void >>> -dpdk_set_lcore_id(unsigned cpu) >>> +dpdk_init_thread_context(unsigned cpu) >>> { >>> + cpu_set_t cpuset; I think it should be rte_cpuset_t. For portability and also to follow DPDK API. >>> + unsigned lcore; >>> + int err; >>> + >>> /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ >>> ovs_assert(cpu != NON_PMD_CORE_ID); >>> - RTE_PER_LCORE(_lcore_id) = cpu; >>> + >>> + ovs_mutex_lock(&lcore_id_pool_mutex); >>> + if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) { >>> + lcore = NON_PMD_CORE_ID; >> >> If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD >> mempool caches since it doesn't take a 'non_pmd_mutex'. > > Ok, need to look at this again. > I missed something. > I don't understand how you would create PMD threads with cpu != > NON_PMD_CORE_ID without dpdk enabled but still call the mempool code. I meant that if you'll try to create more PMD threads than RTE_MAX_LCORE. For example, if RTE_MAX_LCORE == 32 and you want to create 40 PMD threads (assuming your system has 64 CPUs), 8 PMD threads and non-PMD thread(s) will have lcore == NON_PMD_CORE_ID == LCORE_ID_ANY. I re-checked and this will not cause sharing of the same mempool caches because threads with LCORE_ID_ANY doesn't use it, however I think that simultaneous calls of DPDK API might still be somehow dangerous. Anyway, performance will suffer. At least, we should warn about that. > + } > + ovs_mutex_unlock(&lcore_id_pool_mutex); > + > + RTE_PER_LCORE(_lcore_id) = lcore; > + > + /* DPDK is not initialised, nothing more to do. */ > + if (lcore == NON_PMD_CORE_ID) { > + return; > + } > + > + CPU_ZERO(&cpuset); > + err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset); We know the core already. No need to make additional syscall. We could just set it with CPU_SET(). This will also help with portability in the future. > + if (err) { > + VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err)); > + } > + > + rte_thread_set_affinity(&cpuset); > + VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu); > +} > + > +void > +dpdk_uninit_thread_context(void) > +{ > + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { > + return; > + } > + > + ovs_mutex_lock(&lcore_id_pool_mutex); > + id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id)); > + ovs_mutex_unlock(&lcore_id_pool_mutex); > } > > void > diff --git a/lib/dpdk.h b/lib/dpdk.h > index 736a64279..404ac1a4b 100644 > --- a/lib/dpdk.h > +++ b/lib/dpdk.h > @@ -36,7 +36,8 @@ struct smap; > struct ovsrec_open_vswitch; > > void dpdk_init(const struct smap *ovs_other_config); > -void dpdk_set_lcore_id(unsigned cpu); > +void dpdk_init_thread_context(unsigned cpu); > +void dpdk_uninit_thread_context(void); > const char *dpdk_get_vhost_sock_dir(void); > bool dpdk_vhost_iommu_enabled(void); > bool dpdk_vhost_postcopy_enabled(void); > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 5142bad1d..c40031a78 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_) > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > ovs_numa_thread_setaffinity_core(pmd->core_id); > - dpdk_set_lcore_id(pmd->core_id); > + dpdk_init_thread_context(pmd->core_id); > poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); > dfc_cache_init(&pmd->flow_cache); > pmd_alloc_static_tx_qid(pmd); > @@ -5592,6 +5592,7 @@ reload: > dfc_cache_uninit(&pmd->flow_cache); > free(poll_list); > pmd_free_cached_ports(pmd); > + dpdk_uninit_thread_context(); > return NULL; > }
diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index c332c217c..90473bc8e 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) } void -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) +{ + /* Nothing */ +} + +void +dpdk_uninit_thread_context(void) { /* Nothing */ } diff --git a/lib/dpdk.c b/lib/dpdk.c index 21dd47e80..771baa413 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -33,6 +33,7 @@ #include "dirs.h" #include "fatal-signal.h" +#include "id-pool.h" #include "netdev-dpdk.h" #include "netdev-offload-provider.h" #include "openvswitch/dynamic-string.h" @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization * of DPDK. */ static bool per_port_memory = false; /* Status of per port memory support */ +static struct id_pool *lcore_id_pool; +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER; + static int process_vhost_flags(char *flag, const char *default_val, int size, const struct smap *ovs_other_config, @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config) } } - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { + if (args_contains(&args, "-c") || args_contains(&args, "-l") || + args_contains(&args, "--lcores")) { auto_determine = false; } @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config) * thread affintity - default to core #0 */ VLOG_ERR("Thread getaffinity failed. Using core #0"); } - svec_add(&args, "-l"); - svec_add_nocopy(&args, xasprintf("%d", cpu)); + svec_add(&args, "--lcores"); + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); } svec_terminate(&args); @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config) } } + ovs_mutex_lock(&lcore_id_pool_mutex); + lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE); + /* Empty the whole pool... */ + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { + uint32_t lcore_id; + + id_pool_alloc_id(lcore_id_pool, &lcore_id); + } + /* ...and release the unused spots. */ + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { + if (rte_eal_lcore_role(lcore) != ROLE_OFF) { + continue; + } + id_pool_free_id(lcore_id_pool, lcore); + } + ovs_mutex_unlock(&lcore_id_pool_mutex); + /* We are called from the main thread here */ RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; @@ -522,11 +544,48 @@ dpdk_available(void) } void -dpdk_set_lcore_id(unsigned cpu) +dpdk_init_thread_context(unsigned cpu) { + cpu_set_t cpuset; + unsigned lcore; + int err; + /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ ovs_assert(cpu != NON_PMD_CORE_ID); - RTE_PER_LCORE(_lcore_id) = cpu; + + ovs_mutex_lock(&lcore_id_pool_mutex); + if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) { + lcore = NON_PMD_CORE_ID; + } + ovs_mutex_unlock(&lcore_id_pool_mutex); + + RTE_PER_LCORE(_lcore_id) = lcore; + + /* DPDK is not initialised, nothing more to do. */ + if (lcore == NON_PMD_CORE_ID) { + return; + } + + CPU_ZERO(&cpuset); + err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset); + if (err) { + VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err)); + } + + rte_thread_set_affinity(&cpuset); + VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu); +} + +void +dpdk_uninit_thread_context(void) +{ + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { + return; + } + + ovs_mutex_lock(&lcore_id_pool_mutex); + id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id)); + ovs_mutex_unlock(&lcore_id_pool_mutex); } void diff --git a/lib/dpdk.h b/lib/dpdk.h index 736a64279..404ac1a4b 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -36,7 +36,8 @@ struct smap; struct ovsrec_open_vswitch; void dpdk_init(const struct smap *ovs_other_config); -void dpdk_set_lcore_id(unsigned cpu); +void dpdk_init_thread_context(unsigned cpu); +void dpdk_uninit_thread_context(void); const char *dpdk_get_vhost_sock_dir(void); bool dpdk_vhost_iommu_enabled(void); bool dpdk_vhost_postcopy_enabled(void); diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 5142bad1d..c40031a78 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_) /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); ovs_numa_thread_setaffinity_core(pmd->core_id); - dpdk_set_lcore_id(pmd->core_id); + dpdk_init_thread_context(pmd->core_id); poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); dfc_cache_init(&pmd->flow_cache); pmd_alloc_static_tx_qid(pmd); @@ -5592,6 +5592,7 @@ reload: dfc_cache_uninit(&pmd->flow_cache); free(poll_list); pmd_free_cached_ports(pmd); + dpdk_uninit_thread_context(); return NULL; }
Most DPDK components make the assumption that rte_lcore_id() returns a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of the LCORE_ID_ANY special value). OVS does not currently check which value is set in RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK side. Introduce a lcore allocator in OVS for PMD threads and map them to unused lcores from DPDK à la --lcores. The physical cores on which the PMD threads are running still constitutes an important information when debugging, so still keep those in the PMD thread names but add a new debug log when starting them. Synchronize DPDK internals on numa and cpuset for the PMD threads by registering them via the rte_thread_set_affinity() helper. Signed-off-by: David Marchand <david.marchand@redhat.com> --- lib/dpdk-stub.c | 8 +++++- lib/dpdk.c | 69 +++++++++++++++++++++++++++++++++++++++++++---- lib/dpdk.h | 3 ++- lib/dpif-netdev.c | 3 ++- 4 files changed, 75 insertions(+), 8 deletions(-)