diff mbox series

[ovs-dev] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.

Message ID 20191202160330.14413-1-david.marchand@redhat.com
State Superseded
Headers show
Series [ovs-dev] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE. | expand

Commit Message

David Marchand Dec. 2, 2019, 4:03 p.m. UTC
Most DPDK components make the assumption that rte_lcore_id() returns
a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
the LCORE_ID_ANY special value).

OVS does not currently check which value is set in
RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
side.

Introduce a lcore allocator in OVS for PMD threads and map them to
unused lcores from DPDK à la --lcores.

The physical cores on which the PMD threads are running still
constitutes an important information when debugging, so still keep
those in the PMD thread names but add a new debug log when starting
them.

Synchronize DPDK internals on numa and cpuset for the PMD threads by
registering them via the rte_thread_set_affinity() helper.

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
 lib/dpdk-stub.c   |  8 +++++-
 lib/dpdk.c        | 69 +++++++++++++++++++++++++++++++++++++++++++----
 lib/dpdk.h        |  3 ++-
 lib/dpif-netdev.c |  3 ++-
 4 files changed, 75 insertions(+), 8 deletions(-)

Comments

Ilya Maximets Dec. 3, 2019, 11:34 a.m. UTC | #1
On 02.12.2019 17:03, David Marchand wrote:
> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
> the LCORE_ID_ANY special value).

Do you think that makes a practical sense?  In a real virtualization
environments users are usually using cpus with lower numbers and it's
most likely possible to change NUMA layout to have CPUs from all the
nodes enumerated from the low CPU numbers.

It's uncommon also to use all the CPUs for just OVS on a big system
with huge number of cores.

Another thing is can we really do this on a DPDK level?  Maybe it'll
be a step to dynamic registering/unregistering of non-EAL threads?
Since you're wiping off the meaning of a lcore as a CPU core, it
becomes just a thread_id in a DPDK point of view.  So, maybe application
could call a new API function instead of managing these strange
mappings by itself?

One comment inline (not a full review).

Best regards, Ilya Maximets.

> 
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
> 
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
> 
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> 
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
>  lib/dpdk-stub.c   |  8 +++++-
>  lib/dpdk.c        | 69 +++++++++++++++++++++++++++++++++++++++++++----
>  lib/dpdk.h        |  3 ++-
>  lib/dpif-netdev.c |  3 ++-
>  4 files changed, 75 insertions(+), 8 deletions(-)
> 
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index c332c217c..90473bc8e 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 21dd47e80..771baa413 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -33,6 +33,7 @@
>  
>  #include "dirs.h"
>  #include "fatal-signal.h"
> +#include "id-pool.h"
>  #include "netdev-dpdk.h"
>  #include "netdev-offload-provider.h"
>  #include "openvswitch/dynamic-string.h"
> @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>                                         * of DPDK. */
>  static bool per_port_memory = false; /* Status of per port memory support */
>  
> +static struct id_pool *lcore_id_pool;
> +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
> +
>  static int
>  process_vhost_flags(char *flag, const char *default_val, int size,
>                      const struct smap *ovs_other_config,
> @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> +        args_contains(&args, "--lcores")) {
>          auto_determine = false;
>      }
>  
> @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>               * thread affintity - default to core #0 */
>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>          }
> -        svec_add(&args, "-l");
> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
> +        svec_add(&args, "--lcores");
> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>      }
>  
>      svec_terminate(&args);
> @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
> +    /* Empty the whole pool... */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        uint32_t lcore_id;
> +
> +        id_pool_alloc_id(lcore_id_pool, &lcore_id);
> +    }
> +    /* ...and release the unused spots. */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
> +             continue;
> +        }
> +        id_pool_free_id(lcore_id_pool, lcore);
> +    }
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>  
> @@ -522,11 +544,48 @@ dpdk_available(void)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
> +    cpu_set_t cpuset;
> +    unsigned lcore;
> +    int err;
> +
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
> +        lcore = NON_PMD_CORE_ID;

If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD
mempool caches since it doesn't take a 'non_pmd_mutex'.

> +    }
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    /* DPDK is not initialised, nothing more to do. */
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);
> +    if (err) {
> +        VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err));
> +    }
> +
> +    rte_thread_set_affinity(&cpuset);
> +    VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..404ac1a4b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 5142bad1d..c40031a78 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5592,6 +5592,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
>  
>
David Marchand Dec. 3, 2019, 1 p.m. UTC | #2
On Tue, Dec 3, 2019 at 12:34 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 02.12.2019 17:03, David Marchand wrote:
> > Most DPDK components make the assumption that rte_lcore_id() returns
> > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
> > the LCORE_ID_ANY special value).
>
> Do you think that makes a practical sense?  In a real virtualization
> environments users are usually using cpus with lower numbers and it's
> most likely possible to change NUMA layout to have CPUs from all the
> nodes enumerated from the low CPU numbers.

That is if you can reconfigure the NUMA layout.
How do you achieve this?
Bios/firmware configuration?
Kernel boot options (did not find)?

I imagine it would be a pain to enforce this on a fleet of servers
with different hw specs.


> It's uncommon also to use all the CPUs for just OVS on a big system
> with huge number of cores.

Yes, obviously, you don't want to use all your cores for OVS :-).


>
> Another thing is can we really do this on a DPDK level?  Maybe it'll
> be a step to dynamic registering/unregistering of non-EAL threads?

I'd like to ultimately get to a dynamic register mechanism.
But I first wanted to fix the existing code...


> Since you're wiping off the meaning of a lcore as a CPU core, it
> becomes just a thread_id in a DPDK point of view.  So, maybe application
> could call a new API function instead of managing these strange
> mappings by itself?

... as it is unlikely we will backport an experimental API to previous
dpdk versions.


>
> One comment inline (not a full review).
>
> Best regards, Ilya Maximets.
>
> >
> > OVS does not currently check which value is set in
> > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> > side.
> >
> > Introduce a lcore allocator in OVS for PMD threads and map them to
> > unused lcores from DPDK à la --lcores.
> >
> > The physical cores on which the PMD threads are running still
> > constitutes an important information when debugging, so still keep
> > those in the PMD thread names but add a new debug log when starting
> > them.
> >
> > Synchronize DPDK internals on numa and cpuset for the PMD threads by
> > registering them via the rte_thread_set_affinity() helper.
> >
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > ---
> >  lib/dpdk-stub.c   |  8 +++++-
> >  lib/dpdk.c        | 69 +++++++++++++++++++++++++++++++++++++++++++----
> >  lib/dpdk.h        |  3 ++-
> >  lib/dpif-netdev.c |  3 ++-
> >  4 files changed, 75 insertions(+), 8 deletions(-)
> >
> > diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> > index c332c217c..90473bc8e 100644
> > --- a/lib/dpdk-stub.c
> > +++ b/lib/dpdk-stub.c
> > @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
> >  }
> >
> >  void
> > -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> > +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> > +{
> > +    /* Nothing */
> > +}
> > +
> > +void
> > +dpdk_uninit_thread_context(void)
> >  {
> >      /* Nothing */
> >  }
> > diff --git a/lib/dpdk.c b/lib/dpdk.c
> > index 21dd47e80..771baa413 100644
> > --- a/lib/dpdk.c
> > +++ b/lib/dpdk.c
> > @@ -33,6 +33,7 @@
> >
> >  #include "dirs.h"
> >  #include "fatal-signal.h"
> > +#include "id-pool.h"
> >  #include "netdev-dpdk.h"
> >  #include "netdev-offload-provider.h"
> >  #include "openvswitch/dynamic-string.h"
> > @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
> >                                         * of DPDK. */
> >  static bool per_port_memory = false; /* Status of per port memory support */
> >
> > +static struct id_pool *lcore_id_pool;
> > +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
> > +
> >  static int
> >  process_vhost_flags(char *flag, const char *default_val, int size,
> >                      const struct smap *ovs_other_config,
> > @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config)
> >          }
> >      }
> >
> > -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> > +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> > +        args_contains(&args, "--lcores")) {
> >          auto_determine = false;
> >      }
> >
> > @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config)
> >               * thread affintity - default to core #0 */
> >              VLOG_ERR("Thread getaffinity failed. Using core #0");
> >          }
> > -        svec_add(&args, "-l");
> > -        svec_add_nocopy(&args, xasprintf("%d", cpu));
> > +        svec_add(&args, "--lcores");
> > +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
> >      }
> >
> >      svec_terminate(&args);
> > @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config)
> >          }
> >      }
> >
> > +    ovs_mutex_lock(&lcore_id_pool_mutex);
> > +    lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
> > +    /* Empty the whole pool... */
> > +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> > +        uint32_t lcore_id;
> > +
> > +        id_pool_alloc_id(lcore_id_pool, &lcore_id);
> > +    }
> > +    /* ...and release the unused spots. */
> > +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> > +        if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
> > +             continue;
> > +        }
> > +        id_pool_free_id(lcore_id_pool, lcore);
> > +    }
> > +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> > +
> >      /* We are called from the main thread here */
> >      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
> >
> > @@ -522,11 +544,48 @@ dpdk_available(void)
> >  }
> >
> >  void
> > -dpdk_set_lcore_id(unsigned cpu)
> > +dpdk_init_thread_context(unsigned cpu)
> >  {
> > +    cpu_set_t cpuset;
> > +    unsigned lcore;
> > +    int err;
> > +
> >      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
> >      ovs_assert(cpu != NON_PMD_CORE_ID);
> > -    RTE_PER_LCORE(_lcore_id) = cpu;
> > +
> > +    ovs_mutex_lock(&lcore_id_pool_mutex);
> > +    if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
> > +        lcore = NON_PMD_CORE_ID;
>
> If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD
> mempool caches since it doesn't take a 'non_pmd_mutex'.

Ok, need to look at this again.
I missed something.
I don't understand how you would create PMD threads with cpu !=
NON_PMD_CORE_ID without dpdk enabled but still call the mempool code.


--
David Marchand
Flavio Leitner Dec. 4, 2019, 7:52 p.m. UTC | #3
On Mon, Dec 02, 2019 at 05:03:30PM +0100, David Marchand wrote:
> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
> the LCORE_ID_ANY special value).
> 
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
> 
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
> 
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> 
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---

I liked the idea because maybe we could lower RTE_MAX_LCORE and save
some memory. It's unusual to require the default amount anyways.

I changed RTE_MAX_LCORE to 4, and ran some tests with CPU 8:
2019-12-04T18:27:22.797Z|00001|dpdk(pmd-c08/id:3)|INFO|Initialised lcore 1 for core 8

I didn't find any issues with either the patch or with the tests.

Acked-by: Flavio Leitner <fbl@sysclose.org>

Thanks
fbl
David Marchand Dec. 4, 2019, 8:09 p.m. UTC | #4
On Wed, Dec 4, 2019 at 8:52 PM Flavio Leitner <fbl@sysclose.org> wrote:
>
> On Mon, Dec 02, 2019 at 05:03:30PM +0100, David Marchand wrote:
> > Most DPDK components make the assumption that rte_lcore_id() returns
> > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
> > the LCORE_ID_ANY special value).
> >
> > OVS does not currently check which value is set in
> > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> > side.
> >
> > Introduce a lcore allocator in OVS for PMD threads and map them to
> > unused lcores from DPDK à la --lcores.
> >
> > The physical cores on which the PMD threads are running still
> > constitutes an important information when debugging, so still keep
> > those in the PMD thread names but add a new debug log when starting
> > them.
> >
> > Synchronize DPDK internals on numa and cpuset for the PMD threads by
> > registering them via the rte_thread_set_affinity() helper.
> >
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > ---
>
> I liked the idea because maybe we could lower RTE_MAX_LCORE and save
> some memory. It's unusual to require the default amount anyways.

About decreasing RTE_MAX_LCORE, I had that in mind, but the problem is
existing deployments with -c/-l in ovsdb.
We could try to parse this, reformat and only pass --lcores to dpdk.
Still feels a bit dangerous.


>
> I changed RTE_MAX_LCORE to 4, and ran some tests with CPU 8:
> 2019-12-04T18:27:22.797Z|00001|dpdk(pmd-c08/id:3)|INFO|Initialised lcore 1 for core 8
>
> I didn't find any issues with either the patch or with the tests.
>
> Acked-by: Flavio Leitner <fbl@sysclose.org>

Thanks for the review and test!
Flavio Leitner Dec. 4, 2019, 8:21 p.m. UTC | #5
On Wed, Dec 04, 2019 at 09:09:42PM +0100, David Marchand wrote:
> On Wed, Dec 4, 2019 at 8:52 PM Flavio Leitner <fbl@sysclose.org> wrote:
> >
> > On Mon, Dec 02, 2019 at 05:03:30PM +0100, David Marchand wrote:
> > > Most DPDK components make the assumption that rte_lcore_id() returns
> > > a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
> > > the LCORE_ID_ANY special value).
> > >
> > > OVS does not currently check which value is set in
> > > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> > > side.
> > >
> > > Introduce a lcore allocator in OVS for PMD threads and map them to
> > > unused lcores from DPDK à la --lcores.
> > >
> > > The physical cores on which the PMD threads are running still
> > > constitutes an important information when debugging, so still keep
> > > those in the PMD thread names but add a new debug log when starting
> > > them.
> > >
> > > Synchronize DPDK internals on numa and cpuset for the PMD threads by
> > > registering them via the rte_thread_set_affinity() helper.
> > >
> > > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > > ---
> >
> > I liked the idea because maybe we could lower RTE_MAX_LCORE and save
> > some memory. It's unusual to require the default amount anyways.
> 
> About decreasing RTE_MAX_LCORE, I had that in mind, but the problem is
> existing deployments with -c/-l in ovsdb.
> We could try to parse this, reformat and only pass --lcores to dpdk.
> Still feels a bit dangerous.

Always the legacy stuff... :)

> > I changed RTE_MAX_LCORE to 4, and ran some tests with CPU 8:
> > 2019-12-04T18:27:22.797Z|00001|dpdk(pmd-c08/id:3)|INFO|Initialised lcore 1 for core 8
> >
> > I didn't find any issues with either the patch or with the tests.
> >
> > Acked-by: Flavio Leitner <fbl@sysclose.org>
> 
> Thanks for the review and test!

BTW, it would be nice to have an ovs-appctl command to dump a list
of IDs and CPUs in case the logs are rotated, config changed and
whatnot. For instance, sos could get an accurate report dumping a
list.
David Marchand Dec. 4, 2019, 8:34 p.m. UTC | #6
On Wed, Dec 4, 2019 at 9:21 PM Flavio Leitner <fbl@sysclose.org> wrote:
> BTW, it would be nice to have an ovs-appctl command to dump a list
> of IDs and CPUs in case the logs are rotated, config changed and
> whatnot. For instance, sos could get an accurate report dumping a
> list.

Good idea.
I will look at this once I get to the other point raised by Ilya.

Thanks.


--
David Marchand
Ilya Maximets Dec. 4, 2019, 9:22 p.m. UTC | #7
On 03.12.2019 14:00, David Marchand wrote:
> On Tue, Dec 3, 2019 at 12:34 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 02.12.2019 17:03, David Marchand wrote:
>>> Most DPDK components make the assumption that rte_lcore_id() returns
>>> a valid lcore_id in [0..RTE_MAX_LCORE] range (with the exception of
>>> the LCORE_ID_ANY special value).
>>
>> Do you think that makes a practical sense?  In a real virtualization
>> environments users are usually using cpus with lower numbers and it's
>> most likely possible to change NUMA layout to have CPUs from all the
>> nodes enumerated from the low CPU numbers.
> 
> That is if you can reconfigure the NUMA layout.
> How do you achieve this?
> Bios/firmware configuration?

It's usually a BIOS configuration.  Modern servers provides configurations
for 2-4 types of core enumeration along with choosing NUMA topology.

> Kernel boot options (did not find)?
> 
> I imagine it would be a pain to enforce this on a fleet of servers
> with different hw specs.

From my experience, you almost never could just take a new server and
run your virtualization software.  You'll have to enable some HW
capabilities or enable "Highly Optimized Virtualization Configuration
Set" in BIOS anyway.


> 
> 
>> It's uncommon also to use all the CPUs for just OVS on a big system
>> with huge number of cores.
> 
> Yes, obviously, you don't want to use all your cores for OVS :-).
> 
> 
>>
>> Another thing is can we really do this on a DPDK level?  Maybe it'll
>> be a step to dynamic registering/unregistering of non-EAL threads?
> 
> I'd like to ultimately get to a dynamic register mechanism.
> But I first wanted to fix the existing code...
> 
> 
>> Since you're wiping off the meaning of a lcore as a CPU core, it
>> becomes just a thread_id in a DPDK point of view.  So, maybe application
>> could call a new API function instead of managing these strange
>> mappings by itself?
> 
> ... as it is unlikely we will backport an experimental API to previous
> dpdk versions.
> 
> 
>>
>> One comment inline (not a full review).
>>
>> Best regards, Ilya Maximets.
>>
>>>
>>> OVS does not currently check which value is set in
>>> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
>>> side.
>>>
>>> Introduce a lcore allocator in OVS for PMD threads and map them to
>>> unused lcores from DPDK à la --lcores.
>>>
>>> The physical cores on which the PMD threads are running still
>>> constitutes an important information when debugging, so still keep
>>> those in the PMD thread names but add a new debug log when starting
>>> them.
>>>
>>> Synchronize DPDK internals on numa and cpuset for the PMD threads by
>>> registering them via the rte_thread_set_affinity() helper.
>>>
>>> Signed-off-by: David Marchand <david.marchand@redhat.com>
>>> ---
>>>  lib/dpdk-stub.c   |  8 +++++-
>>>  lib/dpdk.c        | 69 +++++++++++++++++++++++++++++++++++++++++++----
>>>  lib/dpdk.h        |  3 ++-
>>>  lib/dpif-netdev.c |  3 ++-
>>>  4 files changed, 75 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
>>> index c332c217c..90473bc8e 100644
>>> --- a/lib/dpdk-stub.c
>>> +++ b/lib/dpdk-stub.c
>>> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>>>  }
>>>
>>>  void
>>> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
>>> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
>>> +{
>>> +    /* Nothing */
>>> +}
>>> +
>>> +void
>>> +dpdk_uninit_thread_context(void)
>>>  {
>>>      /* Nothing */
>>>  }
>>> diff --git a/lib/dpdk.c b/lib/dpdk.c
>>> index 21dd47e80..771baa413 100644
>>> --- a/lib/dpdk.c
>>> +++ b/lib/dpdk.c
>>> @@ -33,6 +33,7 @@
>>>
>>>  #include "dirs.h"
>>>  #include "fatal-signal.h"
>>> +#include "id-pool.h"
>>>  #include "netdev-dpdk.h"
>>>  #include "netdev-offload-provider.h"
>>>  #include "openvswitch/dynamic-string.h"
>>> @@ -55,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>>>                                         * of DPDK. */
>>>  static bool per_port_memory = false; /* Status of per port memory support */
>>>
>>> +static struct id_pool *lcore_id_pool;

OVS_GUARDED_BY() would be nice.

>>> +static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
>>> +
>>>  static int
>>>  process_vhost_flags(char *flag, const char *default_val, int size,
>>>                      const struct smap *ovs_other_config,
>>> @@ -346,7 +350,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>          }
>>>      }
>>>
>>> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
>>> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
>>> +        args_contains(&args, "--lcores")) {
>>>          auto_determine = false;
>>>      }
>>>
>>> @@ -372,8 +377,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>               * thread affintity - default to core #0 */
>>>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>>>          }
>>> -        svec_add(&args, "-l");
>>> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
>>> +        svec_add(&args, "--lcores");
>>> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>>>      }
>>>
>>>      svec_terminate(&args);
>>> @@ -429,6 +434,23 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>          }
>>>      }
>>>
>>> +    ovs_mutex_lock(&lcore_id_pool_mutex);
>>> +    lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
>>> +    /* Empty the whole pool... */
>>> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>> +        uint32_t lcore_id;
>>> +
>>> +        id_pool_alloc_id(lcore_id_pool, &lcore_id);
>>> +    }
>>> +    /* ...and release the unused spots. */
>>> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>> +        if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
>>> +             continue;
>>> +        }
>>> +        id_pool_free_id(lcore_id_pool, lcore);
>>> +    }
>>> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
>>> +
>>>      /* We are called from the main thread here */
>>>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>>>
>>> @@ -522,11 +544,48 @@ dpdk_available(void)
>>>  }
>>>
>>>  void
>>> -dpdk_set_lcore_id(unsigned cpu)
>>> +dpdk_init_thread_context(unsigned cpu)
>>>  {
>>> +    cpu_set_t cpuset;

I think it should be rte_cpuset_t.
For portability and also to follow DPDK API.

>>> +    unsigned lcore;
>>> +    int err;
>>> +
>>>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>>>      ovs_assert(cpu != NON_PMD_CORE_ID);
>>> -    RTE_PER_LCORE(_lcore_id) = cpu;
>>> +
>>> +    ovs_mutex_lock(&lcore_id_pool_mutex);
>>> +    if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
>>> +        lcore = NON_PMD_CORE_ID;
>>
>> If PMD thread will get NON_PMD_CORE_ID it will smash the non-PMD
>> mempool caches since it doesn't take a 'non_pmd_mutex'.
> 
> Ok, need to look at this again.
> I missed something.
> I don't understand how you would create PMD threads with cpu !=
> NON_PMD_CORE_ID without dpdk enabled but still call the mempool code.

I meant that if you'll try to create more PMD threads than RTE_MAX_LCORE.
For example, if RTE_MAX_LCORE == 32 and you want to create 40 PMD threads
(assuming your system has 64 CPUs), 8 PMD threads and non-PMD thread(s)
will have lcore == NON_PMD_CORE_ID == LCORE_ID_ANY.  I re-checked and
this will not cause sharing of the same mempool caches because threads
with LCORE_ID_ANY doesn't use it, however I think that simultaneous calls
of DPDK API might still be somehow dangerous.  Anyway, performance will
suffer.  At least, we should warn about that.

> +    }
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    /* DPDK is not initialised, nothing more to do. */
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);

We know the core already. No need to make additional syscall.
We could just set it with CPU_SET().  This will also help with
portability in the future.

> +    if (err) {
> +        VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err));
> +    }
> +
> +    rte_thread_set_affinity(&cpuset);
> +    VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_id_pool_mutex);
> +    id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_id_pool_mutex);
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..404ac1a4b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 5142bad1d..c40031a78 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5472,7 +5472,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5592,6 +5592,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
diff mbox series

Patch

diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index c332c217c..90473bc8e 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -39,7 +39,13 @@  dpdk_init(const struct smap *ovs_other_config)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
+dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
+{
+    /* Nothing */
+}
+
+void
+dpdk_uninit_thread_context(void)
 {
     /* Nothing */
 }
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 21dd47e80..771baa413 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -33,6 +33,7 @@ 
 
 #include "dirs.h"
 #include "fatal-signal.h"
+#include "id-pool.h"
 #include "netdev-dpdk.h"
 #include "netdev-offload-provider.h"
 #include "openvswitch/dynamic-string.h"
@@ -55,6 +56,9 @@  static bool dpdk_initialized = false; /* Indicates successful initialization
                                        * of DPDK. */
 static bool per_port_memory = false; /* Status of per port memory support */
 
+static struct id_pool *lcore_id_pool;
+static struct ovs_mutex lcore_id_pool_mutex = OVS_MUTEX_INITIALIZER;
+
 static int
 process_vhost_flags(char *flag, const char *default_val, int size,
                     const struct smap *ovs_other_config,
@@ -346,7 +350,8 @@  dpdk_init__(const struct smap *ovs_other_config)
         }
     }
 
-    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
+    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
+        args_contains(&args, "--lcores")) {
         auto_determine = false;
     }
 
@@ -372,8 +377,8 @@  dpdk_init__(const struct smap *ovs_other_config)
              * thread affintity - default to core #0 */
             VLOG_ERR("Thread getaffinity failed. Using core #0");
         }
-        svec_add(&args, "-l");
-        svec_add_nocopy(&args, xasprintf("%d", cpu));
+        svec_add(&args, "--lcores");
+        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
     }
 
     svec_terminate(&args);
@@ -429,6 +434,23 @@  dpdk_init__(const struct smap *ovs_other_config)
         }
     }
 
+    ovs_mutex_lock(&lcore_id_pool_mutex);
+    lcore_id_pool = id_pool_create(0, RTE_MAX_LCORE);
+    /* Empty the whole pool... */
+    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+        uint32_t lcore_id;
+
+        id_pool_alloc_id(lcore_id_pool, &lcore_id);
+    }
+    /* ...and release the unused spots. */
+    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+        if (rte_eal_lcore_role(lcore) != ROLE_OFF) {
+             continue;
+        }
+        id_pool_free_id(lcore_id_pool, lcore);
+    }
+    ovs_mutex_unlock(&lcore_id_pool_mutex);
+
     /* We are called from the main thread here */
     RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
 
@@ -522,11 +544,48 @@  dpdk_available(void)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu)
+dpdk_init_thread_context(unsigned cpu)
 {
+    cpu_set_t cpuset;
+    unsigned lcore;
+    int err;
+
     /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
     ovs_assert(cpu != NON_PMD_CORE_ID);
-    RTE_PER_LCORE(_lcore_id) = cpu;
+
+    ovs_mutex_lock(&lcore_id_pool_mutex);
+    if (lcore_id_pool == NULL || !id_pool_alloc_id(lcore_id_pool, &lcore)) {
+        lcore = NON_PMD_CORE_ID;
+    }
+    ovs_mutex_unlock(&lcore_id_pool_mutex);
+
+    RTE_PER_LCORE(_lcore_id) = lcore;
+
+    /* DPDK is not initialised, nothing more to do. */
+    if (lcore == NON_PMD_CORE_ID) {
+        return;
+    }
+
+    CPU_ZERO(&cpuset);
+    err = pthread_getaffinity_np(pthread_self(), sizeof cpuset, &cpuset);
+    if (err) {
+        VLOG_ABORT("Thread getaffinity error: %s", ovs_strerror(err));
+    }
+
+    rte_thread_set_affinity(&cpuset);
+    VLOG_INFO("Initialised lcore %u for core %u", lcore, cpu);
+}
+
+void
+dpdk_uninit_thread_context(void)
+{
+    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
+        return;
+    }
+
+    ovs_mutex_lock(&lcore_id_pool_mutex);
+    id_pool_free_id(lcore_id_pool, RTE_PER_LCORE(_lcore_id));
+    ovs_mutex_unlock(&lcore_id_pool_mutex);
 }
 
 void
diff --git a/lib/dpdk.h b/lib/dpdk.h
index 736a64279..404ac1a4b 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -36,7 +36,8 @@  struct smap;
 struct ovsrec_open_vswitch;
 
 void dpdk_init(const struct smap *ovs_other_config);
-void dpdk_set_lcore_id(unsigned cpu);
+void dpdk_init_thread_context(unsigned cpu);
+void dpdk_uninit_thread_context(void);
 const char *dpdk_get_vhost_sock_dir(void);
 bool dpdk_vhost_iommu_enabled(void);
 bool dpdk_vhost_postcopy_enabled(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 5142bad1d..c40031a78 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5472,7 +5472,7 @@  pmd_thread_main(void *f_)
     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
     ovs_numa_thread_setaffinity_core(pmd->core_id);
-    dpdk_set_lcore_id(pmd->core_id);
+    dpdk_init_thread_context(pmd->core_id);
     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
     dfc_cache_init(&pmd->flow_cache);
     pmd_alloc_static_tx_qid(pmd);
@@ -5592,6 +5592,7 @@  reload:
     dfc_cache_uninit(&pmd->flow_cache);
     free(poll_list);
     pmd_free_cached_ports(pmd);
+    dpdk_uninit_thread_context();
     return NULL;
 }