diff mbox series

[ovs-dev,v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE.

Message ID 20191210101012.8824-1-david.marchand@redhat.com
State Superseded
Headers show
Series [ovs-dev,v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE. | expand

Commit Message

David Marchand Dec. 10, 2019, 10:10 a.m. UTC
Most DPDK components make the assumption that rte_lcore_id() returns
a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of
the LCORE_ID_ANY special value).

OVS does not currently check which value is set in
RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
side.

Introduce a lcore allocator in OVS for PMD threads and map them to
unused lcores from DPDK à la --lcores.

The physical cores on which the PMD threads are running still
constitutes an important information when debugging, so still keep
those in the PMD thread names but add a new debug log when starting
them.
Mapping between OVS threads and DPDK lcores can be dumped with a new
dpdk/dump-lcores command.

Synchronize DPDK internals on numa and cpuset for the PMD threads by
registering them via the rte_thread_set_affinity() helper.

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
Changelog since v1:
- rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
- switched to a bitmap to track lcores,
- added a command to dump current mapping (Flavio): used an experimental
  API to get DPDK lcores cpuset since it is the most reliable/portable
  information,
- used the same code for the logs when starting DPDK/PMD threads,
- addressed Ilya comments,

---
 lib/dpdk-stub.c   |   8 ++-
 lib/dpdk.c        | 163 ++++++++++++++++++++++++++++++++++++++++++++--
 lib/dpdk.h        |   5 +-
 lib/dpif-netdev.c |   3 +-
 4 files changed, 170 insertions(+), 9 deletions(-)

Comments

Ilya Maximets Dec. 13, 2019, 5:34 p.m. UTC | #1
On 10.12.2019 11:10, David Marchand wrote:
> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of
> the LCORE_ID_ANY special value).
> 
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
> 
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
> 
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> Mapping between OVS threads and DPDK lcores can be dumped with a new
> dpdk/dump-lcores command.
> 
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
> Changelog since v1:
> - rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
> - switched to a bitmap to track lcores,
> - added a command to dump current mapping (Flavio): used an experimental
>   API to get DPDK lcores cpuset since it is the most reliable/portable
>   information,
> - used the same code for the logs when starting DPDK/PMD threads,
> - addressed Ilya comments,
> 

This version looks scary.
Some comments inline.

> ---
>  lib/dpdk-stub.c   |   8 ++-
>  lib/dpdk.c        | 163 ++++++++++++++++++++++++++++++++++++++++++++--
>  lib/dpdk.h        |   5 +-
>  lib/dpif-netdev.c |   3 +-
>  4 files changed, 170 insertions(+), 9 deletions(-)
> 
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index c332c217c..90473bc8e 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 37ea2973c..0173366a0 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -30,6 +30,7 @@
>  #include <rte_pdump.h>
>  #endif
>  
> +#include "bitmap.h"
>  #include "dirs.h"
>  #include "fatal-signal.h"
>  #include "netdev-dpdk.h"
> @@ -39,6 +40,7 @@
>  #include "ovs-numa.h"
>  #include "smap.h"
>  #include "svec.h"
> +#include "unixctl.h"
>  #include "util.h"
>  #include "vswitch-idl.h"
>  
> @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>                                         * of DPDK. */
>  static bool per_port_memory = false; /* Status of per port memory support */
>  
> +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER;
> +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex);
> +
>  static int
>  process_vhost_flags(char *flag, const char *default_val, int size,
>                      const struct smap *ovs_other_config,
> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
>      return false;
>  }
>  
> +static void
> +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> +                            struct svec *args)
> +{
> +    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> +    struct svec lcores = SVEC_EMPTY_INITIALIZER;
> +    struct ovs_numa_info_core *core;
> +    struct ovs_numa_dump *cores;
> +    int index = 0;
> +
> +    if (!cmask) {
> +        return;
> +    }
> +    if (args_contains(args, "-c") || args_contains(args, "-l") ||
> +        args_contains(args, "--lcores")) {
> +                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
> +                          "due to dpdk-extra config");
> +        return;
> +    }
> +
> +    cores = ovs_numa_dump_cores_with_cmask(cmask);
> +    FOR_EACH_CORE_ON_DUMP(core, cores) {
> +        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));

What's the point of re-mapping these cores?
Just let DPDK to fail initialization and user will adjust cores.
This automatic re-mapping only complicates everything.

IMHO, user should not configure DPDK cores at all.  So, it's users' problem
if cores configured incorrectly.

> +        index++;
> +    }
> +    svec_terminate(&lcores);
> +    ovs_numa_dump_destroy(cores);
> +    svec_add(args, "--lcores");
> +    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> +    svec_destroy(&lcores);
> +}
> +
>  static void
>  construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>  {
> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>          bool default_enabled;
>          const char *default_value;
>      } opts[] = {
> -        {"dpdk-lcore-mask",   "-c",             false, NULL},
>          {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
>          {"dpdk-socket-limit", "--socket-limit", false, NULL},
>      };
> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
>          svec_parse_words(args, extra_configuration);
>      }
>  
> +    construct_dpdk_lcore_option(ovs_other_config, args);
>      construct_dpdk_options(ovs_other_config, args);
>      construct_dpdk_mutex_options(ovs_other_config, args);
>  }
> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
>      .write = dpdk_log_write,
>  };
>  
> +static void
> +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> +{
> +    struct svec cores = SVEC_EMPTY_INITIALIZER;
> +    rte_cpuset_t cpuset;
> +    unsigned cpu;
> +
> +    cpuset = rte_lcore_cpuset(lcore);
> +    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> +        if (!CPU_ISSET(cpu, &cpuset)) {
> +            continue;
> +        }
> +        svec_add_nocopy(&cores, xasprintf("%u", cpu));
> +    }> +    svec_terminate(&cores);
> +    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> +                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> +                  svec_join(&cores, ",", ""));
> +    svec_destroy(&cores);
> +}
> +
> +static void
> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> +                 void *aux OVS_UNUSED)
> +{
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    unsigned lcore;
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    if (lcore_bitmap == NULL) {
> +        unixctl_command_reply_error(conn, "DPDK has not been initialised");
> +        goto out;
> +    }
> +    if (argc > 1) {
> +        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> +            !bitmap_is_set(lcore_bitmap, lcore)) {
> +            unixctl_command_reply_error(conn, "incorrect lcoreid");
> +            goto out;
> +        }
> +        dpdk_dump_lcore(&ds, lcore);
> +    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {

Not a good style to have 'else for'.

DPDK lcore iterator?

> +        if (!bitmap_is_set(lcore_bitmap, lcore)) {
> +            continue;
> +        }
> +        dpdk_dump_lcore(&ds, lcore);
> +    }
> +    unixctl_command_reply(conn, ds_cstr(&ds));
> +    ds_destroy(&ds);
> +out:
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +}
> +
>  static bool
>  dpdk_init__(const struct smap *ovs_other_config)
>  {
> @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> +        args_contains(&args, "--lcores")) {
>          auto_determine = false;
>      }
>  
> @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>               * thread affintity - default to core #0 */
>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>          }
> -        svec_add(&args, "-l");
> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
> +        svec_add(&args, "--lcores");
> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>      }
>  
>      svec_terminate(&args);
> @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
> +    /* Mark DPDK threads. */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {

Shouldn't we use DPDK lcore iterator instead?

> +        struct ds ds = DS_EMPTY_INITIALIZER;
> +
> +        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
> +             continue;
> +        }
> +        bitmap_set1(lcore_bitmap, lcore);
> +        dpdk_dump_lcore(&ds, lcore);
> +        VLOG_INFO("%s", ds_cstr(&ds));
> +        ds_destroy(&ds);
> +    }
> +    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
> +                             dpdk_dump_lcores, NULL);

We might need this command if we're doing automatic re-mapping
of lcores from the EAL arguments, but if not, I don't think we need it.
Simple logging on PMD start should be enough.

> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>  
> @@ -512,11 +620,54 @@ dpdk_available(void)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    rte_cpuset_t cpuset;
> +    unsigned lcore;
> +
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    if (lcore_bitmap == NULL) {
> +        lcore = NON_PMD_CORE_ID;
> +    } else {
> +        lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE);
> +        if (lcore == RTE_MAX_LCORE) {
> +            VLOG_WARN("Reached maximum number of DPDK lcores, core %u will "
> +                      "have lower performance", cpu);
> +            lcore = NON_PMD_CORE_ID;
> +        } else {
> +            bitmap_set1(lcore_bitmap, lcore);
> +        }
> +    }
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    CPU_SET(cpu, &cpuset);
> +    rte_thread_set_affinity(&cpuset);
> +    dpdk_dump_lcore(&ds, lcore);
> +    VLOG_INFO("%s", ds_cstr(&ds));
> +    ds_destroy(&ds);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..eab7a926b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -22,7 +22,9 @@
>  #ifdef DPDK_NETDEV
>  
>  #include <rte_config.h>
> +#define ALLOW_EXPERIMENTAL_API

Not for this patch, but for DPDK API:
I think, ALLOW_EXPERIMENTAL_API is too generic name for
external definition.  It should have some special prefix.

>  #include <rte_lcore.h>
> +#undef ALLOW_EXPERIMENTAL_API
>  
>  #define NON_PMD_CORE_ID LCORE_ID_ANY
>  
> @@ -36,7 +38,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 7ebf4ef87..91e98919c 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);

Since this function will atually change cpu affinity that is not
expected by OVS it must be called before setting affinity by
ovs_numa_thread_setaffinity_core().  This way OVS will guarantee
correct affinity for its threads.

>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5590,6 +5590,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
>  
>
David Marchand Dec. 17, 2019, 9:22 a.m. UTC | #2
On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets <i.maximets@ovn.org> wrote:
> > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
> >      return false;
> >  }
> >
> > +static void
> > +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> > +                            struct svec *args)
> > +{
> > +    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> > +    struct svec lcores = SVEC_EMPTY_INITIALIZER;
> > +    struct ovs_numa_info_core *core;
> > +    struct ovs_numa_dump *cores;
> > +    int index = 0;
> > +
> > +    if (!cmask) {
> > +        return;
> > +    }
> > +    if (args_contains(args, "-c") || args_contains(args, "-l") ||
> > +        args_contains(args, "--lcores")) {
> > +                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
> > +                          "due to dpdk-extra config");
> > +        return;
> > +    }
> > +
> > +    cores = ovs_numa_dump_cores_with_cmask(cmask);
> > +    FOR_EACH_CORE_ON_DUMP(core, cores) {
> > +        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
>
> What's the point of re-mapping these cores?
> Just let DPDK to fail initialization and user will adjust cores.
> This automatic re-mapping only complicates everything.

Deal with existing user of this parameter.
Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432


>
> IMHO, user should not configure DPDK cores at all.  So, it's users' problem
> if cores configured incorrectly.

If the user explicitly had set a dpdk option, ok, this is his problem.
But here, this is an OVS option, handling this internally makes more
sense to me.


> > +        index++;
> > +    }
> > +    svec_terminate(&lcores);
> > +    ovs_numa_dump_destroy(cores);
> > +    svec_add(args, "--lcores");
> > +    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> > +    svec_destroy(&lcores);
> > +}
> > +
> >  static void
> >  construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
> >  {
> > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
> >          bool default_enabled;
> >          const char *default_value;
> >      } opts[] = {
> > -        {"dpdk-lcore-mask",   "-c",             false, NULL},
> >          {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
> >          {"dpdk-socket-limit", "--socket-limit", false, NULL},
> >      };
> > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
> >          svec_parse_words(args, extra_configuration);
> >      }
> >
> > +    construct_dpdk_lcore_option(ovs_other_config, args);
> >      construct_dpdk_options(ovs_other_config, args);
> >      construct_dpdk_mutex_options(ovs_other_config, args);
> >  }
> > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
> >      .write = dpdk_log_write,
> >  };
> >
> > +static void
> > +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> > +{
> > +    struct svec cores = SVEC_EMPTY_INITIALIZER;
> > +    rte_cpuset_t cpuset;
> > +    unsigned cpu;
> > +
> > +    cpuset = rte_lcore_cpuset(lcore);
> > +    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> > +        if (!CPU_ISSET(cpu, &cpuset)) {
> > +            continue;
> > +        }
> > +        svec_add_nocopy(&cores, xasprintf("%u", cpu));
> > +    }> +    svec_terminate(&cores);
> > +    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> > +                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> > +                  svec_join(&cores, ",", ""));
> > +    svec_destroy(&cores);
> > +}
> > +
> > +static void
> > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> > +                 void *aux OVS_UNUSED)
> > +{
> > +    struct ds ds = DS_EMPTY_INITIALIZER;
> > +    unsigned lcore;
> > +
> > +    ovs_mutex_lock(&lcore_bitmap_mutex);
> > +    if (lcore_bitmap == NULL) {
> > +        unixctl_command_reply_error(conn, "DPDK has not been initialised");
> > +        goto out;
> > +    }
> > +    if (argc > 1) {
> > +        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> > +            !bitmap_is_set(lcore_bitmap, lcore)) {
> > +            unixctl_command_reply_error(conn, "incorrect lcoreid");
> > +            goto out;
> > +        }
> > +        dpdk_dump_lcore(&ds, lcore);
> > +    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>
> Not a good style to have 'else for'.

Tried to be creative :-) (the additional indent is really unnecessary).
But, if this is a blocker for this patch, ok.


>
> DPDK lcore iterator?

We are iterating over lcores owned by DPDK and OVS.

Example:
other_config        : {dpdk-init="true", dpdk-lcore-mask="0x1",
pmd-cpu-mask="0x00008002"}

# ovs-appctl dpdk/dump-lcores
lcore0 (DPDK) is running on core 0
lcore1 (OVS) is running on core 1
lcore2 (OVS) is running on core 15


>
> > +        if (!bitmap_is_set(lcore_bitmap, lcore)) {
> > +            continue;
> > +        }
> > +        dpdk_dump_lcore(&ds, lcore);
> > +    }
> > +    unixctl_command_reply(conn, ds_cstr(&ds));
> > +    ds_destroy(&ds);
> > +out:
> > +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> > +}
> > +
> >  static bool
> >  dpdk_init__(const struct smap *ovs_other_config)
> >  {


> > @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
> >          }
> >      }
> >
> > +    ovs_mutex_lock(&lcore_bitmap_mutex);
> > +    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
> > +    /* Mark DPDK threads. */
> > +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>
> Shouldn't we use DPDK lcore iterator instead?

The DPDK lcore iterator skips "service" cores.
We would end up reusing those lcores if the user had set something exotic.

Example:
other_config        : {dpdk-extra="-c 0x5 -s 0x4", dpdk-init="true",
pmd-cpu-mask="0x00008002"}
# ovs-appctl dpdk/dump-lcores
lcore0 (DPDK) is running on core 0
lcore1 (OVS) is running on core 1
lcore2 (DPDK) is running on core 2
lcore3 (OVS) is running on core 15


>
> > +        struct ds ds = DS_EMPTY_INITIALIZER;
> > +
> > +        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
> > +             continue;
> > +        }
> > +        bitmap_set1(lcore_bitmap, lcore);
> > +        dpdk_dump_lcore(&ds, lcore);
> > +        VLOG_INFO("%s", ds_cstr(&ds));
> > +        ds_destroy(&ds);
> > +    }
> > +    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
> > +                             dpdk_dump_lcores, NULL);
>
> We might need this command if we're doing automatic re-mapping
> of lcores from the EAL arguments, but if not, I don't think we need it.
> Simple logging on PMD start should be enough.

Flavio mentioned that the logs could have been rotated.


> > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > index 7ebf4ef87..91e98919c 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
> >      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
> >      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
> >      ovs_numa_thread_setaffinity_core(pmd->core_id);
> > -    dpdk_set_lcore_id(pmd->core_id);
> > +    dpdk_init_thread_context(pmd->core_id);
>
> Since this function will atually change cpu affinity that is not

This is the same affinity than what OVS just applied.
The call to pthread_setaffinity_np in dpdk should be a noop.


> expected by OVS it must be called before setting affinity by
> ovs_numa_thread_setaffinity_core().  This way OVS will guarantee
> correct affinity for its threads.

ovs_numa_thread_setaffinity_core() return value is not checked.
And I can't see what calling this function changes to the ovs-numa internals.
We could even remove it and only rely on dpdk.


--
David Marchand
Ilya Maximets Dec. 17, 2019, 9:10 p.m. UTC | #3
On 17.12.2019 10:22, David Marchand wrote:
> On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>>> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
>>>      return false;
>>>  }
>>>
>>> +static void
>>> +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
>>> +                            struct svec *args)
>>> +{
>>> +    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
>>> +    struct svec lcores = SVEC_EMPTY_INITIALIZER;
>>> +    struct ovs_numa_info_core *core;
>>> +    struct ovs_numa_dump *cores;
>>> +    int index = 0;
>>> +
>>> +    if (!cmask) {
>>> +        return;
>>> +    }
>>> +    if (args_contains(args, "-c") || args_contains(args, "-l") ||
>>> +        args_contains(args, "--lcores")) {
>>> +                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
>>> +                          "due to dpdk-extra config");
>>> +        return;
>>> +    }
>>> +
>>> +    cores = ovs_numa_dump_cores_with_cmask(cmask);
>>> +    FOR_EACH_CORE_ON_DUMP(core, cores) {
>>> +        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
>>
>> What's the point of re-mapping these cores?
>> Just let DPDK to fail initialization and user will adjust cores.
>> This automatic re-mapping only complicates everything.
> 
> Deal with existing user of this parameter.
> Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432

So, why not just rebuild DPDK with more than 128 cores in config
since you're going to update OVS package for this issue?
I bet kernel is built with NR_CPUS equal to something like 8192.

<"support up to 128 threads" limitation> doesn't sound feasible
as you can't actually enforce that.

'--lcores' option made DPDK really confusing, IMHO.
Looking at the DPDK API and wondering how the rte_lcore_to_cpu_id()
should work and what it returns if lcore to cpu relation is one to
many...  I'm lost.

BTW, looking at the cpu mask in the bug, what are these 8 cores
(4 cores from each numa) from the lcore mask are supposed to do?

> 
> 
>>
>> IMHO, user should not configure DPDK cores at all.  So, it's users' problem
>> if cores configured incorrectly.
> 
> If the user explicitly had set a dpdk option, ok, this is his problem.
> But here, this is an OVS option, handling this internally makes more
> sense to me.

Sounds like another reason to deprecate and remove those configuration knobs:
https://patchwork.ozlabs.org/project/openvswitch/list/?series=89529&state=*

> 
> 
>>> +        index++;
>>> +    }
>>> +    svec_terminate(&lcores);
>>> +    ovs_numa_dump_destroy(cores);
>>> +    svec_add(args, "--lcores");
>>> +    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
>>> +    svec_destroy(&lcores);
>>> +}
>>> +
>>>  static void
>>>  construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>>>  {
>>> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>>>          bool default_enabled;
>>>          const char *default_value;
>>>      } opts[] = {
>>> -        {"dpdk-lcore-mask",   "-c",             false, NULL},
>>>          {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
>>>          {"dpdk-socket-limit", "--socket-limit", false, NULL},
>>>      };
>>> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
>>>          svec_parse_words(args, extra_configuration);
>>>      }
>>>
>>> +    construct_dpdk_lcore_option(ovs_other_config, args);
>>>      construct_dpdk_options(ovs_other_config, args);
>>>      construct_dpdk_mutex_options(ovs_other_config, args);
>>>  }
>>> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
>>>      .write = dpdk_log_write,
>>>  };
>>>
>>> +static void
>>> +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
>>> +{
>>> +    struct svec cores = SVEC_EMPTY_INITIALIZER;
>>> +    rte_cpuset_t cpuset;
>>> +    unsigned cpu;
>>> +
>>> +    cpuset = rte_lcore_cpuset(lcore);
>>> +    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
>>> +        if (!CPU_ISSET(cpu, &cpuset)) {
>>> +            continue;
>>> +        }
>>> +        svec_add_nocopy(&cores, xasprintf("%u", cpu));
>>> +    }> +    svec_terminate(&cores);
>>> +    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
>>> +                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
>>> +                  svec_join(&cores, ",", ""));
>>> +    svec_destroy(&cores);
>>> +}
>>> +
>>> +static void
>>> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
>>> +                 void *aux OVS_UNUSED)
>>> +{
>>> +    struct ds ds = DS_EMPTY_INITIALIZER;
>>> +    unsigned lcore;
>>> +
>>> +    ovs_mutex_lock(&lcore_bitmap_mutex);
>>> +    if (lcore_bitmap == NULL) {
>>> +        unixctl_command_reply_error(conn, "DPDK has not been initialised");
>>> +        goto out;
>>> +    }
>>> +    if (argc > 1) {
>>> +        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
>>> +            !bitmap_is_set(lcore_bitmap, lcore)) {
>>> +            unixctl_command_reply_error(conn, "incorrect lcoreid");
>>> +            goto out;
>>> +        }
>>> +        dpdk_dump_lcore(&ds, lcore);
>>> +    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>
>> Not a good style to have 'else for'.
> 
> Tried to be creative :-) (the additional indent is really unnecessary).
> But, if this is a blocker for this patch, ok.
> 
> 
>>
>> DPDK lcore iterator?
> 
> We are iterating over lcores owned by DPDK and OVS.
> 
> Example:
> other_config        : {dpdk-init="true", dpdk-lcore-mask="0x1",
> pmd-cpu-mask="0x00008002"}
> 
> # ovs-appctl dpdk/dump-lcores
> lcore0 (DPDK) is running on core 0
> lcore1 (OVS) is running on core 1
> lcore2 (OVS) is running on core 15
> 
> 
>>
>>> +        if (!bitmap_is_set(lcore_bitmap, lcore)) {
>>> +            continue;
>>> +        }
>>> +        dpdk_dump_lcore(&ds, lcore);
>>> +    }
>>> +    unixctl_command_reply(conn, ds_cstr(&ds));
>>> +    ds_destroy(&ds);
>>> +out:
>>> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
>>> +}
>>> +
>>>  static bool
>>>  dpdk_init__(const struct smap *ovs_other_config)
>>>  {
> 
> 
>>> @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
>>>          }
>>>      }
>>>
>>> +    ovs_mutex_lock(&lcore_bitmap_mutex);
>>> +    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
>>> +    /* Mark DPDK threads. */
>>> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
>>
>> Shouldn't we use DPDK lcore iterator instead?
> 
> The DPDK lcore iterator skips "service" cores.
> We would end up reusing those lcores if the user had set something exotic.

What if user will map lcores to float across different cpu sets?
How to treat that?

> 
> Example:
> other_config        : {dpdk-extra="-c 0x5 -s 0x4", dpdk-init="true",
> pmd-cpu-mask="0x00008002"}
> # ovs-appctl dpdk/dump-lcores
> lcore0 (DPDK) is running on core 0
> lcore1 (OVS) is running on core 1
> lcore2 (DPDK) is running on core 2
> lcore3 (OVS) is running on core 15
> 
> 
>>
>>> +        struct ds ds = DS_EMPTY_INITIALIZER;
>>> +
>>> +        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
>>> +             continue;
>>> +        }
>>> +        bitmap_set1(lcore_bitmap, lcore);
>>> +        dpdk_dump_lcore(&ds, lcore);
>>> +        VLOG_INFO("%s", ds_cstr(&ds));
>>> +        ds_destroy(&ds);
>>> +    }
>>> +    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
>>> +                             dpdk_dump_lcores, NULL);
>>
>> We might need this command if we're doing automatic re-mapping
>> of lcores from the EAL arguments, but if not, I don't think we need it.
>> Simple logging on PMD start should be enough.
> 
> Flavio mentioned that the logs could have been rotated.
> 
> 
>>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>> index 7ebf4ef87..91e98919c 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
>>>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>>>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>>>      ovs_numa_thread_setaffinity_core(pmd->core_id);
>>> -    dpdk_set_lcore_id(pmd->core_id);
>>> +    dpdk_init_thread_context(pmd->core_id);
>>
>> Since this function will atually change cpu affinity that is not
> 
> This is the same affinity than what OVS just applied.
> The call to pthread_setaffinity_np in dpdk should be a noop.

Should be, but we should not depend on DPDK internals.  Order
changing will guarantee the affinity from the OVS point of view.

> 
> 
>> expected by OVS it must be called before setting affinity by
>> ovs_numa_thread_setaffinity_core().  This way OVS will guarantee
>> correct affinity for its threads.
> 
> ovs_numa_thread_setaffinity_core() return value is not checked.

Yes. That is the issue we need to fix in OVS thread management.
Right now it's not possible to undone thread creation.

> And I can't see what calling this function changes to the ovs-numa internals.
> We could even remove it and only rely on dpdk.

While it doesn't change the internal state of a ovs-numa it is the
generic OVS way to set affinity.  And we can't remove it since it's
not a 'DPDK datapath', it is a generic userspace datapath and DPDK
calls here are just a bunch of workarounds that should not be there
at all in the ideal world.


To be clear, I'm not against this patch in general, I just think that
it's too heavy for a temporal solution.

Best regards, Ilya Maximets.
Kevin Traynor Dec. 19, 2019, 12:02 p.m. UTC | #4
On 10/12/2019 10:10, David Marchand wrote:
> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of
> the LCORE_ID_ANY special value).
> 
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
> 
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
> 
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> Mapping between OVS threads and DPDK lcores can be dumped with a new
> dpdk/dump-lcores command.
> 
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>

Hi David, it is a big improvement over just failing, thanks. Some
comments below.

> ---
> Changelog since v1:
> - rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
> - switched to a bitmap to track lcores,
> - added a command to dump current mapping (Flavio): used an experimental
>   API to get DPDK lcores cpuset since it is the most reliable/portable
>   information,
> - used the same code for the logs when starting DPDK/PMD threads,
> - addressed Ilya comments,
> 
> ---
>  lib/dpdk-stub.c   |   8 ++-
>  lib/dpdk.c        | 163 ++++++++++++++++++++++++++++++++++++++++++++--
>  lib/dpdk.h        |   5 +-
>  lib/dpif-netdev.c |   3 +-
>  4 files changed, 170 insertions(+), 9 deletions(-)
> 
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index c332c217c..90473bc8e 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 37ea2973c..0173366a0 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -30,6 +30,7 @@
>  #include <rte_pdump.h>
>  #endif
>  
> +#include "bitmap.h"
>  #include "dirs.h"
>  #include "fatal-signal.h"
>  #include "netdev-dpdk.h"
> @@ -39,6 +40,7 @@
>  #include "ovs-numa.h"
>  #include "smap.h"
>  #include "svec.h"
> +#include "unixctl.h"
>  #include "util.h"
>  #include "vswitch-idl.h"
>  
> @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>                                         * of DPDK. */
>  static bool per_port_memory = false; /* Status of per port memory support */
>  
> +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER;
> +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex);
> +
>  static int
>  process_vhost_flags(char *flag, const char *default_val, int size,
>                      const struct smap *ovs_other_config,
> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
>      return false;
>  }
>  
> +static void
> +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> +                            struct svec *args)
> +{
> +    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> +    struct svec lcores = SVEC_EMPTY_INITIALIZER;
> +    struct ovs_numa_info_core *core;
> +    struct ovs_numa_dump *cores;
> +    int index = 0;
> +
> +    if (!cmask) {
> +        return;
> +    }
> +    if (args_contains(args, "-c") || args_contains(args, "-l") ||
> +        args_contains(args, "--lcores")) {
> +                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
> +                          "due to dpdk-extra config");
> +        return;
> +    }
> +
> +    cores = ovs_numa_dump_cores_with_cmask(cmask);
> +    FOR_EACH_CORE_ON_DUMP(core, cores) {

Ran a test with 0x55 as dpdk-lcore-mask and got

|00017|dpdk|INFO|EAL ARGS: ovs-vswitchd --lcores 0@6,1@0,2@2,3@4
--socket-mem 4096,4096 --socket-limit 4096,4096.

which means handler/revalidators will run on core 6, whereas previously
were running on core 0. I think the existing behaviour of running on the
lowest core should be kept if possible.

2019-12-18T20:23:11Z|00035|dpdk|INFO|lcore0 (DPDK) is running on core 6
2019-12-18T20:23:17Z|00036|dpdk|INFO|lcore1 (DPDK) is running on core 0
2019-12-18T20:23:19Z|00037|dpdk|INFO|lcore2 (DPDK) is running on core 2
2019-12-18T20:23:22Z|00038|dpdk|INFO|lcore3 (DPDK) is running on core 4

Core 6, it's labelled as a DPDK thread, but probably OVS
handlers/revalidators will be running on this. Do we need DPDK vs OVS
threads in logs? I think from user, they just run OVS.

Also, are logs needed for cores 0,2,4, when they will do nothing - not
sure, but it's free to log i suppose.

> +        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
> +        index++;
> +    }
> +    svec_terminate(&lcores);
> +    ovs_numa_dump_destroy(cores);
> +    svec_add(args, "--lcores");
> +    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> +    svec_destroy(&lcores);
> +}
> +
>  static void
>  construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>  {
> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>          bool default_enabled;
>          const char *default_value;
>      } opts[] = {
> -        {"dpdk-lcore-mask",   "-c",             false, NULL},
>          {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
>          {"dpdk-socket-limit", "--socket-limit", false, NULL},
>      };
> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
>          svec_parse_words(args, extra_configuration);
>      }
>  
> +    construct_dpdk_lcore_option(ovs_other_config, args);
>      construct_dpdk_options(ovs_other_config, args);
>      construct_dpdk_mutex_options(ovs_other_config, args);
>  }
> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
>      .write = dpdk_log_write,
>  };
>  
> +static void
> +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> +{
> +    struct svec cores = SVEC_EMPTY_INITIALIZER;
> +    rte_cpuset_t cpuset;
> +    unsigned cpu;
> +
> +    cpuset = rte_lcore_cpuset(lcore);
> +    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> +        if (!CPU_ISSET(cpu, &cpuset)) {
> +            continue;
> +        }
> +        svec_add_nocopy(&cores, xasprintf("%u", cpu));
> +    }
> +    svec_terminate(&cores);
> +    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> +                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> +                  svec_join(&cores, ",", ""));

Ran a test with core mask of 0x5 and adding one pmd on core 8,

|00035|dpdk|INFO|lcore0 (DPDK) is running on core 0
|00036|dpdk|INFO|lcore1 (DPDK) is running on core 2

|00084|dpif_netdev|INFO|PMD thread on numa_id: 0, core id:  8 created.
|00085|dpif_netdev|INFO|There are 1 pmd threads on numa node 0

|00001|dpdk(pmd-c08/id:31)|INFO|lcore2 (OVS) is running on core 8

I think users are used to "pmd cores" and "lcores" through the masks. A
pmd logging it's "lcore" may be confusing. At least maybe there's a way
so that it doesn't appear related to "lcores" in dpdk-lcore-mask.

In another test, I purposely ran out of cores:

|00244|dpif_netdev|INFO|PMD thread on numa_id: 0, core id: 16 created.
|00001|dpdk(pmd-c16/id:36)|WARN|Reached maximum number of DPDK lcores,
core 16 will have lower performance

^^^ Good to get a warning about reduced performance, but it's saying
core 16 where really it's running on the non-pmd core

<snip>

|00247|dpif_netdev|INFO|There are 6 pmd threads on numa node 0
|00248|dpif_netdev|INFO|There are 2 pmd threads on numa node 1

^^^ it's not true anymore - these are mostly on the non-pmd core which
in my case is on numa 0

Hmm, it's unlikely that a user will use >128 total num of cores, so
rather than try to survive it by using non-pmd thread and have to think
about all the corner cases after that, maybe for now that could still be
a fail. Otherwise, you'd have to change the logs, but also does it make
sense to equally distribute queues if some of the pmds are on non-pmd
lcore, what about numa etc.

> +    svec_destroy(&cores);
> +}
> +
> +static void
> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> +                 void *aux OVS_UNUSED)
> +{
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    unsigned lcore;
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    if (lcore_bitmap == NULL) {
> +        unixctl_command_reply_error(conn, "DPDK has not been initialised");
> +        goto out;
> +    }
> +    if (argc > 1) {
> +        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> +            !bitmap_is_set(lcore_bitmap, lcore)) {
> +            unixctl_command_reply_error(conn, "incorrect lcoreid");
> +            goto out;
> +        }
> +        dpdk_dump_lcore(&ds, lcore);
> +    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        if (!bitmap_is_set(lcore_bitmap, lcore)) {
> +            continue;
> +        }
> +        dpdk_dump_lcore(&ds, lcore);
> +    }
> +    unixctl_command_reply(conn, ds_cstr(&ds));
> +    ds_destroy(&ds);
> +out:
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +}
> +
>  static bool
>  dpdk_init__(const struct smap *ovs_other_config)
>  {
> @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> +        args_contains(&args, "--lcores")) {
>          auto_determine = false;
>      }
>  
> @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>               * thread affintity - default to core #0 */
>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>          }
> -        svec_add(&args, "-l");
> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
> +        svec_add(&args, "--lcores");
> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>      }
>  
>      svec_terminate(&args);
> @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
>  
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
> +    /* Mark DPDK threads. */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        struct ds ds = DS_EMPTY_INITIALIZER;
> +
> +        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
> +             continue;
> +        }
> +        bitmap_set1(lcore_bitmap, lcore);
> +        dpdk_dump_lcore(&ds, lcore);
> +        VLOG_INFO("%s", ds_cstr(&ds));
> +        ds_destroy(&ds);
> +    }
> +    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
> +                             dpdk_dump_lcores, NULL);
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
>  
> @@ -512,11 +620,54 @@ dpdk_available(void)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    rte_cpuset_t cpuset;
> +    unsigned lcore;
> +
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);

This means a lock between PMDs - any concern about when queues are being
redistributed, pmds stopped/started and traffic flowing?

Not thought through at all, but would it be possible to only use the
dpdk lcores if values of >RTE_MAX_LCORE are actually used? Something
like the tx queue re-mapping comes to mind, where locking is only done
if required. You should know at eal_init time about the dpdk-lcore-mask
and during reconfigure about the pmd cores.

thanks,
Kevin.

> +    if (lcore_bitmap == NULL) {
> +        lcore = NON_PMD_CORE_ID;
> +    } else {
> +        lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE);
> +        if (lcore == RTE_MAX_LCORE) {
> +            VLOG_WARN("Reached maximum number of DPDK lcores, core %u will "
> +                      "have lower performance", cpu);
> +            lcore = NON_PMD_CORE_ID;
> +        } else {
> +            bitmap_set1(lcore_bitmap, lcore);
> +        }
> +    }
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    CPU_SET(cpu, &cpuset);
> +    rte_thread_set_affinity(&cpuset);
> +    dpdk_dump_lcore(&ds, lcore);
> +    VLOG_INFO("%s", ds_cstr(&ds));
> +    ds_destroy(&ds);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..eab7a926b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -22,7 +22,9 @@
>  #ifdef DPDK_NETDEV
>  
>  #include <rte_config.h>
> +#define ALLOW_EXPERIMENTAL_API
>  #include <rte_lcore.h>
> +#undef ALLOW_EXPERIMENTAL_API
>  
>  #define NON_PMD_CORE_ID LCORE_ID_ANY
>  
> @@ -36,7 +38,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 7ebf4ef87..91e98919c 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5590,6 +5590,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
>  
>
Aaron Conole Jan. 29, 2020, 8:05 p.m. UTC | #5
David Marchand <david.marchand@redhat.com> writes:

> Most DPDK components make the assumption that rte_lcore_id() returns
> a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of
> the LCORE_ID_ANY special value).
> OVS does not currently check which value is set in
> RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK
> side.
> Introduce a lcore allocator in OVS for PMD threads and map them to
> unused lcores from DPDK à la --lcores.
> The physical cores on which the PMD threads are running still
> constitutes an important information when debugging, so still keep
> those in the PMD thread names but add a new debug log when starting
> them.
> Mapping between OVS threads and DPDK lcores can be dumped with a new
> dpdk/dump-lcores command.
> Synchronize DPDK internals on numa and cpuset for the PMD threads by
> registering them via the rte_thread_set_affinity() helper.
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---

Thanks for this work.  The current failure condition that exists
today gets confusing for users.

I do prefer having part of OvS internally handling this - it will only
be used with dpdk ports when the EAL gets initialized, so I dislike
documenting some kinds of options.  This is where I like having the
abstraction layer for option passing because we can fix things up for
the user.

> Changelog since v1:
> - rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
> - switched to a bitmap to track lcores,
> - added a command to dump current mapping (Flavio): used an experimental
>   API to get DPDK lcores cpuset since it is the most reliable/portable
>   information,
> - used the same code for the logs when starting DPDK/PMD threads,
> - addressed Ilya comments,
> ---
>  lib/dpdk-stub.c   |   8 ++-
>  lib/dpdk.c        | 163 ++++++++++++++++++++++++++++++++++++++++++++--
>  lib/dpdk.h        |   5 +-
>  lib/dpif-netdev.c |   3 +-
>  4 files changed, 170 insertions(+), 9 deletions(-)
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index c332c217c..90473bc8e 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 37ea2973c..0173366a0 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -30,6 +30,7 @@
>  #include <rte_pdump.h>
>  #endif
> +#include "bitmap.h"
>  #include "dirs.h"
>  #include "fatal-signal.h"
>  #include "netdev-dpdk.h"
> @@ -39,6 +40,7 @@
>  #include "ovs-numa.h"
>  #include "smap.h"
>  #include "svec.h"
> +#include "unixctl.h"
>  #include "util.h"
>  #include "vswitch-idl.h"
> @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization
>                                         * of DPDK. */
>  static bool per_port_memory = false; /* Status of per port memory support */
> +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER;
> +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex);
> +
>  static int
>  process_vhost_flags(char *flag, const char *default_val, int size,
>                      const struct smap *ovs_other_config,
> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value)
>      return false;
>  }
> +static void
> +construct_dpdk_lcore_option(const struct smap *ovs_other_config,
> +                            struct svec *args)
> +{
> +    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
> +    struct svec lcores = SVEC_EMPTY_INITIALIZER;
> +    struct ovs_numa_info_core *core;
> +    struct ovs_numa_dump *cores;
> +    int index = 0;
> +
> +    if (!cmask) {
> +        return;
> +    }
> +    if (args_contains(args, "-c") || args_contains(args, "-l") ||
> +        args_contains(args, "--lcores")) {
> +                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
> +                          "due to dpdk-extra config");
> +        return;
> +    }
> +
> +    cores = ovs_numa_dump_cores_with_cmask(cmask);
> +    FOR_EACH_CORE_ON_DUMP(core, cores) {
> +        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
> +        index++;
> +    }
> +    svec_terminate(&lcores);
> +    ovs_numa_dump_destroy(cores);
> +    svec_add(args, "--lcores");
> +    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
> +    svec_destroy(&lcores);
> +}
> +
>  static void
>  construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>  {
> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
>          bool default_enabled;
>          const char *default_value;
>      } opts[] = {
> -        {"dpdk-lcore-mask",   "-c",             false, NULL},
>          {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
>          {"dpdk-socket-limit", "--socket-limit", false, NULL},
>      };
> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
>          svec_parse_words(args, extra_configuration);
>      }
> +    construct_dpdk_lcore_option(ovs_other_config, args);
>      construct_dpdk_options(ovs_other_config, args);
>      construct_dpdk_mutex_options(ovs_other_config, args);
>  }
> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = {
>      .write = dpdk_log_write,
>  };
> +static void
> +dpdk_dump_lcore(struct ds *ds, unsigned lcore)
> +{
> +    struct svec cores = SVEC_EMPTY_INITIALIZER;
> +    rte_cpuset_t cpuset;
> +    unsigned cpu;
> +
> +    cpuset = rte_lcore_cpuset(lcore);

I guess we don't have any way of calling to pthread_getaffinity_np() on
linux platform so we need to rely on this call.

> +    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
> +        if (!CPU_ISSET(cpu, &cpuset)) {
> +            continue;
> +        }
> +        svec_add_nocopy(&cores, xasprintf("%u", cpu));
> +    }
> +    svec_terminate(&cores);
> +    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
> +                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
> +                  svec_join(&cores, ",", ""));
> +    svec_destroy(&cores);
> +}
> +
> +static void
> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
> +                 void *aux OVS_UNUSED)
> +{
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    unsigned lcore;
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    if (lcore_bitmap == NULL) {
> +        unixctl_command_reply_error(conn, "DPDK has not been initialised");
> +        goto out;
> +    }
> +    if (argc > 1) {
> +        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
> +            !bitmap_is_set(lcore_bitmap, lcore)) {
> +            unixctl_command_reply_error(conn, "incorrect lcoreid");
> +            goto out;
> +        }
> +        dpdk_dump_lcore(&ds, lcore);
> +    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {

Neat.  I've never seen code constructed like this.

> +        if (!bitmap_is_set(lcore_bitmap, lcore)) {
> +            continue;
> +        }
> +        dpdk_dump_lcore(&ds, lcore);
> +    }
> +    unixctl_command_reply(conn, ds_cstr(&ds));
> +    ds_destroy(&ds);
> +out:
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +}
> +
>  static bool
>  dpdk_init__(const struct smap *ovs_other_config)
>  {
> @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
> -    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
> +    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
> +        args_contains(&args, "--lcores")) {
>          auto_determine = false;
>      }
> @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>               * thread affintity - default to core #0 */
>              VLOG_ERR("Thread getaffinity failed. Using core #0");
>          }
> -        svec_add(&args, "-l");
> -        svec_add_nocopy(&args, xasprintf("%d", cpu));
> +        svec_add(&args, "--lcores");
> +        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
>      }
>      svec_terminate(&args);
> @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config)
>          }
>      }
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
> +    /* Mark DPDK threads. */
> +    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
> +        struct ds ds = DS_EMPTY_INITIALIZER;
> +
> +        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
> +             continue;
> +        }
> +        bitmap_set1(lcore_bitmap, lcore);
> +        dpdk_dump_lcore(&ds, lcore);
> +        VLOG_INFO("%s", ds_cstr(&ds));
> +        ds_destroy(&ds);
> +    }
> +    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
> +                             dpdk_dump_lcores, NULL);
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
> @@ -512,11 +620,54 @@ dpdk_available(void)
>  }
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
> +    struct ds ds = DS_EMPTY_INITIALIZER;
> +    rte_cpuset_t cpuset;
> +    unsigned lcore;
> +
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    if (lcore_bitmap == NULL) {
> +        lcore = NON_PMD_CORE_ID;
> +    } else {
> +        lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE);
> +        if (lcore == RTE_MAX_LCORE) {
> +            VLOG_WARN("Reached maximum number of DPDK lcores, core %u will "
> +                      "have lower performance", cpu);

I wouldn't write anything predictive about the condition here.

"core %u will not be mapped in accordance with the mask"

is most accurate.

> +            lcore = NON_PMD_CORE_ID;
> +        } else {
> +            bitmap_set1(lcore_bitmap, lcore);
> +        }
> +    }
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
> +
> +    RTE_PER_LCORE(_lcore_id) = lcore;
> +
> +    if (lcore == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    CPU_ZERO(&cpuset);
> +    CPU_SET(cpu, &cpuset);
> +    rte_thread_set_affinity(&cpuset);
> +    dpdk_dump_lcore(&ds, lcore);
> +    VLOG_INFO("%s", ds_cstr(&ds));
> +    ds_destroy(&ds);
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
> +        return;
> +    }
> +
> +    ovs_mutex_lock(&lcore_bitmap_mutex);
> +    bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id));
> +    ovs_mutex_unlock(&lcore_bitmap_mutex);
>  }
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 736a64279..eab7a926b 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -22,7 +22,9 @@
>  #ifdef DPDK_NETDEV
>  #include <rte_config.h>
> +#define ALLOW_EXPERIMENTAL_API
>  #include <rte_lcore.h>
> +#undef ALLOW_EXPERIMENTAL_API

This is kindof ugly.  Isn't it supposed to be done by the library
with the rte_config.h?  What happens if the library is compiled without
experimental support?  I didn't check this, but it doesn't /look/ nice.

Maybe it's better to do a feature test macro and not use these APIs if
the compiled version of DPDK doesn't support experimental APIs.  WDYT?

>  #define NON_PMD_CORE_ID LCORE_ID_ANY
> @@ -36,7 +38,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 7ebf4ef87..91e98919c 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -5590,6 +5590,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
diff mbox series

Patch

diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index c332c217c..90473bc8e 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -39,7 +39,13 @@  dpdk_init(const struct smap *ovs_other_config)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
+dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
+{
+    /* Nothing */
+}
+
+void
+dpdk_uninit_thread_context(void)
 {
     /* Nothing */
 }
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 37ea2973c..0173366a0 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -30,6 +30,7 @@ 
 #include <rte_pdump.h>
 #endif
 
+#include "bitmap.h"
 #include "dirs.h"
 #include "fatal-signal.h"
 #include "netdev-dpdk.h"
@@ -39,6 +40,7 @@ 
 #include "ovs-numa.h"
 #include "smap.h"
 #include "svec.h"
+#include "unixctl.h"
 #include "util.h"
 #include "vswitch-idl.h"
 
@@ -54,6 +56,9 @@  static bool dpdk_initialized = false; /* Indicates successful initialization
                                        * of DPDK. */
 static bool per_port_memory = false; /* Status of per port memory support */
 
+static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER;
+static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex);
+
 static int
 process_vhost_flags(char *flag, const char *default_val, int size,
                     const struct smap *ovs_other_config,
@@ -94,6 +99,38 @@  args_contains(const struct svec *args, const char *value)
     return false;
 }
 
+static void
+construct_dpdk_lcore_option(const struct smap *ovs_other_config,
+                            struct svec *args)
+{
+    const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask");
+    struct svec lcores = SVEC_EMPTY_INITIALIZER;
+    struct ovs_numa_info_core *core;
+    struct ovs_numa_dump *cores;
+    int index = 0;
+
+    if (!cmask) {
+        return;
+    }
+    if (args_contains(args, "-c") || args_contains(args, "-l") ||
+        args_contains(args, "--lcores")) {
+                VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' "
+                          "due to dpdk-extra config");
+        return;
+    }
+
+    cores = ovs_numa_dump_cores_with_cmask(cmask);
+    FOR_EACH_CORE_ON_DUMP(core, cores) {
+        svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id));
+        index++;
+    }
+    svec_terminate(&lcores);
+    ovs_numa_dump_destroy(cores);
+    svec_add(args, "--lcores");
+    svec_add_nocopy(args, svec_join(&lcores, ",", ""));
+    svec_destroy(&lcores);
+}
+
 static void
 construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
 {
@@ -103,7 +140,6 @@  construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
         bool default_enabled;
         const char *default_value;
     } opts[] = {
-        {"dpdk-lcore-mask",   "-c",             false, NULL},
         {"dpdk-hugepage-dir", "--huge-dir",     false, NULL},
         {"dpdk-socket-limit", "--socket-limit", false, NULL},
     };
@@ -224,6 +260,7 @@  construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args)
         svec_parse_words(args, extra_configuration);
     }
 
+    construct_dpdk_lcore_option(ovs_other_config, args);
     construct_dpdk_options(ovs_other_config, args);
     construct_dpdk_mutex_options(ovs_other_config, args);
 }
@@ -264,6 +301,58 @@  static cookie_io_functions_t dpdk_log_func = {
     .write = dpdk_log_write,
 };
 
+static void
+dpdk_dump_lcore(struct ds *ds, unsigned lcore)
+{
+    struct svec cores = SVEC_EMPTY_INITIALIZER;
+    rte_cpuset_t cpuset;
+    unsigned cpu;
+
+    cpuset = rte_lcore_cpuset(lcore);
+    for (cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+        if (!CPU_ISSET(cpu, &cpuset)) {
+            continue;
+        }
+        svec_add_nocopy(&cores, xasprintf("%u", cpu));
+    }
+    svec_terminate(&cores);
+    ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore,
+                  rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS",
+                  svec_join(&cores, ",", ""));
+    svec_destroy(&cores);
+}
+
+static void
+dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[],
+                 void *aux OVS_UNUSED)
+{
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    unsigned lcore;
+
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    if (lcore_bitmap == NULL) {
+        unixctl_command_reply_error(conn, "DPDK has not been initialised");
+        goto out;
+    }
+    if (argc > 1) {
+        if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE ||
+            !bitmap_is_set(lcore_bitmap, lcore)) {
+            unixctl_command_reply_error(conn, "incorrect lcoreid");
+            goto out;
+        }
+        dpdk_dump_lcore(&ds, lcore);
+    } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+        if (!bitmap_is_set(lcore_bitmap, lcore)) {
+            continue;
+        }
+        dpdk_dump_lcore(&ds, lcore);
+    }
+    unixctl_command_reply(conn, ds_cstr(&ds));
+    ds_destroy(&ds);
+out:
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
+}
+
 static bool
 dpdk_init__(const struct smap *ovs_other_config)
 {
@@ -345,7 +434,8 @@  dpdk_init__(const struct smap *ovs_other_config)
         }
     }
 
-    if (args_contains(&args, "-c") || args_contains(&args, "-l")) {
+    if (args_contains(&args, "-c") || args_contains(&args, "-l") ||
+        args_contains(&args, "--lcores")) {
         auto_determine = false;
     }
 
@@ -371,8 +461,8 @@  dpdk_init__(const struct smap *ovs_other_config)
              * thread affintity - default to core #0 */
             VLOG_ERR("Thread getaffinity failed. Using core #0");
         }
-        svec_add(&args, "-l");
-        svec_add_nocopy(&args, xasprintf("%d", cpu));
+        svec_add(&args, "--lcores");
+        svec_add_nocopy(&args, xasprintf("0@%d", cpu));
     }
 
     svec_terminate(&args);
@@ -428,6 +518,24 @@  dpdk_init__(const struct smap *ovs_other_config)
         }
     }
 
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE);
+    /* Mark DPDK threads. */
+    for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) {
+        struct ds ds = DS_EMPTY_INITIALIZER;
+
+        if (rte_eal_lcore_role(lcore) == ROLE_OFF) {
+             continue;
+        }
+        bitmap_set1(lcore_bitmap, lcore);
+        dpdk_dump_lcore(&ds, lcore);
+        VLOG_INFO("%s", ds_cstr(&ds));
+        ds_destroy(&ds);
+    }
+    unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1,
+                             dpdk_dump_lcores, NULL);
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
+
     /* We are called from the main thread here */
     RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
 
@@ -512,11 +620,54 @@  dpdk_available(void)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu)
+dpdk_init_thread_context(unsigned cpu)
 {
+    struct ds ds = DS_EMPTY_INITIALIZER;
+    rte_cpuset_t cpuset;
+    unsigned lcore;
+
     /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
     ovs_assert(cpu != NON_PMD_CORE_ID);
-    RTE_PER_LCORE(_lcore_id) = cpu;
+
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    if (lcore_bitmap == NULL) {
+        lcore = NON_PMD_CORE_ID;
+    } else {
+        lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE);
+        if (lcore == RTE_MAX_LCORE) {
+            VLOG_WARN("Reached maximum number of DPDK lcores, core %u will "
+                      "have lower performance", cpu);
+            lcore = NON_PMD_CORE_ID;
+        } else {
+            bitmap_set1(lcore_bitmap, lcore);
+        }
+    }
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
+
+    RTE_PER_LCORE(_lcore_id) = lcore;
+
+    if (lcore == NON_PMD_CORE_ID) {
+        return;
+    }
+
+    CPU_ZERO(&cpuset);
+    CPU_SET(cpu, &cpuset);
+    rte_thread_set_affinity(&cpuset);
+    dpdk_dump_lcore(&ds, lcore);
+    VLOG_INFO("%s", ds_cstr(&ds));
+    ds_destroy(&ds);
+}
+
+void
+dpdk_uninit_thread_context(void)
+{
+    if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) {
+        return;
+    }
+
+    ovs_mutex_lock(&lcore_bitmap_mutex);
+    bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id));
+    ovs_mutex_unlock(&lcore_bitmap_mutex);
 }
 
 void
diff --git a/lib/dpdk.h b/lib/dpdk.h
index 736a64279..eab7a926b 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -22,7 +22,9 @@ 
 #ifdef DPDK_NETDEV
 
 #include <rte_config.h>
+#define ALLOW_EXPERIMENTAL_API
 #include <rte_lcore.h>
+#undef ALLOW_EXPERIMENTAL_API
 
 #define NON_PMD_CORE_ID LCORE_ID_ANY
 
@@ -36,7 +38,8 @@  struct smap;
 struct ovsrec_open_vswitch;
 
 void dpdk_init(const struct smap *ovs_other_config);
-void dpdk_set_lcore_id(unsigned cpu);
+void dpdk_init_thread_context(unsigned cpu);
+void dpdk_uninit_thread_context(void);
 const char *dpdk_get_vhost_sock_dir(void);
 bool dpdk_vhost_iommu_enabled(void);
 bool dpdk_vhost_postcopy_enabled(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 7ebf4ef87..91e98919c 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5470,7 +5470,7 @@  pmd_thread_main(void *f_)
     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
     ovs_numa_thread_setaffinity_core(pmd->core_id);
-    dpdk_set_lcore_id(pmd->core_id);
+    dpdk_init_thread_context(pmd->core_id);
     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
     dfc_cache_init(&pmd->flow_cache);
     pmd_alloc_static_tx_qid(pmd);
@@ -5590,6 +5590,7 @@  reload:
     dfc_cache_uninit(&pmd->flow_cache);
     free(poll_list);
     pmd_free_cached_ports(pmd);
+    dpdk_uninit_thread_context();
     return NULL;
 }