Message ID | 20191210101012.8824-1-david.marchand@redhat.com |
---|---|
State | Superseded |
Headers | show |
Series | [ovs-dev,v2] dpdk: Support running PMD threads on cores > RTE_MAX_LCORE. | expand |
On 10.12.2019 11:10, David Marchand wrote: > Most DPDK components make the assumption that rte_lcore_id() returns > a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of > the LCORE_ID_ANY special value). > > OVS does not currently check which value is set in > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > side. > > Introduce a lcore allocator in OVS for PMD threads and map them to > unused lcores from DPDK à la --lcores. > > The physical cores on which the PMD threads are running still > constitutes an important information when debugging, so still keep > those in the PMD thread names but add a new debug log when starting > them. > Mapping between OVS threads and DPDK lcores can be dumped with a new > dpdk/dump-lcores command. > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > registering them via the rte_thread_set_affinity() helper. > > Signed-off-by: David Marchand <david.marchand@redhat.com> > --- > Changelog since v1: > - rewired existing configuration 'dpdk-lcore-mask' to use --lcores, > - switched to a bitmap to track lcores, > - added a command to dump current mapping (Flavio): used an experimental > API to get DPDK lcores cpuset since it is the most reliable/portable > information, > - used the same code for the logs when starting DPDK/PMD threads, > - addressed Ilya comments, > This version looks scary. Some comments inline. > --- > lib/dpdk-stub.c | 8 ++- > lib/dpdk.c | 163 ++++++++++++++++++++++++++++++++++++++++++++-- > lib/dpdk.h | 5 +- > lib/dpif-netdev.c | 3 +- > 4 files changed, 170 insertions(+), 9 deletions(-) > > diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c > index c332c217c..90473bc8e 100644 > --- a/lib/dpdk-stub.c > +++ b/lib/dpdk-stub.c > @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) > } > > void > -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) > +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) > +{ > + /* Nothing */ > +} > + > +void > +dpdk_uninit_thread_context(void) > { > /* Nothing */ > } > diff --git a/lib/dpdk.c b/lib/dpdk.c > index 37ea2973c..0173366a0 100644 > --- a/lib/dpdk.c > +++ b/lib/dpdk.c > @@ -30,6 +30,7 @@ > #include <rte_pdump.h> > #endif > > +#include "bitmap.h" > #include "dirs.h" > #include "fatal-signal.h" > #include "netdev-dpdk.h" > @@ -39,6 +40,7 @@ > #include "ovs-numa.h" > #include "smap.h" > #include "svec.h" > +#include "unixctl.h" > #include "util.h" > #include "vswitch-idl.h" > > @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization > * of DPDK. */ > static bool per_port_memory = false; /* Status of per port memory support */ > > +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER; > +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex); > + > static int > process_vhost_flags(char *flag, const char *default_val, int size, > const struct smap *ovs_other_config, > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) > return false; > } > > +static void > +construct_dpdk_lcore_option(const struct smap *ovs_other_config, > + struct svec *args) > +{ > + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); > + struct svec lcores = SVEC_EMPTY_INITIALIZER; > + struct ovs_numa_info_core *core; > + struct ovs_numa_dump *cores; > + int index = 0; > + > + if (!cmask) { > + return; > + } > + if (args_contains(args, "-c") || args_contains(args, "-l") || > + args_contains(args, "--lcores")) { > + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " > + "due to dpdk-extra config"); > + return; > + } > + > + cores = ovs_numa_dump_cores_with_cmask(cmask); > + FOR_EACH_CORE_ON_DUMP(core, cores) { > + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); What's the point of re-mapping these cores? Just let DPDK to fail initialization and user will adjust cores. This automatic re-mapping only complicates everything. IMHO, user should not configure DPDK cores at all. So, it's users' problem if cores configured incorrectly. > + index++; > + } > + svec_terminate(&lcores); > + ovs_numa_dump_destroy(cores); > + svec_add(args, "--lcores"); > + svec_add_nocopy(args, svec_join(&lcores, ",", "")); > + svec_destroy(&lcores); > +} > + > static void > construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > { > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > bool default_enabled; > const char *default_value; > } opts[] = { > - {"dpdk-lcore-mask", "-c", false, NULL}, > {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, > {"dpdk-socket-limit", "--socket-limit", false, NULL}, > }; > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) > svec_parse_words(args, extra_configuration); > } > > + construct_dpdk_lcore_option(ovs_other_config, args); > construct_dpdk_options(ovs_other_config, args); > construct_dpdk_mutex_options(ovs_other_config, args); > } > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { > .write = dpdk_log_write, > }; > > +static void > +dpdk_dump_lcore(struct ds *ds, unsigned lcore) > +{ > + struct svec cores = SVEC_EMPTY_INITIALIZER; > + rte_cpuset_t cpuset; > + unsigned cpu; > + > + cpuset = rte_lcore_cpuset(lcore); > + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { > + if (!CPU_ISSET(cpu, &cpuset)) { > + continue; > + } > + svec_add_nocopy(&cores, xasprintf("%u", cpu)); > + }> + svec_terminate(&cores); > + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, > + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", > + svec_join(&cores, ",", "")); > + svec_destroy(&cores); > +} > + > +static void > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], > + void *aux OVS_UNUSED) > +{ > + struct ds ds = DS_EMPTY_INITIALIZER; > + unsigned lcore; > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + if (lcore_bitmap == NULL) { > + unixctl_command_reply_error(conn, "DPDK has not been initialised"); > + goto out; > + } > + if (argc > 1) { > + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || > + !bitmap_is_set(lcore_bitmap, lcore)) { > + unixctl_command_reply_error(conn, "incorrect lcoreid"); > + goto out; > + } > + dpdk_dump_lcore(&ds, lcore); > + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { Not a good style to have 'else for'. DPDK lcore iterator? > + if (!bitmap_is_set(lcore_bitmap, lcore)) { > + continue; > + } > + dpdk_dump_lcore(&ds, lcore); > + } > + unixctl_command_reply(conn, ds_cstr(&ds)); > + ds_destroy(&ds); > +out: > + ovs_mutex_unlock(&lcore_bitmap_mutex); > +} > + > static bool > dpdk_init__(const struct smap *ovs_other_config) > { > @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > > - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { > + if (args_contains(&args, "-c") || args_contains(&args, "-l") || > + args_contains(&args, "--lcores")) { > auto_determine = false; > } > > @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config) > * thread affintity - default to core #0 */ > VLOG_ERR("Thread getaffinity failed. Using core #0"); > } > - svec_add(&args, "-l"); > - svec_add_nocopy(&args, xasprintf("%d", cpu)); > + svec_add(&args, "--lcores"); > + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); > } > > svec_terminate(&args); > @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > > + ovs_mutex_lock(&lcore_bitmap_mutex); > + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); > + /* Mark DPDK threads. */ > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { Shouldn't we use DPDK lcore iterator instead? > + struct ds ds = DS_EMPTY_INITIALIZER; > + > + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { > + continue; > + } > + bitmap_set1(lcore_bitmap, lcore); > + dpdk_dump_lcore(&ds, lcore); > + VLOG_INFO("%s", ds_cstr(&ds)); > + ds_destroy(&ds); > + } > + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, > + dpdk_dump_lcores, NULL); We might need this command if we're doing automatic re-mapping of lcores from the EAL arguments, but if not, I don't think we need it. Simple logging on PMD start should be enough. > + ovs_mutex_unlock(&lcore_bitmap_mutex); > + > /* We are called from the main thread here */ > RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; > > @@ -512,11 +620,54 @@ dpdk_available(void) > } > > void > -dpdk_set_lcore_id(unsigned cpu) > +dpdk_init_thread_context(unsigned cpu) > { > + struct ds ds = DS_EMPTY_INITIALIZER; > + rte_cpuset_t cpuset; > + unsigned lcore; > + > /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ > ovs_assert(cpu != NON_PMD_CORE_ID); > - RTE_PER_LCORE(_lcore_id) = cpu; > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + if (lcore_bitmap == NULL) { > + lcore = NON_PMD_CORE_ID; > + } else { > + lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE); > + if (lcore == RTE_MAX_LCORE) { > + VLOG_WARN("Reached maximum number of DPDK lcores, core %u will " > + "have lower performance", cpu); > + lcore = NON_PMD_CORE_ID; > + } else { > + bitmap_set1(lcore_bitmap, lcore); > + } > + } > + ovs_mutex_unlock(&lcore_bitmap_mutex); > + > + RTE_PER_LCORE(_lcore_id) = lcore; > + > + if (lcore == NON_PMD_CORE_ID) { > + return; > + } > + > + CPU_ZERO(&cpuset); > + CPU_SET(cpu, &cpuset); > + rte_thread_set_affinity(&cpuset); > + dpdk_dump_lcore(&ds, lcore); > + VLOG_INFO("%s", ds_cstr(&ds)); > + ds_destroy(&ds); > +} > + > +void > +dpdk_uninit_thread_context(void) > +{ > + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { > + return; > + } > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id)); > + ovs_mutex_unlock(&lcore_bitmap_mutex); > } > > void > diff --git a/lib/dpdk.h b/lib/dpdk.h > index 736a64279..eab7a926b 100644 > --- a/lib/dpdk.h > +++ b/lib/dpdk.h > @@ -22,7 +22,9 @@ > #ifdef DPDK_NETDEV > > #include <rte_config.h> > +#define ALLOW_EXPERIMENTAL_API Not for this patch, but for DPDK API: I think, ALLOW_EXPERIMENTAL_API is too generic name for external definition. It should have some special prefix. > #include <rte_lcore.h> > +#undef ALLOW_EXPERIMENTAL_API > > #define NON_PMD_CORE_ID LCORE_ID_ANY > > @@ -36,7 +38,8 @@ struct smap; > struct ovsrec_open_vswitch; > > void dpdk_init(const struct smap *ovs_other_config); > -void dpdk_set_lcore_id(unsigned cpu); > +void dpdk_init_thread_context(unsigned cpu); > +void dpdk_uninit_thread_context(void); > const char *dpdk_get_vhost_sock_dir(void); > bool dpdk_vhost_iommu_enabled(void); > bool dpdk_vhost_postcopy_enabled(void); > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 7ebf4ef87..91e98919c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > ovs_numa_thread_setaffinity_core(pmd->core_id); > - dpdk_set_lcore_id(pmd->core_id); > + dpdk_init_thread_context(pmd->core_id); Since this function will atually change cpu affinity that is not expected by OVS it must be called before setting affinity by ovs_numa_thread_setaffinity_core(). This way OVS will guarantee correct affinity for its threads. > poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); > dfc_cache_init(&pmd->flow_cache); > pmd_alloc_static_tx_qid(pmd); > @@ -5590,6 +5590,7 @@ reload: > dfc_cache_uninit(&pmd->flow_cache); > free(poll_list); > pmd_free_cached_ports(pmd); > + dpdk_uninit_thread_context(); > return NULL; > } > >
On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets <i.maximets@ovn.org> wrote: > > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) > > return false; > > } > > > > +static void > > +construct_dpdk_lcore_option(const struct smap *ovs_other_config, > > + struct svec *args) > > +{ > > + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); > > + struct svec lcores = SVEC_EMPTY_INITIALIZER; > > + struct ovs_numa_info_core *core; > > + struct ovs_numa_dump *cores; > > + int index = 0; > > + > > + if (!cmask) { > > + return; > > + } > > + if (args_contains(args, "-c") || args_contains(args, "-l") || > > + args_contains(args, "--lcores")) { > > + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " > > + "due to dpdk-extra config"); > > + return; > > + } > > + > > + cores = ovs_numa_dump_cores_with_cmask(cmask); > > + FOR_EACH_CORE_ON_DUMP(core, cores) { > > + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); > > What's the point of re-mapping these cores? > Just let DPDK to fail initialization and user will adjust cores. > This automatic re-mapping only complicates everything. Deal with existing user of this parameter. Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432 > > IMHO, user should not configure DPDK cores at all. So, it's users' problem > if cores configured incorrectly. If the user explicitly had set a dpdk option, ok, this is his problem. But here, this is an OVS option, handling this internally makes more sense to me. > > + index++; > > + } > > + svec_terminate(&lcores); > > + ovs_numa_dump_destroy(cores); > > + svec_add(args, "--lcores"); > > + svec_add_nocopy(args, svec_join(&lcores, ",", "")); > > + svec_destroy(&lcores); > > +} > > + > > static void > > construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > > { > > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > > bool default_enabled; > > const char *default_value; > > } opts[] = { > > - {"dpdk-lcore-mask", "-c", false, NULL}, > > {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, > > {"dpdk-socket-limit", "--socket-limit", false, NULL}, > > }; > > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) > > svec_parse_words(args, extra_configuration); > > } > > > > + construct_dpdk_lcore_option(ovs_other_config, args); > > construct_dpdk_options(ovs_other_config, args); > > construct_dpdk_mutex_options(ovs_other_config, args); > > } > > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { > > .write = dpdk_log_write, > > }; > > > > +static void > > +dpdk_dump_lcore(struct ds *ds, unsigned lcore) > > +{ > > + struct svec cores = SVEC_EMPTY_INITIALIZER; > > + rte_cpuset_t cpuset; > > + unsigned cpu; > > + > > + cpuset = rte_lcore_cpuset(lcore); > > + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { > > + if (!CPU_ISSET(cpu, &cpuset)) { > > + continue; > > + } > > + svec_add_nocopy(&cores, xasprintf("%u", cpu)); > > + }> + svec_terminate(&cores); > > + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, > > + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", > > + svec_join(&cores, ",", "")); > > + svec_destroy(&cores); > > +} > > + > > +static void > > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], > > + void *aux OVS_UNUSED) > > +{ > > + struct ds ds = DS_EMPTY_INITIALIZER; > > + unsigned lcore; > > + > > + ovs_mutex_lock(&lcore_bitmap_mutex); > > + if (lcore_bitmap == NULL) { > > + unixctl_command_reply_error(conn, "DPDK has not been initialised"); > > + goto out; > > + } > > + if (argc > 1) { > > + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || > > + !bitmap_is_set(lcore_bitmap, lcore)) { > > + unixctl_command_reply_error(conn, "incorrect lcoreid"); > > + goto out; > > + } > > + dpdk_dump_lcore(&ds, lcore); > > + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > > Not a good style to have 'else for'. Tried to be creative :-) (the additional indent is really unnecessary). But, if this is a blocker for this patch, ok. > > DPDK lcore iterator? We are iterating over lcores owned by DPDK and OVS. Example: other_config : {dpdk-init="true", dpdk-lcore-mask="0x1", pmd-cpu-mask="0x00008002"} # ovs-appctl dpdk/dump-lcores lcore0 (DPDK) is running on core 0 lcore1 (OVS) is running on core 1 lcore2 (OVS) is running on core 15 > > > + if (!bitmap_is_set(lcore_bitmap, lcore)) { > > + continue; > > + } > > + dpdk_dump_lcore(&ds, lcore); > > + } > > + unixctl_command_reply(conn, ds_cstr(&ds)); > > + ds_destroy(&ds); > > +out: > > + ovs_mutex_unlock(&lcore_bitmap_mutex); > > +} > > + > > static bool > > dpdk_init__(const struct smap *ovs_other_config) > > { > > @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) > > } > > } > > > > + ovs_mutex_lock(&lcore_bitmap_mutex); > > + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); > > + /* Mark DPDK threads. */ > > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > > Shouldn't we use DPDK lcore iterator instead? The DPDK lcore iterator skips "service" cores. We would end up reusing those lcores if the user had set something exotic. Example: other_config : {dpdk-extra="-c 0x5 -s 0x4", dpdk-init="true", pmd-cpu-mask="0x00008002"} # ovs-appctl dpdk/dump-lcores lcore0 (DPDK) is running on core 0 lcore1 (OVS) is running on core 1 lcore2 (DPDK) is running on core 2 lcore3 (OVS) is running on core 15 > > > + struct ds ds = DS_EMPTY_INITIALIZER; > > + > > + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { > > + continue; > > + } > > + bitmap_set1(lcore_bitmap, lcore); > > + dpdk_dump_lcore(&ds, lcore); > > + VLOG_INFO("%s", ds_cstr(&ds)); > > + ds_destroy(&ds); > > + } > > + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, > > + dpdk_dump_lcores, NULL); > > We might need this command if we're doing automatic re-mapping > of lcores from the EAL arguments, but if not, I don't think we need it. > Simple logging on PMD start should be enough. Flavio mentioned that the logs could have been rotated. > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > > index 7ebf4ef87..91e98919c 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) > > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > > ovs_numa_thread_setaffinity_core(pmd->core_id); > > - dpdk_set_lcore_id(pmd->core_id); > > + dpdk_init_thread_context(pmd->core_id); > > Since this function will atually change cpu affinity that is not This is the same affinity than what OVS just applied. The call to pthread_setaffinity_np in dpdk should be a noop. > expected by OVS it must be called before setting affinity by > ovs_numa_thread_setaffinity_core(). This way OVS will guarantee > correct affinity for its threads. ovs_numa_thread_setaffinity_core() return value is not checked. And I can't see what calling this function changes to the ovs-numa internals. We could even remove it and only rely on dpdk. -- David Marchand
On 17.12.2019 10:22, David Marchand wrote: > On Fri, Dec 13, 2019 at 6:34 PM Ilya Maximets <i.maximets@ovn.org> wrote: >>> @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) >>> return false; >>> } >>> >>> +static void >>> +construct_dpdk_lcore_option(const struct smap *ovs_other_config, >>> + struct svec *args) >>> +{ >>> + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); >>> + struct svec lcores = SVEC_EMPTY_INITIALIZER; >>> + struct ovs_numa_info_core *core; >>> + struct ovs_numa_dump *cores; >>> + int index = 0; >>> + >>> + if (!cmask) { >>> + return; >>> + } >>> + if (args_contains(args, "-c") || args_contains(args, "-l") || >>> + args_contains(args, "--lcores")) { >>> + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " >>> + "due to dpdk-extra config"); >>> + return; >>> + } >>> + >>> + cores = ovs_numa_dump_cores_with_cmask(cmask); >>> + FOR_EACH_CORE_ON_DUMP(core, cores) { >>> + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); >> >> What's the point of re-mapping these cores? >> Just let DPDK to fail initialization and user will adjust cores. >> This automatic re-mapping only complicates everything. > > Deal with existing user of this parameter. > Example: https://bugzilla.redhat.com/show_bug.cgi?id=1753432 So, why not just rebuild DPDK with more than 128 cores in config since you're going to update OVS package for this issue? I bet kernel is built with NR_CPUS equal to something like 8192. <"support up to 128 threads" limitation> doesn't sound feasible as you can't actually enforce that. '--lcores' option made DPDK really confusing, IMHO. Looking at the DPDK API and wondering how the rte_lcore_to_cpu_id() should work and what it returns if lcore to cpu relation is one to many... I'm lost. BTW, looking at the cpu mask in the bug, what are these 8 cores (4 cores from each numa) from the lcore mask are supposed to do? > > >> >> IMHO, user should not configure DPDK cores at all. So, it's users' problem >> if cores configured incorrectly. > > If the user explicitly had set a dpdk option, ok, this is his problem. > But here, this is an OVS option, handling this internally makes more > sense to me. Sounds like another reason to deprecate and remove those configuration knobs: https://patchwork.ozlabs.org/project/openvswitch/list/?series=89529&state=* > > >>> + index++; >>> + } >>> + svec_terminate(&lcores); >>> + ovs_numa_dump_destroy(cores); >>> + svec_add(args, "--lcores"); >>> + svec_add_nocopy(args, svec_join(&lcores, ",", "")); >>> + svec_destroy(&lcores); >>> +} >>> + >>> static void >>> construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) >>> { >>> @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) >>> bool default_enabled; >>> const char *default_value; >>> } opts[] = { >>> - {"dpdk-lcore-mask", "-c", false, NULL}, >>> {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, >>> {"dpdk-socket-limit", "--socket-limit", false, NULL}, >>> }; >>> @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) >>> svec_parse_words(args, extra_configuration); >>> } >>> >>> + construct_dpdk_lcore_option(ovs_other_config, args); >>> construct_dpdk_options(ovs_other_config, args); >>> construct_dpdk_mutex_options(ovs_other_config, args); >>> } >>> @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { >>> .write = dpdk_log_write, >>> }; >>> >>> +static void >>> +dpdk_dump_lcore(struct ds *ds, unsigned lcore) >>> +{ >>> + struct svec cores = SVEC_EMPTY_INITIALIZER; >>> + rte_cpuset_t cpuset; >>> + unsigned cpu; >>> + >>> + cpuset = rte_lcore_cpuset(lcore); >>> + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { >>> + if (!CPU_ISSET(cpu, &cpuset)) { >>> + continue; >>> + } >>> + svec_add_nocopy(&cores, xasprintf("%u", cpu)); >>> + }> + svec_terminate(&cores); >>> + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, >>> + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", >>> + svec_join(&cores, ",", "")); >>> + svec_destroy(&cores); >>> +} >>> + >>> +static void >>> +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], >>> + void *aux OVS_UNUSED) >>> +{ >>> + struct ds ds = DS_EMPTY_INITIALIZER; >>> + unsigned lcore; >>> + >>> + ovs_mutex_lock(&lcore_bitmap_mutex); >>> + if (lcore_bitmap == NULL) { >>> + unixctl_command_reply_error(conn, "DPDK has not been initialised"); >>> + goto out; >>> + } >>> + if (argc > 1) { >>> + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || >>> + !bitmap_is_set(lcore_bitmap, lcore)) { >>> + unixctl_command_reply_error(conn, "incorrect lcoreid"); >>> + goto out; >>> + } >>> + dpdk_dump_lcore(&ds, lcore); >>> + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { >> >> Not a good style to have 'else for'. > > Tried to be creative :-) (the additional indent is really unnecessary). > But, if this is a blocker for this patch, ok. > > >> >> DPDK lcore iterator? > > We are iterating over lcores owned by DPDK and OVS. > > Example: > other_config : {dpdk-init="true", dpdk-lcore-mask="0x1", > pmd-cpu-mask="0x00008002"} > > # ovs-appctl dpdk/dump-lcores > lcore0 (DPDK) is running on core 0 > lcore1 (OVS) is running on core 1 > lcore2 (OVS) is running on core 15 > > >> >>> + if (!bitmap_is_set(lcore_bitmap, lcore)) { >>> + continue; >>> + } >>> + dpdk_dump_lcore(&ds, lcore); >>> + } >>> + unixctl_command_reply(conn, ds_cstr(&ds)); >>> + ds_destroy(&ds); >>> +out: >>> + ovs_mutex_unlock(&lcore_bitmap_mutex); >>> +} >>> + >>> static bool >>> dpdk_init__(const struct smap *ovs_other_config) >>> { > > >>> @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) >>> } >>> } >>> >>> + ovs_mutex_lock(&lcore_bitmap_mutex); >>> + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); >>> + /* Mark DPDK threads. */ >>> + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { >> >> Shouldn't we use DPDK lcore iterator instead? > > The DPDK lcore iterator skips "service" cores. > We would end up reusing those lcores if the user had set something exotic. What if user will map lcores to float across different cpu sets? How to treat that? > > Example: > other_config : {dpdk-extra="-c 0x5 -s 0x4", dpdk-init="true", > pmd-cpu-mask="0x00008002"} > # ovs-appctl dpdk/dump-lcores > lcore0 (DPDK) is running on core 0 > lcore1 (OVS) is running on core 1 > lcore2 (DPDK) is running on core 2 > lcore3 (OVS) is running on core 15 > > >> >>> + struct ds ds = DS_EMPTY_INITIALIZER; >>> + >>> + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { >>> + continue; >>> + } >>> + bitmap_set1(lcore_bitmap, lcore); >>> + dpdk_dump_lcore(&ds, lcore); >>> + VLOG_INFO("%s", ds_cstr(&ds)); >>> + ds_destroy(&ds); >>> + } >>> + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, >>> + dpdk_dump_lcores, NULL); >> >> We might need this command if we're doing automatic re-mapping >> of lcores from the EAL arguments, but if not, I don't think we need it. >> Simple logging on PMD start should be enough. > > Flavio mentioned that the logs could have been rotated. > > >>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c >>> index 7ebf4ef87..91e98919c 100644 >>> --- a/lib/dpif-netdev.c >>> +++ b/lib/dpif-netdev.c >>> @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) >>> /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ >>> ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); >>> ovs_numa_thread_setaffinity_core(pmd->core_id); >>> - dpdk_set_lcore_id(pmd->core_id); >>> + dpdk_init_thread_context(pmd->core_id); >> >> Since this function will atually change cpu affinity that is not > > This is the same affinity than what OVS just applied. > The call to pthread_setaffinity_np in dpdk should be a noop. Should be, but we should not depend on DPDK internals. Order changing will guarantee the affinity from the OVS point of view. > > >> expected by OVS it must be called before setting affinity by >> ovs_numa_thread_setaffinity_core(). This way OVS will guarantee >> correct affinity for its threads. > > ovs_numa_thread_setaffinity_core() return value is not checked. Yes. That is the issue we need to fix in OVS thread management. Right now it's not possible to undone thread creation. > And I can't see what calling this function changes to the ovs-numa internals. > We could even remove it and only rely on dpdk. While it doesn't change the internal state of a ovs-numa it is the generic OVS way to set affinity. And we can't remove it since it's not a 'DPDK datapath', it is a generic userspace datapath and DPDK calls here are just a bunch of workarounds that should not be there at all in the ideal world. To be clear, I'm not against this patch in general, I just think that it's too heavy for a temporal solution. Best regards, Ilya Maximets.
On 10/12/2019 10:10, David Marchand wrote: > Most DPDK components make the assumption that rte_lcore_id() returns > a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of > the LCORE_ID_ANY special value). > > OVS does not currently check which value is set in > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > side. > > Introduce a lcore allocator in OVS for PMD threads and map them to > unused lcores from DPDK à la --lcores. > > The physical cores on which the PMD threads are running still > constitutes an important information when debugging, so still keep > those in the PMD thread names but add a new debug log when starting > them. > Mapping between OVS threads and DPDK lcores can be dumped with a new > dpdk/dump-lcores command. > > Synchronize DPDK internals on numa and cpuset for the PMD threads by > registering them via the rte_thread_set_affinity() helper. > > Signed-off-by: David Marchand <david.marchand@redhat.com> Hi David, it is a big improvement over just failing, thanks. Some comments below. > --- > Changelog since v1: > - rewired existing configuration 'dpdk-lcore-mask' to use --lcores, > - switched to a bitmap to track lcores, > - added a command to dump current mapping (Flavio): used an experimental > API to get DPDK lcores cpuset since it is the most reliable/portable > information, > - used the same code for the logs when starting DPDK/PMD threads, > - addressed Ilya comments, > > --- > lib/dpdk-stub.c | 8 ++- > lib/dpdk.c | 163 ++++++++++++++++++++++++++++++++++++++++++++-- > lib/dpdk.h | 5 +- > lib/dpif-netdev.c | 3 +- > 4 files changed, 170 insertions(+), 9 deletions(-) > > diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c > index c332c217c..90473bc8e 100644 > --- a/lib/dpdk-stub.c > +++ b/lib/dpdk-stub.c > @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) > } > > void > -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) > +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) > +{ > + /* Nothing */ > +} > + > +void > +dpdk_uninit_thread_context(void) > { > /* Nothing */ > } > diff --git a/lib/dpdk.c b/lib/dpdk.c > index 37ea2973c..0173366a0 100644 > --- a/lib/dpdk.c > +++ b/lib/dpdk.c > @@ -30,6 +30,7 @@ > #include <rte_pdump.h> > #endif > > +#include "bitmap.h" > #include "dirs.h" > #include "fatal-signal.h" > #include "netdev-dpdk.h" > @@ -39,6 +40,7 @@ > #include "ovs-numa.h" > #include "smap.h" > #include "svec.h" > +#include "unixctl.h" > #include "util.h" > #include "vswitch-idl.h" > > @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization > * of DPDK. */ > static bool per_port_memory = false; /* Status of per port memory support */ > > +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER; > +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex); > + > static int > process_vhost_flags(char *flag, const char *default_val, int size, > const struct smap *ovs_other_config, > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) > return false; > } > > +static void > +construct_dpdk_lcore_option(const struct smap *ovs_other_config, > + struct svec *args) > +{ > + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); > + struct svec lcores = SVEC_EMPTY_INITIALIZER; > + struct ovs_numa_info_core *core; > + struct ovs_numa_dump *cores; > + int index = 0; > + > + if (!cmask) { > + return; > + } > + if (args_contains(args, "-c") || args_contains(args, "-l") || > + args_contains(args, "--lcores")) { > + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " > + "due to dpdk-extra config"); > + return; > + } > + > + cores = ovs_numa_dump_cores_with_cmask(cmask); > + FOR_EACH_CORE_ON_DUMP(core, cores) { Ran a test with 0x55 as dpdk-lcore-mask and got |00017|dpdk|INFO|EAL ARGS: ovs-vswitchd --lcores 0@6,1@0,2@2,3@4 --socket-mem 4096,4096 --socket-limit 4096,4096. which means handler/revalidators will run on core 6, whereas previously were running on core 0. I think the existing behaviour of running on the lowest core should be kept if possible. 2019-12-18T20:23:11Z|00035|dpdk|INFO|lcore0 (DPDK) is running on core 6 2019-12-18T20:23:17Z|00036|dpdk|INFO|lcore1 (DPDK) is running on core 0 2019-12-18T20:23:19Z|00037|dpdk|INFO|lcore2 (DPDK) is running on core 2 2019-12-18T20:23:22Z|00038|dpdk|INFO|lcore3 (DPDK) is running on core 4 Core 6, it's labelled as a DPDK thread, but probably OVS handlers/revalidators will be running on this. Do we need DPDK vs OVS threads in logs? I think from user, they just run OVS. Also, are logs needed for cores 0,2,4, when they will do nothing - not sure, but it's free to log i suppose. > + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); > + index++; > + } > + svec_terminate(&lcores); > + ovs_numa_dump_destroy(cores); > + svec_add(args, "--lcores"); > + svec_add_nocopy(args, svec_join(&lcores, ",", "")); > + svec_destroy(&lcores); > +} > + > static void > construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > { > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > bool default_enabled; > const char *default_value; > } opts[] = { > - {"dpdk-lcore-mask", "-c", false, NULL}, > {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, > {"dpdk-socket-limit", "--socket-limit", false, NULL}, > }; > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) > svec_parse_words(args, extra_configuration); > } > > + construct_dpdk_lcore_option(ovs_other_config, args); > construct_dpdk_options(ovs_other_config, args); > construct_dpdk_mutex_options(ovs_other_config, args); > } > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { > .write = dpdk_log_write, > }; > > +static void > +dpdk_dump_lcore(struct ds *ds, unsigned lcore) > +{ > + struct svec cores = SVEC_EMPTY_INITIALIZER; > + rte_cpuset_t cpuset; > + unsigned cpu; > + > + cpuset = rte_lcore_cpuset(lcore); > + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { > + if (!CPU_ISSET(cpu, &cpuset)) { > + continue; > + } > + svec_add_nocopy(&cores, xasprintf("%u", cpu)); > + } > + svec_terminate(&cores); > + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, > + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", > + svec_join(&cores, ",", "")); Ran a test with core mask of 0x5 and adding one pmd on core 8, |00035|dpdk|INFO|lcore0 (DPDK) is running on core 0 |00036|dpdk|INFO|lcore1 (DPDK) is running on core 2 |00084|dpif_netdev|INFO|PMD thread on numa_id: 0, core id: 8 created. |00085|dpif_netdev|INFO|There are 1 pmd threads on numa node 0 |00001|dpdk(pmd-c08/id:31)|INFO|lcore2 (OVS) is running on core 8 I think users are used to "pmd cores" and "lcores" through the masks. A pmd logging it's "lcore" may be confusing. At least maybe there's a way so that it doesn't appear related to "lcores" in dpdk-lcore-mask. In another test, I purposely ran out of cores: |00244|dpif_netdev|INFO|PMD thread on numa_id: 0, core id: 16 created. |00001|dpdk(pmd-c16/id:36)|WARN|Reached maximum number of DPDK lcores, core 16 will have lower performance ^^^ Good to get a warning about reduced performance, but it's saying core 16 where really it's running on the non-pmd core <snip> |00247|dpif_netdev|INFO|There are 6 pmd threads on numa node 0 |00248|dpif_netdev|INFO|There are 2 pmd threads on numa node 1 ^^^ it's not true anymore - these are mostly on the non-pmd core which in my case is on numa 0 Hmm, it's unlikely that a user will use >128 total num of cores, so rather than try to survive it by using non-pmd thread and have to think about all the corner cases after that, maybe for now that could still be a fail. Otherwise, you'd have to change the logs, but also does it make sense to equally distribute queues if some of the pmds are on non-pmd lcore, what about numa etc. > + svec_destroy(&cores); > +} > + > +static void > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], > + void *aux OVS_UNUSED) > +{ > + struct ds ds = DS_EMPTY_INITIALIZER; > + unsigned lcore; > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + if (lcore_bitmap == NULL) { > + unixctl_command_reply_error(conn, "DPDK has not been initialised"); > + goto out; > + } > + if (argc > 1) { > + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || > + !bitmap_is_set(lcore_bitmap, lcore)) { > + unixctl_command_reply_error(conn, "incorrect lcoreid"); > + goto out; > + } > + dpdk_dump_lcore(&ds, lcore); > + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > + if (!bitmap_is_set(lcore_bitmap, lcore)) { > + continue; > + } > + dpdk_dump_lcore(&ds, lcore); > + } > + unixctl_command_reply(conn, ds_cstr(&ds)); > + ds_destroy(&ds); > +out: > + ovs_mutex_unlock(&lcore_bitmap_mutex); > +} > + > static bool > dpdk_init__(const struct smap *ovs_other_config) > { > @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > > - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { > + if (args_contains(&args, "-c") || args_contains(&args, "-l") || > + args_contains(&args, "--lcores")) { > auto_determine = false; > } > > @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config) > * thread affintity - default to core #0 */ > VLOG_ERR("Thread getaffinity failed. Using core #0"); > } > - svec_add(&args, "-l"); > - svec_add_nocopy(&args, xasprintf("%d", cpu)); > + svec_add(&args, "--lcores"); > + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); > } > > svec_terminate(&args); > @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > > + ovs_mutex_lock(&lcore_bitmap_mutex); > + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); > + /* Mark DPDK threads. */ > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > + struct ds ds = DS_EMPTY_INITIALIZER; > + > + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { > + continue; > + } > + bitmap_set1(lcore_bitmap, lcore); > + dpdk_dump_lcore(&ds, lcore); > + VLOG_INFO("%s", ds_cstr(&ds)); > + ds_destroy(&ds); > + } > + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, > + dpdk_dump_lcores, NULL); > + ovs_mutex_unlock(&lcore_bitmap_mutex); > + > /* We are called from the main thread here */ > RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; > > @@ -512,11 +620,54 @@ dpdk_available(void) > } > > void > -dpdk_set_lcore_id(unsigned cpu) > +dpdk_init_thread_context(unsigned cpu) > { > + struct ds ds = DS_EMPTY_INITIALIZER; > + rte_cpuset_t cpuset; > + unsigned lcore; > + > /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ > ovs_assert(cpu != NON_PMD_CORE_ID); > - RTE_PER_LCORE(_lcore_id) = cpu; > + > + ovs_mutex_lock(&lcore_bitmap_mutex); This means a lock between PMDs - any concern about when queues are being redistributed, pmds stopped/started and traffic flowing? Not thought through at all, but would it be possible to only use the dpdk lcores if values of >RTE_MAX_LCORE are actually used? Something like the tx queue re-mapping comes to mind, where locking is only done if required. You should know at eal_init time about the dpdk-lcore-mask and during reconfigure about the pmd cores. thanks, Kevin. > + if (lcore_bitmap == NULL) { > + lcore = NON_PMD_CORE_ID; > + } else { > + lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE); > + if (lcore == RTE_MAX_LCORE) { > + VLOG_WARN("Reached maximum number of DPDK lcores, core %u will " > + "have lower performance", cpu); > + lcore = NON_PMD_CORE_ID; > + } else { > + bitmap_set1(lcore_bitmap, lcore); > + } > + } > + ovs_mutex_unlock(&lcore_bitmap_mutex); > + > + RTE_PER_LCORE(_lcore_id) = lcore; > + > + if (lcore == NON_PMD_CORE_ID) { > + return; > + } > + > + CPU_ZERO(&cpuset); > + CPU_SET(cpu, &cpuset); > + rte_thread_set_affinity(&cpuset); > + dpdk_dump_lcore(&ds, lcore); > + VLOG_INFO("%s", ds_cstr(&ds)); > + ds_destroy(&ds); > +} > + > +void > +dpdk_uninit_thread_context(void) > +{ > + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { > + return; > + } > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id)); > + ovs_mutex_unlock(&lcore_bitmap_mutex); > } > > void > diff --git a/lib/dpdk.h b/lib/dpdk.h > index 736a64279..eab7a926b 100644 > --- a/lib/dpdk.h > +++ b/lib/dpdk.h > @@ -22,7 +22,9 @@ > #ifdef DPDK_NETDEV > > #include <rte_config.h> > +#define ALLOW_EXPERIMENTAL_API > #include <rte_lcore.h> > +#undef ALLOW_EXPERIMENTAL_API > > #define NON_PMD_CORE_ID LCORE_ID_ANY > > @@ -36,7 +38,8 @@ struct smap; > struct ovsrec_open_vswitch; > > void dpdk_init(const struct smap *ovs_other_config); > -void dpdk_set_lcore_id(unsigned cpu); > +void dpdk_init_thread_context(unsigned cpu); > +void dpdk_uninit_thread_context(void); > const char *dpdk_get_vhost_sock_dir(void); > bool dpdk_vhost_iommu_enabled(void); > bool dpdk_vhost_postcopy_enabled(void); > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 7ebf4ef87..91e98919c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > ovs_numa_thread_setaffinity_core(pmd->core_id); > - dpdk_set_lcore_id(pmd->core_id); > + dpdk_init_thread_context(pmd->core_id); > poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); > dfc_cache_init(&pmd->flow_cache); > pmd_alloc_static_tx_qid(pmd); > @@ -5590,6 +5590,7 @@ reload: > dfc_cache_uninit(&pmd->flow_cache); > free(poll_list); > pmd_free_cached_ports(pmd); > + dpdk_uninit_thread_context(); > return NULL; > } > >
David Marchand <david.marchand@redhat.com> writes: > Most DPDK components make the assumption that rte_lcore_id() returns > a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of > the LCORE_ID_ANY special value). > OVS does not currently check which value is set in > RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK > side. > Introduce a lcore allocator in OVS for PMD threads and map them to > unused lcores from DPDK à la --lcores. > The physical cores on which the PMD threads are running still > constitutes an important information when debugging, so still keep > those in the PMD thread names but add a new debug log when starting > them. > Mapping between OVS threads and DPDK lcores can be dumped with a new > dpdk/dump-lcores command. > Synchronize DPDK internals on numa and cpuset for the PMD threads by > registering them via the rte_thread_set_affinity() helper. > Signed-off-by: David Marchand <david.marchand@redhat.com> > --- Thanks for this work. The current failure condition that exists today gets confusing for users. I do prefer having part of OvS internally handling this - it will only be used with dpdk ports when the EAL gets initialized, so I dislike documenting some kinds of options. This is where I like having the abstraction layer for option passing because we can fix things up for the user. > Changelog since v1: > - rewired existing configuration 'dpdk-lcore-mask' to use --lcores, > - switched to a bitmap to track lcores, > - added a command to dump current mapping (Flavio): used an experimental > API to get DPDK lcores cpuset since it is the most reliable/portable > information, > - used the same code for the logs when starting DPDK/PMD threads, > - addressed Ilya comments, > --- > lib/dpdk-stub.c | 8 ++- > lib/dpdk.c | 163 ++++++++++++++++++++++++++++++++++++++++++++-- > lib/dpdk.h | 5 +- > lib/dpif-netdev.c | 3 +- > 4 files changed, 170 insertions(+), 9 deletions(-) > diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c > index c332c217c..90473bc8e 100644 > --- a/lib/dpdk-stub.c > +++ b/lib/dpdk-stub.c > @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) > } > void > -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) > +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) > +{ > + /* Nothing */ > +} > + > +void > +dpdk_uninit_thread_context(void) > { > /* Nothing */ > } > diff --git a/lib/dpdk.c b/lib/dpdk.c > index 37ea2973c..0173366a0 100644 > --- a/lib/dpdk.c > +++ b/lib/dpdk.c > @@ -30,6 +30,7 @@ > #include <rte_pdump.h> > #endif > +#include "bitmap.h" > #include "dirs.h" > #include "fatal-signal.h" > #include "netdev-dpdk.h" > @@ -39,6 +40,7 @@ > #include "ovs-numa.h" > #include "smap.h" > #include "svec.h" > +#include "unixctl.h" > #include "util.h" > #include "vswitch-idl.h" > @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization > * of DPDK. */ > static bool per_port_memory = false; /* Status of per port memory support */ > +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER; > +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex); > + > static int > process_vhost_flags(char *flag, const char *default_val, int size, > const struct smap *ovs_other_config, > @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) > return false; > } > +static void > +construct_dpdk_lcore_option(const struct smap *ovs_other_config, > + struct svec *args) > +{ > + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); > + struct svec lcores = SVEC_EMPTY_INITIALIZER; > + struct ovs_numa_info_core *core; > + struct ovs_numa_dump *cores; > + int index = 0; > + > + if (!cmask) { > + return; > + } > + if (args_contains(args, "-c") || args_contains(args, "-l") || > + args_contains(args, "--lcores")) { > + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " > + "due to dpdk-extra config"); > + return; > + } > + > + cores = ovs_numa_dump_cores_with_cmask(cmask); > + FOR_EACH_CORE_ON_DUMP(core, cores) { > + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); > + index++; > + } > + svec_terminate(&lcores); > + ovs_numa_dump_destroy(cores); > + svec_add(args, "--lcores"); > + svec_add_nocopy(args, svec_join(&lcores, ",", "")); > + svec_destroy(&lcores); > +} > + > static void > construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > { > @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) > bool default_enabled; > const char *default_value; > } opts[] = { > - {"dpdk-lcore-mask", "-c", false, NULL}, > {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, > {"dpdk-socket-limit", "--socket-limit", false, NULL}, > }; > @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) > svec_parse_words(args, extra_configuration); > } > + construct_dpdk_lcore_option(ovs_other_config, args); > construct_dpdk_options(ovs_other_config, args); > construct_dpdk_mutex_options(ovs_other_config, args); > } > @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { > .write = dpdk_log_write, > }; > +static void > +dpdk_dump_lcore(struct ds *ds, unsigned lcore) > +{ > + struct svec cores = SVEC_EMPTY_INITIALIZER; > + rte_cpuset_t cpuset; > + unsigned cpu; > + > + cpuset = rte_lcore_cpuset(lcore); I guess we don't have any way of calling to pthread_getaffinity_np() on linux platform so we need to rely on this call. > + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { > + if (!CPU_ISSET(cpu, &cpuset)) { > + continue; > + } > + svec_add_nocopy(&cores, xasprintf("%u", cpu)); > + } > + svec_terminate(&cores); > + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, > + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", > + svec_join(&cores, ",", "")); > + svec_destroy(&cores); > +} > + > +static void > +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], > + void *aux OVS_UNUSED) > +{ > + struct ds ds = DS_EMPTY_INITIALIZER; > + unsigned lcore; > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + if (lcore_bitmap == NULL) { > + unixctl_command_reply_error(conn, "DPDK has not been initialised"); > + goto out; > + } > + if (argc > 1) { > + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || > + !bitmap_is_set(lcore_bitmap, lcore)) { > + unixctl_command_reply_error(conn, "incorrect lcoreid"); > + goto out; > + } > + dpdk_dump_lcore(&ds, lcore); > + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { Neat. I've never seen code constructed like this. > + if (!bitmap_is_set(lcore_bitmap, lcore)) { > + continue; > + } > + dpdk_dump_lcore(&ds, lcore); > + } > + unixctl_command_reply(conn, ds_cstr(&ds)); > + ds_destroy(&ds); > +out: > + ovs_mutex_unlock(&lcore_bitmap_mutex); > +} > + > static bool > dpdk_init__(const struct smap *ovs_other_config) > { > @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { > + if (args_contains(&args, "-c") || args_contains(&args, "-l") || > + args_contains(&args, "--lcores")) { > auto_determine = false; > } > @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config) > * thread affintity - default to core #0 */ > VLOG_ERR("Thread getaffinity failed. Using core #0"); > } > - svec_add(&args, "-l"); > - svec_add_nocopy(&args, xasprintf("%d", cpu)); > + svec_add(&args, "--lcores"); > + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); > } > svec_terminate(&args); > @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) > } > } > + ovs_mutex_lock(&lcore_bitmap_mutex); > + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); > + /* Mark DPDK threads. */ > + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { > + struct ds ds = DS_EMPTY_INITIALIZER; > + > + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { > + continue; > + } > + bitmap_set1(lcore_bitmap, lcore); > + dpdk_dump_lcore(&ds, lcore); > + VLOG_INFO("%s", ds_cstr(&ds)); > + ds_destroy(&ds); > + } > + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, > + dpdk_dump_lcores, NULL); > + ovs_mutex_unlock(&lcore_bitmap_mutex); > + > /* We are called from the main thread here */ > RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; > @@ -512,11 +620,54 @@ dpdk_available(void) > } > void > -dpdk_set_lcore_id(unsigned cpu) > +dpdk_init_thread_context(unsigned cpu) > { > + struct ds ds = DS_EMPTY_INITIALIZER; > + rte_cpuset_t cpuset; > + unsigned lcore; > + > /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ > ovs_assert(cpu != NON_PMD_CORE_ID); > - RTE_PER_LCORE(_lcore_id) = cpu; > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + if (lcore_bitmap == NULL) { > + lcore = NON_PMD_CORE_ID; > + } else { > + lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE); > + if (lcore == RTE_MAX_LCORE) { > + VLOG_WARN("Reached maximum number of DPDK lcores, core %u will " > + "have lower performance", cpu); I wouldn't write anything predictive about the condition here. "core %u will not be mapped in accordance with the mask" is most accurate. > + lcore = NON_PMD_CORE_ID; > + } else { > + bitmap_set1(lcore_bitmap, lcore); > + } > + } > + ovs_mutex_unlock(&lcore_bitmap_mutex); > + > + RTE_PER_LCORE(_lcore_id) = lcore; > + > + if (lcore == NON_PMD_CORE_ID) { > + return; > + } > + > + CPU_ZERO(&cpuset); > + CPU_SET(cpu, &cpuset); > + rte_thread_set_affinity(&cpuset); > + dpdk_dump_lcore(&ds, lcore); > + VLOG_INFO("%s", ds_cstr(&ds)); > + ds_destroy(&ds); > +} > + > +void > +dpdk_uninit_thread_context(void) > +{ > + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { > + return; > + } > + > + ovs_mutex_lock(&lcore_bitmap_mutex); > + bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id)); > + ovs_mutex_unlock(&lcore_bitmap_mutex); > } > void > diff --git a/lib/dpdk.h b/lib/dpdk.h > index 736a64279..eab7a926b 100644 > --- a/lib/dpdk.h > +++ b/lib/dpdk.h > @@ -22,7 +22,9 @@ > #ifdef DPDK_NETDEV > #include <rte_config.h> > +#define ALLOW_EXPERIMENTAL_API > #include <rte_lcore.h> > +#undef ALLOW_EXPERIMENTAL_API This is kindof ugly. Isn't it supposed to be done by the library with the rte_config.h? What happens if the library is compiled without experimental support? I didn't check this, but it doesn't /look/ nice. Maybe it's better to do a feature test macro and not use these APIs if the compiled version of DPDK doesn't support experimental APIs. WDYT? > #define NON_PMD_CORE_ID LCORE_ID_ANY > @@ -36,7 +38,8 @@ struct smap; > struct ovsrec_open_vswitch; > void dpdk_init(const struct smap *ovs_other_config); > -void dpdk_set_lcore_id(unsigned cpu); > +void dpdk_init_thread_context(unsigned cpu); > +void dpdk_uninit_thread_context(void); > const char *dpdk_get_vhost_sock_dir(void); > bool dpdk_vhost_iommu_enabled(void); > bool dpdk_vhost_postcopy_enabled(void); > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 7ebf4ef87..91e98919c 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) > /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ > ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); > ovs_numa_thread_setaffinity_core(pmd->core_id); > - dpdk_set_lcore_id(pmd->core_id); > + dpdk_init_thread_context(pmd->core_id); > poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); > dfc_cache_init(&pmd->flow_cache); > pmd_alloc_static_tx_qid(pmd); > @@ -5590,6 +5590,7 @@ reload: > dfc_cache_uninit(&pmd->flow_cache); > free(poll_list); > pmd_free_cached_ports(pmd); > + dpdk_uninit_thread_context(); > return NULL; > }
diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index c332c217c..90473bc8e 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config) } void -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED) +dpdk_init_thread_context(unsigned cpu OVS_UNUSED) +{ + /* Nothing */ +} + +void +dpdk_uninit_thread_context(void) { /* Nothing */ } diff --git a/lib/dpdk.c b/lib/dpdk.c index 37ea2973c..0173366a0 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -30,6 +30,7 @@ #include <rte_pdump.h> #endif +#include "bitmap.h" #include "dirs.h" #include "fatal-signal.h" #include "netdev-dpdk.h" @@ -39,6 +40,7 @@ #include "ovs-numa.h" #include "smap.h" #include "svec.h" +#include "unixctl.h" #include "util.h" #include "vswitch-idl.h" @@ -54,6 +56,9 @@ static bool dpdk_initialized = false; /* Indicates successful initialization * of DPDK. */ static bool per_port_memory = false; /* Status of per port memory support */ +static struct ovs_mutex lcore_bitmap_mutex = OVS_MUTEX_INITIALIZER; +static unsigned long *lcore_bitmap OVS_GUARDED_BY(lcore_bitmap_mutex); + static int process_vhost_flags(char *flag, const char *default_val, int size, const struct smap *ovs_other_config, @@ -94,6 +99,38 @@ args_contains(const struct svec *args, const char *value) return false; } +static void +construct_dpdk_lcore_option(const struct smap *ovs_other_config, + struct svec *args) +{ + const char *cmask = smap_get(ovs_other_config, "dpdk-lcore-mask"); + struct svec lcores = SVEC_EMPTY_INITIALIZER; + struct ovs_numa_info_core *core; + struct ovs_numa_dump *cores; + int index = 0; + + if (!cmask) { + return; + } + if (args_contains(args, "-c") || args_contains(args, "-l") || + args_contains(args, "--lcores")) { + VLOG_WARN("Ignoring database defined option 'dpdk-lcore-mask' " + "due to dpdk-extra config"); + return; + } + + cores = ovs_numa_dump_cores_with_cmask(cmask); + FOR_EACH_CORE_ON_DUMP(core, cores) { + svec_add_nocopy(&lcores, xasprintf("%d@%d", index, core->core_id)); + index++; + } + svec_terminate(&lcores); + ovs_numa_dump_destroy(cores); + svec_add(args, "--lcores"); + svec_add_nocopy(args, svec_join(&lcores, ",", "")); + svec_destroy(&lcores); +} + static void construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) { @@ -103,7 +140,6 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) bool default_enabled; const char *default_value; } opts[] = { - {"dpdk-lcore-mask", "-c", false, NULL}, {"dpdk-hugepage-dir", "--huge-dir", false, NULL}, {"dpdk-socket-limit", "--socket-limit", false, NULL}, }; @@ -224,6 +260,7 @@ construct_dpdk_args(const struct smap *ovs_other_config, struct svec *args) svec_parse_words(args, extra_configuration); } + construct_dpdk_lcore_option(ovs_other_config, args); construct_dpdk_options(ovs_other_config, args); construct_dpdk_mutex_options(ovs_other_config, args); } @@ -264,6 +301,58 @@ static cookie_io_functions_t dpdk_log_func = { .write = dpdk_log_write, }; +static void +dpdk_dump_lcore(struct ds *ds, unsigned lcore) +{ + struct svec cores = SVEC_EMPTY_INITIALIZER; + rte_cpuset_t cpuset; + unsigned cpu; + + cpuset = rte_lcore_cpuset(lcore); + for (cpu = 0; cpu < CPU_SETSIZE; cpu++) { + if (!CPU_ISSET(cpu, &cpuset)) { + continue; + } + svec_add_nocopy(&cores, xasprintf("%u", cpu)); + } + svec_terminate(&cores); + ds_put_format(ds, "lcore%u (%s) is running on core %s\n", lcore, + rte_eal_lcore_role(lcore) != ROLE_OFF ? "DPDK" : "OVS", + svec_join(&cores, ",", "")); + svec_destroy(&cores); +} + +static void +dpdk_dump_lcores(struct unixctl_conn *conn, int argc, const char *argv[], + void *aux OVS_UNUSED) +{ + struct ds ds = DS_EMPTY_INITIALIZER; + unsigned lcore; + + ovs_mutex_lock(&lcore_bitmap_mutex); + if (lcore_bitmap == NULL) { + unixctl_command_reply_error(conn, "DPDK has not been initialised"); + goto out; + } + if (argc > 1) { + if (!str_to_uint(argv[1], 0, &lcore) || lcore >= RTE_MAX_LCORE || + !bitmap_is_set(lcore_bitmap, lcore)) { + unixctl_command_reply_error(conn, "incorrect lcoreid"); + goto out; + } + dpdk_dump_lcore(&ds, lcore); + } else for (lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { + if (!bitmap_is_set(lcore_bitmap, lcore)) { + continue; + } + dpdk_dump_lcore(&ds, lcore); + } + unixctl_command_reply(conn, ds_cstr(&ds)); + ds_destroy(&ds); +out: + ovs_mutex_unlock(&lcore_bitmap_mutex); +} + static bool dpdk_init__(const struct smap *ovs_other_config) { @@ -345,7 +434,8 @@ dpdk_init__(const struct smap *ovs_other_config) } } - if (args_contains(&args, "-c") || args_contains(&args, "-l")) { + if (args_contains(&args, "-c") || args_contains(&args, "-l") || + args_contains(&args, "--lcores")) { auto_determine = false; } @@ -371,8 +461,8 @@ dpdk_init__(const struct smap *ovs_other_config) * thread affintity - default to core #0 */ VLOG_ERR("Thread getaffinity failed. Using core #0"); } - svec_add(&args, "-l"); - svec_add_nocopy(&args, xasprintf("%d", cpu)); + svec_add(&args, "--lcores"); + svec_add_nocopy(&args, xasprintf("0@%d", cpu)); } svec_terminate(&args); @@ -428,6 +518,24 @@ dpdk_init__(const struct smap *ovs_other_config) } } + ovs_mutex_lock(&lcore_bitmap_mutex); + lcore_bitmap = bitmap_allocate(RTE_MAX_LCORE); + /* Mark DPDK threads. */ + for (uint32_t lcore = 0; lcore < RTE_MAX_LCORE; lcore++) { + struct ds ds = DS_EMPTY_INITIALIZER; + + if (rte_eal_lcore_role(lcore) == ROLE_OFF) { + continue; + } + bitmap_set1(lcore_bitmap, lcore); + dpdk_dump_lcore(&ds, lcore); + VLOG_INFO("%s", ds_cstr(&ds)); + ds_destroy(&ds); + } + unixctl_command_register("dpdk/dump-lcores", "[lcore]", 0, 1, + dpdk_dump_lcores, NULL); + ovs_mutex_unlock(&lcore_bitmap_mutex); + /* We are called from the main thread here */ RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID; @@ -512,11 +620,54 @@ dpdk_available(void) } void -dpdk_set_lcore_id(unsigned cpu) +dpdk_init_thread_context(unsigned cpu) { + struct ds ds = DS_EMPTY_INITIALIZER; + rte_cpuset_t cpuset; + unsigned lcore; + /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */ ovs_assert(cpu != NON_PMD_CORE_ID); - RTE_PER_LCORE(_lcore_id) = cpu; + + ovs_mutex_lock(&lcore_bitmap_mutex); + if (lcore_bitmap == NULL) { + lcore = NON_PMD_CORE_ID; + } else { + lcore = bitmap_scan(lcore_bitmap, 0, 0, RTE_MAX_LCORE); + if (lcore == RTE_MAX_LCORE) { + VLOG_WARN("Reached maximum number of DPDK lcores, core %u will " + "have lower performance", cpu); + lcore = NON_PMD_CORE_ID; + } else { + bitmap_set1(lcore_bitmap, lcore); + } + } + ovs_mutex_unlock(&lcore_bitmap_mutex); + + RTE_PER_LCORE(_lcore_id) = lcore; + + if (lcore == NON_PMD_CORE_ID) { + return; + } + + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + rte_thread_set_affinity(&cpuset); + dpdk_dump_lcore(&ds, lcore); + VLOG_INFO("%s", ds_cstr(&ds)); + ds_destroy(&ds); +} + +void +dpdk_uninit_thread_context(void) +{ + if (RTE_PER_LCORE(_lcore_id) == NON_PMD_CORE_ID) { + return; + } + + ovs_mutex_lock(&lcore_bitmap_mutex); + bitmap_set0(lcore_bitmap, RTE_PER_LCORE(_lcore_id)); + ovs_mutex_unlock(&lcore_bitmap_mutex); } void diff --git a/lib/dpdk.h b/lib/dpdk.h index 736a64279..eab7a926b 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -22,7 +22,9 @@ #ifdef DPDK_NETDEV #include <rte_config.h> +#define ALLOW_EXPERIMENTAL_API #include <rte_lcore.h> +#undef ALLOW_EXPERIMENTAL_API #define NON_PMD_CORE_ID LCORE_ID_ANY @@ -36,7 +38,8 @@ struct smap; struct ovsrec_open_vswitch; void dpdk_init(const struct smap *ovs_other_config); -void dpdk_set_lcore_id(unsigned cpu); +void dpdk_init_thread_context(unsigned cpu); +void dpdk_uninit_thread_context(void); const char *dpdk_get_vhost_sock_dir(void); bool dpdk_vhost_iommu_enabled(void); bool dpdk_vhost_postcopy_enabled(void); diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 7ebf4ef87..91e98919c 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -5470,7 +5470,7 @@ pmd_thread_main(void *f_) /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */ ovsthread_setspecific(pmd->dp->per_pmd_key, pmd); ovs_numa_thread_setaffinity_core(pmd->core_id); - dpdk_set_lcore_id(pmd->core_id); + dpdk_init_thread_context(pmd->core_id); poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); dfc_cache_init(&pmd->flow_cache); pmd_alloc_static_tx_qid(pmd); @@ -5590,6 +5590,7 @@ reload: dfc_cache_uninit(&pmd->flow_cache); free(poll_list); pmd_free_cached_ports(pmd); + dpdk_uninit_thread_context(); return NULL; }
Most DPDK components make the assumption that rte_lcore_id() returns a valid lcore_id in [0..RTE_MAX_LCORE[ range (with the exception of the LCORE_ID_ANY special value). OVS does not currently check which value is set in RTE_PER_LCORE(_lcore_id) which exposes us to potential crashes on DPDK side. Introduce a lcore allocator in OVS for PMD threads and map them to unused lcores from DPDK à la --lcores. The physical cores on which the PMD threads are running still constitutes an important information when debugging, so still keep those in the PMD thread names but add a new debug log when starting them. Mapping between OVS threads and DPDK lcores can be dumped with a new dpdk/dump-lcores command. Synchronize DPDK internals on numa and cpuset for the PMD threads by registering them via the rte_thread_set_affinity() helper. Signed-off-by: David Marchand <david.marchand@redhat.com> --- Changelog since v1: - rewired existing configuration 'dpdk-lcore-mask' to use --lcores, - switched to a bitmap to track lcores, - added a command to dump current mapping (Flavio): used an experimental API to get DPDK lcores cpuset since it is the most reliable/portable information, - used the same code for the logs when starting DPDK/PMD threads, - addressed Ilya comments, --- lib/dpdk-stub.c | 8 ++- lib/dpdk.c | 163 ++++++++++++++++++++++++++++++++++++++++++++-- lib/dpdk.h | 5 +- lib/dpif-netdev.c | 3 +- 4 files changed, 170 insertions(+), 9 deletions(-)