diff mbox series

[ovs-dev] dpdk: Refuse running on cpu >= RTE_MAX_LCORE.

Message ID 20200626122738.28163-1-david.marchand@redhat.com
State New
Headers show
Series [ovs-dev] dpdk: Refuse running on cpu >= RTE_MAX_LCORE. | expand

Commit Message

David Marchand June 26, 2020, 12:27 p.m. UTC
DPDK lcores range is [0, RTE_MAX_LCORE[.
Trying to use a lcore out of this range expose OVS to crashes in DPDK mempool
local cache array.
Prefer a "clean" abort so that users know that something is wrong with
their configuration.

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
 lib/dpdk.c | 7 +++++++
 1 file changed, 7 insertions(+)

Comments

Ilya Maximets July 2, 2020, 1:09 p.m. UTC | #1
On 6/26/20 2:27 PM, David Marchand wrote:
> DPDK lcores range is [0, RTE_MAX_LCORE[.
> Trying to use a lcore out of this range expose OVS to crashes in DPDK mempool
> local cache array.
> Prefer a "clean" abort so that users know that something is wrong with
> their configuration.
> 
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
>  lib/dpdk.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 98d0e2643e..55ce9a9221 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -617,7 +617,14 @@ dpdk_set_lcore_id(unsigned cpu)
>  {
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> +    if (cpu >= RTE_MAX_LCORE) {
> +        cpu = LCORE_ID_ANY;
> +    }
>      RTE_PER_LCORE(_lcore_id) = cpu;
> +    if (rte_lcore_id() == LCORE_ID_ANY) {
> +        ovs_abort(0, "PMD thread init failed, trying to use more cores than "
> +                  "DPDK supports (RTE_MAX_LCORE %u).", RTE_MAX_LCORE);

I don't think that random OVS abort after pmd-cpu-mask change or port addition
is a good thing to have.  Maybe it's better to limit CPU core discovery?
This way OVS will never try to create thread on unwanted cores.

Since ovs-numa module is mostly used by userspace datapath this will not
affect non-DPDK setups.

Something like this (not tested):

diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a68522..7c8a3b036 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -27,6 +27,7 @@
 #include <unistd.h>
 #endif /* __linux__ */
 
+#include "dpdk.h"
 #include "hash.h"
 #include "openvswitch/hmap.h"
 #include "openvswitch/list.h"
@@ -58,6 +59,13 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
 
 #define MAX_NUMA_NODES 128
 
+#ifdef DPDK_NETDEV
+/* DPDK will not work correctly for cores with core_id >= RTE_MAX_LCORE. */
+#define MAX_CORE_ID    (RTE_MAX_LCORE - 1)
+#else
+#define MAX_CORE_ID    INT_MAX
+#endif
+
 /* numa node. */
 struct numa_node {
     struct hmap_node hmap_node;     /* In the 'all_numa_nodes'. */
@@ -112,6 +120,12 @@ insert_new_numa_node(int numa_id)
 static struct cpu_core *
 insert_new_cpu_core(struct numa_node *n, unsigned core_id)
 {
+    if (core_id > MAX_CORE_ID) {
+        VLOG_DBG("Ignoring CPU core with id %u ( %u > %d).",
+                 core_id, core_id, MAX_CORE_ID);
+        return NULL;
+    }
+
     struct cpu_core *c = xzalloc(sizeof *c);
 
     hmap_insert(&all_cpu_cores, &c->hmap_node, hash_int(core_id, 0));
@@ -278,6 +292,11 @@ ovs_numa_init(void)
     if (ovsthread_once_start(&once)) {
         const struct numa_node *n;
 
+        if (MAX_CORE_ID != INT_MAX) {
+            VLOG_INFO("Maximum allowed CPU core id is %d. "
+                      "Other cores will not be available.", MAX_CORE_ID);
+        }
+
         if (dummy_numa) {
             discover_numa_and_core_dummy();
         } else {
---

What do you think?

Best regards, Ilya Maximets.
David Marchand July 2, 2020, 1:56 p.m. UTC | #2
On Thu, Jul 2, 2020 at 3:09 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>
> On 6/26/20 2:27 PM, David Marchand wrote:
> > DPDK lcores range is [0, RTE_MAX_LCORE[.
> > Trying to use a lcore out of this range expose OVS to crashes in DPDK mempool
> > local cache array.
> > Prefer a "clean" abort so that users know that something is wrong with
> > their configuration.
> >
> > Signed-off-by: David Marchand <david.marchand@redhat.com>
> > ---
> >  lib/dpdk.c | 7 +++++++
> >  1 file changed, 7 insertions(+)
> >
> > diff --git a/lib/dpdk.c b/lib/dpdk.c
> > index 98d0e2643e..55ce9a9221 100644
> > --- a/lib/dpdk.c
> > +++ b/lib/dpdk.c
> > @@ -617,7 +617,14 @@ dpdk_set_lcore_id(unsigned cpu)
> >  {
> >      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
> >      ovs_assert(cpu != NON_PMD_CORE_ID);
> > +    if (cpu >= RTE_MAX_LCORE) {
> > +        cpu = LCORE_ID_ANY;
> > +    }
> >      RTE_PER_LCORE(_lcore_id) = cpu;
> > +    if (rte_lcore_id() == LCORE_ID_ANY) {
> > +        ovs_abort(0, "PMD thread init failed, trying to use more cores than "
> > +                  "DPDK supports (RTE_MAX_LCORE %u).", RTE_MAX_LCORE);
>
> I don't think that random OVS abort after pmd-cpu-mask change or port addition
> is a good thing to have.  Maybe it's better to limit CPU core discovery?
> This way OVS will never try to create thread on unwanted cores.
>
> Since ovs-numa module is mostly used by userspace datapath this will not
> affect non-DPDK setups.
>
> Something like this (not tested):
>
> diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
> index 6d0a68522..7c8a3b036 100644
> --- a/lib/ovs-numa.c
> +++ b/lib/ovs-numa.c
> @@ -27,6 +27,7 @@
>  #include <unistd.h>
>  #endif /* __linux__ */
>
> +#include "dpdk.h"
>  #include "hash.h"
>  #include "openvswitch/hmap.h"
>  #include "openvswitch/list.h"
> @@ -58,6 +59,13 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
>
>  #define MAX_NUMA_NODES 128
>
> +#ifdef DPDK_NETDEV
> +/* DPDK will not work correctly for cores with core_id >= RTE_MAX_LCORE. */
> +#define MAX_CORE_ID    (RTE_MAX_LCORE - 1)
> +#else
> +#define MAX_CORE_ID    INT_MAX
> +#endif
> +
>  /* numa node. */
>  struct numa_node {
>      struct hmap_node hmap_node;     /* In the 'all_numa_nodes'. */
> @@ -112,6 +120,12 @@ insert_new_numa_node(int numa_id)
>  static struct cpu_core *
>  insert_new_cpu_core(struct numa_node *n, unsigned core_id)
>  {
> +    if (core_id > MAX_CORE_ID) {
> +        VLOG_DBG("Ignoring CPU core with id %u ( %u > %d).",
> +                 core_id, core_id, MAX_CORE_ID);
> +        return NULL;
> +    }
> +
>      struct cpu_core *c = xzalloc(sizeof *c);
>
>      hmap_insert(&all_cpu_cores, &c->hmap_node, hash_int(core_id, 0));
> @@ -278,6 +292,11 @@ ovs_numa_init(void)
>      if (ovsthread_once_start(&once)) {
>          const struct numa_node *n;
>
> +        if (MAX_CORE_ID != INT_MAX) {
> +            VLOG_INFO("Maximum allowed CPU core id is %d. "
> +                      "Other cores will not be available.", MAX_CORE_ID);
> +        }
> +
>          if (dummy_numa) {
>              discover_numa_and_core_dummy();
>          } else {
> ---
>
> What do you think?

pmd threads will never run on any core >= RTE_MAX_LCORE ok, it works.

Though, it binds OVS core id and DPDK lcore id.
Should OVS really care about this?
Ilya Maximets July 2, 2020, 3:53 p.m. UTC | #3
On 7/2/20 3:56 PM, David Marchand wrote:
> On Thu, Jul 2, 2020 at 3:09 PM Ilya Maximets <i.maximets@ovn.org> wrote:
>>
>> On 6/26/20 2:27 PM, David Marchand wrote:
>>> DPDK lcores range is [0, RTE_MAX_LCORE[.
>>> Trying to use a lcore out of this range expose OVS to crashes in DPDK mempool
>>> local cache array.
>>> Prefer a "clean" abort so that users know that something is wrong with
>>> their configuration.
>>>
>>> Signed-off-by: David Marchand <david.marchand@redhat.com>
>>> ---
>>>  lib/dpdk.c | 7 +++++++
>>>  1 file changed, 7 insertions(+)
>>>
>>> diff --git a/lib/dpdk.c b/lib/dpdk.c
>>> index 98d0e2643e..55ce9a9221 100644
>>> --- a/lib/dpdk.c
>>> +++ b/lib/dpdk.c
>>> @@ -617,7 +617,14 @@ dpdk_set_lcore_id(unsigned cpu)
>>>  {
>>>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>>>      ovs_assert(cpu != NON_PMD_CORE_ID);
>>> +    if (cpu >= RTE_MAX_LCORE) {
>>> +        cpu = LCORE_ID_ANY;
>>> +    }
>>>      RTE_PER_LCORE(_lcore_id) = cpu;
>>> +    if (rte_lcore_id() == LCORE_ID_ANY) {
>>> +        ovs_abort(0, "PMD thread init failed, trying to use more cores than "
>>> +                  "DPDK supports (RTE_MAX_LCORE %u).", RTE_MAX_LCORE);
>>
>> I don't think that random OVS abort after pmd-cpu-mask change or port addition
>> is a good thing to have.  Maybe it's better to limit CPU core discovery?
>> This way OVS will never try to create thread on unwanted cores.
>>
>> Since ovs-numa module is mostly used by userspace datapath this will not
>> affect non-DPDK setups.
>>
>> Something like this (not tested):
>>
>> diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
>> index 6d0a68522..7c8a3b036 100644
>> --- a/lib/ovs-numa.c
>> +++ b/lib/ovs-numa.c
>> @@ -27,6 +27,7 @@
>>  #include <unistd.h>
>>  #endif /* __linux__ */
>>
>> +#include "dpdk.h"
>>  #include "hash.h"
>>  #include "openvswitch/hmap.h"
>>  #include "openvswitch/list.h"
>> @@ -58,6 +59,13 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
>>
>>  #define MAX_NUMA_NODES 128
>>
>> +#ifdef DPDK_NETDEV
>> +/* DPDK will not work correctly for cores with core_id >= RTE_MAX_LCORE. */
>> +#define MAX_CORE_ID    (RTE_MAX_LCORE - 1)
>> +#else
>> +#define MAX_CORE_ID    INT_MAX
>> +#endif
>> +
>>  /* numa node. */
>>  struct numa_node {
>>      struct hmap_node hmap_node;     /* In the 'all_numa_nodes'. */
>> @@ -112,6 +120,12 @@ insert_new_numa_node(int numa_id)
>>  static struct cpu_core *
>>  insert_new_cpu_core(struct numa_node *n, unsigned core_id)
>>  {
>> +    if (core_id > MAX_CORE_ID) {
>> +        VLOG_DBG("Ignoring CPU core with id %u ( %u > %d).",
>> +                 core_id, core_id, MAX_CORE_ID);
>> +        return NULL;
>> +    }
>> +
>>      struct cpu_core *c = xzalloc(sizeof *c);
>>
>>      hmap_insert(&all_cpu_cores, &c->hmap_node, hash_int(core_id, 0));
>> @@ -278,6 +292,11 @@ ovs_numa_init(void)
>>      if (ovsthread_once_start(&once)) {
>>          const struct numa_node *n;
>>
>> +        if (MAX_CORE_ID != INT_MAX) {
>> +            VLOG_INFO("Maximum allowed CPU core id is %d. "
>> +                      "Other cores will not be available.", MAX_CORE_ID);
>> +        }
>> +
>>          if (dummy_numa) {
>>              discover_numa_and_core_dummy();
>>          } else {
>> ---
>>
>> What do you think?
> 
> pmd threads will never run on any core >= RTE_MAX_LCORE ok, it works.
> 
> Though, it binds OVS core id and DPDK lcore id.
> Should OVS really care about this?

I think this is better than random abort() in production environment.
And, in current code, OVS cores bound to lcores as 1 to 1.

I understand that there will be no direct binding with the new
rte_thread_register API, and we will need to figure out how to work
with it correctly.  With new API above changes will need to be reverted
and some new handling introduced to limit total number of cores, not
their IDs.  Maybe something like this (not tested):

diff --git a/lib/dpdk.h b/lib/dpdk.h
index 736a64279..f738a90f6 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -25,10 +25,12 @@
 #include <rte_lcore.h>
 
 #define NON_PMD_CORE_ID LCORE_ID_ANY
+#define MAX_PMD_THREADS RTE_MAX_LCORE
 
 #else
 
 #define NON_PMD_CORE_ID UINT32_MAX
+#define MAX_PMD_THREADS INT_MAX
 
 #endif /* DPDK_NETDEV */
 
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 1086efd47..245c9b2de 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -4906,18 +4906,28 @@ reconfigure_pmd_threads(struct dp_netdev *dp)
     struct ovs_numa_info_core *core;
     struct hmapx to_delete = HMAPX_INITIALIZER(&to_delete);
     struct hmapx_node *node;
-    bool changed = false;
+    bool limited, changed = false;
     bool need_to_adjust_static_tx_qids = false;
 
     /* The pmd threads should be started only if there's a pmd port in the
      * datapath.  If the user didn't provide any "pmd-cpu-mask", we start
      * NR_PMD_THREADS per numa node. */
     if (!has_pmd_port(dp)) {
-        pmd_cores = ovs_numa_dump_n_cores_per_numa(0);
+        pmd_cores = ovs_numa_dump_n_cores_per_numa(0, INT_MAX, &limited);
     } else if (dp->pmd_cmask && dp->pmd_cmask[0]) {
-        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask);
+        pmd_cores = ovs_numa_dump_cores_with_cmask(dp->pmd_cmask,
+                                                   MAX_PMD_THREADS, &limited);
+        if (limimted) {
+            VLOG_WARN("Too many cores in pmd-cpu-mask. Only %d will be used.",
+                      MAX_PMD_THREADS);
+        }
     } else {
-        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS);
+        pmd_cores = ovs_numa_dump_n_cores_per_numa(NR_PMD_THREADS,
+                                                   MAX_PMD_THREADS, &limited);
+        if (limited) {
+            VLOG_WARN("Can't create %d cores per numa node: Too many cores. "
+                      "Only %d cores in total will be used.", MAX_PMD_THREADS);
+        }
     }
 
     /* We need to adjust 'static_tx_qid's only if we're reducing number of
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a68522..4e88d435f 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -424,12 +443,14 @@ ovs_numa_dump_cores_on_numa(int numa_id)
 }
 
 struct ovs_numa_dump *
-ovs_numa_dump_cores_with_cmask(const char *cmask)
+ovs_numa_dump_cores_with_cmask(const char *cmask, int limit, bool *limited)
 {
     struct ovs_numa_dump *dump = ovs_numa_dump_create();
     int core_id = 0;
     int end_idx;
 
+    *limited = false;
+
     /* Ignore leading 0x. */
     end_idx = 0;
     if (!strncmp(cmask, "0x", 2) || !strncmp(cmask, "0X", 2)) {
@@ -450,10 +471,15 @@ ovs_numa_dump_cores_with_cmask(const char *cmask)
             if ((bin >> j) & 0x1) {
                 struct cpu_core *core = get_core_by_core_id(core_id);
 
-                if (core) {
+                if (core && ovs_numa_dump_count(dump) < limit) {
                     ovs_numa_dump_add(dump,
                                       core->numa->numa_id,
                                       core->core_id);
+                } else if (ovs_numa_dump_count(dump) >= limit) {
+                    VLOG_DBG("Reached the limit for number of CPUs in cmask. "
+                             "Core %d on numa %d from cmask will not be used.",
+                             core->core_id, core->numa->numa_id);
+                    *limited = true;
                 }
             }
 
@@ -465,11 +491,12 @@ ovs_numa_dump_cores_with_cmask(const char *cmask)
 }
 
 struct ovs_numa_dump *
-ovs_numa_dump_n_cores_per_numa(int cores_per_numa)
+ovs_numa_dump_n_cores_per_numa(int cores_per_numa, int limit, bool *limited)
 {
     struct ovs_numa_dump *dump = ovs_numa_dump_create();
     const struct numa_node *n;
 
+    *limited = false;
     HMAP_FOR_EACH (n, hmap_node, &all_numa_nodes) {
         const struct cpu_core *core;
         int i = 0;
@@ -478,6 +505,10 @@ ovs_numa_dump_n_cores_per_numa(int cores_per_numa)
             if (i++ >= cores_per_numa) {
                 break;
             }
+            if (ovs_numa_dump_count(dump) >= limit) {
+                *limited = true;
+                return dump;
+            }
 
             ovs_numa_dump_add(dump, core->numa->numa_id, core->core_id);
         }
---
David Marchand July 2, 2020, 4:46 p.m. UTC | #4
On Thu, Jul 2, 2020 at 5:53 PM Ilya Maximets <i.maximets@ovn.org> wrote:
> > pmd threads will never run on any core >= RTE_MAX_LCORE ok, it works.
> >
> > Though, it binds OVS core id and DPDK lcore id.
> > Should OVS really care about this?
>
> I think this is better than random abort() in production environment.
> And, in current code, OVS cores bound to lcores as 1 to 1.

Yes, I did not like either the abort(), I added it wrt to a comment from Kevin.
But rereading it, I might have misunderstood.


>
> I understand that there will be no direct binding with the new
> rte_thread_register API, and we will need to figure out how to work
> with it correctly.  With new API above changes will need to be reverted
> and some new handling introduced to limit total number of cores, not
> their IDs.  Maybe something like this (not tested):

I just wanted to make sure you had the other patch in mind.
I'll have a try with your proposal.

Thanks.
diff mbox series

Patch

diff --git a/lib/dpdk.c b/lib/dpdk.c
index 98d0e2643e..55ce9a9221 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -617,7 +617,14 @@  dpdk_set_lcore_id(unsigned cpu)
 {
     /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
     ovs_assert(cpu != NON_PMD_CORE_ID);
+    if (cpu >= RTE_MAX_LCORE) {
+        cpu = LCORE_ID_ANY;
+    }
     RTE_PER_LCORE(_lcore_id) = cpu;
+    if (rte_lcore_id() == LCORE_ID_ANY) {
+        ovs_abort(0, "PMD thread init failed, trying to use more cores than "
+                  "DPDK supports (RTE_MAX_LCORE %u).", RTE_MAX_LCORE);
+    }
 }
 
 void