diff mbox series

[ovs-dev,dpdk-latest,v4] dpdk: Support running PMD threads on any core.

Message ID 20201102131045.10410-1-david.marchand@redhat.com
State New
Headers show
Series [ovs-dev,dpdk-latest,v4] dpdk: Support running PMD threads on any core. | expand

Commit Message

David Marchand Nov. 2, 2020, 1:10 p.m. UTC
DPDK 20.08 introduced a new API that associates a non-EAL thread to a free
lcore. This new API does not change the thread characteristics (like CPU
affinity).
Using this new API, there is no assumption on lcore X running on cpu X
anymore which leaves OVS free from running its PMD thread on any cpu.

DPDK still limits the number of lcores to RTE_MAX_LCORE (128 on x86_64)
which should be enough for OVS pmd threads (hopefully).

DPDK lcore/OVS pmd threads mapping are logged at threads creation and
destruction.
A new command is added to help get DPDK point of view of the DPDK lcores:

$ ovs-appctl dpdk/lcores-list
lcore 0, socket 0, role RTE, cpuset 0
lcore 1, socket 0, role NON_EAL, cpuset 1
lcore 2, socket 0, role NON_EAL, cpuset 15

Signed-off-by: David Marchand <david.marchand@redhat.com>
---
Changes since v3:
- rebased on current HEAD,
- switched back to simple warning rather than abort when registering a
  thread fails,

Changes since v2:
- introduced a new api in DPDK 20.08 (still being discussed), inbox thread at
  http://inbox.dpdk.org/dev/20200610144506.30505-1-david.marchand@redhat.com/T/#t
- this current patch depends on a patch on master I sent:
  https://patchwork.ozlabs.org/project/openvswitch/patch/20200626122738.28163-1-david.marchand@redhat.com/
- dropped 'dpdk-lcore-mask' compat handling,

Changes since v1:
- rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
- switched to a bitmap to track lcores,
- added a command to dump current mapping (Flavio): used an experimental
  API to get DPDK lcores cpuset since it is the most reliable/portable
  information,
- used the same code for the logs when starting DPDK/PMD threads,
- addressed Ilya comments,

---
 lib/dpdk-stub.c   |  8 +++++++-
 lib/dpdk.c        | 36 ++++++++++++++++++++++++++++++++++--
 lib/dpdk.h        |  3 ++-
 lib/dpif-netdev.c |  3 ++-
 4 files changed, 45 insertions(+), 5 deletions(-)

Comments

Aaron Conole Nov. 2, 2020, 2:40 p.m. UTC | #1
David Marchand <david.marchand@redhat.com> writes:

> DPDK 20.08 introduced a new API that associates a non-EAL thread to a free
> lcore. This new API does not change the thread characteristics (like CPU
> affinity).
> Using this new API, there is no assumption on lcore X running on cpu X
> anymore which leaves OVS free from running its PMD thread on any cpu.
>
> DPDK still limits the number of lcores to RTE_MAX_LCORE (128 on x86_64)
> which should be enough for OVS pmd threads (hopefully).
>
> DPDK lcore/OVS pmd threads mapping are logged at threads creation and
> destruction.
> A new command is added to help get DPDK point of view of the DPDK lcores:
>
> $ ovs-appctl dpdk/lcores-list
> lcore 0, socket 0, role RTE, cpuset 0
> lcore 1, socket 0, role NON_EAL, cpuset 1
> lcore 2, socket 0, role NON_EAL, cpuset 15
>
> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
> Changes since v3:
> - rebased on current HEAD,
> - switched back to simple warning rather than abort when registering a
>   thread fails,
>
> Changes since v2:
> - introduced a new api in DPDK 20.08 (still being discussed), inbox thread at
>   http://inbox.dpdk.org/dev/20200610144506.30505-1-david.marchand@redhat.com/T/#t
> - this current patch depends on a patch on master I sent:
>   https://patchwork.ozlabs.org/project/openvswitch/patch/20200626122738.28163-1-david.marchand@redhat.com/
> - dropped 'dpdk-lcore-mask' compat handling,
>
> Changes since v1:
> - rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
> - switched to a bitmap to track lcores,
> - added a command to dump current mapping (Flavio): used an experimental
>   API to get DPDK lcores cpuset since it is the most reliable/portable
>   information,
> - used the same code for the logs when starting DPDK/PMD threads,
> - addressed Ilya comments,
>
> ---
>  lib/dpdk-stub.c   |  8 +++++++-
>  lib/dpdk.c        | 36 ++++++++++++++++++++++++++++++++++--
>  lib/dpdk.h        |  3 ++-
>  lib/dpif-netdev.c |  3 ++-
>  4 files changed, 45 insertions(+), 5 deletions(-)
>
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index b7d577870d..5bc996b665 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 319540394b..7d3c0637c9 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -14,6 +14,10 @@
>   * limitations under the License.
>   */
>  
> +#ifndef ALLOW_EXPERIMENTAL_API
> +#define ALLOW_EXPERIMENTAL_API
> +#endif
> +

We already have a #define for this in netdev-dpdk.c, maybe we should
follow that (and also, possibly combine them).

Just a quick nit.

>  #include <config.h>
>  #include "dpdk.h"
>  
> @@ -525,6 +529,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>                               dpdk_unixctl_mem_stream, rte_log_dump);
>      unixctl_command_register("dpdk/log-set", "{level | pattern:level}", 0,
>                               INT_MAX, dpdk_unixctl_log_set, NULL);
> +    unixctl_command_register("dpdk/lcores-list", "", 0, 0,
> +                             dpdk_unixctl_mem_stream, rte_lcore_dump);
>  
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
> @@ -601,11 +607,37 @@ dpdk_available(void)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    if (!dpdk_available()) {
> +        return;
> +    }
> +
> +    if (rte_thread_register() < 0) {
> +        VLOG_WARN("This OVS pmd thread will share resources with the non-pmd "
> +                  "thread: %s.", rte_strerror(rte_errno));
> +    } else {
> +        VLOG_INFO("PMD thread uses DPDK lcore %u.", rte_lcore_id());
> +    }
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    unsigned int lcore_id;
> +
> +    if (!dpdk_available()) {
> +        return;
> +    }
> +
> +    lcore_id = rte_lcore_id();
> +    rte_thread_unregister();
> +    if (lcore_id != LCORE_ID_ANY) {
> +        VLOG_INFO("PMD thread released DPDK lcore %u.", lcore_id);
> +    }
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 445a51d065..1bd16b31db 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 300861ca59..da9e1e8fcf 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5926,7 +5926,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -6061,6 +6061,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
David Marchand Nov. 2, 2020, 8:34 p.m. UTC | #2
On Mon, Nov 2, 2020 at 3:40 PM Aaron Conole <aconole@redhat.com> wrote:
> > diff --git a/lib/dpdk.c b/lib/dpdk.c
> > index 319540394b..7d3c0637c9 100644
> > --- a/lib/dpdk.c
> > +++ b/lib/dpdk.c
> > @@ -14,6 +14,10 @@
> >   * limitations under the License.
> >   */
> >
> > +#ifndef ALLOW_EXPERIMENTAL_API
> > +#define ALLOW_EXPERIMENTAL_API
> > +#endif
> > +
>
> We already have a #define for this in netdev-dpdk.c, maybe we should
> follow that (and also, possibly combine them).
>
> Just a quick nit.

- In the dpdk-latest branch, there is no trace of ALLOW_EXPERIMENTAL_API since:
https://github.com/openvswitch/ovs/commit/0730d7786a40d80a45c8d587ebfdd92ceb4b689d

On the why the check on already defined ALLOW_EXPERIMENTAL_API, this
is because I added a job with it enabled to catch potential build
issues.


- About combining (I'd say dpdk.h would be the best location), I am not sure.
I like turning this knob on only when needed.
Kevin Traynor Nov. 20, 2020, 5:47 p.m. UTC | #3
On 02/11/2020 13:10, David Marchand wrote:
> DPDK 20.08 introduced a new API that associates a non-EAL thread to a free
> lcore. This new API does not change the thread characteristics (like CPU
> affinity).
> Using this new API, there is no assumption on lcore X running on cpu X
> anymore which leaves OVS free from running its PMD thread on any cpu.
> 
> DPDK still limits the number of lcores to RTE_MAX_LCORE (128 on x86_64)
> which should be enough for OVS pmd threads (hopefully).
> 

This feature won't work with multi-process and will disable it, which is
probably fine - just need to be sure that others are ok with that.

It would be a lot cleaner to use this if mp is not allowed. If people
think mp is needed, then a mp user option (default disable) could be
added to enable it before the PMD threads start. In that case, this
lcore feature won't work, but it could fall back to previous model.

If mp is explicitly not allowed now, it would need a doc update.

> DPDK lcore/OVS pmd threads mapping are logged at threads creation and
> destruction.
> A new command is added to help get DPDK point of view of the DPDK lcores:
> 
> $ ovs-appctl dpdk/lcores-list
> lcore 0, socket 0, role RTE, cpuset 0
> lcore 1, socket 0, role NON_EAL, cpuset 1
> lcore 2, socket 0, role NON_EAL, cpuset 15
> 

Could probably add this in the docs too, not sure the current thinking
about documenting these appctl commands.

> Signed-off-by: David Marchand <david.marchand@redhat.com>
> ---
> Changes since v3:
> - rebased on current HEAD,
> - switched back to simple warning rather than abort when registering a
>   thread fails,
> 
> Changes since v2:
> - introduced a new api in DPDK 20.08 (still being discussed), inbox thread at
>   http://inbox.dpdk.org/dev/20200610144506.30505-1-david.marchand@redhat.com/T/#t
> - this current patch depends on a patch on master I sent:
>   https://patchwork.ozlabs.org/project/openvswitch/patch/20200626122738.28163-1-david.marchand@redhat.com/
> - dropped 'dpdk-lcore-mask' compat handling,
> 
> Changes since v1:
> - rewired existing configuration 'dpdk-lcore-mask' to use --lcores,
> - switched to a bitmap to track lcores,
> - added a command to dump current mapping (Flavio): used an experimental
>   API to get DPDK lcores cpuset since it is the most reliable/portable
>   information,
> - used the same code for the logs when starting DPDK/PMD threads,
> - addressed Ilya comments,
> 
> ---
>  lib/dpdk-stub.c   |  8 +++++++-
>  lib/dpdk.c        | 36 ++++++++++++++++++++++++++++++++++--
>  lib/dpdk.h        |  3 ++-
>  lib/dpif-netdev.c |  3 ++-
>  4 files changed, 45 insertions(+), 5 deletions(-)
> 
> diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
> index b7d577870d..5bc996b665 100644
> --- a/lib/dpdk-stub.c
> +++ b/lib/dpdk-stub.c
> @@ -39,7 +39,13 @@ dpdk_init(const struct smap *ovs_other_config)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
> +dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
> +{
> +    /* Nothing */
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
>  {
>      /* Nothing */
>  }
> diff --git a/lib/dpdk.c b/lib/dpdk.c
> index 319540394b..7d3c0637c9 100644
> --- a/lib/dpdk.c
> +++ b/lib/dpdk.c
> @@ -14,6 +14,10 @@
>   * limitations under the License.
>   */
>  
> +#ifndef ALLOW_EXPERIMENTAL_API
> +#define ALLOW_EXPERIMENTAL_API
> +#endif
> +
>  #include <config.h>
>  #include "dpdk.h"
>  
> @@ -525,6 +529,8 @@ dpdk_init__(const struct smap *ovs_other_config)
>                               dpdk_unixctl_mem_stream, rte_log_dump);
>      unixctl_command_register("dpdk/log-set", "{level | pattern:level}", 0,
>                               INT_MAX, dpdk_unixctl_log_set, NULL);
> +    unixctl_command_register("dpdk/lcores-list", "", 0, 0,
> +                             dpdk_unixctl_mem_stream, rte_lcore_dump);
>  
>      /* We are called from the main thread here */
>      RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
> @@ -601,11 +607,37 @@ dpdk_available(void)
>  }
>  
>  void
> -dpdk_set_lcore_id(unsigned cpu)
> +dpdk_init_thread_context(unsigned cpu)
>  {
>      /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
>      ovs_assert(cpu != NON_PMD_CORE_ID);
> -    RTE_PER_LCORE(_lcore_id) = cpu;
> +
> +    if (!dpdk_available()) {
> +        return;
> +    }
> +
> +    if (rte_thread_register() < 0) {
> +        VLOG_WARN("This OVS pmd thread will share resources with the non-pmd "
> +                  "thread: %s.", rte_strerror(rte_errno));

There will always be some PMD thread started early on which will disable
mp, but if this failed because mp was enabled before the PMD started,
some different action is needed.

If there was a user flag for allowing mp, that could be checked first.
Otherwise you could distinguish between EINVAL and ENOMEM. For EINVAL,
fall back to old scheme because mp is enabled. For ENOMEM, print warning
and continue as per the current code.

> +    } else {
> +        VLOG_INFO("PMD thread uses DPDK lcore %u.", rte_lcore_id());
> +    }
> +}
> +
> +void
> +dpdk_uninit_thread_context(void)
> +{
> +    unsigned int lcore_id;
> +
> +    if (!dpdk_available()) {
> +        return;
> +    }
> +
> +    lcore_id = rte_lcore_id();
> +    rte_thread_unregister();
> +    if (lcore_id != LCORE_ID_ANY) {
> +        VLOG_INFO("PMD thread released DPDK lcore %u.", lcore_id);
> +    }
>  }
>  
>  void
> diff --git a/lib/dpdk.h b/lib/dpdk.h
> index 445a51d065..1bd16b31db 100644
> --- a/lib/dpdk.h
> +++ b/lib/dpdk.h
> @@ -36,7 +36,8 @@ struct smap;
>  struct ovsrec_open_vswitch;
>  
>  void dpdk_init(const struct smap *ovs_other_config);
> -void dpdk_set_lcore_id(unsigned cpu);
> +void dpdk_init_thread_context(unsigned cpu);
> +void dpdk_uninit_thread_context(void);
>  const char *dpdk_get_vhost_sock_dir(void);
>  bool dpdk_vhost_iommu_enabled(void);
>  bool dpdk_vhost_postcopy_enabled(void);
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index 300861ca59..da9e1e8fcf 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -5926,7 +5926,7 @@ pmd_thread_main(void *f_)
>      /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
>      ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
>      ovs_numa_thread_setaffinity_core(pmd->core_id);
> -    dpdk_set_lcore_id(pmd->core_id);
> +    dpdk_init_thread_context(pmd->core_id);
>      poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
>      dfc_cache_init(&pmd->flow_cache);
>      pmd_alloc_static_tx_qid(pmd);
> @@ -6061,6 +6061,7 @@ reload:
>      dfc_cache_uninit(&pmd->flow_cache);
>      free(poll_list);
>      pmd_free_cached_ports(pmd);
> +    dpdk_uninit_thread_context();
>      return NULL;
>  }
>  
>
diff mbox series

Patch

diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index b7d577870d..5bc996b665 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -39,7 +39,13 @@  dpdk_init(const struct smap *ovs_other_config)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu OVS_UNUSED)
+dpdk_init_thread_context(unsigned cpu OVS_UNUSED)
+{
+    /* Nothing */
+}
+
+void
+dpdk_uninit_thread_context(void)
 {
     /* Nothing */
 }
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 319540394b..7d3c0637c9 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -14,6 +14,10 @@ 
  * limitations under the License.
  */
 
+#ifndef ALLOW_EXPERIMENTAL_API
+#define ALLOW_EXPERIMENTAL_API
+#endif
+
 #include <config.h>
 #include "dpdk.h"
 
@@ -525,6 +529,8 @@  dpdk_init__(const struct smap *ovs_other_config)
                              dpdk_unixctl_mem_stream, rte_log_dump);
     unixctl_command_register("dpdk/log-set", "{level | pattern:level}", 0,
                              INT_MAX, dpdk_unixctl_log_set, NULL);
+    unixctl_command_register("dpdk/lcores-list", "", 0, 0,
+                             dpdk_unixctl_mem_stream, rte_lcore_dump);
 
     /* We are called from the main thread here */
     RTE_PER_LCORE(_lcore_id) = NON_PMD_CORE_ID;
@@ -601,11 +607,37 @@  dpdk_available(void)
 }
 
 void
-dpdk_set_lcore_id(unsigned cpu)
+dpdk_init_thread_context(unsigned cpu)
 {
     /* NON_PMD_CORE_ID is reserved for use by non pmd threads. */
     ovs_assert(cpu != NON_PMD_CORE_ID);
-    RTE_PER_LCORE(_lcore_id) = cpu;
+
+    if (!dpdk_available()) {
+        return;
+    }
+
+    if (rte_thread_register() < 0) {
+        VLOG_WARN("This OVS pmd thread will share resources with the non-pmd "
+                  "thread: %s.", rte_strerror(rte_errno));
+    } else {
+        VLOG_INFO("PMD thread uses DPDK lcore %u.", rte_lcore_id());
+    }
+}
+
+void
+dpdk_uninit_thread_context(void)
+{
+    unsigned int lcore_id;
+
+    if (!dpdk_available()) {
+        return;
+    }
+
+    lcore_id = rte_lcore_id();
+    rte_thread_unregister();
+    if (lcore_id != LCORE_ID_ANY) {
+        VLOG_INFO("PMD thread released DPDK lcore %u.", lcore_id);
+    }
 }
 
 void
diff --git a/lib/dpdk.h b/lib/dpdk.h
index 445a51d065..1bd16b31db 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -36,7 +36,8 @@  struct smap;
 struct ovsrec_open_vswitch;
 
 void dpdk_init(const struct smap *ovs_other_config);
-void dpdk_set_lcore_id(unsigned cpu);
+void dpdk_init_thread_context(unsigned cpu);
+void dpdk_uninit_thread_context(void);
 const char *dpdk_get_vhost_sock_dir(void);
 bool dpdk_vhost_iommu_enabled(void);
 bool dpdk_vhost_postcopy_enabled(void);
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 300861ca59..da9e1e8fcf 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -5926,7 +5926,7 @@  pmd_thread_main(void *f_)
     /* Stores the pmd thread's 'pmd' to 'per_pmd_key'. */
     ovsthread_setspecific(pmd->dp->per_pmd_key, pmd);
     ovs_numa_thread_setaffinity_core(pmd->core_id);
-    dpdk_set_lcore_id(pmd->core_id);
+    dpdk_init_thread_context(pmd->core_id);
     poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
     dfc_cache_init(&pmd->flow_cache);
     pmd_alloc_static_tx_qid(pmd);
@@ -6061,6 +6061,7 @@  reload:
     dfc_cache_uninit(&pmd->flow_cache);
     free(poll_list);
     pmd_free_cached_ports(pmd);
+    dpdk_uninit_thread_context();
     return NULL;
 }