diff mbox

[ovs-dev,v2] dpif-netdev: Assign ports to pmds on non-local numa node.

Message ID 1490875385-5417-1-git-send-email-billy.o.mahony@intel.com
State Changes Requested
Delegated to: Darrell Ball
Headers show

Commit Message

Billy O'Mahony March 30, 2017, 12:03 p.m. UTC
From: billyom <billy.o.mahony@intel.com>

Previously if there is no available (non-isolated) pmd on the numa node
for a port then the port is not polled at all. This can result in a
non-operational system until such time as nics are physically
repositioned. It is preferable to operate with a pmd on the 'wrong' numa
node albeit with lower performance. Local pmds are still chosen when
available.

Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
---
 Documentation/intro/install/dpdk.rst | 10 ++++++++++
 lib/dpif-netdev.c                    | 36 ++++++++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

Comments

Stokes, Ian April 24, 2017, 4:33 p.m. UTC | #1
> Previously if there is no available (non-isolated) pmd on the numa node
> for a port then the port is not polled at all. This can result in a non-
> operational system until such time as nics are physically repositioned. It
> is preferable to operate with a pmd on the 'wrong' numa node albeit with
> lower performance. Local pmds are still chosen when available.
> 
> Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>

Thanks for the patch Billy, few comments below.

> ---
>  Documentation/intro/install/dpdk.rst | 10 ++++++++++
>  lib/dpif-netdev.c                    | 36
> ++++++++++++++++++++++++++++++++----
>  2 files changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/Documentation/intro/install/dpdk.rst
> b/Documentation/intro/install/dpdk.rst
> index b947bd5..0e0b9f0 100644
> --- a/Documentation/intro/install/dpdk.rst
> +++ b/Documentation/intro/install/dpdk.rst
> @@ -450,6 +450,16 @@ affinitized accordingly.
>      pmd thread on a NUMA node is only created if there is at least one
> DPDK
>      interface from that NUMA node added to OVS.
> 
> +  .. note::
> +   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues
> for
> +   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-
> mask
> +   has created a pmd thread on that NUMA node.  If not the queue will be
> +   assigned to a pmd on a remote NUMA node.  This will result in reduced
> +   maximum throughput on that device.  In the case such a queue
> assingment

Typo in assignment.

> +   is made a warning message will be logged: "There's no available
> +   (non isolated) pmd thread on numa node N. Queue Q on port P will be
> assigned
> +   to a pmd on numa node M. Expect reduced performance."
> +
>  - QEMU vCPU thread Affinity
> 
>    A VM performing simple packet forwarding or running complex packet
> pipelines diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> a14a2eb..c6570ba 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int
> numa_id)  }
> 
>  static void
> -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
> +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
> +                      int *all_numa_ids, unsigned all_numa_ids_sz,
> +                      int *num_ids_written)
>  {
>      struct dp_netdev_pmd_thread *pmd;
>      struct rr_numa *numa;
> +    unsigned idx = 0;
> 
>      hmap_init(&rr->numas);
> 
> @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp, struct
> rr_numa_list *rr)
>          numa->n_pmds++;
>          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa-
> >pmds);
>          numa->pmds[numa->n_pmds - 1] = pmd;
> +
> +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
> +        idx++;
>      }
> +    *num_ids_written = idx;
>  }
> 
>  static struct dp_netdev_pmd_thread *
> @@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned)
> OVS_REQUIRES(dp->port_mutex)  {
>      struct dp_netdev_port *port;
>      struct rr_numa_list rr;
> +    int all_numa_ids [64];
> +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
> +    unsigned all_numa_ids_idx = 0;
> +    int all_numa_ids_max_idx = 0;
> +    int num_numa_ids = 0;
> 
> -    rr_numa_list_populate(dp, &rr);
> +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
> +                          &num_numa_ids);
> +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
> 
>      HMAP_FOR_EACH (port, node, &dp->ports) {
>          struct rr_numa *numa;
> @@ -3234,10 +3248,24 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned)
> OVS_REQUIRES(dp->port_mutex)
>                  }
>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
>                  if (!numa) {
> +                    if (all_numa_ids_max_idx < 0) {
> +                        VLOG_ERR("There are no pmd threads. "
> +                                 "Is pmd-cpu-mask set to zero?");

The VLOG_ERR is a bit general and misleading. A user could set a PMD on a numa node, add a DPDK device local to that node, isolate the PMD for that DPDK port, then add a second DPDK port that is local to the same numa node. This will trigger the ERR above.

Strictly speaking there is a PMD thread set on the numa node but it has been isolated. I would change the message above to include isolation. I would also add the name of the port searching for a PMD to the message so it is clear which port could not find a thread. Something like below

if (all_numa_ids_max_idx < 0) {
    VLOG_ERR("There are no available pmd threads for %s. "
             "Is pmd-cpu-mask set to zero or isolated?", 
             netdev_get_name(port->netdev));

> +                        continue;
> +                    }
>                      VLOG_WARN("There's no available (non isolated) pmd
> thread "
>                                "on numa node %d. Queue %d on port \'%s\'
> will "
> -                              "not be polled.",
> -                              numa_id, qid, netdev_get_name(port-
> >netdev));
> +                              "be assigned to a pmd on numa node %d.
> Expect "
> +                              "reduced performance.",
> +                              numa_id, qid, netdev_get_name(port-
> >netdev),
> +                              all_numa_ids[all_numa_ids_idx]);

The warning above will identify queue 0 will be assigned to a pmd on the other numa node.
However it's not just queue 0 that will be assigned but all the unpinned queues associated with the port (see 'q->pmd = rr_numa_get_pmd(numa)' below).

I'm thinking the warning should reflect this so users don't think only queue 0 has been assigned to the non local numa pmd.

> +                    numa_id = all_numa_ids[all_numa_ids_idx];
> +                    numa = rr_numa_list_lookup(&rr, numa_id);
> +                    q->pmd = rr_numa_get_pmd(numa);
> +                    all_numa_ids_idx++;
> +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
> +                        all_numa_ids_idx = 0;
> +                    }
>                  } else {
>                      q->pmd = rr_numa_get_pmd(numa);
>                  }
> --
> 2.7.4
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Billy O'Mahony April 25, 2017, 10:15 a.m. UTC | #2
Thanks Ian,

I'll incorporate these suggestions in a new patch version. 

Cheers,
Billy. 

> -----Original Message-----
> From: Stokes, Ian
> Sent: Monday, April 24, 2017 5:33 PM
> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
> Subject: RE: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds on non-
> local numa node.
> 
> > Previously if there is no available (non-isolated) pmd on the numa
> > node for a port then the port is not polled at all. This can result in
> > a non- operational system until such time as nics are physically
> > repositioned. It is preferable to operate with a pmd on the 'wrong'
> > numa node albeit with lower performance. Local pmds are still chosen
> when available.
> >
> > Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
> 
> Thanks for the patch Billy, few comments below.
> 
> > ---
> >  Documentation/intro/install/dpdk.rst | 10 ++++++++++
> >  lib/dpif-netdev.c                    | 36
> > ++++++++++++++++++++++++++++++++----
> >  2 files changed, 42 insertions(+), 4 deletions(-)
> >
> > diff --git a/Documentation/intro/install/dpdk.rst
> > b/Documentation/intro/install/dpdk.rst
> > index b947bd5..0e0b9f0 100644
> > --- a/Documentation/intro/install/dpdk.rst
> > +++ b/Documentation/intro/install/dpdk.rst
> > @@ -450,6 +450,16 @@ affinitized accordingly.
> >      pmd thread on a NUMA node is only created if there is at least
> > one DPDK
> >      interface from that NUMA node added to OVS.
> >
> > +  .. note::
> > +   On NUMA systems PCI devices are also local to a NUMA node.  Rx
> > + queues
> > for
> > +   PCI device will assigned to a pmd on it's local NUMA node if
> > + pmd-cpu-
> > mask
> > +   has created a pmd thread on that NUMA node.  If not the queue will be
> > +   assigned to a pmd on a remote NUMA node.  This will result in reduced
> > +   maximum throughput on that device.  In the case such a queue
> > assingment
> 
> Typo in assignment.
> 
> > +   is made a warning message will be logged: "There's no available
> > +   (non isolated) pmd thread on numa node N. Queue Q on port P will
> > + be
> > assigned
> > +   to a pmd on numa node M. Expect reduced performance."
> > +
> >  - QEMU vCPU thread Affinity
> >
> >    A VM performing simple packet forwarding or running complex packet
> > pipelines diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
> > a14a2eb..c6570ba 100644
> > --- a/lib/dpif-netdev.c
> > +++ b/lib/dpif-netdev.c
> > @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr,
> > int
> > numa_id)  }
> >
> >  static void
> > -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
> > +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
> > +                      int *all_numa_ids, unsigned all_numa_ids_sz,
> > +                      int *num_ids_written)
> >  {
> >      struct dp_netdev_pmd_thread *pmd;
> >      struct rr_numa *numa;
> > +    unsigned idx = 0;
> >
> >      hmap_init(&rr->numas);
> >
> > @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp,
> > struct rr_numa_list *rr)
> >          numa->n_pmds++;
> >          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
> > *numa-
> > >pmds);
> >          numa->pmds[numa->n_pmds - 1] = pmd;
> > +
> > +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
> > +        idx++;
> >      }
> > +    *num_ids_written = idx;
> >  }
> >
> >  static struct dp_netdev_pmd_thread *
> > @@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool
> > pinned)
> > OVS_REQUIRES(dp->port_mutex)  {
> >      struct dp_netdev_port *port;
> >      struct rr_numa_list rr;
> > +    int all_numa_ids [64];
> > +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
> > +    unsigned all_numa_ids_idx = 0;
> > +    int all_numa_ids_max_idx = 0;
> > +    int num_numa_ids = 0;
> >
> > -    rr_numa_list_populate(dp, &rr);
> > +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
> > +                          &num_numa_ids);
> > +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz -
> > + 1);
> >
> >      HMAP_FOR_EACH (port, node, &dp->ports) {
> >          struct rr_numa *numa;
> > @@ -3234,10 +3248,24 @@ rxq_scheduling(struct dp_netdev *dp, bool
> > pinned)
> > OVS_REQUIRES(dp->port_mutex)
> >                  }
> >              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
> >                  if (!numa) {
> > +                    if (all_numa_ids_max_idx < 0) {
> > +                        VLOG_ERR("There are no pmd threads. "
> > +                                 "Is pmd-cpu-mask set to zero?");
> 
> The VLOG_ERR is a bit general and misleading. A user could set a PMD on a
> numa node, add a DPDK device local to that node, isolate the PMD for that
> DPDK port, then add a second DPDK port that is local to the same numa
> node. This will trigger the ERR above.
> 
> Strictly speaking there is a PMD thread set on the numa node but it has been
> isolated. I would change the message above to include isolation. I would also
> add the name of the port searching for a PMD to the message so it is clear
> which port could not find a thread. Something like below
> 
> if (all_numa_ids_max_idx < 0) {
>     VLOG_ERR("There are no available pmd threads for %s. "
>              "Is pmd-cpu-mask set to zero or isolated?",
>              netdev_get_name(port->netdev));
> 
> > +                        continue;
> > +                    }
> >                      VLOG_WARN("There's no available (non isolated)
> > pmd thread "
> >                                "on numa node %d. Queue %d on port \'%s\'
> > will "
> > -                              "not be polled.",
> > -                              numa_id, qid, netdev_get_name(port-
> > >netdev));
> > +                              "be assigned to a pmd on numa node %d.
> > Expect "
> > +                              "reduced performance.",
> > +                              numa_id, qid, netdev_get_name(port-
> > >netdev),
> > +                              all_numa_ids[all_numa_ids_idx]);
> 
> The warning above will identify queue 0 will be assigned to a pmd on the
> other numa node.
> However it's not just queue 0 that will be assigned but all the unpinned
> queues associated with the port (see 'q->pmd = rr_numa_get_pmd(numa)'
> below).
> 
> I'm thinking the warning should reflect this so users don't think only queue 0
> has been assigned to the non local numa pmd.
> 
> > +                    numa_id = all_numa_ids[all_numa_ids_idx];
> > +                    numa = rr_numa_list_lookup(&rr, numa_id);
> > +                    q->pmd = rr_numa_get_pmd(numa);
> > +                    all_numa_ids_idx++;
> > +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
> > +                        all_numa_ids_idx = 0;
> > +                    }
> >                  } else {
> >                      q->pmd = rr_numa_get_pmd(numa);
> >                  }
> > --
> > 2.7.4
> >
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Billy O'Mahony May 3, 2017, 1:58 p.m. UTC | #3
Hi Ian,

Sorry for delay responding. Some comment below.

> -----Original Message-----
> From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-
> bounces@openvswitch.org] On Behalf Of O Mahony, Billy
> Sent: Tuesday, April 25, 2017 11:15 AM
> To: Stokes, Ian <ian.stokes@intel.com>; dev@openvswitch.org
> Subject: Re: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds on non-
> local numa node.
> 
> Thanks Ian,
> 
> I'll incorporate these suggestions in a new patch version.
> 
> Cheers,
> Billy.
> 
> > -----Original Message-----
> > From: Stokes, Ian
> > Sent: Monday, April 24, 2017 5:33 PM
> > To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
> > Subject: RE: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds on
> > non- local numa node.
> >
> > > Previously if there is no available (non-isolated) pmd on the numa
> > > node for a port then the port is not polled at all. This can result
> > > in a non- operational system until such time as nics are physically
> > > repositioned. It is preferable to operate with a pmd on the 'wrong'
> > > numa node albeit with lower performance. Local pmds are still chosen
> > when available.
> > >
> > > Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
> >
> > Thanks for the patch Billy, few comments below.
> >
> > > ---
> > >  Documentation/intro/install/dpdk.rst | 10 ++++++++++
> > >  lib/dpif-netdev.c                    | 36
> > > ++++++++++++++++++++++++++++++++----
> > >  2 files changed, 42 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/Documentation/intro/install/dpdk.rst
> > > b/Documentation/intro/install/dpdk.rst
> > > index b947bd5..0e0b9f0 100644
> > > --- a/Documentation/intro/install/dpdk.rst
> > > +++ b/Documentation/intro/install/dpdk.rst
> > > @@ -450,6 +450,16 @@ affinitized accordingly.
> > >      pmd thread on a NUMA node is only created if there is at least
> > > one DPDK
> > >      interface from that NUMA node added to OVS.
> > >
> > > +  .. note::
> > > +   On NUMA systems PCI devices are also local to a NUMA node.  Rx
> > > + queues
> > > for
> > > +   PCI device will assigned to a pmd on it's local NUMA node if
> > > + pmd-cpu-
> > > mask
> > > +   has created a pmd thread on that NUMA node.  If not the queue will
> be
> > > +   assigned to a pmd on a remote NUMA node.  This will result in reduced
> > > +   maximum throughput on that device.  In the case such a queue
> > > assingment
> >
> > Typo in assignment.
[[BO'M]] Thanks.
> >
> > > +   is made a warning message will be logged: "There's no available
> > > +   (non isolated) pmd thread on numa node N. Queue Q on port P will
> > > + be
> > > assigned
> > > +   to a pmd on numa node M. Expect reduced performance."
> > > +
> > >  - QEMU vCPU thread Affinity
> > >
> > >    A VM performing simple packet forwarding or running complex
> > > packet pipelines diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> > > index a14a2eb..c6570ba 100644
> > > --- a/lib/dpif-netdev.c
> > > +++ b/lib/dpif-netdev.c
> > > @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr,
> > > int
> > > numa_id)  }
> > >
> > >  static void
> > > -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
> > > *rr)
> > > +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
> > > +                      int *all_numa_ids, unsigned all_numa_ids_sz,
> > > +                      int *num_ids_written)
> > >  {
> > >      struct dp_netdev_pmd_thread *pmd;
> > >      struct rr_numa *numa;
> > > +    unsigned idx = 0;
> > >
> > >      hmap_init(&rr->numas);
> > >
> > > @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp,
> > > struct rr_numa_list *rr)
> > >          numa->n_pmds++;
> > >          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
> > > *numa-
> > > >pmds);
> > >          numa->pmds[numa->n_pmds - 1] = pmd;
> > > +
> > > +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
> > > +        idx++;
> > >      }
> > > +    *num_ids_written = idx;
> > >  }
> > >
> > >  static struct dp_netdev_pmd_thread * @@ -3202,8 +3209,15 @@
> > > rxq_scheduling(struct dp_netdev *dp, bool
> > > pinned)
> > > OVS_REQUIRES(dp->port_mutex)  {
> > >      struct dp_netdev_port *port;
> > >      struct rr_numa_list rr;
> > > +    int all_numa_ids [64];
> > > +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
> > > +    unsigned all_numa_ids_idx = 0;
> > > +    int all_numa_ids_max_idx = 0;
> > > +    int num_numa_ids = 0;
> > >
> > > -    rr_numa_list_populate(dp, &rr);
> > > +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
> > > +                          &num_numa_ids);
> > > +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz -
> > > + 1);
> > >
> > >      HMAP_FOR_EACH (port, node, &dp->ports) {
> > >          struct rr_numa *numa;
> > > @@ -3234,10 +3248,24 @@ rxq_scheduling(struct dp_netdev *dp, bool
> > > pinned)
> > > OVS_REQUIRES(dp->port_mutex)
> > >                  }
> > >              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
> > >                  if (!numa) {
> > > +                    if (all_numa_ids_max_idx < 0) {
> > > +                        VLOG_ERR("There are no pmd threads. "
> > > +                                 "Is pmd-cpu-mask set to zero?");
> >
> > The VLOG_ERR is a bit general and misleading. A user could set a PMD
> > on a numa node, add a DPDK device local to that node, isolate the PMD
> > for that DPDK port, then add a second DPDK port that is local to the
> > same numa node. This will trigger the ERR above.
> >
> > Strictly speaking there is a PMD thread set on the numa node but it
> > has been isolated. I would change the message above to include
> > isolation. I would also add the name of the port searching for a PMD
> > to the message so it is clear which port could not find a thread.
> > Something like below
> >
> > if (all_numa_ids_max_idx < 0) {
> >     VLOG_ERR("There are no available pmd threads for %s. "
> >              "Is pmd-cpu-mask set to zero or isolated?",
> >              netdev_get_name(port->netdev));
> >
[[BO'M]] I'll change this in v3.
> > > +                        continue;
> > > +                    }
> > >                      VLOG_WARN("There's no available (non isolated)
> > > pmd thread "
> > >                                "on numa node %d. Queue %d on port \'%s\'
> > > will "
> > > -                              "not be polled.",
> > > -                              numa_id, qid, netdev_get_name(port-
> > > >netdev));
> > > +                              "be assigned to a pmd on numa node %d.
> > > Expect "
> > > +                              "reduced performance.",
> > > +                              numa_id, qid, netdev_get_name(port-
> > > >netdev),
> > > +                              all_numa_ids[all_numa_ids_idx]);
> >
> > The warning above will identify queue 0 will be assigned to a pmd on
> > the other numa node.
> > However it's not just queue 0 that will be assigned but all the
> > unpinned queues associated with the port (see 'q->pmd =
> rr_numa_get_pmd(numa)'
> > below).
> >
> > I'm thinking the warning should reflect this so users don't think only
> > queue 0 has been assigned to the non local numa pmd.
[[BO'M]] AFAIK (though I didn't test multiqueue explicitly) pmd 
assignment is done on a per queue basis as opposed to a per port basis 
so if there are two queues for dpdk0 say then this error should be 
written twice. Once for queue 0 and once for queue 1. (Assuming that 
there is no PMD for dpdk0 on the local NUMA node).

I'll verify this is the behavior of the path when using multiple queues and revert back to the ML.
 
> >
> > > +                    numa_id = all_numa_ids[all_numa_ids_idx];
> > > +                    numa = rr_numa_list_lookup(&rr, numa_id);
> > > +                    q->pmd = rr_numa_get_pmd(numa);
> > > +                    all_numa_ids_idx++;
> > > +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
> > > +                        all_numa_ids_idx = 0;
> > > +                    }
> > >                  } else {
> > >                      q->pmd = rr_numa_get_pmd(numa);
> > >                  }
> > > --
> > > 2.7.4
> > >
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Stokes, Ian May 4, 2017, 1:18 p.m. UTC | #4
> Hi Ian,
> 
> Sorry for delay responding. Some comment below.
> 
> > -----Original Message-----
> > From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-
> > bounces@openvswitch.org] On Behalf Of O Mahony, Billy
> > Sent: Tuesday, April 25, 2017 11:15 AM
> > To: Stokes, Ian <ian.stokes@intel.com>; dev@openvswitch.org
> > Subject: Re: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds on
> > non- local numa node.
> >
> > Thanks Ian,
> >
> > I'll incorporate these suggestions in a new patch version.
> >
> > Cheers,
> > Billy.
> >
> > > -----Original Message-----
> > > From: Stokes, Ian
> > > Sent: Monday, April 24, 2017 5:33 PM
> > > To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
> > > Subject: RE: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds
> > > on
> > > non- local numa node.
> > >
> > > > Previously if there is no available (non-isolated) pmd on the numa
> > > > node for a port then the port is not polled at all. This can
> > > > result in a non- operational system until such time as nics are
> > > > physically repositioned. It is preferable to operate with a pmd on
> the 'wrong'
> > > > numa node albeit with lower performance. Local pmds are still
> > > > chosen
> > > when available.
> > > >
> > > > Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
> > >
> > > Thanks for the patch Billy, few comments below.
> > >
> > > > ---
> > > >  Documentation/intro/install/dpdk.rst | 10 ++++++++++
> > > >  lib/dpif-netdev.c                    | 36
> > > > ++++++++++++++++++++++++++++++++----
> > > >  2 files changed, 42 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/Documentation/intro/install/dpdk.rst
> > > > b/Documentation/intro/install/dpdk.rst
> > > > index b947bd5..0e0b9f0 100644
> > > > --- a/Documentation/intro/install/dpdk.rst
> > > > +++ b/Documentation/intro/install/dpdk.rst
> > > > @@ -450,6 +450,16 @@ affinitized accordingly.
> > > >      pmd thread on a NUMA node is only created if there is at
> > > > least one DPDK
> > > >      interface from that NUMA node added to OVS.
> > > >
> > > > +  .. note::
> > > > +   On NUMA systems PCI devices are also local to a NUMA node.  Rx
> > > > + queues
> > > > for
> > > > +   PCI device will assigned to a pmd on it's local NUMA node if
> > > > + pmd-cpu-
> > > > mask
> > > > +   has created a pmd thread on that NUMA node.  If not the queue
> > > > + will
> > be
> > > > +   assigned to a pmd on a remote NUMA node.  This will result in
> reduced
> > > > +   maximum throughput on that device.  In the case such a queue
> > > > assingment
> > >
> > > Typo in assignment.
> [[BO'M]] Thanks.
> > >
> > > > +   is made a warning message will be logged: "There's no available
> > > > +   (non isolated) pmd thread on numa node N. Queue Q on port P
> > > > + will be
> > > > assigned
> > > > +   to a pmd on numa node M. Expect reduced performance."
> > > > +
> > > >  - QEMU vCPU thread Affinity
> > > >
> > > >    A VM performing simple packet forwarding or running complex
> > > > packet pipelines diff --git a/lib/dpif-netdev.c
> > > > b/lib/dpif-netdev.c index a14a2eb..c6570ba 100644
> > > > --- a/lib/dpif-netdev.c
> > > > +++ b/lib/dpif-netdev.c
> > > > @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list
> > > > *rr, int
> > > > numa_id)  }
> > > >
> > > >  static void
> > > > -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
> > > > *rr)
> > > > +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
> *rr,
> > > > +                      int *all_numa_ids, unsigned all_numa_ids_sz,
> > > > +                      int *num_ids_written)
> > > >  {
> > > >      struct dp_netdev_pmd_thread *pmd;
> > > >      struct rr_numa *numa;
> > > > +    unsigned idx = 0;
> > > >
> > > >      hmap_init(&rr->numas);
> > > >
> > > > @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp,
> > > > struct rr_numa_list *rr)
> > > >          numa->n_pmds++;
> > > >          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
> > > > *numa-
> > > > >pmds);
> > > >          numa->pmds[numa->n_pmds - 1] = pmd;
> > > > +
> > > > +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
> > > > +        idx++;
> > > >      }
> > > > +    *num_ids_written = idx;
> > > >  }
> > > >
> > > >  static struct dp_netdev_pmd_thread * @@ -3202,8 +3209,15 @@
> > > > rxq_scheduling(struct dp_netdev *dp, bool
> > > > pinned)
> > > > OVS_REQUIRES(dp->port_mutex)  {
> > > >      struct dp_netdev_port *port;
> > > >      struct rr_numa_list rr;
> > > > +    int all_numa_ids [64];
> > > > +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof
> all_numa_ids[0];
> > > > +    unsigned all_numa_ids_idx = 0;
> > > > +    int all_numa_ids_max_idx = 0;
> > > > +    int num_numa_ids = 0;
> > > >
> > > > -    rr_numa_list_populate(dp, &rr);
> > > > +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
> > > > +                          &num_numa_ids);
> > > > +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz
> > > > + - 1);
> > > >
> > > >      HMAP_FOR_EACH (port, node, &dp->ports) {
> > > >          struct rr_numa *numa;
> > > > @@ -3234,10 +3248,24 @@ rxq_scheduling(struct dp_netdev *dp, bool
> > > > pinned)
> > > > OVS_REQUIRES(dp->port_mutex)
> > > >                  }
> > > >              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
> > > >                  if (!numa) {
> > > > +                    if (all_numa_ids_max_idx < 0) {
> > > > +                        VLOG_ERR("There are no pmd threads. "
> > > > +                                 "Is pmd-cpu-mask set to zero?");
> > >
> > > The VLOG_ERR is a bit general and misleading. A user could set a PMD
> > > on a numa node, add a DPDK device local to that node, isolate the
> > > PMD for that DPDK port, then add a second DPDK port that is local to
> > > the same numa node. This will trigger the ERR above.
> > >
> > > Strictly speaking there is a PMD thread set on the numa node but it
> > > has been isolated. I would change the message above to include
> > > isolation. I would also add the name of the port searching for a PMD
> > > to the message so it is clear which port could not find a thread.
> > > Something like below
> > >
> > > if (all_numa_ids_max_idx < 0) {
> > >     VLOG_ERR("There are no available pmd threads for %s. "
> > >              "Is pmd-cpu-mask set to zero or isolated?",
> > >              netdev_get_name(port->netdev));
> > >
> [[BO'M]] I'll change this in v3.
> > > > +                        continue;
> > > > +                    }
> > > >                      VLOG_WARN("There's no available (non
> > > > isolated) pmd thread "
> > > >                                "on numa node %d. Queue %d on port
> \'%s\'
> > > > will "
> > > > -                              "not be polled.",
> > > > -                              numa_id, qid, netdev_get_name(port-
> > > > >netdev));
> > > > +                              "be assigned to a pmd on numa node
> %d.
> > > > Expect "
> > > > +                              "reduced performance.",
> > > > +                              numa_id, qid, netdev_get_name(port-
> > > > >netdev),
> > > > +                              all_numa_ids[all_numa_ids_idx]);
> > >
> > > The warning above will identify queue 0 will be assigned to a pmd on
> > > the other numa node.
> > > However it's not just queue 0 that will be assigned but all the
> > > unpinned queues associated with the port (see 'q->pmd =
> > rr_numa_get_pmd(numa)'
> > > below).
> > >
> > > I'm thinking the warning should reflect this so users don't think
> > > only queue 0 has been assigned to the non local numa pmd.
> [[BO'M]] AFAIK (though I didn't test multiqueue explicitly) pmd assignment
> is done on a per queue basis as opposed to a per port basis so if there
> are two queues for dpdk0 say then this error should be written twice. Once
> for queue 0 and once for queue 1. (Assuming that there is no PMD for dpdk0
> on the local NUMA node).
> 
> I'll verify this is the behavior of the path when using multiple queues
> and revert back to the ML.
> 

Hi Billy, your right regarding the queue pinning behavior.

I've tested with 2 MQ cases

Case 1

PMD mask set to 0x4 (Core 2 on numa node 0).

Add DPDK port dpdk0 with multiple queues (Queues will all be assigned to PMD on core 2).

Change the PMD mask to 0x30004 (Core 2 on numa node 0, cores 16 & 17 on numa node 1).

Pin queue 0 of dpdk0 to core 2.

In this case the remaining queues of dpdk0 will be distributed to cores 16 & 17. But we only see a warning for 1 queue. A user would have to check the distribution using ovs-appctl dpif-netdev/pmd-rxq-show.

Case 2

Adding a second DPDK port to the scenario above , queues will be distributed to multiple PMDs on numa node 1 but only 1 warning is flagged.

It might make sense to also mention the pmd core the queue has been distributed to in the warning? It could help with debugging performance issues for a user from the logs?

> > >
> > > > +                    numa_id = all_numa_ids[all_numa_ids_idx];
> > > > +                    numa = rr_numa_list_lookup(&rr, numa_id);
> > > > +                    q->pmd = rr_numa_get_pmd(numa);
> > > > +                    all_numa_ids_idx++;
> > > > +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
> > > > +                        all_numa_ids_idx = 0;
> > > > +                    }
> > > >                  } else {
> > > >                      q->pmd = rr_numa_get_pmd(numa);
> > > >                  }
> > > > --
> > > > 2.7.4
> > > >
> > > > _______________________________________________
> > > > dev mailing list
> > > > dev@openvswitch.org
> > > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> > _______________________________________________
> > dev mailing list
> > dev@openvswitch.org
> > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Billy O'Mahony May 4, 2017, 1:21 p.m. UTC | #5
Hi Ian,

> -----Original Message-----
> From: Stokes, Ian
> Sent: Thursday, May 4, 2017 2:19 PM
> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
> Subject: RE: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds on non-
> local numa node.
> 
> > Hi Ian,
> >
> > Sorry for delay responding. Some comment below.
> >
> > > -----Original Message-----
> > > From: ovs-dev-bounces@openvswitch.org [mailto:ovs-dev-
> > > bounces@openvswitch.org] On Behalf Of O Mahony, Billy
> > > Sent: Tuesday, April 25, 2017 11:15 AM
> > > To: Stokes, Ian <ian.stokes@intel.com>; dev@openvswitch.org
> > > Subject: Re: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to pmds
> > > on
> > > non- local numa node.
> > >
> > > Thanks Ian,
> > >
> > > I'll incorporate these suggestions in a new patch version.
> > >
> > > Cheers,
> > > Billy.
> > >
> > > > -----Original Message-----
> > > > From: Stokes, Ian
> > > > Sent: Monday, April 24, 2017 5:33 PM
> > > > To: O Mahony, Billy <billy.o.mahony@intel.com>;
> > > > dev@openvswitch.org
> > > > Subject: RE: [ovs-dev] [PATCH v2] dpif-netdev: Assign ports to
> > > > pmds on
> > > > non- local numa node.
> > > >
> > > > > Previously if there is no available (non-isolated) pmd on the
> > > > > numa node for a port then the port is not polled at all. This
> > > > > can result in a non- operational system until such time as nics
> > > > > are physically repositioned. It is preferable to operate with a
> > > > > pmd on
> > the 'wrong'
> > > > > numa node albeit with lower performance. Local pmds are still
> > > > > chosen
> > > > when available.
> > > > >
> > > > > Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
> > > >
> > > > Thanks for the patch Billy, few comments below.
> > > >
> > > > > ---
> > > > >  Documentation/intro/install/dpdk.rst | 10 ++++++++++
> > > > >  lib/dpif-netdev.c                    | 36
> > > > > ++++++++++++++++++++++++++++++++----
> > > > >  2 files changed, 42 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/Documentation/intro/install/dpdk.rst
> > > > > b/Documentation/intro/install/dpdk.rst
> > > > > index b947bd5..0e0b9f0 100644
> > > > > --- a/Documentation/intro/install/dpdk.rst
> > > > > +++ b/Documentation/intro/install/dpdk.rst
> > > > > @@ -450,6 +450,16 @@ affinitized accordingly.
> > > > >      pmd thread on a NUMA node is only created if there is at
> > > > > least one DPDK
> > > > >      interface from that NUMA node added to OVS.
> > > > >
> > > > > +  .. note::
> > > > > +   On NUMA systems PCI devices are also local to a NUMA node.
> > > > > + Rx queues
> > > > > for
> > > > > +   PCI device will assigned to a pmd on it's local NUMA node if
> > > > > + pmd-cpu-
> > > > > mask
> > > > > +   has created a pmd thread on that NUMA node.  If not the
> > > > > + queue will
> > > be
> > > > > +   assigned to a pmd on a remote NUMA node.  This will result
> > > > > + in
> > reduced
> > > > > +   maximum throughput on that device.  In the case such a queue
> > > > > assingment
> > > >
> > > > Typo in assignment.
> > [[BO'M]] Thanks.
> > > >
> > > > > +   is made a warning message will be logged: "There's no available
> > > > > +   (non isolated) pmd thread on numa node N. Queue Q on port P
> > > > > + will be
> > > > > assigned
> > > > > +   to a pmd on numa node M. Expect reduced performance."
> > > > > +
> > > > >  - QEMU vCPU thread Affinity
> > > > >
> > > > >    A VM performing simple packet forwarding or running complex
> > > > > packet pipelines diff --git a/lib/dpif-netdev.c
> > > > > b/lib/dpif-netdev.c index a14a2eb..c6570ba 100644
> > > > > --- a/lib/dpif-netdev.c
> > > > > +++ b/lib/dpif-netdev.c
> > > > > @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list
> > > > > *rr, int
> > > > > numa_id)  }
> > > > >
> > > > >  static void
> > > > > -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
> > > > > *rr)
> > > > > +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
> > *rr,
> > > > > +                      int *all_numa_ids, unsigned all_numa_ids_sz,
> > > > > +                      int *num_ids_written)
> > > > >  {
> > > > >      struct dp_netdev_pmd_thread *pmd;
> > > > >      struct rr_numa *numa;
> > > > > +    unsigned idx = 0;
> > > > >
> > > > >      hmap_init(&rr->numas);
> > > > >
> > > > > @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev
> > > > > *dp, struct rr_numa_list *rr)
> > > > >          numa->n_pmds++;
> > > > >          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
> > > > > *numa-
> > > > > >pmds);
> > > > >          numa->pmds[numa->n_pmds - 1] = pmd;
> > > > > +
> > > > > +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
> > > > > +        idx++;
> > > > >      }
> > > > > +    *num_ids_written = idx;
> > > > >  }
> > > > >
> > > > >  static struct dp_netdev_pmd_thread * @@ -3202,8 +3209,15 @@
> > > > > rxq_scheduling(struct dp_netdev *dp, bool
> > > > > pinned)
> > > > > OVS_REQUIRES(dp->port_mutex)  {
> > > > >      struct dp_netdev_port *port;
> > > > >      struct rr_numa_list rr;
> > > > > +    int all_numa_ids [64];
> > > > > +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof
> > all_numa_ids[0];
> > > > > +    unsigned all_numa_ids_idx = 0;
> > > > > +    int all_numa_ids_max_idx = 0;
> > > > > +    int num_numa_ids = 0;
> > > > >
> > > > > -    rr_numa_list_populate(dp, &rr);
> > > > > +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
> > > > > +                          &num_numa_ids);
> > > > > +    all_numa_ids_max_idx = MIN(num_numa_ids - 1,
> > > > > + all_numa_ids_sz
> > > > > + - 1);
> > > > >
> > > > >      HMAP_FOR_EACH (port, node, &dp->ports) {
> > > > >          struct rr_numa *numa;
> > > > > @@ -3234,10 +3248,24 @@ rxq_scheduling(struct dp_netdev *dp,
> > > > > bool
> > > > > pinned)
> > > > > OVS_REQUIRES(dp->port_mutex)
> > > > >                  }
> > > > >              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
> > > > >                  if (!numa) {
> > > > > +                    if (all_numa_ids_max_idx < 0) {
> > > > > +                        VLOG_ERR("There are no pmd threads. "
> > > > > +                                 "Is pmd-cpu-mask set to
> > > > > + zero?");
> > > >
> > > > The VLOG_ERR is a bit general and misleading. A user could set a
> > > > PMD on a numa node, add a DPDK device local to that node, isolate
> > > > the PMD for that DPDK port, then add a second DPDK port that is
> > > > local to the same numa node. This will trigger the ERR above.
> > > >
> > > > Strictly speaking there is a PMD thread set on the numa node but
> > > > it has been isolated. I would change the message above to include
> > > > isolation. I would also add the name of the port searching for a
> > > > PMD to the message so it is clear which port could not find a thread.
> > > > Something like below
> > > >
> > > > if (all_numa_ids_max_idx < 0) {
> > > >     VLOG_ERR("There are no available pmd threads for %s. "
> > > >              "Is pmd-cpu-mask set to zero or isolated?",
> > > >              netdev_get_name(port->netdev));
> > > >
> > [[BO'M]] I'll change this in v3.
> > > > > +                        continue;
> > > > > +                    }
> > > > >                      VLOG_WARN("There's no available (non
> > > > > isolated) pmd thread "
> > > > >                                "on numa node %d. Queue %d on
> > > > > port
> > \'%s\'
> > > > > will "
> > > > > -                              "not be polled.",
> > > > > -                              numa_id, qid, netdev_get_name(port-
> > > > > >netdev));
> > > > > +                              "be assigned to a pmd on numa
> > > > > + node
> > %d.
> > > > > Expect "
> > > > > +                              "reduced performance.",
> > > > > +                              numa_id, qid,
> > > > > + netdev_get_name(port-
> > > > > >netdev),
> > > > > +                              all_numa_ids[all_numa_ids_idx]);
> > > >
> > > > The warning above will identify queue 0 will be assigned to a pmd
> > > > on the other numa node.
> > > > However it's not just queue 0 that will be assigned but all the
> > > > unpinned queues associated with the port (see 'q->pmd =
> > > rr_numa_get_pmd(numa)'
> > > > below).
> > > >
> > > > I'm thinking the warning should reflect this so users don't think
> > > > only queue 0 has been assigned to the non local numa pmd.
> > [[BO'M]] AFAIK (though I didn't test multiqueue explicitly) pmd
> > assignment is done on a per queue basis as opposed to a per port basis
> > so if there are two queues for dpdk0 say then this error should be
> > written twice. Once for queue 0 and once for queue 1. (Assuming that
> > there is no PMD for dpdk0 on the local NUMA node).
> >
> > I'll verify this is the behavior of the path when using multiple
> > queues and revert back to the ML.
> >
> 
> Hi Billy, your right regarding the queue pinning behavior.
> 
> I've tested with 2 MQ cases
> 
> Case 1
> 
> PMD mask set to 0x4 (Core 2 on numa node 0).
> 
> Add DPDK port dpdk0 with multiple queues (Queues will all be assigned to
> PMD on core 2).
> 
> Change the PMD mask to 0x30004 (Core 2 on numa node 0, cores 16 & 17 on
> numa node 1).
> 
> Pin queue 0 of dpdk0 to core 2.
> 
> In this case the remaining queues of dpdk0 will be distributed to cores 16 &
> 17. But we only see a warning for 1 queue. A user would have to check the
> distribution using ovs-appctl dpif-netdev/pmd-rxq-show.
> 
> Case 2
> 
> Adding a second DPDK port to the scenario above , queues will be distributed
> to multiple PMDs on numa node 1 but only 1 warning is flagged.
[[BO'M]] Yep I'm seeing the same thing. Good spot. Will include fix in v3. 
> 
> It might make sense to also mention the pmd core the queue has been
> distributed to in the warning? It could help with debugging performance
> issues for a user from the logs?
[[BO'M]] Will include this also.
Cheers,
Billy
> 
> > > >
> > > > > +                    numa_id = all_numa_ids[all_numa_ids_idx];
> > > > > +                    numa = rr_numa_list_lookup(&rr, numa_id);
> > > > > +                    q->pmd = rr_numa_get_pmd(numa);
> > > > > +                    all_numa_ids_idx++;
> > > > > +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
> > > > > +                        all_numa_ids_idx = 0;
> > > > > +                    }
> > > > >                  } else {
> > > > >                      q->pmd = rr_numa_get_pmd(numa);
> > > > >                  }
> > > > > --
> > > > > 2.7.4
> > > > >
> > > > > _______________________________________________
> > > > > dev mailing list
> > > > > dev@openvswitch.org
> > > > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> > > _______________________________________________
> > > dev mailing list
> > > dev@openvswitch.org
> > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev
diff mbox

Patch

diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst
index b947bd5..0e0b9f0 100644
--- a/Documentation/intro/install/dpdk.rst
+++ b/Documentation/intro/install/dpdk.rst
@@ -450,6 +450,16 @@  affinitized accordingly.
     pmd thread on a NUMA node is only created if there is at least one DPDK
     interface from that NUMA node added to OVS.
 
+  .. note::
+   On NUMA systems PCI devices are also local to a NUMA node.  Rx queues for
+   PCI device will assigned to a pmd on it's local NUMA node if pmd-cpu-mask
+   has created a pmd thread on that NUMA node.  If not the queue will be
+   assigned to a pmd on a remote NUMA node.  This will result in reduced
+   maximum throughput on that device.  In the case such a queue assingment
+   is made a warning message will be logged: "There's no available
+   (non isolated) pmd thread on numa node N. Queue Q on port P will be assigned
+   to a pmd on numa node M. Expect reduced performance."
+
 - QEMU vCPU thread Affinity
 
   A VM performing simple packet forwarding or running complex packet pipelines
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index a14a2eb..c6570ba 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3149,10 +3149,13 @@  rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
 }
 
 static void
-rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
+rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
+                      int *all_numa_ids, unsigned all_numa_ids_sz,
+                      int *num_ids_written)
 {
     struct dp_netdev_pmd_thread *pmd;
     struct rr_numa *numa;
+    unsigned idx = 0;
 
     hmap_init(&rr->numas);
 
@@ -3170,7 +3173,11 @@  rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
         numa->n_pmds++;
         numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof *numa->pmds);
         numa->pmds[numa->n_pmds - 1] = pmd;
+
+        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
+        idx++;
     }
+    *num_ids_written = idx;
 }
 
 static struct dp_netdev_pmd_thread *
@@ -3202,8 +3209,15 @@  rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
 {
     struct dp_netdev_port *port;
     struct rr_numa_list rr;
+    int all_numa_ids [64];
+    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
+    unsigned all_numa_ids_idx = 0;
+    int all_numa_ids_max_idx = 0;
+    int num_numa_ids = 0;
 
-    rr_numa_list_populate(dp, &rr);
+    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
+                          &num_numa_ids);
+    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz - 1);
 
     HMAP_FOR_EACH (port, node, &dp->ports) {
         struct rr_numa *numa;
@@ -3234,10 +3248,24 @@  rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
                 }
             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
                 if (!numa) {
+                    if (all_numa_ids_max_idx < 0) {
+                        VLOG_ERR("There are no pmd threads. "
+                                 "Is pmd-cpu-mask set to zero?");
+                        continue;
+                    }
                     VLOG_WARN("There's no available (non isolated) pmd thread "
                               "on numa node %d. Queue %d on port \'%s\' will "
-                              "not be polled.",
-                              numa_id, qid, netdev_get_name(port->netdev));
+                              "be assigned to a pmd on numa node %d. Expect "
+                              "reduced performance.",
+                              numa_id, qid, netdev_get_name(port->netdev),
+                              all_numa_ids[all_numa_ids_idx]);
+                    numa_id = all_numa_ids[all_numa_ids_idx];
+                    numa = rr_numa_list_lookup(&rr, numa_id);
+                    q->pmd = rr_numa_get_pmd(numa);
+                    all_numa_ids_idx++;
+                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
+                        all_numa_ids_idx = 0;
+                    }
                 } else {
                     q->pmd = rr_numa_get_pmd(numa);
                 }