diff mbox

[ovs-dev,v6] dpif-netdev: Assign ports to pmds on non-local numa node.

Message ID 96db6de9-111b-6d81-60fd-b91a68577239@samsung.com
State Not Applicable
Headers show

Commit Message

Ilya Maximets June 26, 2017, 1:48 p.m. UTC
I don't like the implementation. The bunch of these all_numa_ids*
variables looks completely unreadable.
'all_numa_ids [64];' contains same numa_ids and will be overwritten
from the start if we have more than 64 PMD threads. => possible
broken logic and mapping of all the non-local ports to the same node.

Also, the main concern is that  we already have all the required
information about NUMA nodes in 'rr_numa_list rr'. All we need is
to properly iterate over it.


How about something like this (not fully tested):

>------------------------------------------------------------------<

>------------------------------------------------------------------<


Best regards, Ilya Maximets.

On 13.06.2017 12:01, O Mahony, Billy wrote:
> Hi All,
> 
> Does anyone else have any comments on this patch?
> 
> I'm adding Ilya and Jan to the CC as I believe you both had comments on this previously. Apologies if I've forgotten anyone else that commented from the CC!
> 
> Regards,
> /Billy
> 
>> -----Original Message-----
>> From: Stokes, Ian
>> Sent: Thursday, May 11, 2017 12:09 PM
>> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
>> Subject: RE: [ovs-dev] [PATCH v6] dpif-netdev: Assign ports to pmds on non-
>> local numa node.
>>
>>> Previously if there is no available (non-isolated) pmd on the numa
>>> node for a port then the port is not polled at all. This can result in
>>> a non- operational system until such time as nics are physically
>>> repositioned. It is preferable to operate with a pmd on the 'wrong'
>>> numa node albeit with lower performance. Local pmds are still chosen
>> when available.
>>>
>>> Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
>>> ---
>>> v6: Change 'port' to 'queue' in a warning msg
>>> v5: Fix warning msg; Update same in docs
>>> v4: Fix a checkpatch error
>>> v3: Fix warning messages not appearing when using multiqueue
>>> v2: Add details of warning messages into docs
>>>
>>>  Documentation/intro/install/dpdk.rst | 10 +++++++++
>>>  lib/dpif-netdev.c                    | 43
>>> +++++++++++++++++++++++++++++++-----
>>>  2 files changed, 48 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/Documentation/intro/install/dpdk.rst
>>> b/Documentation/intro/install/dpdk.rst
>>> index d1c0e65..7a66bff 100644
>>> --- a/Documentation/intro/install/dpdk.rst
>>> +++ b/Documentation/intro/install/dpdk.rst
>>> @@ -460,6 +460,16 @@ affinitized accordingly.
>>>      pmd thread on a NUMA node is only created if there is at least
>>> one DPDK
>>>      interface from that NUMA node added to OVS.
>>>
>>> +  .. note::
>>> +   On NUMA systems PCI devices are also local to a NUMA node.  Rx
>>> + queues
>>> for
>>> +   PCI device will assigned to a pmd on it's local NUMA node if
>>> + pmd-cpu-
>>> mask
>>> +   has created a pmd thread on that NUMA node.  If not the queue will be
>>> +   assigned to a pmd on a remote NUMA node.  This will result in reduced
>>> +   maximum throughput on that device.  In the case such a queue
>>> assignment
>>> +   is made a warning message will be logged: "There's no available (non-
>>> +   isolated) pmd thread on numa node N. Queue Q on port P will be
>>> assigned to
>>> +   the pmd on core C (numa node N'). Expect reduced performance."
>>> +
>>>  - QEMU vCPU thread Affinity
>>>
>>>    A VM performing simple packet forwarding or running complex packet
>>> pipelines diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index
>>> b3a0806..34f1963 100644
>>> --- a/lib/dpif-netdev.c
>>> +++ b/lib/dpif-netdev.c
>>> @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr,
>>> int
>>> numa_id)  }
>>>
>>>  static void
>>> -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
>>> +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
>>> +                      int *all_numa_ids, unsigned all_numa_ids_sz,
>>> +                      int *num_ids_written)
>>>  {
>>>      struct dp_netdev_pmd_thread *pmd;
>>>      struct rr_numa *numa;
>>> +    unsigned idx = 0;
>>>
>>>      hmap_init(&rr->numas);
>>>
>>> @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev *dp,
>>> struct rr_numa_list *rr)
>>>          numa->n_pmds++;
>>>          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
>>> *numa-
>>>> pmds);
>>>          numa->pmds[numa->n_pmds - 1] = pmd;
>>> +
>>> +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
>>> +        idx++;
>>>      }
>>> +    *num_ids_written = idx;
>>>  }
>>>
>>>  static struct dp_netdev_pmd_thread *
>>> @@ -3202,8 +3209,15 @@ rxq_scheduling(struct dp_netdev *dp, bool
>>> pinned)
>>> OVS_REQUIRES(dp->port_mutex)  {
>>>      struct dp_netdev_port *port;
>>>      struct rr_numa_list rr;
>>> +    int all_numa_ids [64];
>>> +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
>>> +    unsigned all_numa_ids_idx = 0;
>>> +    int all_numa_ids_max_idx = 0;
>>> +    int num_numa_ids = 0;
>>>
>>> -    rr_numa_list_populate(dp, &rr);
>>> +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
>>> +                          &num_numa_ids);
>>> +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz -
>>> + 1);
>>>
>>>      HMAP_FOR_EACH (port, node, &dp->ports) {
>>>          struct rr_numa *numa;
>>> @@ -3234,10 +3248,29 @@ rxq_scheduling(struct dp_netdev *dp, bool
>>> pinned)
>>> OVS_REQUIRES(dp->port_mutex)
>>>                  }
>>>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
>>>                  if (!numa) {
>>> -                    VLOG_WARN("There's no available (non isolated) pmd
>>> thread "
>>> +                    if (all_numa_ids_max_idx < 0) {
>>> +                        VLOG_ERR("There is no available
>>> + (non-isolated)
>>> pmd "
>>> +                                 "thread for port \'%s\' queue %d.
>>> + This
>>> queue "
>>> +                                 "will not be polled. Is pmd-cpu-mask
>>> + set
>>> to "
>>> +                                 "zero? Or are all PMDs isolated to
>>> + other
>>> "
>>> +                                 "queues?", netdev_get_name(port-
>>>> netdev),
>>> +                                 qid);
>>> +                        continue;
>>> +                    }
>>> +                    int alt_numa_id = all_numa_ids[all_numa_ids_idx];
>>> +                    struct rr_numa *alt_numa;
>>> +                    alt_numa = rr_numa_list_lookup(&rr, alt_numa_id);
>>> +                    q->pmd = rr_numa_get_pmd(alt_numa);
>>> +                    VLOG_WARN("There's no available (non-isolated)
>>> + pmd
>>> thread "
>>>                                "on numa node %d. Queue %d on port \'%s\'
>>> will "
>>> -                              "not be polled.",
>>> -                              numa_id, qid, netdev_get_name(port-
>>>> netdev));
>>> +                              "be assigned to the pmd on core %d "
>>> +                              "(numa node %d). Expect reduced
>>> performance.",
>>> +                              numa_id, qid, netdev_get_name(port-
>>>> netdev),
>>> +                              q->pmd->core_id, q->pmd->numa_id);
>>> +                    all_numa_ids_idx++;
>>> +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
>>> +                        all_numa_ids_idx = 0;
>>> +                    }
>>>                  } else {
>>>                      q->pmd = rr_numa_get_pmd(numa);
>>>                  }
>>> --
>>> 2.7.4
>>>
>>
>> Thanks Billy, LGTM and tested ok.
>>
>> Acked-by: Ian Stokes <ian.stokes@intel.com>
>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 
> 
>

Comments

Billy O'Mahony June 26, 2017, 2:49 p.m. UTC | #1
Hi Ilya,

Thanks for further reviewing this patch, which we had previously discussed in February https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/329182.html. 

You are suggesting adding an iterator function to identify the non-isolated cores as the assignment progresses and not building a full list up front. That would be much cleaner. I'll have look and use that idea if it's suitable (I think it will be).

Some more comments below.

Regards,
/Billy.

> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Monday, June 26, 2017 2:49 PM
> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
> Cc: Stokes, Ian <ian.stokes@intel.com>; 'Jan Scheurich'
> <jan.scheurich@ericsson.com>; Darrell Ball <dball@vmware.com>
> Subject: Re: [ovs-dev] [PATCH v6] dpif-netdev: Assign ports to pmds on non-
> local numa node.
> 
> I don't like the implementation. The bunch of these all_numa_ids* variables
> looks completely unreadable.
> 'all_numa_ids [64];' contains same numa_ids and will be overwritten from
> the start if we have more than 64 PMD threads. => possible broken logic and
> mapping of all the non-local ports to the same node.

 [[BO'M]] 
The wrap around of all_numa_ids is in order to assign to non-local numa nodes in a fair fashion. So if one non-local numa node has many PMDs and another has few then the queues with no local PMD are mainly assigned to the second rather than the latter. 

As thread counts increase over coming years or on pure packet switch devices (with many PMDs) and depending precisely how rr_numa_list is populated (i.e. if it is populated in numa id order) a limited (ie. wrapped around) list of alternative threads does mean there could possibly be situations where the allocation of queues to non-local PMDs could be unfair. Though nothing as drastic as broken logic. 

> 
> Also, the main concern is that  we already have all the required information
> about NUMA nodes in 'rr_numa_list rr'. All we need is to properly iterate
> over it.
> 
> 
> How about something like this (not fully tested):
> 
> >------------------------------------------------------------------<
> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4e29085..d17d7e4
> 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int
> numa_id)
>      return NULL;
>  }
> 
> +/* Returns next NUMA from rr list in round-robin fashion. Returns the
> +first
> + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is
> +empty. */ static struct rr_numa * rr_numa_list_next(struct rr_numa_list
> +*rr, const struct rr_numa *numa) {
> +    struct hmap_node *node = NULL;
> +
> +    if (numa) {
> +        node = hmap_next(&rr->numas, &numa->node);
> +    }
> +    if (!node) {
> +        node = hmap_first(&rr->numas);
> +    }
> +
> +    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; }
> +
>  static void
>  rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)  {
> @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool
> pinned) OVS_REQUIRES(dp->port_mutex)  {
>      struct dp_netdev_port *port;
>      struct rr_numa_list rr;
> +    struct rr_numa *last_used_nonlocal_numa = NULL;
> 
>      rr_numa_list_populate(dp, &rr);
> 
> @@ -3281,10 +3299,26 @@ rxq_scheduling(struct dp_netdev *dp, bool
> pinned) OVS_REQUIRES(dp->port_mutex)
>                  }
>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
>                  if (!numa) {
> -                    VLOG_WARN("There's no available (non isolated) pmd thread "
> +                    numa = rr_numa_list_next(&rr, last_used_nonlocal_numa);
> +                    if (!numa) {
> +                        VLOG_ERR("There is no available (non-isolated) pmd "
> +                                 "thread for port \'%s\' queue %d. This queue "
> +                                 "will not be polled. Is pmd-cpu-mask set to "
> +                                 "zero? Or are all PMDs isolated to other "
> +                                 "queues?", netdev_get_name(port->netdev),
> +                                 qid);
> +                        continue;
> +                    }
> +
> +                    q->pmd = rr_numa_get_pmd(numa);
> +                    VLOG_WARN("There's no available (non-isolated) pmd thread "
>                                "on numa node %d. Queue %d on port \'%s\' will "
> -                              "not be polled.",
> -                              numa_id, qid, netdev_get_name(port->netdev));
> +                              "be assigned to the pmd on core %d "
> +                              "(numa node %d). Expect reduced performance.",
> +                              numa_id, qid, netdev_get_name(port->netdev),
> +                              q->pmd->core_id, q->pmd->numa_id);
> +
> +                    last_used_nonlocal_numa = numa;
>                  } else {
>                      q->pmd = rr_numa_get_pmd(numa);
>                  }
> 
> >------------------------------------------------------------------<
> 
> 
> Best regards, Ilya Maximets.
> 
> On 13.06.2017 12:01, O Mahony, Billy wrote:
> > Hi All,
> >
> > Does anyone else have any comments on this patch?
> >
> > I'm adding Ilya and Jan to the CC as I believe you both had comments on
> this previously. Apologies if I've forgotten anyone else that commented from
> the CC!
> >
> > Regards,
> > /Billy
> >
> >> -----Original Message-----
> >> From: Stokes, Ian
> >> Sent: Thursday, May 11, 2017 12:09 PM
> >> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
> >> Subject: RE: [ovs-dev] [PATCH v6] dpif-netdev: Assign ports to pmds
> >> on non- local numa node.
> >>
> >>> Previously if there is no available (non-isolated) pmd on the numa
> >>> node for a port then the port is not polled at all. This can result
> >>> in a non- operational system until such time as nics are physically
> >>> repositioned. It is preferable to operate with a pmd on the 'wrong'
> >>> numa node albeit with lower performance. Local pmds are still chosen
> >> when available.
> >>>
> >>> Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
> >>> ---
> >>> v6: Change 'port' to 'queue' in a warning msg
> >>> v5: Fix warning msg; Update same in docs
> >>> v4: Fix a checkpatch error
> >>> v3: Fix warning messages not appearing when using multiqueue
> >>> v2: Add details of warning messages into docs
> >>>
> >>>  Documentation/intro/install/dpdk.rst | 10 +++++++++
> >>>  lib/dpif-netdev.c                    | 43
> >>> +++++++++++++++++++++++++++++++-----
> >>>  2 files changed, 48 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/Documentation/intro/install/dpdk.rst
> >>> b/Documentation/intro/install/dpdk.rst
> >>> index d1c0e65..7a66bff 100644
> >>> --- a/Documentation/intro/install/dpdk.rst
> >>> +++ b/Documentation/intro/install/dpdk.rst
> >>> @@ -460,6 +460,16 @@ affinitized accordingly.
> >>>      pmd thread on a NUMA node is only created if there is at least
> >>> one DPDK
> >>>      interface from that NUMA node added to OVS.
> >>>
> >>> +  .. note::
> >>> +   On NUMA systems PCI devices are also local to a NUMA node.  Rx
> >>> + queues
> >>> for
> >>> +   PCI device will assigned to a pmd on it's local NUMA node if
> >>> + pmd-cpu-
> >>> mask
> >>> +   has created a pmd thread on that NUMA node.  If not the queue will
> be
> >>> +   assigned to a pmd on a remote NUMA node.  This will result in
> reduced
> >>> +   maximum throughput on that device.  In the case such a queue
> >>> assignment
> >>> +   is made a warning message will be logged: "There's no available (non-
> >>> +   isolated) pmd thread on numa node N. Queue Q on port P will be
> >>> assigned to
> >>> +   the pmd on core C (numa node N'). Expect reduced performance."
> >>> +
> >>>  - QEMU vCPU thread Affinity
> >>>
> >>>    A VM performing simple packet forwarding or running complex
> >>> packet pipelines diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> >>> index
> >>> b3a0806..34f1963 100644
> >>> --- a/lib/dpif-netdev.c
> >>> +++ b/lib/dpif-netdev.c
> >>> @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr,
> >>> int
> >>> numa_id)  }
> >>>
> >>>  static void
> >>> -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
> >>> *rr)
> >>> +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
> >>> +                      int *all_numa_ids, unsigned all_numa_ids_sz,
> >>> +                      int *num_ids_written)
> >>>  {
> >>>      struct dp_netdev_pmd_thread *pmd;
> >>>      struct rr_numa *numa;
> >>> +    unsigned idx = 0;
> >>>
> >>>      hmap_init(&rr->numas);
> >>>
> >>> @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev
> *dp,
> >>> struct rr_numa_list *rr)
> >>>          numa->n_pmds++;
> >>>          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
> >>> *numa-
> >>>> pmds);
> >>>          numa->pmds[numa->n_pmds - 1] = pmd;
> >>> +
> >>> +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
> >>> +        idx++;
> >>>      }
> >>> +    *num_ids_written = idx;
> >>>  }
> >>>
> >>>  static struct dp_netdev_pmd_thread * @@ -3202,8 +3209,15 @@
> >>> rxq_scheduling(struct dp_netdev *dp, bool
> >>> pinned)
> >>> OVS_REQUIRES(dp->port_mutex)  {
> >>>      struct dp_netdev_port *port;
> >>>      struct rr_numa_list rr;
> >>> +    int all_numa_ids [64];
> >>> +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
> >>> +    unsigned all_numa_ids_idx = 0;
> >>> +    int all_numa_ids_max_idx = 0;
> >>> +    int num_numa_ids = 0;
> >>>
> >>> -    rr_numa_list_populate(dp, &rr);
> >>> +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
> >>> +                          &num_numa_ids);
> >>> +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz -
> >>> + 1);
> >>>
> >>>      HMAP_FOR_EACH (port, node, &dp->ports) {
> >>>          struct rr_numa *numa;
> >>> @@ -3234,10 +3248,29 @@ rxq_scheduling(struct dp_netdev *dp, bool
> >>> pinned)
> >>> OVS_REQUIRES(dp->port_mutex)
> >>>                  }
> >>>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
> >>>                  if (!numa) {
> >>> -                    VLOG_WARN("There's no available (non isolated) pmd
> >>> thread "
> >>> +                    if (all_numa_ids_max_idx < 0) {
> >>> +                        VLOG_ERR("There is no available
> >>> + (non-isolated)
> >>> pmd "
> >>> +                                 "thread for port \'%s\' queue %d.
> >>> + This
> >>> queue "
> >>> +                                 "will not be polled. Is
> >>> + pmd-cpu-mask set
> >>> to "
> >>> +                                 "zero? Or are all PMDs isolated to
> >>> + other
> >>> "
> >>> +                                 "queues?", netdev_get_name(port-
> >>>> netdev),
> >>> +                                 qid);
> >>> +                        continue;
> >>> +                    }
> >>> +                    int alt_numa_id = all_numa_ids[all_numa_ids_idx];
> >>> +                    struct rr_numa *alt_numa;
> >>> +                    alt_numa = rr_numa_list_lookup(&rr, alt_numa_id);
> >>> +                    q->pmd = rr_numa_get_pmd(alt_numa);
> >>> +                    VLOG_WARN("There's no available (non-isolated)
> >>> + pmd
> >>> thread "
> >>>                                "on numa node %d. Queue %d on port \'%s\'
> >>> will "
> >>> -                              "not be polled.",
> >>> -                              numa_id, qid, netdev_get_name(port-
> >>>> netdev));
> >>> +                              "be assigned to the pmd on core %d "
> >>> +                              "(numa node %d). Expect reduced
> >>> performance.",
> >>> +                              numa_id, qid, netdev_get_name(port-
> >>>> netdev),
> >>> +                              q->pmd->core_id, q->pmd->numa_id);
> >>> +                    all_numa_ids_idx++;
> >>> +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
> >>> +                        all_numa_ids_idx = 0;
> >>> +                    }
> >>>                  } else {
> >>>                      q->pmd = rr_numa_get_pmd(numa);
> >>>                  }
> >>> --
> >>> 2.7.4
> >>>
> >>
> >> Thanks Billy, LGTM and tested ok.
> >>
> >> Acked-by: Ian Stokes <ian.stokes@intel.com>
> >>
> >>> _______________________________________________
> >>> dev mailing list
> >>> dev@openvswitch.org
> >>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> >
> >
> >
Ilya Maximets June 26, 2017, 3:19 p.m. UTC | #2
On 26.06.2017 17:49, O Mahony, Billy wrote:
> Hi Ilya,
> 
> Thanks for further reviewing this patch, which we had previously discussed in February https://mail.openvswitch.org/pipermail/ovs-dev/2017-February/329182.html. 
> 
> You are suggesting adding an iterator function to identify the non-isolated cores as the assignment progresses and not building a full list up front. That would be much cleaner. I'll have look and use that idea if it's suitable (I think it will be).
> 
> Some more comments below.
> 
> Regards,
> /Billy.
> 
>> -----Original Message-----
>> From: Ilya Maximets [mailto:i.maximets@samsung.com]
>> Sent: Monday, June 26, 2017 2:49 PM
>> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
>> Cc: Stokes, Ian <ian.stokes@intel.com>; 'Jan Scheurich'
>> <jan.scheurich@ericsson.com>; Darrell Ball <dball@vmware.com>
>> Subject: Re: [ovs-dev] [PATCH v6] dpif-netdev: Assign ports to pmds on non-
>> local numa node.
>>
>> I don't like the implementation. The bunch of these all_numa_ids* variables
>> looks completely unreadable.
>> 'all_numa_ids [64];' contains same numa_ids and will be overwritten from
>> the start if we have more than 64 PMD threads. => possible broken logic and
>> mapping of all the non-local ports to the same node.
> 
>  [[BO'M]] 
> The wrap around of all_numa_ids is in order to assign to non-local numa nodes in a fair fashion.
> So if one non-local numa node has many PMDs and another has few then the queues with no local PMD
> are mainly assigned to the second rather than the latter. 

It will not be fair anyway. To make it fair you should track the number of
ports assigned to it's local node threads. You may have large number of
PMD threads on some NUMA node, but all of them will be highly loaded by
their local ports. I see no sense trying to achieve fairness in your way
until you're not implement calculating of the NUMA load using number of
local ports assigned. And even after that the distribution will be unfair
because we don't know the value of traffic flow through that ports and
have no way to measure the impact of non-local port mapping for particular
PMD thread.
So, I think that simple round-robin of NUMA nodes is fair enough for
"Expect reduced performance." configuration. Other solutions looks
like an overkill.

> As thread counts increase over coming years or on pure packet switch devices (with many PMDs) and
> depending precisely how rr_numa_list is populated (i.e. if it is populated in numa id order) a
> limited (ie. wrapped around) list of alternative threads does mean there could possibly be situations
> where the allocation of queues to non-local PMDs could be unfair. Though nothing as drastic as broken logic. 
> 
>>
>> Also, the main concern is that  we already have all the required information
>> about NUMA nodes in 'rr_numa_list rr'. All we need is to properly iterate
>> over it.
>>
>>
>> How about something like this (not fully tested):
>>
>>> ------------------------------------------------------------------<
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4e29085..d17d7e4
>> 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int
>> numa_id)
>>      return NULL;
>>  }
>>
>> +/* Returns next NUMA from rr list in round-robin fashion. Returns the
>> +first
>> + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is
>> +empty. */ static struct rr_numa * rr_numa_list_next(struct rr_numa_list
>> +*rr, const struct rr_numa *numa) {
>> +    struct hmap_node *node = NULL;
>> +
>> +    if (numa) {
>> +        node = hmap_next(&rr->numas, &numa->node);
>> +    }
>> +    if (!node) {
>> +        node = hmap_first(&rr->numas);
>> +    }
>> +
>> +    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; }
>> +
>>  static void
>>  rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)  {
>> @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool
>> pinned) OVS_REQUIRES(dp->port_mutex)  {
>>      struct dp_netdev_port *port;
>>      struct rr_numa_list rr;
>> +    struct rr_numa *last_used_nonlocal_numa = NULL;
>>
>>      rr_numa_list_populate(dp, &rr);
>>
>> @@ -3281,10 +3299,26 @@ rxq_scheduling(struct dp_netdev *dp, bool
>> pinned) OVS_REQUIRES(dp->port_mutex)
>>                  }
>>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
>>                  if (!numa) {
>> -                    VLOG_WARN("There's no available (non isolated) pmd thread "
>> +                    numa = rr_numa_list_next(&rr, last_used_nonlocal_numa);
>> +                    if (!numa) {
>> +                        VLOG_ERR("There is no available (non-isolated) pmd "
>> +                                 "thread for port \'%s\' queue %d. This queue "
>> +                                 "will not be polled. Is pmd-cpu-mask set to "
>> +                                 "zero? Or are all PMDs isolated to other "
>> +                                 "queues?", netdev_get_name(port->netdev),
>> +                                 qid);
>> +                        continue;
>> +                    }
>> +
>> +                    q->pmd = rr_numa_get_pmd(numa);
>> +                    VLOG_WARN("There's no available (non-isolated) pmd thread "
>>                                "on numa node %d. Queue %d on port \'%s\' will "
>> -                              "not be polled.",
>> -                              numa_id, qid, netdev_get_name(port->netdev));
>> +                              "be assigned to the pmd on core %d "
>> +                              "(numa node %d). Expect reduced performance.",
>> +                              numa_id, qid, netdev_get_name(port->netdev),
>> +                              q->pmd->core_id, q->pmd->numa_id);
>> +
>> +                    last_used_nonlocal_numa = numa;
>>                  } else {
>>                      q->pmd = rr_numa_get_pmd(numa);
>>                  }
>>
>>> ------------------------------------------------------------------<
>>
>>
>> Best regards, Ilya Maximets.
>>
>> On 13.06.2017 12:01, O Mahony, Billy wrote:
>>> Hi All,
>>>
>>> Does anyone else have any comments on this patch?
>>>
>>> I'm adding Ilya and Jan to the CC as I believe you both had comments on
>> this previously. Apologies if I've forgotten anyone else that commented from
>> the CC!
>>>
>>> Regards,
>>> /Billy
>>>
>>>> -----Original Message-----
>>>> From: Stokes, Ian
>>>> Sent: Thursday, May 11, 2017 12:09 PM
>>>> To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org
>>>> Subject: RE: [ovs-dev] [PATCH v6] dpif-netdev: Assign ports to pmds
>>>> on non- local numa node.
>>>>
>>>>> Previously if there is no available (non-isolated) pmd on the numa
>>>>> node for a port then the port is not polled at all. This can result
>>>>> in a non- operational system until such time as nics are physically
>>>>> repositioned. It is preferable to operate with a pmd on the 'wrong'
>>>>> numa node albeit with lower performance. Local pmds are still chosen
>>>> when available.
>>>>>
>>>>> Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com>
>>>>> ---
>>>>> v6: Change 'port' to 'queue' in a warning msg
>>>>> v5: Fix warning msg; Update same in docs
>>>>> v4: Fix a checkpatch error
>>>>> v3: Fix warning messages not appearing when using multiqueue
>>>>> v2: Add details of warning messages into docs
>>>>>
>>>>>  Documentation/intro/install/dpdk.rst | 10 +++++++++
>>>>>  lib/dpif-netdev.c                    | 43
>>>>> +++++++++++++++++++++++++++++++-----
>>>>>  2 files changed, 48 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/intro/install/dpdk.rst
>>>>> b/Documentation/intro/install/dpdk.rst
>>>>> index d1c0e65..7a66bff 100644
>>>>> --- a/Documentation/intro/install/dpdk.rst
>>>>> +++ b/Documentation/intro/install/dpdk.rst
>>>>> @@ -460,6 +460,16 @@ affinitized accordingly.
>>>>>      pmd thread on a NUMA node is only created if there is at least
>>>>> one DPDK
>>>>>      interface from that NUMA node added to OVS.
>>>>>
>>>>> +  .. note::
>>>>> +   On NUMA systems PCI devices are also local to a NUMA node.  Rx
>>>>> + queues
>>>>> for
>>>>> +   PCI device will assigned to a pmd on it's local NUMA node if
>>>>> + pmd-cpu-
>>>>> mask
>>>>> +   has created a pmd thread on that NUMA node.  If not the queue will
>> be
>>>>> +   assigned to a pmd on a remote NUMA node.  This will result in
>> reduced
>>>>> +   maximum throughput on that device.  In the case such a queue
>>>>> assignment
>>>>> +   is made a warning message will be logged: "There's no available (non-
>>>>> +   isolated) pmd thread on numa node N. Queue Q on port P will be
>>>>> assigned to
>>>>> +   the pmd on core C (numa node N'). Expect reduced performance."
>>>>> +
>>>>>  - QEMU vCPU thread Affinity
>>>>>
>>>>>    A VM performing simple packet forwarding or running complex
>>>>> packet pipelines diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>>>>> index
>>>>> b3a0806..34f1963 100644
>>>>> --- a/lib/dpif-netdev.c
>>>>> +++ b/lib/dpif-netdev.c
>>>>> @@ -3149,10 +3149,13 @@ rr_numa_list_lookup(struct rr_numa_list *rr,
>>>>> int
>>>>> numa_id)  }
>>>>>
>>>>>  static void
>>>>> -rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list
>>>>> *rr)
>>>>> +rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr,
>>>>> +                      int *all_numa_ids, unsigned all_numa_ids_sz,
>>>>> +                      int *num_ids_written)
>>>>>  {
>>>>>      struct dp_netdev_pmd_thread *pmd;
>>>>>      struct rr_numa *numa;
>>>>> +    unsigned idx = 0;
>>>>>
>>>>>      hmap_init(&rr->numas);
>>>>>
>>>>> @@ -3170,7 +3173,11 @@ rr_numa_list_populate(struct dp_netdev
>> *dp,
>>>>> struct rr_numa_list *rr)
>>>>>          numa->n_pmds++;
>>>>>          numa->pmds = xrealloc(numa->pmds, numa->n_pmds * sizeof
>>>>> *numa-
>>>>>> pmds);
>>>>>          numa->pmds[numa->n_pmds - 1] = pmd;
>>>>> +
>>>>> +        all_numa_ids[idx % all_numa_ids_sz] = pmd->numa_id;
>>>>> +        idx++;
>>>>>      }
>>>>> +    *num_ids_written = idx;
>>>>>  }
>>>>>
>>>>>  static struct dp_netdev_pmd_thread * @@ -3202,8 +3209,15 @@
>>>>> rxq_scheduling(struct dp_netdev *dp, bool
>>>>> pinned)
>>>>> OVS_REQUIRES(dp->port_mutex)  {
>>>>>      struct dp_netdev_port *port;
>>>>>      struct rr_numa_list rr;
>>>>> +    int all_numa_ids [64];
>>>>> +    int all_numa_ids_sz = sizeof all_numa_ids / sizeof all_numa_ids[0];
>>>>> +    unsigned all_numa_ids_idx = 0;
>>>>> +    int all_numa_ids_max_idx = 0;
>>>>> +    int num_numa_ids = 0;
>>>>>
>>>>> -    rr_numa_list_populate(dp, &rr);
>>>>> +    rr_numa_list_populate(dp, &rr, all_numa_ids, all_numa_ids_sz,
>>>>> +                          &num_numa_ids);
>>>>> +    all_numa_ids_max_idx = MIN(num_numa_ids - 1, all_numa_ids_sz -
>>>>> + 1);
>>>>>
>>>>>      HMAP_FOR_EACH (port, node, &dp->ports) {
>>>>>          struct rr_numa *numa;
>>>>> @@ -3234,10 +3248,29 @@ rxq_scheduling(struct dp_netdev *dp, bool
>>>>> pinned)
>>>>> OVS_REQUIRES(dp->port_mutex)
>>>>>                  }
>>>>>              } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
>>>>>                  if (!numa) {
>>>>> -                    VLOG_WARN("There's no available (non isolated) pmd
>>>>> thread "
>>>>> +                    if (all_numa_ids_max_idx < 0) {
>>>>> +                        VLOG_ERR("There is no available
>>>>> + (non-isolated)
>>>>> pmd "
>>>>> +                                 "thread for port \'%s\' queue %d.
>>>>> + This
>>>>> queue "
>>>>> +                                 "will not be polled. Is
>>>>> + pmd-cpu-mask set
>>>>> to "
>>>>> +                                 "zero? Or are all PMDs isolated to
>>>>> + other
>>>>> "
>>>>> +                                 "queues?", netdev_get_name(port-
>>>>>> netdev),
>>>>> +                                 qid);
>>>>> +                        continue;
>>>>> +                    }
>>>>> +                    int alt_numa_id = all_numa_ids[all_numa_ids_idx];
>>>>> +                    struct rr_numa *alt_numa;
>>>>> +                    alt_numa = rr_numa_list_lookup(&rr, alt_numa_id);
>>>>> +                    q->pmd = rr_numa_get_pmd(alt_numa);
>>>>> +                    VLOG_WARN("There's no available (non-isolated)
>>>>> + pmd
>>>>> thread "
>>>>>                                "on numa node %d. Queue %d on port \'%s\'
>>>>> will "
>>>>> -                              "not be polled.",
>>>>> -                              numa_id, qid, netdev_get_name(port-
>>>>>> netdev));
>>>>> +                              "be assigned to the pmd on core %d "
>>>>> +                              "(numa node %d). Expect reduced
>>>>> performance.",
>>>>> +                              numa_id, qid, netdev_get_name(port-
>>>>>> netdev),
>>>>> +                              q->pmd->core_id, q->pmd->numa_id);
>>>>> +                    all_numa_ids_idx++;
>>>>> +                    if (all_numa_ids_idx > all_numa_ids_max_idx) {
>>>>> +                        all_numa_ids_idx = 0;
>>>>> +                    }
>>>>>                  } else {
>>>>>                      q->pmd = rr_numa_get_pmd(numa);
>>>>>                  }
>>>>> --
>>>>> 2.7.4
>>>>>
>>>>
>>>> Thanks Billy, LGTM and tested ok.
>>>>
>>>> Acked-by: Ian Stokes <ian.stokes@intel.com>
>>>>
>>>>> _______________________________________________
>>>>> dev mailing list
>>>>> dev@openvswitch.org
>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
>>>
>>>
> 
> 
>
diff mbox

Patch

diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 4e29085..d17d7e4 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -3195,6 +3195,23 @@  rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id)
     return NULL;
 }
 
+/* Returns next NUMA from rr list in round-robin fashion. Returns the first
+ * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */
+static struct rr_numa *
+rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa)
+{
+    struct hmap_node *node = NULL;
+
+    if (numa) {
+        node = hmap_next(&rr->numas, &numa->node);
+    }
+    if (!node) {
+        node = hmap_first(&rr->numas);
+    }
+
+    return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL;
+}
+
 static void
 rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr)
 {
@@ -3249,6 +3266,7 @@  rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
 {
     struct dp_netdev_port *port;
     struct rr_numa_list rr;
+    struct rr_numa *last_used_nonlocal_numa = NULL;
 
     rr_numa_list_populate(dp, &rr);
 
@@ -3281,10 +3299,26 @@  rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
                 }
             } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
                 if (!numa) {
-                    VLOG_WARN("There's no available (non isolated) pmd thread "
+                    numa = rr_numa_list_next(&rr, last_used_nonlocal_numa);
+                    if (!numa) {
+                        VLOG_ERR("There is no available (non-isolated) pmd "
+                                 "thread for port \'%s\' queue %d. This queue "
+                                 "will not be polled. Is pmd-cpu-mask set to "
+                                 "zero? Or are all PMDs isolated to other "
+                                 "queues?", netdev_get_name(port->netdev),
+                                 qid);
+                        continue;
+                    }
+
+                    q->pmd = rr_numa_get_pmd(numa);
+                    VLOG_WARN("There's no available (non-isolated) pmd thread "
                               "on numa node %d. Queue %d on port \'%s\' will "
-                              "not be polled.",
-                              numa_id, qid, netdev_get_name(port->netdev));
+                              "be assigned to the pmd on core %d "
+                              "(numa node %d). Expect reduced performance.",
+                              numa_id, qid, netdev_get_name(port->netdev),
+                              q->pmd->core_id, q->pmd->numa_id);
+
+                    last_used_nonlocal_numa = numa;
                 } else {
                     q->pmd = rr_numa_get_pmd(numa);
                 }