Message ID | 1498578394-4505-2-git-send-email-billy.o.mahony@intel.com |
---|---|
State | Superseded |
Headers | show |
On 27.06.2017 18:46, Billy O'Mahony wrote: > Previously if there is no available (non-isolated) pmd on the numa node > for a port then the port is not polled at all. This can result in a > non-operational system until such time as nics are physically > repositioned. It is preferable to operate with a pmd on the 'wrong' numa > node albeit with lower performance. Local pmds are still chosen when > available. > > Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com> > Signed-off-by: Ilya Maximets <i.maximets@samsung.com> > Co-authored-by: Ilya Maximets <i.maximets@samsung.com> > --- > v7: Incorporate review comments on docs and implementation > v6: Change 'port' to 'queue' in a warning msg > v5: Fix warning msg; Update same in docs > v4: Fix a checkpatch error > v3: Fix warning messages not appearing when using multiqueue > v2: Add details of warning messages into docs > > Documentation/intro/install/dpdk.rst | 18 +++++++++++++--- > lib/dpif-netdev.c | 42 ++++++++++++++++++++++++++++++++---- > 2 files changed, 53 insertions(+), 7 deletions(-) > > diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst > index e83f852..a760fb6 100644 > --- a/Documentation/intro/install/dpdk.rst > +++ b/Documentation/intro/install/dpdk.rst > @@ -449,7 +449,7 @@ affinitized accordingly. > > A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces > assigned to it. A pmd thread shall poll the ports for incoming packets, > - switch the packets and send to tx port. pmd thread is CPU bound, and needs > + switch the packets and send to tx port. A pmd thread is CPU bound, and needs > to be affinitized to isolated cores for optimum performance. > > By setting a bit in the mask, a pmd thread is created and pinned to the > @@ -458,8 +458,20 @@ affinitized accordingly. > $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4 > > .. note:: > - pmd thread on a NUMA node is only created if there is at least one DPDK > - interface from that NUMA node added to OVS. > + While pmd threads are created based on pmd-cpu-mask, the thread only starts > + consuming CPU cycles if there is least one receive queue assigned to the > + pmd. > + > + .. note:: > + > + On NUMA systems PCI devices are also local to a NUMA node. Unbound Rx > + queues for PCI device will assigned to a pmd on it's local NUMA node if > + pmd-cpu-mask has created a pmd thread on that NUMA node. If not the queue > + will be assigned to a pmd on a remote NUMA node. This will result in > + reduced maximum throughput on that device. And possibly on other devices assigned to that pmd thread. > In case such a queue assignment > + is made a warning message will be logged: "There's no available (non- > + isolated) pmd thread on numa node N. Queue Q on port P will be assigned to > + the pmd on core C (numa node N'). Expect reduced performance." > > - QEMU vCPU thread Affinity > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index 4e29085..38a0fd3 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id) > return NULL; > } > > +/* Returns next NUMA from rr list in round-robin fashion. Returns the first > + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */ > +static struct rr_numa * > +rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa) > +{ > + struct hmap_node *node = NULL; > + > + if (numa) { > + node = hmap_next(&rr->numas, &numa->node); > + } > + if (!node) { > + node = hmap_first(&rr->numas); > + } > + > + return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; > +} > + > static void > rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr) > { > @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) > { > struct dp_netdev_port *port; > struct rr_numa_list rr; > + struct rr_numa * non_local_numa = NULL; '*' should be near to variable. > > rr_numa_list_populate(dp, &rr); > > @@ -3262,7 +3280,6 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) > > numa_id = netdev_get_numa_id(port->netdev); > numa = rr_numa_list_lookup(&rr, numa_id); > - > for (int qid = 0; qid < port->n_rxq; qid++) { > struct dp_netdev_rxq *q = &port->rxqs[qid]; I prefer to keep that blank line. > @@ -3281,11 +3298,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) > } > } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) { > if (!numa) { > - VLOG_WARN("There's no available (non isolated) pmd thread " > + /* There are no pmds on the queue's local NUMA node. > + Round-robin on the NUMA nodes that do have pmds. */ > + non_local_numa = rr_numa_list_next(&rr, non_local_numa); > + if (!non_local_numa) { > + VLOG_ERR("There is no available (non-isolated) pmd " > + "thread for port \'%s\' queue %d. This queue " > + "will not be polled. Is pmd-cpu-mask set to " > + "zero? Or are all PMDs isolated to other " > + "queues?", netdev_get_name(port->netdev), > + qid); > + continue; > + } > + q->pmd = rr_numa_get_pmd(non_local_numa); > + VLOG_WARN("There's no available (non-isolated) pmd thread " > "on numa node %d. Queue %d on port \'%s\' will " > - "not be polled.", > - numa_id, qid, netdev_get_name(port->netdev)); > + "be assigned to the pmd on core %d " > + "(numa node %d). Expect reduced performance.", > + numa_id, qid, netdev_get_name(port->netdev), > + q->pmd->core_id, q->pmd->numa_id); > } else { > + /* Assign queue to the next (round-robin) PMD on it's local > + NUMA node. */ > q->pmd = rr_numa_get_pmd(numa); > } > } >
I'll give Darrell a chance to comment before rev'ing. > -----Original Message----- > From: Ilya Maximets [mailto:i.maximets@samsung.com] > Sent: Tuesday, June 27, 2017 5:11 PM > To: O Mahony, Billy <billy.o.mahony@intel.com>; dev@openvswitch.org > Cc: dlu998@gmail.com > Subject: Re: [PATCH v7] dpif-netdev: Assign ports to pmds on non-local numa > node. > > On 27.06.2017 18:46, Billy O'Mahony wrote: > > Previously if there is no available (non-isolated) pmd on the numa > > node for a port then the port is not polled at all. This can result in > > a non-operational system until such time as nics are physically > > repositioned. It is preferable to operate with a pmd on the 'wrong' > > numa node albeit with lower performance. Local pmds are still chosen > > when available. > > > > Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com> > > Signed-off-by: Ilya Maximets <i.maximets@samsung.com> > > Co-authored-by: Ilya Maximets <i.maximets@samsung.com> > > --- > > v7: Incorporate review comments on docs and implementation > > v6: Change 'port' to 'queue' in a warning msg > > v5: Fix warning msg; Update same in docs > > v4: Fix a checkpatch error > > v3: Fix warning messages not appearing when using multiqueue > > v2: Add details of warning messages into docs > > > > Documentation/intro/install/dpdk.rst | 18 +++++++++++++--- > > lib/dpif-netdev.c | 42 ++++++++++++++++++++++++++++++++-- > -- > > 2 files changed, 53 insertions(+), 7 deletions(-) > > > > diff --git a/Documentation/intro/install/dpdk.rst > > b/Documentation/intro/install/dpdk.rst > > index e83f852..a760fb6 100644 > > --- a/Documentation/intro/install/dpdk.rst > > +++ b/Documentation/intro/install/dpdk.rst > > @@ -449,7 +449,7 @@ affinitized accordingly. > > > > A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces > > assigned to it. A pmd thread shall poll the ports for incoming > > packets, > > - switch the packets and send to tx port. pmd thread is CPU bound, > > and needs > > + switch the packets and send to tx port. A pmd thread is CPU bound, > > + and needs > > to be affinitized to isolated cores for optimum performance. > > > > By setting a bit in the mask, a pmd thread is created and pinned to > > the @@ -458,8 +458,20 @@ affinitized accordingly. > > $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4 > > > > .. note:: > > - pmd thread on a NUMA node is only created if there is at least one > DPDK > > - interface from that NUMA node added to OVS. > > + While pmd threads are created based on pmd-cpu-mask, the thread > only starts > > + consuming CPU cycles if there is least one receive queue assigned to > the > > + pmd. > > + > > + .. note:: > > + > > + On NUMA systems PCI devices are also local to a NUMA node. > Unbound Rx > > + queues for PCI device will assigned to a pmd on it's local NUMA node if > > + pmd-cpu-mask has created a pmd thread on that NUMA node. If not > the queue > > + will be assigned to a pmd on a remote NUMA node. This will result in > > + reduced maximum throughput on that device. > > And possibly on other devices assigned to that pmd thread. > > > In case such a queue assignment > > + is made a warning message will be logged: "There's no available (non- > > + isolated) pmd thread on numa node N. Queue Q on port P will be > assigned to > > + the pmd on core C (numa node N'). Expect reduced performance." > > > > - QEMU vCPU thread Affinity > > > > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index > > 4e29085..38a0fd3 100644 > > --- a/lib/dpif-netdev.c > > +++ b/lib/dpif-netdev.c > > @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int > numa_id) > > return NULL; > > } > > > > +/* Returns next NUMA from rr list in round-robin fashion. Returns the > > +first > > + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is > > +empty. */ static struct rr_numa * rr_numa_list_next(struct > > +rr_numa_list *rr, const struct rr_numa *numa) { > > + struct hmap_node *node = NULL; > > + > > + if (numa) { > > + node = hmap_next(&rr->numas, &numa->node); > > + } > > + if (!node) { > > + node = hmap_first(&rr->numas); > > + } > > + > > + return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; > > +} > > + > > static void > > rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr) > > { @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool > > pinned) OVS_REQUIRES(dp->port_mutex) { > > struct dp_netdev_port *port; > > struct rr_numa_list rr; > > + struct rr_numa * non_local_numa = NULL; > > '*' should be near to variable. > > > > > rr_numa_list_populate(dp, &rr); > > > > @@ -3262,7 +3280,6 @@ rxq_scheduling(struct dp_netdev *dp, bool > > pinned) OVS_REQUIRES(dp->port_mutex) > > > > numa_id = netdev_get_numa_id(port->netdev); > > numa = rr_numa_list_lookup(&rr, numa_id); > > - > > for (int qid = 0; qid < port->n_rxq; qid++) { > > struct dp_netdev_rxq *q = &port->rxqs[qid]; > > I prefer to keep that blank line. > > > @@ -3281,11 +3298,28 @@ rxq_scheduling(struct dp_netdev *dp, bool > pinned) OVS_REQUIRES(dp->port_mutex) > > } > > } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) { > > if (!numa) { > > - VLOG_WARN("There's no available (non isolated) pmd thread " > > + /* There are no pmds on the queue's local NUMA node. > > + Round-robin on the NUMA nodes that do have pmds. */ > > + non_local_numa = rr_numa_list_next(&rr, non_local_numa); > > + if (!non_local_numa) { > > + VLOG_ERR("There is no available (non-isolated) pmd " > > + "thread for port \'%s\' queue %d. This queue " > > + "will not be polled. Is pmd-cpu-mask set to " > > + "zero? Or are all PMDs isolated to other " > > + "queues?", netdev_get_name(port->netdev), > > + qid); > > + continue; > > + } > > + q->pmd = rr_numa_get_pmd(non_local_numa); > > + VLOG_WARN("There's no available (non-isolated) pmd thread " > > "on numa node %d. Queue %d on port \'%s\' will " > > - "not be polled.", > > - numa_id, qid, netdev_get_name(port->netdev)); > > + "be assigned to the pmd on core %d " > > + "(numa node %d). Expect reduced performance.", > > + numa_id, qid, netdev_get_name(port->netdev), > > + q->pmd->core_id, q->pmd->numa_id); > > } else { > > + /* Assign queue to the next (round-robin) PMD on it's local > > + NUMA node. */ > > q->pmd = rr_numa_get_pmd(numa); > > } > > } > >
Let us cover the documentation first On 6/27/17, 8:46 AM, "ovs-dev-bounces@openvswitch.org on behalf of Billy O'Mahony" <ovs-dev-bounces@openvswitch.org on behalf of billy.o.mahony@intel.com> wrote: Previously if there is no available (non-isolated) pmd on the numa node for a port then the port is not polled at all. This can result in a non-operational system until such time as nics are physically repositioned. It is preferable to operate with a pmd on the 'wrong' numa node albeit with lower performance. Local pmds are still chosen when available. Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> --- v7: Incorporate review comments on docs and implementation v6: Change 'port' to 'queue' in a warning msg v5: Fix warning msg; Update same in docs v4: Fix a checkpatch error v3: Fix warning messages not appearing when using multiqueue v2: Add details of warning messages into docs Documentation/intro/install/dpdk.rst | 18 +++++++++++++--- lib/dpif-netdev.c | 42 ++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index e83f852..a760fb6 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -449,7 +449,7 @@ affinitized accordingly. A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces assigned to it. A pmd thread shall poll the ports for incoming packets, - switch the packets and send to tx port. pmd thread is CPU bound, and needs + switch the packets and send to tx port. A pmd thread is CPU bound, and needs to be affinitized to isolated cores for optimum performance. By setting a bit in the mask, a pmd thread is created and pinned to the @@ -458,8 +458,20 @@ affinitized accordingly. $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4 .. note:: - pmd thread on a NUMA node is only created if there is at least one DPDK - interface from that NUMA node added to OVS. + While pmd threads are created based on pmd-cpu-mask, the thread only starts + consuming CPU cycles if there is least one receive queue assigned to the + pmd. + + .. note:: + + On NUMA systems PCI devices are also local to a NUMA node. Unbound Rx + queues for PCI device will assigned to a pmd on it's local NUMA node if + pmd-cpu-mask has created a pmd thread on that NUMA node. If not the queue + will be assigned to a pmd on a remote NUMA node. This will result in + reduced maximum throughput on that device. In case such a queue assignment + is made a warning message will be logged: "There's no available (non- + isolated) pmd thread on numa node N. Queue Q on port P will be assigned to + the pmd on core C (numa node N'). Expect reduced performance." I think we want to make it clear that PMD threads are created by default. I think we want to stress non-isolated a bit more I also folded in “And possibly on other devices assigned to that pmd thread.” How about: .. note:: A pmd thread on a NUMA node is only created if there is at least one DPDK interface from that NUMA node added to OVS. A pmd thread is created by default on a core of a NUMA node or when a specified pmd-cpu-mask has indicated so. While pmd threads are created based on pmd-cpu-mask, the thread only starts consuming CPU cycles if there is least one receive queue assigned to the pmd. .. note:: On NUMA systems PCI devices are also local to a NUMA node. Unbound Rx queues for a PCI device will assigned to a pmd on it's local NUMA node if a non-isolated PMD exists on that NUMA node. If not, the queue will be assigned to a non-isolated pmd on a remote NUMA node. This will result in reduced maximum throughput on that device and possibly on other devices assigned to that pmd thread. In the case such, a queue assignment is made a warning message will be logged: "There's no available (non-isolated) pmd thread on numa node N. Queue Q on port P will be assigned to the pmd on core C (numa node N'). Expect reduced performance." - QEMU vCPU thread Affinity diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4e29085..38a0fd3 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id) return NULL; } +/* Returns next NUMA from rr list in round-robin fashion. Returns the first + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */ +static struct rr_numa * +rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa) +{ + struct hmap_node *node = NULL; + + if (numa) { + node = hmap_next(&rr->numas, &numa->node); + } + if (!node) { + node = hmap_first(&rr->numas); + } + + return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; +} + static void rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr) { @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) { struct dp_netdev_port *port; struct rr_numa_list rr; + struct rr_numa * non_local_numa = NULL; rr_numa_list_populate(dp, &rr); @@ -3262,7 +3280,6 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) numa_id = netdev_get_numa_id(port->netdev); numa = rr_numa_list_lookup(&rr, numa_id); - for (int qid = 0; qid < port->n_rxq; qid++) { struct dp_netdev_rxq *q = &port->rxqs[qid]; @@ -3281,11 +3298,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) } } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) { if (!numa) { - VLOG_WARN("There's no available (non isolated) pmd thread " + /* There are no pmds on the queue's local NUMA node. + Round-robin on the NUMA nodes that do have pmds. */ + non_local_numa = rr_numa_list_next(&rr, non_local_numa); + if (!non_local_numa) { + VLOG_ERR("There is no available (non-isolated) pmd " + "thread for port \'%s\' queue %d. This queue " + "will not be polled. Is pmd-cpu-mask set to " + "zero? Or are all PMDs isolated to other " + "queues?", netdev_get_name(port->netdev), + qid); + continue; + } + q->pmd = rr_numa_get_pmd(non_local_numa); + VLOG_WARN("There's no available (non-isolated) pmd thread " "on numa node %d. Queue %d on port \'%s\' will " - "not be polled.", - numa_id, qid, netdev_get_name(port->netdev)); + "be assigned to the pmd on core %d " + "(numa node %d). Expect reduced performance.", + numa_id, qid, netdev_get_name(port->netdev), + q->pmd->core_id, q->pmd->numa_id); } else { + /* Assign queue to the next (round-robin) PMD on it's local + NUMA node. */ q->pmd = rr_numa_get_pmd(numa); } } -- 2.7.4 _______________________________________________ dev mailing list dev@openvswitch.org https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=xwTDizKqmzVLDp7yyP_I5R4aeN3APblBhE9vzyiJ70M&s=WdHF5dNtIutUuGCreAg0s8swpclUxADNrvdgPGNd-Fk&e=
One more suggestion: Change While pmd threads are created based on pmd-cpu-mask, the thread only starts consuming CPU cycles if there is least one receive queue assigned to the pmd. to Even though a PMD thread may exist, the thread only starts consuming CPU cycles if there is least one receive queue assigned to the pmd. On 6/27/17, 10:40 AM, "ovs-dev-bounces@openvswitch.org on behalf of Darrell Ball" <ovs-dev-bounces@openvswitch.org on behalf of dball@vmware.com> wrote: Let us cover the documentation first On 6/27/17, 8:46 AM, "ovs-dev-bounces@openvswitch.org on behalf of Billy O'Mahony" <ovs-dev-bounces@openvswitch.org on behalf of billy.o.mahony@intel.com> wrote: Previously if there is no available (non-isolated) pmd on the numa node for a port then the port is not polled at all. This can result in a non-operational system until such time as nics are physically repositioned. It is preferable to operate with a pmd on the 'wrong' numa node albeit with lower performance. Local pmds are still chosen when available. Signed-off-by: Billy O'Mahony <billy.o.mahony@intel.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com> Co-authored-by: Ilya Maximets <i.maximets@samsung.com> --- v7: Incorporate review comments on docs and implementation v6: Change 'port' to 'queue' in a warning msg v5: Fix warning msg; Update same in docs v4: Fix a checkpatch error v3: Fix warning messages not appearing when using multiqueue v2: Add details of warning messages into docs Documentation/intro/install/dpdk.rst | 18 +++++++++++++--- lib/dpif-netdev.c | 42 ++++++++++++++++++++++++++++++++---- 2 files changed, 53 insertions(+), 7 deletions(-) diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index e83f852..a760fb6 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -449,7 +449,7 @@ affinitized accordingly. A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces assigned to it. A pmd thread shall poll the ports for incoming packets, - switch the packets and send to tx port. pmd thread is CPU bound, and needs + switch the packets and send to tx port. A pmd thread is CPU bound, and needs to be affinitized to isolated cores for optimum performance. By setting a bit in the mask, a pmd thread is created and pinned to the @@ -458,8 +458,20 @@ affinitized accordingly. $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4 .. note:: - pmd thread on a NUMA node is only created if there is at least one DPDK - interface from that NUMA node added to OVS. + While pmd threads are created based on pmd-cpu-mask, the thread only starts + consuming CPU cycles if there is least one receive queue assigned to the + pmd. + + .. note:: + + On NUMA systems PCI devices are also local to a NUMA node. Unbound Rx + queues for PCI device will assigned to a pmd on it's local NUMA node if + pmd-cpu-mask has created a pmd thread on that NUMA node. If not the queue + will be assigned to a pmd on a remote NUMA node. This will result in + reduced maximum throughput on that device. In case such a queue assignment + is made a warning message will be logged: "There's no available (non- + isolated) pmd thread on numa node N. Queue Q on port P will be assigned to + the pmd on core C (numa node N'). Expect reduced performance." I think we want to make it clear that PMD threads are created by default. I think we want to stress non-isolated a bit more I also folded in “And possibly on other devices assigned to that pmd thread.” How about: .. note:: A pmd thread on a NUMA node is only created if there is at least one DPDK interface from that NUMA node added to OVS. A pmd thread is created by default on a core of a NUMA node or when a specified pmd-cpu-mask has indicated so. While pmd threads are created based on pmd-cpu-mask, the thread only starts consuming CPU cycles if there is least one receive queue assigned to the pmd. .. note:: On NUMA systems PCI devices are also local to a NUMA node. Unbound Rx queues for a PCI device will assigned to a pmd on it's local NUMA node if a non-isolated PMD exists on that NUMA node. If not, the queue will be assigned to a non-isolated pmd on a remote NUMA node. This will result in reduced maximum throughput on that device and possibly on other devices assigned to that pmd thread. In the case such, a queue assignment is made a warning message will be logged: "There's no available (non-isolated) pmd thread on numa node N. Queue Q on port P will be assigned to the pmd on core C (numa node N'). Expect reduced performance." - QEMU vCPU thread Affinity diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4e29085..38a0fd3 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id) return NULL; } +/* Returns next NUMA from rr list in round-robin fashion. Returns the first + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */ +static struct rr_numa * +rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa) +{ + struct hmap_node *node = NULL; + + if (numa) { + node = hmap_next(&rr->numas, &numa->node); + } + if (!node) { + node = hmap_first(&rr->numas); + } + + return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; +} + static void rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr) { @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) { struct dp_netdev_port *port; struct rr_numa_list rr; + struct rr_numa * non_local_numa = NULL; rr_numa_list_populate(dp, &rr); @@ -3262,7 +3280,6 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) numa_id = netdev_get_numa_id(port->netdev); numa = rr_numa_list_lookup(&rr, numa_id); - for (int qid = 0; qid < port->n_rxq; qid++) { struct dp_netdev_rxq *q = &port->rxqs[qid]; @@ -3281,11 +3298,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) } } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) { if (!numa) { - VLOG_WARN("There's no available (non isolated) pmd thread " + /* There are no pmds on the queue's local NUMA node. + Round-robin on the NUMA nodes that do have pmds. */ + non_local_numa = rr_numa_list_next(&rr, non_local_numa); + if (!non_local_numa) { + VLOG_ERR("There is no available (non-isolated) pmd " + "thread for port \'%s\' queue %d. This queue " + "will not be polled. Is pmd-cpu-mask set to " + "zero? Or are all PMDs isolated to other " + "queues?", netdev_get_name(port->netdev), + qid); + continue; + } + q->pmd = rr_numa_get_pmd(non_local_numa); + VLOG_WARN("There's no available (non-isolated) pmd thread " "on numa node %d. Queue %d on port \'%s\' will " - "not be polled.", - numa_id, qid, netdev_get_name(port->netdev)); + "be assigned to the pmd on core %d " + "(numa node %d). Expect reduced performance.", + numa_id, qid, netdev_get_name(port->netdev), + q->pmd->core_id, q->pmd->numa_id); } else { + /* Assign queue to the next (round-robin) PMD on it's local + NUMA node. */ q->pmd = rr_numa_get_pmd(numa); } } -- 2.7.4 _______________________________________________ dev mailing list dev@openvswitch.org https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwICAg&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=xwTDizKqmzVLDp7yyP_I5R4aeN3APblBhE9vzyiJ70M&s=WdHF5dNtIutUuGCreAg0s8swpclUxADNrvdgPGNd-Fk&e= _______________________________________________ dev mailing list dev@openvswitch.org https://urldefense.proofpoint.com/v2/url?u=https-3A__mail.openvswitch.org_mailman_listinfo_ovs-2Ddev&d=DwIGaQ&c=uilaK90D4TOVoH58JNXRgQ&r=BVhFA09CGX7JQ5Ih-uZnsw&m=P59HA8d7P4t_kPx0wEBzeqJ6XAZu76mnbn5XoeA83KQ&s=IkAPN_X1NyG62jrEMJoLNPk5t8Pg5_jQ8rljyvsCyV4&e=
diff --git a/Documentation/intro/install/dpdk.rst b/Documentation/intro/install/dpdk.rst index e83f852..a760fb6 100644 --- a/Documentation/intro/install/dpdk.rst +++ b/Documentation/intro/install/dpdk.rst @@ -449,7 +449,7 @@ affinitized accordingly. A poll mode driver (pmd) thread handles the I/O of all DPDK interfaces assigned to it. A pmd thread shall poll the ports for incoming packets, - switch the packets and send to tx port. pmd thread is CPU bound, and needs + switch the packets and send to tx port. A pmd thread is CPU bound, and needs to be affinitized to isolated cores for optimum performance. By setting a bit in the mask, a pmd thread is created and pinned to the @@ -458,8 +458,20 @@ affinitized accordingly. $ ovs-vsctl set Open_vSwitch . other_config:pmd-cpu-mask=0x4 .. note:: - pmd thread on a NUMA node is only created if there is at least one DPDK - interface from that NUMA node added to OVS. + While pmd threads are created based on pmd-cpu-mask, the thread only starts + consuming CPU cycles if there is least one receive queue assigned to the + pmd. + + .. note:: + + On NUMA systems PCI devices are also local to a NUMA node. Unbound Rx + queues for PCI device will assigned to a pmd on it's local NUMA node if + pmd-cpu-mask has created a pmd thread on that NUMA node. If not the queue + will be assigned to a pmd on a remote NUMA node. This will result in + reduced maximum throughput on that device. In case such a queue assignment + is made a warning message will be logged: "There's no available (non- + isolated) pmd thread on numa node N. Queue Q on port P will be assigned to + the pmd on core C (numa node N'). Expect reduced performance." - QEMU vCPU thread Affinity diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 4e29085..38a0fd3 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -3195,6 +3195,23 @@ rr_numa_list_lookup(struct rr_numa_list *rr, int numa_id) return NULL; } +/* Returns next NUMA from rr list in round-robin fashion. Returns the first + * NUMA node if 'NULL' or the last node passed, and 'NULL' if list is empty. */ +static struct rr_numa * +rr_numa_list_next(struct rr_numa_list *rr, const struct rr_numa *numa) +{ + struct hmap_node *node = NULL; + + if (numa) { + node = hmap_next(&rr->numas, &numa->node); + } + if (!node) { + node = hmap_first(&rr->numas); + } + + return (node) ? CONTAINER_OF(node, struct rr_numa, node) : NULL; +} + static void rr_numa_list_populate(struct dp_netdev *dp, struct rr_numa_list *rr) { @@ -3249,6 +3266,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) { struct dp_netdev_port *port; struct rr_numa_list rr; + struct rr_numa * non_local_numa = NULL; rr_numa_list_populate(dp, &rr); @@ -3262,7 +3280,6 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) numa_id = netdev_get_numa_id(port->netdev); numa = rr_numa_list_lookup(&rr, numa_id); - for (int qid = 0; qid < port->n_rxq; qid++) { struct dp_netdev_rxq *q = &port->rxqs[qid]; @@ -3281,11 +3298,28 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) } } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) { if (!numa) { - VLOG_WARN("There's no available (non isolated) pmd thread " + /* There are no pmds on the queue's local NUMA node. + Round-robin on the NUMA nodes that do have pmds. */ + non_local_numa = rr_numa_list_next(&rr, non_local_numa); + if (!non_local_numa) { + VLOG_ERR("There is no available (non-isolated) pmd " + "thread for port \'%s\' queue %d. This queue " + "will not be polled. Is pmd-cpu-mask set to " + "zero? Or are all PMDs isolated to other " + "queues?", netdev_get_name(port->netdev), + qid); + continue; + } + q->pmd = rr_numa_get_pmd(non_local_numa); + VLOG_WARN("There's no available (non-isolated) pmd thread " "on numa node %d. Queue %d on port \'%s\' will " - "not be polled.", - numa_id, qid, netdev_get_name(port->netdev)); + "be assigned to the pmd on core %d " + "(numa node %d). Expect reduced performance.", + numa_id, qid, netdev_get_name(port->netdev), + q->pmd->core_id, q->pmd->numa_id); } else { + /* Assign queue to the next (round-robin) PMD on it's local + NUMA node. */ q->pmd = rr_numa_get_pmd(numa); } }