@@ -8,8 +8,11 @@ addons:
packages:
- bc
- gcc-multilib
+ - libnuma1
+ - libnuma-dev
- libssl-dev
- llvm-dev
+ - numactl
before_install: ./.travis/prepare.sh
@@ -33,6 +33,11 @@ on Debian/Ubuntu)
`CONFIG_RTE_BUILD_COMBINE_LIBS=y`
+ Optional: Enable NUMA-aware vHost by modifying the following in the same
+ file:
+
+ `CONFIG_RTE_LIBRTE_VHOST_NUMA=y`
+
Then run `make install` to build and install the library.
For default install without IVSHMEM:
@@ -403,7 +408,10 @@ Performance Tuning:
It is good practice to ensure that threads that are in the datapath are
pinned to cores in the same NUMA area. e.g. pmd threads and QEMU vCPUs
- responsible for forwarding.
+ responsible for forwarding. If DPDK is built with
+ CONFIG_RTE_LIBRTE_VHOST_NUMA=y, vHost ports automatically detect the
+ NUMA socket of the QEMU vCPUs and will be serviced by a PMD from the
+ same node provided a core on this node is enabled in the pmd-cpu-mask.
9. Rx Mergeable buffers
@@ -195,7 +195,7 @@ AC_DEFUN([OVS_CHECK_DPDK], [
found=false
save_LIBS=$LIBS
for extras in "" "-ldl"; do
- LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB"
+ LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB -lnuma"
AC_LINK_IFELSE(
[AC_LANG_PROGRAM([#include <rte_config.h>
#include <rte_eal.h>],
@@ -29,6 +29,7 @@
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
+#include <numaif.h>
#include "dirs.h"
#include "dp-packet.h"
@@ -344,6 +345,9 @@ struct netdev_dpdk {
* netdev_dpdk*_reconfigure() is called */
int requested_n_txq;
int requested_n_rxq;
+
+ /* Socket ID detected when vHost device is brought up */
+ int requested_socket_id;
};
struct netdev_rxq_dpdk {
@@ -706,6 +710,8 @@ netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no,
}
netdev->socket_id = sid < 0 ? SOCKET0 : sid;
+ netdev->requested_socket_id = netdev->socket_id;
+
netdev->port_id = port_no;
netdev->type = type;
netdev->flags = 0;
@@ -2006,6 +2012,8 @@ new_device(struct virtio_net *dev)
{
struct netdev_dpdk *netdev;
bool exists = false;
+ int newnode = 0;
+ long err = 0;
ovs_mutex_lock(&dpdk_mutex);
/* Add device to the vhost port with the same name as that passed down. */
@@ -2019,6 +2027,18 @@ new_device(struct virtio_net *dev)
}
ovsrcu_set(&netdev->virtio_dev, dev);
exists = true;
+
+ /* Get NUMA information */
+ err = get_mempolicy(&newnode, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR);
+ if (err) {
+ VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
+ dev->ifname);
+ newnode = netdev->socket_id;
+ } else if (newnode != netdev->socket_id) {
+ netdev->requested_socket_id = newnode;
+ netdev_request_reconfigure(&netdev->up);
+ }
+
dev->flags |= VIRTIO_DEV_RUNNING;
/* Disable notifications. */
set_irq_status(dev);
@@ -2035,8 +2055,8 @@ new_device(struct virtio_net *dev)
return -1;
}
- VLOG_INFO("vHost Device '%s' %"PRIu64" has been added", dev->ifname,
- dev->device_fh);
+ VLOG_INFO("vHost Device '%s' %"PRIu64" has been added on socket %i",
+ dev->ifname, dev->device_fh, newnode);
return 0;
}
@@ -2615,6 +2635,7 @@ static int
netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev_)
{
struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+ int err = 0;
ovs_mutex_lock(&dpdk_mutex);
ovs_mutex_lock(&netdev->mutex);
@@ -2622,10 +2643,20 @@ netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev_)
netdev->up.n_txq = netdev->requested_n_txq;
netdev->up.n_rxq = netdev->requested_n_rxq;
+ if (netdev->requested_socket_id != netdev->socket_id) {
+ netdev->socket_id = netdev->requested_socket_id;
+ /* Change mempool to new NUMA Node */
+ dpdk_mp_put(netdev->dpdk_mp);
+ netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
+ if (!netdev->dpdk_mp) {
+ err = ENOMEM;
+ }
+ }
+
ovs_mutex_unlock(&netdev->mutex);
ovs_mutex_unlock(&dpdk_mutex);
- return 0;
+ return err;
}
static int
@@ -54,6 +54,7 @@ BuildRequires: libcap-ng libcap-ng-devel
%endif
%if %{with dpdk}
BuildRequires: dpdk-devel >= 2.2.0
+BuildRequires: numactl numactl-devel numactl-libs
Provides: %{name}-dpdk = %{version}-%{release}
%endif
This commit allows for vHost memory from QEMU, DPDK and OVS, as well as the servicing PMD, to all come from the same socket. DPDK v2.2 introduces a new configuration option: CONFIG_RTE_LIBRTE_VHOST_NUMA. If enabled, DPDK detects the socket from which a vhost device's memory has been allocated by QEMU, and accordingly reallocates device memory managed by DPDK to that same socket. OVS by default sets the socket id of a vhost port to that of the master lcore. This commit introduces the ability to update the socket id of the port if it is detected (during VM boot) that the port memory is not on the default NUMA node. If this is the case, the mempool of the port is also changed to the new node, and a PMD thread currently servicing the port will no longer, in favour of a thread from the new node (if enabled in the CPU mask). Signed-off-by: Ciara Loftus <ciara.loftus@intel.com> --- .travis.yml | 3 +++ INSTALL.DPDK.md | 10 +++++++++- acinclude.m4 | 2 +- lib/netdev-dpdk.c | 37 ++++++++++++++++++++++++++++++++++--- rhel/openvswitch-fedora.spec.in | 1 + 5 files changed, 48 insertions(+), 5 deletions(-)