diff mbox

[ovs-dev,RFC,v2] netdev-dpdk: NUMA Aware vHost

Message ID 1458139015-30599-2-git-send-email-ciara.loftus@intel.com
State Changes Requested
Headers show

Commit Message

Ciara Loftus March 16, 2016, 2:36 p.m. UTC
This commit allows for vHost memory from QEMU, DPDK and OVS, as well
as the servicing PMD, to all come from the same socket.

DPDK v2.2 introduces a new configuration option:
CONFIG_RTE_LIBRTE_VHOST_NUMA. If enabled, DPDK detects the socket
from which a vhost device's memory has been allocated by QEMU, and
accordingly reallocates device memory managed by DPDK to that same
socket.

OVS by default sets the socket id of a vhost port to that of the
master lcore. This commit introduces the ability to update the
socket id of the port if it is detected (during VM boot) that the
port memory is not on the default NUMA node. If this is the case, the
mempool of the port is also changed to the new node, and a PMD
thread currently servicing the port will no longer, in favour of a
thread from the new node (if enabled in the CPU mask).

Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>
---
 .travis.yml                     |  3 +++
 INSTALL.DPDK.md                 | 10 +++++++++-
 acinclude.m4                    |  2 +-
 lib/netdev-dpdk.c               | 37 ++++++++++++++++++++++++++++++++++---
 rhel/openvswitch-fedora.spec.in |  1 +
 5 files changed, 48 insertions(+), 5 deletions(-)

Comments

Ben Pfaff March 30, 2016, 5:50 p.m. UTC | #1
On Wed, Mar 16, 2016 at 02:36:55PM +0000, Ciara Loftus wrote:
> This commit allows for vHost memory from QEMU, DPDK and OVS, as well
> as the servicing PMD, to all come from the same socket.
> 
> DPDK v2.2 introduces a new configuration option:
> CONFIG_RTE_LIBRTE_VHOST_NUMA. If enabled, DPDK detects the socket
> from which a vhost device's memory has been allocated by QEMU, and
> accordingly reallocates device memory managed by DPDK to that same
> socket.
> 
> OVS by default sets the socket id of a vhost port to that of the
> master lcore. This commit introduces the ability to update the
> socket id of the port if it is detected (during VM boot) that the
> port memory is not on the default NUMA node. If this is the case, the
> mempool of the port is also changed to the new node, and a PMD
> thread currently servicing the port will no longer, in favour of a
> thread from the new node (if enabled in the CPU mask).
> 
> Signed-off-by: Ciara Loftus <ciara.loftus@intel.com>

The change below to acinclude.m4 makes it appear that libnuma is now
mandatory for building with DPDK support, but I did not notice any
update to INSTALL.DPDK.md to explain that the user must now install
libnuma.

> diff --git a/acinclude.m4 b/acinclude.m4
> index 74f0494..e06ace9 100644
> --- a/acinclude.m4
> +++ b/acinclude.m4
> @@ -195,7 +195,7 @@ AC_DEFUN([OVS_CHECK_DPDK], [
>      found=false
>      save_LIBS=$LIBS
>      for extras in "" "-ldl"; do
> -        LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB"
> +        LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB -lnuma"
>          AC_LINK_IFELSE(
>             [AC_LANG_PROGRAM([#include <rte_config.h>
>                               #include <rte_eal.h>],

I didn't otherwise review the patch.

Thanks,

Ben.
diff mbox

Patch

diff --git a/.travis.yml b/.travis.yml
index 2b262e4..841f534 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,8 +8,11 @@  addons:
     packages:
       - bc
       - gcc-multilib
+      - libnuma1
+      - libnuma-dev
       - libssl-dev
       - llvm-dev
+      - numactl
 
 before_install: ./.travis/prepare.sh
 
diff --git a/INSTALL.DPDK.md b/INSTALL.DPDK.md
index 9ec8bf6..d9513e0 100644
--- a/INSTALL.DPDK.md
+++ b/INSTALL.DPDK.md
@@ -33,6 +33,11 @@  on Debian/Ubuntu)
 
      `CONFIG_RTE_BUILD_COMBINE_LIBS=y`
 
+     Optional: Enable NUMA-aware vHost by modifying the following in the same
+     file:
+
+     `CONFIG_RTE_LIBRTE_VHOST_NUMA=y`
+
      Then run `make install` to build and install the library.
      For default install without IVSHMEM:
 
@@ -403,7 +408,10 @@  Performance Tuning:
 
 	It is good practice to ensure that threads that are in the datapath are
 	pinned to cores in the same NUMA area. e.g. pmd threads and QEMU vCPUs
-	responsible for forwarding.
+	responsible for forwarding. If DPDK is built with
+	CONFIG_RTE_LIBRTE_VHOST_NUMA=y, vHost ports automatically detect the
+	NUMA socket of the QEMU vCPUs and will be serviced by a PMD from the
+	same node provided a core on this node is enabled in the pmd-cpu-mask.
 
   9. Rx Mergeable buffers
 
diff --git a/acinclude.m4 b/acinclude.m4
index 74f0494..e06ace9 100644
--- a/acinclude.m4
+++ b/acinclude.m4
@@ -195,7 +195,7 @@  AC_DEFUN([OVS_CHECK_DPDK], [
     found=false
     save_LIBS=$LIBS
     for extras in "" "-ldl"; do
-        LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB"
+        LIBS="$DPDK_LIB $extras $save_LIBS $DPDK_EXTRA_LIB -lnuma"
         AC_LINK_IFELSE(
            [AC_LANG_PROGRAM([#include <rte_config.h>
                              #include <rte_eal.h>],
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index 7d3e275..cea6f4b 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -29,6 +29,7 @@ 
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
+#include <numaif.h>
 
 #include "dirs.h"
 #include "dp-packet.h"
@@ -344,6 +345,9 @@  struct netdev_dpdk {
      * netdev_dpdk*_reconfigure() is called */
     int requested_n_txq;
     int requested_n_rxq;
+
+    /* Socket ID detected when vHost device is brought up */
+    int requested_socket_id;
 };
 
 struct netdev_rxq_dpdk {
@@ -706,6 +710,8 @@  netdev_dpdk_init(struct netdev *netdev_, unsigned int port_no,
     }
 
     netdev->socket_id = sid < 0 ? SOCKET0 : sid;
+    netdev->requested_socket_id = netdev->socket_id;
+
     netdev->port_id = port_no;
     netdev->type = type;
     netdev->flags = 0;
@@ -2006,6 +2012,8 @@  new_device(struct virtio_net *dev)
 {
     struct netdev_dpdk *netdev;
     bool exists = false;
+    int newnode = 0;
+    long err = 0;
 
     ovs_mutex_lock(&dpdk_mutex);
     /* Add device to the vhost port with the same name as that passed down. */
@@ -2019,6 +2027,18 @@  new_device(struct virtio_net *dev)
             }
             ovsrcu_set(&netdev->virtio_dev, dev);
             exists = true;
+
+            /* Get NUMA information */
+            err = get_mempolicy(&newnode, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR);
+            if (err) {
+                VLOG_INFO("Error getting NUMA info for vHost Device '%s'",
+                        dev->ifname);
+                newnode = netdev->socket_id;
+            } else if (newnode != netdev->socket_id) {
+                netdev->requested_socket_id = newnode;
+                netdev_request_reconfigure(&netdev->up);
+            }
+
             dev->flags |= VIRTIO_DEV_RUNNING;
             /* Disable notifications. */
             set_irq_status(dev);
@@ -2035,8 +2055,8 @@  new_device(struct virtio_net *dev)
         return -1;
     }
 
-    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added", dev->ifname,
-              dev->device_fh);
+    VLOG_INFO("vHost Device '%s' %"PRIu64" has been added on socket %i",
+              dev->ifname, dev->device_fh, newnode);
     return 0;
 }
 
@@ -2615,6 +2635,7 @@  static int
 netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev_)
 {
     struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_);
+    int err = 0;
 
     ovs_mutex_lock(&dpdk_mutex);
     ovs_mutex_lock(&netdev->mutex);
@@ -2622,10 +2643,20 @@  netdev_dpdk_vhost_user_reconfigure(struct netdev *netdev_)
     netdev->up.n_txq = netdev->requested_n_txq;
     netdev->up.n_rxq = netdev->requested_n_rxq;
 
+    if (netdev->requested_socket_id != netdev->socket_id) {
+        netdev->socket_id = netdev->requested_socket_id;
+        /* Change mempool to new NUMA Node */
+        dpdk_mp_put(netdev->dpdk_mp);
+        netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu);
+        if (!netdev->dpdk_mp) {
+            err = ENOMEM;
+        }
+    }
+
     ovs_mutex_unlock(&netdev->mutex);
     ovs_mutex_unlock(&dpdk_mutex);
 
-    return 0;
+    return err;
 }
 
 static int
diff --git a/rhel/openvswitch-fedora.spec.in b/rhel/openvswitch-fedora.spec.in
index 065d384..8145706 100644
--- a/rhel/openvswitch-fedora.spec.in
+++ b/rhel/openvswitch-fedora.spec.in
@@ -54,6 +54,7 @@  BuildRequires: libcap-ng libcap-ng-devel
 %endif
 %if %{with dpdk}
 BuildRequires: dpdk-devel >= 2.2.0
+BuildRequires: numactl numactl-devel numactl-libs
 Provides: %{name}-dpdk = %{version}-%{release}
 %endif