Patchwork [V7,07/11] NUMA: set guest numa nodes memory policy

login
register
mail settings
Submitter Wanlong Gao
Date July 31, 2013, 6:42 a.m.
Message ID <1375252936-28496-8-git-send-email-gaowanlong@cn.fujitsu.com>
Download mbox | patch
Permalink /patch/263593/
State New
Headers show

Comments

Wanlong Gao - July 31, 2013, 6:42 a.m.
Set the guest numa nodes memory policies using the mbind(2)
system call node by node.
After this patch, we are able to set guest nodes memory policies
through the QEMU options, this arms to solve the guest cross
nodes memory access performance issue.
And as you all know, if PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
    =>kvm_assign_device()
      => kvm_iommu_map_memslots()
        => kvm_iommu_map_pages()
           => kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too.

So, we should set the guest nodes memory allocation policies before
the pages are really mapped.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 numa.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 89 insertions(+)

Patch

diff --git a/numa.c b/numa.c
index 436b8e0..b2c0048 100644
--- a/numa.c
+++ b/numa.c
@@ -28,6 +28,16 @@ 
 #include "qapi-visit.h"
 #include "qapi/opts-visitor.h"
 #include "qapi/dealloc-visitor.h"
+#include "exec/memory.h"
+
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#include <numaif.h>
+#ifndef MPOL_F_RELATIVE_NODES
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_F_STATIC_NODES   (1 << 15)
+#endif
+#endif
 
 QemuOptsList qemu_numa_opts = {
     .name = "numa",
@@ -209,6 +219,78 @@  void set_numa_nodes(void)
     }
 }
 
+#ifdef CONFIG_NUMA
+static int node_parse_bind_mode(unsigned int nodeid)
+{
+    int bind_mode;
+
+    switch (numa_info[nodeid].policy) {
+    case NUMA_NODE_POLICY_MEMBIND:
+        bind_mode = MPOL_BIND;
+        break;
+    case NUMA_NODE_POLICY_INTERLEAVE:
+        bind_mode = MPOL_INTERLEAVE;
+        break;
+    case NUMA_NODE_POLICY_PREFERRED:
+        bind_mode = MPOL_PREFERRED;
+        break;
+    case NUMA_NODE_POLICY_DEFAULT:
+    default:
+        bind_mode = MPOL_DEFAULT;
+        return bind_mode;
+    }
+
+    bind_mode |= numa_info[nodeid].relative ?
+        MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+    return bind_mode;
+}
+#endif
+
+static int set_node_mem_policy(int nodeid)
+{
+#ifdef CONFIG_NUMA
+    void *ram_ptr;
+    RAMBlock *block;
+    ram_addr_t len, ram_offset = 0;
+    int bind_mode;
+    int i;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(block->mr->name, "pc.ram")) {
+            break;
+        }
+    }
+
+    if (block->host == NULL) {
+        return -1;
+    }
+
+    ram_ptr = block->host;
+    for (i = 0; i < nodeid; i++) {
+        len = numa_info[i].node_mem;
+        ram_offset += len;
+    }
+
+    len = numa_info[i].node_mem;
+    bind_mode = node_parse_bind_mode(i);
+
+    /* This is a workaround for a long standing bug in Linux'
+     * mbind implementation, which cuts off the last specified
+     * node. To stay compatible should this bug be fixed, we
+     * specify one more node and zero this one out.
+     */
+    clear_bit(numa_num_configured_nodes() + 1, numa_info[i].host_mem);
+    if (mbind(ram_ptr + ram_offset, len, bind_mode,
+        numa_info[i].host_mem, numa_num_configured_nodes() + 1, 0)) {
+            perror("mbind");
+            return -1;
+    }
+#endif
+
+    return 0;
+}
+
 void set_numa_modes(void)
 {
     CPUState *cpu;
@@ -221,4 +303,11 @@  void set_numa_modes(void)
             }
         }
     }
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (set_node_mem_policy(i) == -1) {
+            fprintf(stderr,
+                    "qemu: can not set host memory policy for node%d\n", i);
+        }
+    }
 }