diff mbox

[5/5] memory: able to pin guest node memory to host node manually

Message ID 1369298842-6295-5-git-send-email-gaowanlong@cn.fujitsu.com
State New
Headers show

Commit Message

Wanlong Gao May 23, 2013, 8:47 a.m. UTC
Use mbind to pin guest numa node memory to host nodes manually.

If we are not able to pin memory to host node, we may meet the
cross node memory access performance regression.

With this patch, we can add manual pinning host node like this:
-m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1

And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
between device and qemu process. All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
    =>kvm_assign_device()
      => kvm_iommu_map_memslots()
        => kvm_iommu_map_pages()
           => kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.

Above all, we need manual pinning memory to host node to avoid
such cross nodes memmory access performance regression.

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 exec.c                  | 21 +++++++++++++++++++++
 include/sysemu/sysemu.h |  1 +
 vl.c                    | 13 +++++++++++++
 3 files changed, 35 insertions(+)

Comments

Wanlong Gao May 24, 2013, 7:10 a.m. UTC | #1
On 05/23/2013 04:47 PM, Wanlong Gao wrote:
> Use mbind to pin guest numa node memory to host nodes manually.
> 
> If we are not able to pin memory to host node, we may meet the
> cross node memory access performance regression.
> 
> With this patch, we can add manual pinning host node like this:
> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> 
> Above all, we need manual pinning memory to host node to avoid
> such cross nodes memmory access performance regression.

Any comments ?

Thanks,
Wanlong Gao

> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  exec.c                  | 21 +++++++++++++++++++++
>  include/sysemu/sysemu.h |  1 +
>  vl.c                    | 13 +++++++++++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/exec.c b/exec.c
> index aec65c5..fe929ef 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -36,6 +36,8 @@
>  #include "qemu/config-file.h"
>  #include "exec/memory.h"
>  #include "sysemu/dma.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitops.h"
>  #include "exec/address-spaces.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>              memory_try_enable_merging(new_block->host, size);
>          }
>      }
> +
> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> +        int i;
> +        uint64_t nodes_mem = 0;
> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> +        for (i = 0; i < nb_numa_nodes; i++) {
> +            *maskp = 0;
> +            if (node_pin[i] != -1) {
> +                set_bit(node_pin[i], maskp);
> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> +                    perror("qemu_mbind");
> +                    exit(1);
> +                }
> +            }
> +            nodes_mem += node_mem[i];
> +        }
> +    }
> +
>      new_block->length = size;
>  
>      /* Keep the list sorted from biggest to smallest block.  */
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 2fb71af..ebf6580 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>  #define MAX_CPUMASK_BITS 255
>  extern int nb_numa_nodes;
>  extern uint64_t node_mem[MAX_NODES];
> +extern int node_pin[MAX_NODES];
>  extern unsigned long *node_cpumask[MAX_NODES];
>  
>  #define MAX_OPTION_ROMS 16
> diff --git a/vl.c b/vl.c
> index 5555b1d..3768002 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>  
>  int nb_numa_nodes;
>  uint64_t node_mem[MAX_NODES];
> +int node_pin[MAX_NODES];
>  unsigned long *node_cpumask[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>              }
>              node_mem[nodenr] = sval;
>          }
> +
> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> +            int unsigned long long pin_node;
> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> +                exit(1);
> +            } else {
> +                node_pin[nodenr] = pin_node;
> +            }
> +        }
> +
>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>              numa_node_parse_cpus(nodenr, option);
>          }
> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          node_mem[i] = 0;
> +        node_pin[i] = -1;
>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>      }
>  
>
Wanlong Gao May 27, 2013, 2:57 a.m. UTC | #2
Ping............



> On 05/23/2013 04:47 PM, Wanlong Gao wrote:
>> Use mbind to pin guest numa node memory to host nodes manually.
>>
>> If we are not able to pin memory to host node, we may meet the
>> cross node memory access performance regression.
>>
>> With this patch, we can add manual pinning host node like this:
>> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
>>
>> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
>> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
>>
>> KVM_ASSIGN_PCI_DEVICE ioctl
>>   kvm_vm_ioctl_assign_device()
>>     =>kvm_assign_device()
>>       => kvm_iommu_map_memslots()
>>         => kvm_iommu_map_pages()
>>            => kvm_pin_pages()
>>
>> So, with direct-attached-device, all guest page's page count will be +1 and
>> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
>>
>> Above all, we need manual pinning memory to host node to avoid
>> such cross nodes memmory access performance regression.
> 
> Any comments ?
> 
> Thanks,
> Wanlong Gao
> 
>>
>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>> ---
>>  exec.c                  | 21 +++++++++++++++++++++
>>  include/sysemu/sysemu.h |  1 +
>>  vl.c                    | 13 +++++++++++++
>>  3 files changed, 35 insertions(+)
>>
>> diff --git a/exec.c b/exec.c
>> index aec65c5..fe929ef 100644
>> --- a/exec.c
>> +++ b/exec.c
>> @@ -36,6 +36,8 @@
>>  #include "qemu/config-file.h"
>>  #include "exec/memory.h"
>>  #include "sysemu/dma.h"
>> +#include "sysemu/sysemu.h"
>> +#include "qemu/bitops.h"
>>  #include "exec/address-spaces.h"
>>  #if defined(CONFIG_USER_ONLY)
>>  #include <qemu.h>
>> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>>              memory_try_enable_merging(new_block->host, size);
>>          }
>>      }
>> +
>> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
>> +        int i;
>> +        uint64_t nodes_mem = 0;
>> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
>> +        for (i = 0; i < nb_numa_nodes; i++) {
>> +            *maskp = 0;
>> +            if (node_pin[i] != -1) {
>> +                set_bit(node_pin[i], maskp);
>> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
>> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
>> +                    perror("qemu_mbind");
>> +                    exit(1);
>> +                }
>> +            }
>> +            nodes_mem += node_mem[i];
>> +        }
>> +    }
>> +
>>      new_block->length = size;
>>  
>>      /* Keep the list sorted from biggest to smallest block.  */
>> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
>> index 2fb71af..ebf6580 100644
>> --- a/include/sysemu/sysemu.h
>> +++ b/include/sysemu/sysemu.h
>> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>>  #define MAX_CPUMASK_BITS 255
>>  extern int nb_numa_nodes;
>>  extern uint64_t node_mem[MAX_NODES];
>> +extern int node_pin[MAX_NODES];
>>  extern unsigned long *node_cpumask[MAX_NODES];
>>  
>>  #define MAX_OPTION_ROMS 16
>> diff --git a/vl.c b/vl.c
>> index 5555b1d..3768002 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>>  
>>  int nb_numa_nodes;
>>  uint64_t node_mem[MAX_NODES];
>> +int node_pin[MAX_NODES];
>>  unsigned long *node_cpumask[MAX_NODES];
>>  
>>  uint8_t qemu_uuid[16];
>> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>>              }
>>              node_mem[nodenr] = sval;
>>          }
>> +
>> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
>> +            int unsigned long long pin_node;
>> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
>> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
>> +                exit(1);
>> +            } else {
>> +                node_pin[nodenr] = pin_node;
>> +            }
>> +        }
>> +
>>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>>              numa_node_parse_cpus(nodenr, option);
>>          }
>> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>>  
>>      for (i = 0; i < MAX_NODES; i++) {
>>          node_mem[i] = 0;
>> +        node_pin[i] = -1;
>>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>>      }
>>  
>>
> 
>
Wanlong Gao May 28, 2013, 2:27 a.m. UTC | #3
Any comments?


> Use mbind to pin guest numa node memory to host nodes manually.
> 
> If we are not able to pin memory to host node, we may meet the
> cross node memory access performance regression.
> 
> With this patch, we can add manual pinning host node like this:
> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> 
> Above all, we need manual pinning memory to host node to avoid
> such cross nodes memmory access performance regression.
> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  exec.c                  | 21 +++++++++++++++++++++
>  include/sysemu/sysemu.h |  1 +
>  vl.c                    | 13 +++++++++++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/exec.c b/exec.c
> index aec65c5..fe929ef 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -36,6 +36,8 @@
>  #include "qemu/config-file.h"
>  #include "exec/memory.h"
>  #include "sysemu/dma.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitops.h"
>  #include "exec/address-spaces.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>              memory_try_enable_merging(new_block->host, size);
>          }
>      }
> +
> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> +        int i;
> +        uint64_t nodes_mem = 0;
> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> +        for (i = 0; i < nb_numa_nodes; i++) {
> +            *maskp = 0;
> +            if (node_pin[i] != -1) {
> +                set_bit(node_pin[i], maskp);
> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> +                    perror("qemu_mbind");
> +                    exit(1);
> +                }
> +            }
> +            nodes_mem += node_mem[i];
> +        }
> +    }
> +
>      new_block->length = size;
>  
>      /* Keep the list sorted from biggest to smallest block.  */
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 2fb71af..ebf6580 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>  #define MAX_CPUMASK_BITS 255
>  extern int nb_numa_nodes;
>  extern uint64_t node_mem[MAX_NODES];
> +extern int node_pin[MAX_NODES];
>  extern unsigned long *node_cpumask[MAX_NODES];
>  
>  #define MAX_OPTION_ROMS 16
> diff --git a/vl.c b/vl.c
> index 5555b1d..3768002 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>  
>  int nb_numa_nodes;
>  uint64_t node_mem[MAX_NODES];
> +int node_pin[MAX_NODES];
>  unsigned long *node_cpumask[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>              }
>              node_mem[nodenr] = sval;
>          }
> +
> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> +            int unsigned long long pin_node;
> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> +                exit(1);
> +            } else {
> +                node_pin[nodenr] = pin_node;
> +            }
> +        }
> +
>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>              numa_node_parse_cpus(nodenr, option);
>          }
> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          node_mem[i] = 0;
> +        node_pin[i] = -1;
>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>      }
>  
>
Wanlong Gao May 30, 2013, 9:57 a.m. UTC | #4
Any comments?


> Use mbind to pin guest numa node memory to host nodes manually.
> 
> If we are not able to pin memory to host node, we may meet the
> cross node memory access performance regression.
> 
> With this patch, we can add manual pinning host node like this:
> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> 
> Above all, we need manual pinning memory to host node to avoid
> such cross nodes memmory access performance regression.
> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  exec.c                  | 21 +++++++++++++++++++++
>  include/sysemu/sysemu.h |  1 +
>  vl.c                    | 13 +++++++++++++
>  3 files changed, 35 insertions(+)
> 
> diff --git a/exec.c b/exec.c
> index aec65c5..fe929ef 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -36,6 +36,8 @@
>  #include "qemu/config-file.h"
>  #include "exec/memory.h"
>  #include "sysemu/dma.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitops.h"
>  #include "exec/address-spaces.h"
>  #if defined(CONFIG_USER_ONLY)
>  #include <qemu.h>
> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>              memory_try_enable_merging(new_block->host, size);
>          }
>      }
> +
> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> +        int i;
> +        uint64_t nodes_mem = 0;
> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> +        for (i = 0; i < nb_numa_nodes; i++) {
> +            *maskp = 0;
> +            if (node_pin[i] != -1) {
> +                set_bit(node_pin[i], maskp);
> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> +                    perror("qemu_mbind");
> +                    exit(1);
> +                }
> +            }
> +            nodes_mem += node_mem[i];
> +        }
> +    }
> +
>      new_block->length = size;
>  
>      /* Keep the list sorted from biggest to smallest block.  */
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 2fb71af..ebf6580 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>  #define MAX_CPUMASK_BITS 255
>  extern int nb_numa_nodes;
>  extern uint64_t node_mem[MAX_NODES];
> +extern int node_pin[MAX_NODES];
>  extern unsigned long *node_cpumask[MAX_NODES];
>  
>  #define MAX_OPTION_ROMS 16
> diff --git a/vl.c b/vl.c
> index 5555b1d..3768002 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>  
>  int nb_numa_nodes;
>  uint64_t node_mem[MAX_NODES];
> +int node_pin[MAX_NODES];
>  unsigned long *node_cpumask[MAX_NODES];
>  
>  uint8_t qemu_uuid[16];
> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>              }
>              node_mem[nodenr] = sval;
>          }
> +
> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> +            int unsigned long long pin_node;
> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> +                exit(1);
> +            } else {
> +                node_pin[nodenr] = pin_node;
> +            }
> +        }
> +
>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>              numa_node_parse_cpus(nodenr, option);
>          }
> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>  
>      for (i = 0; i < MAX_NODES; i++) {
>          node_mem[i] = 0;
> +        node_pin[i] = -1;
>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>      }
>  
>
Eduardo Habkost May 30, 2013, 6:22 p.m. UTC | #5
On Thu, May 30, 2013 at 05:57:21PM +0800, Wanlong Gao wrote:
> > Use mbind to pin guest numa node memory to host nodes manually.
> > 
> > If we are not able to pin memory to host node, we may meet the
> > cross node memory access performance regression.
> > 
> > With this patch, we can add manual pinning host node like this:
> > -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
> > 
> > And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> > between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> > 
> > KVM_ASSIGN_PCI_DEVICE ioctl
> >   kvm_vm_ioctl_assign_device()
> >     =>kvm_assign_device()
> >       => kvm_iommu_map_memslots()
> >         => kvm_iommu_map_pages()
> >            => kvm_pin_pages()
> > 
> > So, with direct-attached-device, all guest page's page count will be +1 and
> > any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
> > 
> > Above all, we need manual pinning memory to host node to avoid
> > such cross nodes memmory access performance regression.

I believe a similar approach (letting QEMU do the pinning itself) was
already proposed and rejected. See:
http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684

An alternative approach was proposed at;
http://article.gmane.org/gmane.comp.emulators.qemu/123001
(exporting virtual address information directly)

and another one at:
http://article.gmane.org/gmane.comp.emulators.qemu/157741
(keeping the files inside -mem-path-dir so they could be pinned manually
by other programs)

The approach I was planning to implement was the one proposed at:
http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
(exporting memory backing information through QMP, instead of depending
on predictable filenames on -mem-path-dir)


> > 
> > Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> > ---
> >  exec.c                  | 21 +++++++++++++++++++++
> >  include/sysemu/sysemu.h |  1 +
> >  vl.c                    | 13 +++++++++++++
> >  3 files changed, 35 insertions(+)
> > 
> > diff --git a/exec.c b/exec.c
> > index aec65c5..fe929ef 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -36,6 +36,8 @@
> >  #include "qemu/config-file.h"
> >  #include "exec/memory.h"
> >  #include "sysemu/dma.h"
> > +#include "sysemu/sysemu.h"
> > +#include "qemu/bitops.h"
> >  #include "exec/address-spaces.h"
> >  #if defined(CONFIG_USER_ONLY)
> >  #include <qemu.h>
> > @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
> >              memory_try_enable_merging(new_block->host, size);
> >          }
> >      }
> > +
> > +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
> > +        int i;
> > +        uint64_t nodes_mem = 0;
> > +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
> > +        for (i = 0; i < nb_numa_nodes; i++) {
> > +            *maskp = 0;
> > +            if (node_pin[i] != -1) {
> > +                set_bit(node_pin[i], maskp);
> > +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
> > +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
> > +                    perror("qemu_mbind");
> > +                    exit(1);
> > +                }
> > +            }
> > +            nodes_mem += node_mem[i];
> > +        }
> > +    }
> > +
> >      new_block->length = size;
> >  
> >      /* Keep the list sorted from biggest to smallest block.  */
> > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> > index 2fb71af..ebf6580 100644
> > --- a/include/sysemu/sysemu.h
> > +++ b/include/sysemu/sysemu.h
> > @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
> >  #define MAX_CPUMASK_BITS 255
> >  extern int nb_numa_nodes;
> >  extern uint64_t node_mem[MAX_NODES];
> > +extern int node_pin[MAX_NODES];
> >  extern unsigned long *node_cpumask[MAX_NODES];
> >  
> >  #define MAX_OPTION_ROMS 16
> > diff --git a/vl.c b/vl.c
> > index 5555b1d..3768002 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
> >  
> >  int nb_numa_nodes;
> >  uint64_t node_mem[MAX_NODES];
> > +int node_pin[MAX_NODES];
> >  unsigned long *node_cpumask[MAX_NODES];
> >  
> >  uint8_t qemu_uuid[16];
> > @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
> >              }
> >              node_mem[nodenr] = sval;
> >          }
> > +
> > +        if (get_param_value(option, 128, "pin", optarg) != 0) {
> > +            int unsigned long long pin_node;
> > +            if (parse_uint_full(option, &pin_node, 10) < 0) {
> > +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
> > +                exit(1);
> > +            } else {
> > +                node_pin[nodenr] = pin_node;
> > +            }
> > +        }
> > +
> >          if (get_param_value(option, 128, "cpus", optarg) != 0) {
> >              numa_node_parse_cpus(nodenr, option);
> >          }
> > @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
> >  
> >      for (i = 0; i < MAX_NODES; i++) {
> >          node_mem[i] = 0;
> > +        node_pin[i] = -1;
> >          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
> >      }
> >  
> > 
> 
> 
>
Wanlong Gao May 31, 2013, 8:45 a.m. UTC | #6
On 05/31/2013 02:22 AM, Eduardo Habkost wrote:
> On Thu, May 30, 2013 at 05:57:21PM +0800, Wanlong Gao wrote:
>>> Use mbind to pin guest numa node memory to host nodes manually.
>>>
>>> If we are not able to pin memory to host node, we may meet the
>>> cross node memory access performance regression.
>>>
>>> With this patch, we can add manual pinning host node like this:
>>> -m 1024 -numa node,cpus=0,nodeid=0,mem=512,pin=0 -numa node,nodeid=1,cpus=1,mem=512,pin=1
>>>
>>> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
>>> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
>>>
>>> KVM_ASSIGN_PCI_DEVICE ioctl
>>>   kvm_vm_ioctl_assign_device()
>>>     =>kvm_assign_device()
>>>       => kvm_iommu_map_memslots()
>>>         => kvm_iommu_map_pages()
>>>            => kvm_pin_pages()
>>>
>>> So, with direct-attached-device, all guest page's page count will be +1 and
>>> any page migration will not work. AutoNUMA won't too. And direction by libvirt is *ignored*.
>>>
>>> Above all, we need manual pinning memory to host node to avoid
>>> such cross nodes memmory access performance regression.
> 
> I believe a similar approach (letting QEMU do the pinning itself) was
> already proposed and rejected. See:
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
> 
> An alternative approach was proposed at;
> http://article.gmane.org/gmane.comp.emulators.qemu/123001
> (exporting virtual address information directly)
> 
> and another one at:
> http://article.gmane.org/gmane.comp.emulators.qemu/157741
> (keeping the files inside -mem-path-dir so they could be pinned manually
> by other programs)
> 
> The approach I was planning to implement was the one proposed at:
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
> (exporting memory backing information through QMP, instead of depending
> on predictable filenames on -mem-path-dir)

You proposal seems good, but as I said above, when PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages(). Then the
"numactl" directions through hugetlbfs files will not work, neither does the
AutoNUMA.  We should set the numa binding directions manually before
PCI-passthrough assigned devices. Any external tools can't resolve this
problem but pinning memory manually inside QEMU.
So, IMO, we should both support manually pinning memory nodes inside QEMU
and give interfaces to allow external tools to judge the memory bind policy.

Thanks,
Wanlong Gao

> 
> 
>>>
>>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>>> ---
>>>  exec.c                  | 21 +++++++++++++++++++++
>>>  include/sysemu/sysemu.h |  1 +
>>>  vl.c                    | 13 +++++++++++++
>>>  3 files changed, 35 insertions(+)
>>>
>>> diff --git a/exec.c b/exec.c
>>> index aec65c5..fe929ef 100644
>>> --- a/exec.c
>>> +++ b/exec.c
>>> @@ -36,6 +36,8 @@
>>>  #include "qemu/config-file.h"
>>>  #include "exec/memory.h"
>>>  #include "sysemu/dma.h"
>>> +#include "sysemu/sysemu.h"
>>> +#include "qemu/bitops.h"
>>>  #include "exec/address-spaces.h"
>>>  #if defined(CONFIG_USER_ONLY)
>>>  #include <qemu.h>
>>> @@ -1081,6 +1083,25 @@ ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
>>>              memory_try_enable_merging(new_block->host, size);
>>>          }
>>>      }
>>> +
>>> +    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
>>> +        int i;
>>> +        uint64_t nodes_mem = 0;
>>> +        unsigned long *maskp = g_malloc0(sizeof(*maskp));
>>> +        for (i = 0; i < nb_numa_nodes; i++) {
>>> +            *maskp = 0;
>>> +            if (node_pin[i] != -1) {
>>> +                set_bit(node_pin[i], maskp);
>>> +                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
>>> +                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
>>> +                    perror("qemu_mbind");
>>> +                    exit(1);
>>> +                }
>>> +            }
>>> +            nodes_mem += node_mem[i];
>>> +        }
>>> +    }
>>> +
>>>      new_block->length = size;
>>>  
>>>      /* Keep the list sorted from biggest to smallest block.  */
>>> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
>>> index 2fb71af..ebf6580 100644
>>> --- a/include/sysemu/sysemu.h
>>> +++ b/include/sysemu/sysemu.h
>>> @@ -131,6 +131,7 @@ extern QEMUClock *rtc_clock;
>>>  #define MAX_CPUMASK_BITS 255
>>>  extern int nb_numa_nodes;
>>>  extern uint64_t node_mem[MAX_NODES];
>>> +extern int node_pin[MAX_NODES];
>>>  extern unsigned long *node_cpumask[MAX_NODES];
>>>  
>>>  #define MAX_OPTION_ROMS 16
>>> diff --git a/vl.c b/vl.c
>>> index 5555b1d..3768002 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -253,6 +253,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
>>>  
>>>  int nb_numa_nodes;
>>>  uint64_t node_mem[MAX_NODES];
>>> +int node_pin[MAX_NODES];
>>>  unsigned long *node_cpumask[MAX_NODES];
>>>  
>>>  uint8_t qemu_uuid[16];
>>> @@ -1390,6 +1391,17 @@ static void numa_add(const char *optarg)
>>>              }
>>>              node_mem[nodenr] = sval;
>>>          }
>>> +
>>> +        if (get_param_value(option, 128, "pin", optarg) != 0) {
>>> +            int unsigned long long pin_node;
>>> +            if (parse_uint_full(option, &pin_node, 10) < 0) {
>>> +                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
>>> +                exit(1);
>>> +            } else {
>>> +                node_pin[nodenr] = pin_node;
>>> +            }
>>> +        }
>>> +
>>>          if (get_param_value(option, 128, "cpus", optarg) != 0) {
>>>              numa_node_parse_cpus(nodenr, option);
>>>          }
>>> @@ -2921,6 +2933,7 @@ int main(int argc, char **argv, char **envp)
>>>  
>>>      for (i = 0; i < MAX_NODES; i++) {
>>>          node_mem[i] = 0;
>>> +        node_pin[i] = -1;
>>>          node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
>>>      }
>>>  
>>>
>>
>>
>>
>
diff mbox

Patch

diff --git a/exec.c b/exec.c
index aec65c5..fe929ef 100644
--- a/exec.c
+++ b/exec.c
@@ -36,6 +36,8 @@ 
 #include "qemu/config-file.h"
 #include "exec/memory.h"
 #include "sysemu/dma.h"
+#include "sysemu/sysemu.h"
+#include "qemu/bitops.h"
 #include "exec/address-spaces.h"
 #if defined(CONFIG_USER_ONLY)
 #include <qemu.h>
@@ -1081,6 +1083,25 @@  ram_addr_t qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
             memory_try_enable_merging(new_block->host, size);
         }
     }
+
+    if (nb_numa_nodes > 0 && !strcmp(mr->name, "pc.ram")) {
+        int i;
+        uint64_t nodes_mem = 0;
+        unsigned long *maskp = g_malloc0(sizeof(*maskp));
+        for (i = 0; i < nb_numa_nodes; i++) {
+            *maskp = 0;
+            if (node_pin[i] != -1) {
+                set_bit(node_pin[i], maskp);
+                if (qemu_mbind(new_block->host + nodes_mem, node_mem[i],
+                               QEMU_MPOL_BIND, maskp, MAX_NODES, 0)) {
+                    perror("qemu_mbind");
+                    exit(1);
+                }
+            }
+            nodes_mem += node_mem[i];
+        }
+    }
+
     new_block->length = size;
 
     /* Keep the list sorted from biggest to smallest block.  */
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 2fb71af..ebf6580 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -131,6 +131,7 @@  extern QEMUClock *rtc_clock;
 #define MAX_CPUMASK_BITS 255
 extern int nb_numa_nodes;
 extern uint64_t node_mem[MAX_NODES];
+extern int node_pin[MAX_NODES];
 extern unsigned long *node_cpumask[MAX_NODES];
 
 #define MAX_OPTION_ROMS 16
diff --git a/vl.c b/vl.c
index 5555b1d..3768002 100644
--- a/vl.c
+++ b/vl.c
@@ -253,6 +253,7 @@  static QTAILQ_HEAD(, FWBootEntry) fw_boot_order =
 
 int nb_numa_nodes;
 uint64_t node_mem[MAX_NODES];
+int node_pin[MAX_NODES];
 unsigned long *node_cpumask[MAX_NODES];
 
 uint8_t qemu_uuid[16];
@@ -1390,6 +1391,17 @@  static void numa_add(const char *optarg)
             }
             node_mem[nodenr] = sval;
         }
+
+        if (get_param_value(option, 128, "pin", optarg) != 0) {
+            int unsigned long long pin_node;
+            if (parse_uint_full(option, &pin_node, 10) < 0) {
+                fprintf(stderr, "qemu: Invalid pinning nodeid: %s\n", optarg);
+                exit(1);
+            } else {
+                node_pin[nodenr] = pin_node;
+            }
+        }
+
         if (get_param_value(option, 128, "cpus", optarg) != 0) {
             numa_node_parse_cpus(nodenr, option);
         }
@@ -2921,6 +2933,7 @@  int main(int argc, char **argv, char **envp)
 
     for (i = 0; i < MAX_NODES; i++) {
         node_mem[i] = 0;
+        node_pin[i] = -1;
         node_cpumask[i] = bitmap_new(MAX_CPUMASK_BITS);
     }