diff mbox

[2/2] Add monitor command mem-nodes

Message ID 1370404705-4620-2-git-send-email-gaowanlong@cn.fujitsu.com
State New
Headers show

Commit Message

Wanlong Gao June 5, 2013, 3:58 a.m. UTC
Add monitor command mem-nodes to show the huge mapped
memory nodes locations.

(qemu) info mem-nodes
/proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
/proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
/proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
/proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1

Refer to the proposal of Eduardo and Daniel.
http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476

Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

Comments

Eric Blake June 5, 2013, 12:39 p.m. UTC | #1
On 06/04/2013 09:58 PM, Wanlong Gao wrote:
> Add monitor command mem-nodes to show the huge mapped
> memory nodes locations.

Missing a QMP counterpart.  Libvirt would probably like to use this
command, and providing it HMP-only is not friendly.
Anthony Liguori June 5, 2013, 12:57 p.m. UTC | #2
Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:

> Add monitor command mem-nodes to show the huge mapped
> memory nodes locations.
>
> (qemu) info mem-nodes
> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1

This creates an ABI that we don't currently support.  Memory hotplug or
a variety of things can break this mapping and then we'd have to provide
an interface to describe that the mapping was broken.

Also, it only works with hugetlbfs which is probbably not widely used
given the existance of THP.

I had hoped that we would get proper userspace interfaces for describing
memory groups but that appears to have stalled out.

Does anyone know if this is still on the table?

If we can't get a proper kernel interface, then perhaps we need to add
full libnuma support but that would really be unfortunate...

Regards,

Anthony Liguori

>
> Refer to the proposal of Eduardo and Daniel.
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
>
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 45 insertions(+)
>
> diff --git a/monitor.c b/monitor.c
> index eefc7f0..85c865f 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -74,6 +74,10 @@
>  #endif
>  #include "hw/lm32/lm32_pic.h"
>  
> +#if defined(CONFIG_NUMA)
> +#include <numaif.h>
> +#endif
> +
>  //#define DEBUG
>  //#define DEBUG_COMPLETION
>  
> @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
>  }
>  #endif
>  
> +#if defined(CONFIG_NUMA)
> +static void mem_nodes(Monitor *mon, const QDict *qdict)
> +{
> +    RAMBlock *block;
> +    int prevnode, node;
> +    unsigned long long c, start, area;
> +    int fd;
> +    int pid = getpid();
> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +        if (!(fd = block->fd))
> +            continue;
> +        prevnode = -1;
> +        start = 0;
> +        area = (unsigned long long)block->host;
> +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
> +            if (get_mempolicy(&node, NULL, 0, c + block->host,
> +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
> +                continue;
> +            if (node == prevnode)
> +                continue;
> +            if (prevnode != -1)
> +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> +                               pid, fd, start + area, c + area, prevnode);
> +            prevnode = node;
> +            start = c;
> +         }
> +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> +                        pid, fd, start + area, c + area, prevnode);
> +    }
> +}
> +#endif
> +
>  #if defined(TARGET_SH4)
>  
>  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
> @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
>          .mhandler.cmd = mem_info,
>      },
>  #endif
> +#if defined(CONFIG_NUMA)
> +    {
> +        .name       = "mem-nodes",
> +        .args_type  = "",
> +        .params     = "",
> +        .help       = "show the huge mapped memory nodes location",
> +        .mhandler.cmd = mem_nodes,
> +    },
> +#endif
>      {
>          .name       = "mtree",
>          .args_type  = "",
> -- 
> 1.8.3.rc2.10.g0c2b1cf
Eduardo Habkost June 5, 2013, 1:46 p.m. UTC | #3
On Wed, Jun 05, 2013 at 11:58:25AM +0800, Wanlong Gao wrote:
> Add monitor command mem-nodes to show the huge mapped
> memory nodes locations.
> 

This is for machine consumption, so we need a QMP command.

> (qemu) info mem-nodes
> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1

Are node0/node1 _host_ nodes?

How do I know what's the _guest_ address/node corresponding to each
file/range above?

What I am really looking for is:

 * The correspondence between guest (virtual) NUMA nodes and guest
   physical address ranges (it could be provided by the QMP version of
   "info numa")
 * The correspondence between guest physical address ranges and ranges
   inside the mapped files (so external tools could set the policy on
   those files instead of requiring QEMU to set it directly)

I understand that your use case may require additional information and
additional interfaces. But if we provide the information above we will
allow external components set the policy on the hugetlbfs files before
we add new interfaces required for your use case.

Also, what about making it conditional to OSes where we really know
"/proc/<pid>/fd/<fd>" is available?


> 
> Refer to the proposal of Eduardo and Daniel.
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476

> 
> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> ---
>  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 45 insertions(+)
> 
> diff --git a/monitor.c b/monitor.c
> index eefc7f0..85c865f 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -74,6 +74,10 @@
>  #endif
>  #include "hw/lm32/lm32_pic.h"
>  
> +#if defined(CONFIG_NUMA)
> +#include <numaif.h>
> +#endif
> +
>  //#define DEBUG
>  //#define DEBUG_COMPLETION
>  
> @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
>  }
>  #endif
>  
> +#if defined(CONFIG_NUMA)
> +static void mem_nodes(Monitor *mon, const QDict *qdict)
> +{
> +    RAMBlock *block;
> +    int prevnode, node;
> +    unsigned long long c, start, area;
> +    int fd;
> +    int pid = getpid();
> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +        if (!(fd = block->fd))
> +            continue;
> +        prevnode = -1;
> +        start = 0;
> +        area = (unsigned long long)block->host;
> +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
> +            if (get_mempolicy(&node, NULL, 0, c + block->host,
> +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
> +                continue;
> +            if (node == prevnode)
> +                continue;
> +            if (prevnode != -1)
> +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> +                               pid, fd, start + area, c + area, prevnode);
> +            prevnode = node;
> +            start = c;
> +         }
> +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> +                        pid, fd, start + area, c + area, prevnode);
> +    }
> +}
> +#endif
> +
>  #if defined(TARGET_SH4)
>  
>  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
> @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
>          .mhandler.cmd = mem_info,
>      },
>  #endif
> +#if defined(CONFIG_NUMA)
> +    {
> +        .name       = "mem-nodes",
> +        .args_type  = "",
> +        .params     = "",
> +        .help       = "show the huge mapped memory nodes location",
> +        .mhandler.cmd = mem_nodes,
> +    },
> +#endif
>      {
>          .name       = "mtree",
>          .args_type  = "",
> -- 
> 1.8.3.rc2.10.g0c2b1cf
>
Eduardo Habkost June 5, 2013, 3:54 p.m. UTC | #4
On Wed, Jun 05, 2013 at 07:57:42AM -0500, Anthony Liguori wrote:
> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
> 
> > Add monitor command mem-nodes to show the huge mapped
> > memory nodes locations.
> >
> > (qemu) info mem-nodes
> > /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> > /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> > /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> > /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
> 
> This creates an ABI that we don't currently support.  Memory hotplug or
> a variety of things can break this mapping and then we'd have to provide
> an interface to describe that the mapping was broken.

What do you mean by "breaking this mapping", exactly? Would the backing
file of existing guest RAM ever change? (It would require a memory copy
from one file to another, why would QEMU ever do that?)

> 
> Also, it only works with hugetlbfs which is probbably not widely used
> given the existance of THP.

Quoting yourself at
http://article.gmane.org/gmane.comp.emulators.kvm.devel/58227:

>> It's extremely likely that if you're doing NUMA pinning, you're also 
>> doing large pages via hugetlbfs.  numactl can already set policies for 
>> files in hugetlbfs so all you need to do is have a separate hugetlbfs 
>> file for each numa node.
>> 
>> Then you have all the flexibility of numactl and you can implement node 
>> migration external to QEMU if you so desire.

And if we simply report where are the backing files and offsets being
used for guest RAM, one could simply use
'numactl --file --offset --length', so we don't even need separate
files/mem-paths for each node.

Does THP work with tmpfs, already? If it does, people who doesn't want
hugetlbfs and want numa tuning to work with THP could just use tmpfs for
-mem-path.

> 
> I had hoped that we would get proper userspace interfaces for describing
> memory groups but that appears to have stalled out.

I would love to have it. But while we don't have it, sharing the
tmpfs/hugetlbfs backing files seem to work just fine as a mechanism to
let other tools manipulate guest memory policy. We just need to let
external tools know where the backing files are.

> 
> Does anyone know if this is still on the table?
> 
> If we can't get a proper kernel interface, then perhaps we need to add
> full libnuma support but that would really be unfortunate...

Why isn't the "info mem-nodes" solution (I mean: not this version, but a
proper QMP version that exposes all the information we need) an option?


> 
> Regards,
> 
> Anthony Liguori
> 
> >
> > Refer to the proposal of Eduardo and Daniel.
> > http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
> >
> > Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> > ---
> >  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 45 insertions(+)
> >
> > diff --git a/monitor.c b/monitor.c
> > index eefc7f0..85c865f 100644
> > --- a/monitor.c
> > +++ b/monitor.c
> > @@ -74,6 +74,10 @@
> >  #endif
> >  #include "hw/lm32/lm32_pic.h"
> >  
> > +#if defined(CONFIG_NUMA)
> > +#include <numaif.h>
> > +#endif
> > +
> >  //#define DEBUG
> >  //#define DEBUG_COMPLETION
> >  
> > @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
> >  }
> >  #endif
> >  
> > +#if defined(CONFIG_NUMA)
> > +static void mem_nodes(Monitor *mon, const QDict *qdict)
> > +{
> > +    RAMBlock *block;
> > +    int prevnode, node;
> > +    unsigned long long c, start, area;
> > +    int fd;
> > +    int pid = getpid();
> > +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> > +        if (!(fd = block->fd))
> > +            continue;
> > +        prevnode = -1;
> > +        start = 0;
> > +        area = (unsigned long long)block->host;
> > +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
> > +            if (get_mempolicy(&node, NULL, 0, c + block->host,
> > +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
> > +                continue;
> > +            if (node == prevnode)
> > +                continue;
> > +            if (prevnode != -1)
> > +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> > +                               pid, fd, start + area, c + area, prevnode);
> > +            prevnode = node;
> > +            start = c;
> > +         }
> > +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> > +                        pid, fd, start + area, c + area, prevnode);
> > +    }
> > +}
> > +#endif
> > +
> >  #if defined(TARGET_SH4)
> >  
> >  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
> > @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
> >          .mhandler.cmd = mem_info,
> >      },
> >  #endif
> > +#if defined(CONFIG_NUMA)
> > +    {
> > +        .name       = "mem-nodes",
> > +        .args_type  = "",
> > +        .params     = "",
> > +        .help       = "show the huge mapped memory nodes location",
> > +        .mhandler.cmd = mem_nodes,
> > +    },
> > +#endif
> >      {
> >          .name       = "mtree",
> >          .args_type  = "",
> > -- 
> > 1.8.3.rc2.10.g0c2b1cf
>
Wanlong Gao June 6, 2013, 9:30 a.m. UTC | #5
On 06/05/2013 11:54 PM, Eduardo Habkost wrote:
> On Wed, Jun 05, 2013 at 07:57:42AM -0500, Anthony Liguori wrote:
>> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>>
>>> Add monitor command mem-nodes to show the huge mapped
>>> memory nodes locations.
>>>
>>> (qemu) info mem-nodes
>>> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
>>> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
>>> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
>>> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
>>
>> This creates an ABI that we don't currently support.  Memory hotplug or
>> a variety of things can break this mapping and then we'd have to provide
>> an interface to describe that the mapping was broken.
> 
> What do you mean by "breaking this mapping", exactly? Would the backing
> file of existing guest RAM ever change? (It would require a memory copy
> from one file to another, why would QEMU ever do that?)
> 
>>
>> Also, it only works with hugetlbfs which is probbably not widely used
>> given the existance of THP.
> 
> Quoting yourself at
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/58227:
> 
>>> It's extremely likely that if you're doing NUMA pinning, you're also 
>>> doing large pages via hugetlbfs.  numactl can already set policies for 
>>> files in hugetlbfs so all you need to do is have a separate hugetlbfs 
>>> file for each numa node.
>>>
>>> Then you have all the flexibility of numactl and you can implement node 
>>> migration external to QEMU if you so desire.
> 
> And if we simply report where are the backing files and offsets being
> used for guest RAM, one could simply use
> 'numactl --file --offset --length', so we don't even need separate
> files/mem-paths for each node.

Does "numactl" work after QEMU process mapped the hugetlbfs file?
I'm afraid the mempolicy set after it will take no effect.

And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
between device and qemu process. All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
    =>kvm_assign_device()
      => kvm_iommu_map_memslots()
        => kvm_iommu_map_pages()
           => kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too. Then any numa directions
are also ignored.


Mel? Andrea?

> 
> Does THP work with tmpfs, already? If it does, people who doesn't want
> hugetlbfs and want numa tuning to work with THP could just use tmpfs for
> -mem-path.
> 
>>
>> I had hoped that we would get proper userspace interfaces for describing
>> memory groups but that appears to have stalled out.
> 
> I would love to have it. But while we don't have it, sharing the
> tmpfs/hugetlbfs backing files seem to work just fine as a mechanism to
> let other tools manipulate guest memory policy. We just need to let
> external tools know where the backing files are.
> 
>>
>> Does anyone know if this is still on the table?
>>
>> If we can't get a proper kernel interface, then perhaps we need to add
>> full libnuma support but that would really be unfortunate...
> 
> Why isn't the "info mem-nodes" solution (I mean: not this version, but a
> proper QMP version that exposes all the information we need) an option?
> 

And the shortage of hugetlbfs is that we can't know how many virtual
machines, so that we can't determine how many huge pages to be reserved
for virtual machines.

In order to set numa mempolices on THP or normal memories, as Anthony said,
I think we should add full libnuma support into QEMU, and allow manually
set mempolices in the QEMU command line and through QEMU monitor after
QEMU started. The external tools can't do exactly the right thing.

So IMO the right direction is the objected proposal
 - Message-ID: <1281534738-8310-1-git-send-email-andre.przywara@amd.com>
   http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
 - Message-ID: <4C7D7C2A.7000205@codemonkey.ws>
   http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835

If you agree, I'll refact this patch set for review.

Thanks,
Wanlong Gao

> 
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>>
>>> Refer to the proposal of Eduardo and Daniel.
>>> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
>>>
>>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>>> ---
>>>  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 45 insertions(+)
>>>
>>> diff --git a/monitor.c b/monitor.c
>>> index eefc7f0..85c865f 100644
>>> --- a/monitor.c
>>> +++ b/monitor.c
>>> @@ -74,6 +74,10 @@
>>>  #endif
>>>  #include "hw/lm32/lm32_pic.h"
>>>  
>>> +#if defined(CONFIG_NUMA)
>>> +#include <numaif.h>
>>> +#endif
>>> +
>>>  //#define DEBUG
>>>  //#define DEBUG_COMPLETION
>>>  
>>> @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
>>>  }
>>>  #endif
>>>  
>>> +#if defined(CONFIG_NUMA)
>>> +static void mem_nodes(Monitor *mon, const QDict *qdict)
>>> +{
>>> +    RAMBlock *block;
>>> +    int prevnode, node;
>>> +    unsigned long long c, start, area;
>>> +    int fd;
>>> +    int pid = getpid();
>>> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>>> +        if (!(fd = block->fd))
>>> +            continue;
>>> +        prevnode = -1;
>>> +        start = 0;
>>> +        area = (unsigned long long)block->host;
>>> +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
>>> +            if (get_mempolicy(&node, NULL, 0, c + block->host,
>>> +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
>>> +                continue;
>>> +            if (node == prevnode)
>>> +                continue;
>>> +            if (prevnode != -1)
>>> +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
>>> +                               pid, fd, start + area, c + area, prevnode);
>>> +            prevnode = node;
>>> +            start = c;
>>> +         }
>>> +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
>>> +                        pid, fd, start + area, c + area, prevnode);
>>> +    }
>>> +}
>>> +#endif
>>> +
>>>  #if defined(TARGET_SH4)
>>>  
>>>  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
>>> @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
>>>          .mhandler.cmd = mem_info,
>>>      },
>>>  #endif
>>> +#if defined(CONFIG_NUMA)
>>> +    {
>>> +        .name       = "mem-nodes",
>>> +        .args_type  = "",
>>> +        .params     = "",
>>> +        .help       = "show the huge mapped memory nodes location",
>>> +        .mhandler.cmd = mem_nodes,
>>> +    },
>>> +#endif
>>>      {
>>>          .name       = "mtree",
>>>          .args_type  = "",
>>> -- 
>>> 1.8.3.rc2.10.g0c2b1cf
>>
>
Eduardo Habkost June 6, 2013, 4:15 p.m. UTC | #6
On Thu, Jun 06, 2013 at 05:30:16PM +0800, Wanlong Gao wrote:
> On 06/05/2013 11:54 PM, Eduardo Habkost wrote:
> > On Wed, Jun 05, 2013 at 07:57:42AM -0500, Anthony Liguori wrote:
> >> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
> >>
> >>> Add monitor command mem-nodes to show the huge mapped
> >>> memory nodes locations.
> >>>
> >>> (qemu) info mem-nodes
> >>> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> >>> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> >>> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> >>> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
> >>
> >> This creates an ABI that we don't currently support.  Memory hotplug or
> >> a variety of things can break this mapping and then we'd have to provide
> >> an interface to describe that the mapping was broken.
> > 
> > What do you mean by "breaking this mapping", exactly? Would the backing
> > file of existing guest RAM ever change? (It would require a memory copy
> > from one file to another, why would QEMU ever do that?)
> > 
> >>
> >> Also, it only works with hugetlbfs which is probbably not widely used
> >> given the existance of THP.
> > 
> > Quoting yourself at
> > http://article.gmane.org/gmane.comp.emulators.kvm.devel/58227:
> > 
> >>> It's extremely likely that if you're doing NUMA pinning, you're also 
> >>> doing large pages via hugetlbfs.  numactl can already set policies for 
> >>> files in hugetlbfs so all you need to do is have a separate hugetlbfs 
> >>> file for each numa node.
> >>>
> >>> Then you have all the flexibility of numactl and you can implement node 
> >>> migration external to QEMU if you so desire.
> > 
> > And if we simply report where are the backing files and offsets being
> > used for guest RAM, one could simply use
> > 'numactl --file --offset --length', so we don't even need separate
> > files/mem-paths for each node.
> 
> Does "numactl" work after QEMU process mapped the hugetlbfs file?
> I'm afraid the mempolicy set after it will take no effect.

I was always expecting it to work. I will make some tests to find out.

> 
> And, if PCI-passthrough is used, direct-attached-device uses DMA transfer
> between device and qemu process. All pages of the guest will be pinned by get_user_pages().
> 
> KVM_ASSIGN_PCI_DEVICE ioctl
>   kvm_vm_ioctl_assign_device()
>     =>kvm_assign_device()
>       => kvm_iommu_map_memslots()
>         => kvm_iommu_map_pages()
>            => kvm_pin_pages()
> 
> So, with direct-attached-device, all guest page's page count will be +1 and
> any page migration will not work. AutoNUMA won't too. Then any numa directions
> are also ignored.

We could have a new -mem-path-like option where we could tell QEMU
exactly which hugetlbfs/tmpfs files should be used for each memory
region. This way an external tool could create the files, set the policy
on them, and then tell QEMU to use them.


> 
> Mel? Andrea?
> 
> > 
> > Does THP work with tmpfs, already? If it does, people who doesn't want
> > hugetlbfs and want numa tuning to work with THP could just use tmpfs for
> > -mem-path.
> > 
> >>
> >> I had hoped that we would get proper userspace interfaces for describing
> >> memory groups but that appears to have stalled out.
> > 
> > I would love to have it. But while we don't have it, sharing the
> > tmpfs/hugetlbfs backing files seem to work just fine as a mechanism to
> > let other tools manipulate guest memory policy. We just need to let
> > external tools know where the backing files are.
> > 
> >>
> >> Does anyone know if this is still on the table?
> >>
> >> If we can't get a proper kernel interface, then perhaps we need to add
> >> full libnuma support but that would really be unfortunate...
> > 
> > Why isn't the "info mem-nodes" solution (I mean: not this version, but a
> > proper QMP version that exposes all the information we need) an option?
> > 
> 
> And the shortage of hugetlbfs is that we can't know how many virtual
> machines, so that we can't determine how many huge pages to be reserved
> for virtual machines.

Then we just need to have THP working on tmpfs, and people who don't
want hugetlbfs could use tmpfs for -mem-path.


> 
> In order to set numa mempolices on THP or normal memories, as Anthony said,
> I think we should add full libnuma support into QEMU, and allow manually
> set mempolices in the QEMU command line and through QEMU monitor after
> QEMU started. The external tools can't do exactly the right thing.
> 
> So IMO the right direction is the objected proposal
>  - Message-ID: <1281534738-8310-1-git-send-email-andre.przywara@amd.com>
>    http://article.gmane.org/gmane.comp.emulators.kvm.devel/57684
>  - Message-ID: <4C7D7C2A.7000205@codemonkey.ws>
>    http://article.gmane.org/gmane.comp.emulators.kvm.devel/58835
> 
> If you agree, I'll refact this patch set for review.

I believe there are many different ways to avoid having QEMU
reimplementing what numactl and other tools can already do, and we
should consider those solutions before going that way.


But my main question about this specific patch is: if you are not
exposing this information so other tools could set policy themselves and
QEMU can't set the policy itself yet, what's exactly the purpose of this
patch?

> 
> Thanks,
> Wanlong Gao
> 
> > 
> >>
> >> Regards,
> >>
> >> Anthony Liguori
> >>
> >>>
> >>> Refer to the proposal of Eduardo and Daniel.
> >>> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
> >>>
> >>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> >>> ---
> >>>  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >>>  1 file changed, 45 insertions(+)
> >>>
> >>> diff --git a/monitor.c b/monitor.c
> >>> index eefc7f0..85c865f 100644
> >>> --- a/monitor.c
> >>> +++ b/monitor.c
> >>> @@ -74,6 +74,10 @@
> >>>  #endif
> >>>  #include "hw/lm32/lm32_pic.h"
> >>>  
> >>> +#if defined(CONFIG_NUMA)
> >>> +#include <numaif.h>
> >>> +#endif
> >>> +
> >>>  //#define DEBUG
> >>>  //#define DEBUG_COMPLETION
> >>>  
> >>> @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
> >>>  }
> >>>  #endif
> >>>  
> >>> +#if defined(CONFIG_NUMA)
> >>> +static void mem_nodes(Monitor *mon, const QDict *qdict)
> >>> +{
> >>> +    RAMBlock *block;
> >>> +    int prevnode, node;
> >>> +    unsigned long long c, start, area;
> >>> +    int fd;
> >>> +    int pid = getpid();
> >>> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> >>> +        if (!(fd = block->fd))
> >>> +            continue;
> >>> +        prevnode = -1;
> >>> +        start = 0;
> >>> +        area = (unsigned long long)block->host;
> >>> +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
> >>> +            if (get_mempolicy(&node, NULL, 0, c + block->host,
> >>> +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
> >>> +                continue;
> >>> +            if (node == prevnode)
> >>> +                continue;
> >>> +            if (prevnode != -1)
> >>> +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> >>> +                               pid, fd, start + area, c + area, prevnode);
> >>> +            prevnode = node;
> >>> +            start = c;
> >>> +         }
> >>> +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> >>> +                        pid, fd, start + area, c + area, prevnode);
> >>> +    }
> >>> +}
> >>> +#endif
> >>> +
> >>>  #if defined(TARGET_SH4)
> >>>  
> >>>  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
> >>> @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
> >>>          .mhandler.cmd = mem_info,
> >>>      },
> >>>  #endif
> >>> +#if defined(CONFIG_NUMA)
> >>> +    {
> >>> +        .name       = "mem-nodes",
> >>> +        .args_type  = "",
> >>> +        .params     = "",
> >>> +        .help       = "show the huge mapped memory nodes location",
> >>> +        .mhandler.cmd = mem_nodes,
> >>> +    },
> >>> +#endif
> >>>      {
> >>>          .name       = "mtree",
> >>>          .args_type  = "",
> >>> -- 
> >>> 1.8.3.rc2.10.g0c2b1cf
> >>
> > 
>
Wanlong Gao June 11, 2013, 7:22 a.m. UTC | #7
On 06/05/2013 09:46 PM, Eduardo Habkost wrote:
> On Wed, Jun 05, 2013 at 11:58:25AM +0800, Wanlong Gao wrote:
>> Add monitor command mem-nodes to show the huge mapped
>> memory nodes locations.
>>
> 
> This is for machine consumption, so we need a QMP command.
> 
>> (qemu) info mem-nodes
>> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
>> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
>> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
>> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
> 
> Are node0/node1 _host_ nodes?
> 
> How do I know what's the _guest_ address/node corresponding to each
> file/range above?
> 
> What I am really looking for is:
> 
>  * The correspondence between guest (virtual) NUMA nodes and guest
>    physical address ranges (it could be provided by the QMP version of
>    "info numa")

AFAIK, the guest NUMA nodes and guest physical address ranges are set
by seabios, we can't get this information from QEMU, and I think this
information is useless for pinning memory range to host.

>  * The correspondence between guest physical address ranges and ranges
>    inside the mapped files (so external tools could set the policy on
>    those files instead of requiring QEMU to set it directly)
> 
> I understand that your use case may require additional information and
> additional interfaces. But if we provide the information above we will
> allow external components set the policy on the hugetlbfs files before
> we add new interfaces required for your use case.

But the file backed memory is not good for the host which has many
virtual machines, in this situation, we can't handle anon THP yet.

And as I mentioned, the cross numa node access performance regression
is caused by pci-passthrough, it's a very long time bug, we should
back port the host memory pinning patch to old QEMU to resolve this performance
problem, too.

Thanks,
Wanlong Gao

> 
> Also, what about making it conditional to OSes where we really know
> "/proc/<pid>/fd/<fd>" is available?
> 
> 
>>
>> Refer to the proposal of Eduardo and Daniel.
>> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
> 
>>
>> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>> ---
>>  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 45 insertions(+)
>>
>> diff --git a/monitor.c b/monitor.c
>> index eefc7f0..85c865f 100644
>> --- a/monitor.c
>> +++ b/monitor.c
>> @@ -74,6 +74,10 @@
>>  #endif
>>  #include "hw/lm32/lm32_pic.h"
>>  
>> +#if defined(CONFIG_NUMA)
>> +#include <numaif.h>
>> +#endif
>> +
>>  //#define DEBUG
>>  //#define DEBUG_COMPLETION
>>  
>> @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
>>  }
>>  #endif
>>  
>> +#if defined(CONFIG_NUMA)
>> +static void mem_nodes(Monitor *mon, const QDict *qdict)
>> +{
>> +    RAMBlock *block;
>> +    int prevnode, node;
>> +    unsigned long long c, start, area;
>> +    int fd;
>> +    int pid = getpid();
>> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>> +        if (!(fd = block->fd))
>> +            continue;
>> +        prevnode = -1;
>> +        start = 0;
>> +        area = (unsigned long long)block->host;
>> +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
>> +            if (get_mempolicy(&node, NULL, 0, c + block->host,
>> +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
>> +                continue;
>> +            if (node == prevnode)
>> +                continue;
>> +            if (prevnode != -1)
>> +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
>> +                               pid, fd, start + area, c + area, prevnode);
>> +            prevnode = node;
>> +            start = c;
>> +         }
>> +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
>> +                        pid, fd, start + area, c + area, prevnode);
>> +    }
>> +}
>> +#endif
>> +
>>  #if defined(TARGET_SH4)
>>  
>>  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
>> @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
>>          .mhandler.cmd = mem_info,
>>      },
>>  #endif
>> +#if defined(CONFIG_NUMA)
>> +    {
>> +        .name       = "mem-nodes",
>> +        .args_type  = "",
>> +        .params     = "",
>> +        .help       = "show the huge mapped memory nodes location",
>> +        .mhandler.cmd = mem_nodes,
>> +    },
>> +#endif
>>      {
>>          .name       = "mtree",
>>          .args_type  = "",
>> -- 
>> 1.8.3.rc2.10.g0c2b1cf
>>
>
Eduardo Habkost June 11, 2013, 1:40 p.m. UTC | #8
On Tue, Jun 11, 2013 at 03:22:13PM +0800, Wanlong Gao wrote:
> On 06/05/2013 09:46 PM, Eduardo Habkost wrote:
> > On Wed, Jun 05, 2013 at 11:58:25AM +0800, Wanlong Gao wrote:
> >> Add monitor command mem-nodes to show the huge mapped
> >> memory nodes locations.
> >>
> > 
> > This is for machine consumption, so we need a QMP command.
> > 
> >> (qemu) info mem-nodes
> >> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> >> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> >> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> >> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
> > 
> > Are node0/node1 _host_ nodes?
> > 
> > How do I know what's the _guest_ address/node corresponding to each
> > file/range above?
> > 
> > What I am really looking for is:
> > 
> >  * The correspondence between guest (virtual) NUMA nodes and guest
> >    physical address ranges (it could be provided by the QMP version of
> >    "info numa")
> 
> AFAIK, the guest NUMA nodes and guest physical address ranges are set
> by seabios, we can't get this information from QEMU,

QEMU _has_ to know about it, otherwise we would never be able to know
which virtual addresses inside the QEMU process (or offsets inside the
backing files) belong to which virtual NUMA node.

(After all, the NUMA wiring is a hardware feature, not something that
the BIOS can decide)


> and I think this
> information is useless for pinning memory range to host.

Well, we have to somehow identify each region of guest memory when
deciding how to pin it. How would you identify it without using guest
physical addresses? Guest physical addresses are more meaningful than
the QEMU virtual addresses your patch exposes (that are meaningless
outside QEMU).



> >  * The correspondence between guest physical address ranges and ranges
> >    inside the mapped files (so external tools could set the policy on
> >    those files instead of requiring QEMU to set it directly)
> > 
> > I understand that your use case may require additional information and
> > additional interfaces. But if we provide the information above we will
> > allow external components set the policy on the hugetlbfs files before
> > we add new interfaces required for your use case.
> 
> But the file backed memory is not good for the host which has many
> virtual machines, in this situation, we can't handle anon THP yet.

I don't understand what you mean, here. What prevents someone from using
file-backed memory with multiple virtual machines?

> 
> And as I mentioned, the cross numa node access performance regression
> is caused by pci-passthrough, it's a very long time bug, we should
> back port the host memory pinning patch to old QEMU to resolve this performance
> problem, too.

If it's a regression, what's the last version of QEMU where the bug
wasn't present?


> 
> Thanks,
> Wanlong Gao
> 
> > 
> > Also, what about making it conditional to OSes where we really know
> > "/proc/<pid>/fd/<fd>" is available?
> > 
> > 
> >>
> >> Refer to the proposal of Eduardo and Daniel.
> >> http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
> > 
> >>
> >> Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
> >> ---
> >>  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >>  1 file changed, 45 insertions(+)
> >>
> >> diff --git a/monitor.c b/monitor.c
> >> index eefc7f0..85c865f 100644
> >> --- a/monitor.c
> >> +++ b/monitor.c
> >> @@ -74,6 +74,10 @@
> >>  #endif
> >>  #include "hw/lm32/lm32_pic.h"
> >>  
> >> +#if defined(CONFIG_NUMA)
> >> +#include <numaif.h>
> >> +#endif
> >> +
> >>  //#define DEBUG
> >>  //#define DEBUG_COMPLETION
> >>  
> >> @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
> >>  }
> >>  #endif
> >>  
> >> +#if defined(CONFIG_NUMA)
> >> +static void mem_nodes(Monitor *mon, const QDict *qdict)
> >> +{
> >> +    RAMBlock *block;
> >> +    int prevnode, node;
> >> +    unsigned long long c, start, area;
> >> +    int fd;
> >> +    int pid = getpid();
> >> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> >> +        if (!(fd = block->fd))
> >> +            continue;
> >> +        prevnode = -1;
> >> +        start = 0;
> >> +        area = (unsigned long long)block->host;
> >> +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
> >> +            if (get_mempolicy(&node, NULL, 0, c + block->host,
> >> +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
> >> +                continue;
> >> +            if (node == prevnode)
> >> +                continue;
> >> +            if (prevnode != -1)
> >> +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> >> +                               pid, fd, start + area, c + area, prevnode);
> >> +            prevnode = node;
> >> +            start = c;
> >> +         }
> >> +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
> >> +                        pid, fd, start + area, c + area, prevnode);
> >> +    }
> >> +}
> >> +#endif
> >> +
> >>  #if defined(TARGET_SH4)
> >>  
> >>  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
> >> @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
> >>          .mhandler.cmd = mem_info,
> >>      },
> >>  #endif
> >> +#if defined(CONFIG_NUMA)
> >> +    {
> >> +        .name       = "mem-nodes",
> >> +        .args_type  = "",
> >> +        .params     = "",
> >> +        .help       = "show the huge mapped memory nodes location",
> >> +        .mhandler.cmd = mem_nodes,
> >> +    },
> >> +#endif
> >>      {
> >>          .name       = "mtree",
> >>          .args_type  = "",
> >> -- 
> >> 1.8.3.rc2.10.g0c2b1cf
> >>
> > 
>
Wanlong Gao June 13, 2013, 1:40 a.m. UTC | #9
On 06/11/2013 09:40 PM, Eduardo Habkost wrote:
> On Tue, Jun 11, 2013 at 03:22:13PM +0800, Wanlong Gao wrote:
>> On 06/05/2013 09:46 PM, Eduardo Habkost wrote:
>>> On Wed, Jun 05, 2013 at 11:58:25AM +0800, Wanlong Gao wrote:
>>>> Add monitor command mem-nodes to show the huge mapped
>>>> memory nodes locations.
>>>>
>>>
>>> This is for machine consumption, so we need a QMP command.
>>>
>>>> (qemu) info mem-nodes
>>>> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
>>>> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
>>>> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
>>>> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
>>>
>>> Are node0/node1 _host_ nodes?
>>>
>>> How do I know what's the _guest_ address/node corresponding to each
>>> file/range above?
>>>
>>> What I am really looking for is:
>>>
>>>  * The correspondence between guest (virtual) NUMA nodes and guest
>>>    physical address ranges (it could be provided by the QMP version of
>>>    "info numa")
>>
>> AFAIK, the guest NUMA nodes and guest physical address ranges are set
>> by seabios, we can't get this information from QEMU,
> 
> QEMU _has_ to know about it, otherwise we would never be able to know
> which virtual addresses inside the QEMU process (or offsets inside the
> backing files) belong to which virtual NUMA node.

Nope, if I'm right, actually it's linear except that there are holes in
the physical address spaces. So we can know which node the guest virtual
address is included just by each numa node size. It's enough for us if we
can provide a QMP interface from QEMU to let external tools like libvirt
set the host memory binding polices according to the QMP interface, and
we can also provide the QEMU command line option to be able to set host
bindings through command line options before we start QEMU process.

> 
> (After all, the NUMA wiring is a hardware feature, not something that
> the BIOS can decide)

But this is ACPI table which wrote by seabios now. AFAIK, there is no
unified idea about moving this part to QEMU with the QEMU interfaces
for seabios removed or just stay it there.


> 
> 
>> and I think this
>> information is useless for pinning memory range to host.
> 
> Well, we have to somehow identify each region of guest memory when
> deciding how to pin it. How would you identify it without using guest
> physical addresses? Guest physical addresses are more meaningful than
> the QEMU virtual addresses your patch exposes (that are meaningless
> outside QEMU).

As I mentioned above, we can know this just by the guest node memory size,
and can set the host bindings by treating this sizes as offsets.
And I think we only need to set the host memory binding polices to each
guest numa nodes. It's unnecessary to set polices to each region as you
said.

> 
> 
> 
>>>  * The correspondence between guest physical address ranges and ranges
>>>    inside the mapped files (so external tools could set the policy on
>>>    those files instead of requiring QEMU to set it directly)
>>>
>>> I understand that your use case may require additional information and
>>> additional interfaces. But if we provide the information above we will
>>> allow external components set the policy on the hugetlbfs files before
>>> we add new interfaces required for your use case.
>>
>> But the file backed memory is not good for the host which has many
>> virtual machines, in this situation, we can't handle anon THP yet.
> 
> I don't understand what you mean, here. What prevents someone from using
> file-backed memory with multiple virtual machines?

While if we use hugetlbfs backed memory, we should know how many virtual machines,
how much memory each vm will use, then reserve these pages for them. And even
should reserve more pages for external tools(numactl) to set memory polices.
Even the memory reservation also has it's own memory policies. It's very hard
to control it to what we want to set.


> 
>>
>> And as I mentioned, the cross numa node access performance regression
>> is caused by pci-passthrough, it's a very long time bug, we should
>> back port the host memory pinning patch to old QEMU to resolve this performance
>> problem, too.
> 
> If it's a regression, what's the last version of QEMU where the bug
> wasn't present?
> 

 As QEMU doesn't support host memory binding, I think
this was present since we support guest NUMA, and the pci-passthrough made
it even worse.


Thanks,
Wanlong Gao
Eduardo Habkost June 13, 2013, 12:50 p.m. UTC | #10
On Thu, Jun 13, 2013 at 09:40:14AM +0800, Wanlong Gao wrote:
> On 06/11/2013 09:40 PM, Eduardo Habkost wrote:
> > On Tue, Jun 11, 2013 at 03:22:13PM +0800, Wanlong Gao wrote:
> >> On 06/05/2013 09:46 PM, Eduardo Habkost wrote:
> >>> On Wed, Jun 05, 2013 at 11:58:25AM +0800, Wanlong Gao wrote:
> >>>> Add monitor command mem-nodes to show the huge mapped
> >>>> memory nodes locations.
> >>>>
> >>>
> >>> This is for machine consumption, so we need a QMP command.
> >>>
> >>>> (qemu) info mem-nodes
> >>>> /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> >>>> /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> >>>> /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> >>>> /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
> >>>
> >>> Are node0/node1 _host_ nodes?
> >>>
> >>> How do I know what's the _guest_ address/node corresponding to each
> >>> file/range above?
> >>>
> >>> What I am really looking for is:
> >>>
> >>>  * The correspondence between guest (virtual) NUMA nodes and guest
> >>>    physical address ranges (it could be provided by the QMP version of
> >>>    "info numa")
> >>
> >> AFAIK, the guest NUMA nodes and guest physical address ranges are set
> >> by seabios, we can't get this information from QEMU,
> > 
> > QEMU _has_ to know about it, otherwise we would never be able to know
> > which virtual addresses inside the QEMU process (or offsets inside the
> > backing files) belong to which virtual NUMA node.
> 
> Nope, if I'm right, actually it's linear except that there are holes in
> the physical address spaces. So we can know which node the guest virtual
> address is included just by each numa node size.

You are just describing a way to accomplish the item I asked about
above: finding out the correspondence between guest physical addresses
and NUMA nodes.  :)

(But I would prefer to have something more explicit in the QMP interface
instead of something implicit that assumes a predefined binding)

> It's enough for us if we
> can provide a QMP interface from QEMU to let external tools like libvirt
> set the host memory binding polices according to the QMP interface, and
> we can also provide the QEMU command line option to be able to set host
> bindings through command line options before we start QEMU process.

And how would you identify memory regions through this memory binding
QMP interface, if not by guest physical addresses?


> 
> > 
> > (After all, the NUMA wiring is a hardware feature, not something that
> > the BIOS can decide)
> 
> But this is ACPI table which wrote by seabios now. AFAIK, there is no
> unified idea about moving this part to QEMU with the QEMU interfaces
> for seabios removed or just stay it there.

It doesn't matter who writes the ACPI table. QEMU must always know on
which virtual NUMA node each memory region is located.

> > 
> >> and I think this
> >> information is useless for pinning memory range to host.
> > 
> > Well, we have to somehow identify each region of guest memory when
> > deciding how to pin it. How would you identify it without using guest
> > physical addresses? Guest physical addresses are more meaningful than
> > the QEMU virtual addresses your patch exposes (that are meaningless
> > outside QEMU).
> 
> As I mentioned above, we can know this just by the guest node memory size,
> and can set the host bindings by treating this sizes as offsets.
> And I think we only need to set the host memory binding polices to each
> guest numa nodes. It's unnecessary to set polices to each region as you
> said.

I believe an interface based on guest physical memory addresses is more
flexible (and even simpler!) than one that only allows binding of whole
virtual NUMA nodes.

(And I still don't understand why you are exposing QEMU virtual memory
addresses in the new command, if they are useless).


> > 
> > 
> >>>  * The correspondence between guest physical address ranges and ranges
> >>>    inside the mapped files (so external tools could set the policy on
> >>>    those files instead of requiring QEMU to set it directly)
> >>>
> >>> I understand that your use case may require additional information and
> >>> additional interfaces. But if we provide the information above we will
> >>> allow external components set the policy on the hugetlbfs files before
> >>> we add new interfaces required for your use case.
> >>
> >> But the file backed memory is not good for the host which has many
> >> virtual machines, in this situation, we can't handle anon THP yet.
> > 
> > I don't understand what you mean, here. What prevents someone from using
> > file-backed memory with multiple virtual machines?
> 
> While if we use hugetlbfs backed memory, we should know how many virtual machines,
> how much memory each vm will use, then reserve these pages for them. And even
> should reserve more pages for external tools(numactl) to set memory polices.
> Even the memory reservation also has it's own memory policies. It's very hard
> to control it to what we want to set.

Well, it's hard because we don't even have tools to help on that, yet.

Anyway, I understand that you want to make it work with THP as well. But
if THP works with tmpfs (does it?), people then could use exactly the
same file-based mechanisms with tmpfs and keep THP working.

(Right now I am doing some experiments to understand how the system
behaves when using numactl on hugetlbfs and tmpfs, before and after
getting the files mapped).


> > 
> >>
> >> And as I mentioned, the cross numa node access performance regression
> >> is caused by pci-passthrough, it's a very long time bug, we should
> >> back port the host memory pinning patch to old QEMU to resolve this performance
> >> problem, too.
> > 
> > If it's a regression, what's the last version of QEMU where the bug
> > wasn't present?
> > 
> 
>  As QEMU doesn't support host memory binding, I think
> this was present since we support guest NUMA, and the pci-passthrough made
> it even worse.

If the problem was always present, it is not a regression, is it?
Paolo Bonzini June 13, 2013, 10:32 p.m. UTC | #11
Il 13/06/2013 08:50, Eduardo Habkost ha scritto:
> I believe an interface based on guest physical memory addresses is more
> flexible (and even simpler!) than one that only allows binding of whole
> virtual NUMA nodes.

And "-numa node" is already one, what about just adding "mem-path=/foo"
or "host_node=NN" suboptions?  Then "-mem-path /foo" would be a shortcut
for "-numa node,mem-path=/foo".

I even had patches to convert -numa to QemuOpts, I can dig them out if
your interested.

Paolo

> (And I still don't understand why you are exposing QEMU virtual memory
> addresses in the new command, if they are useless).
> 
> 
>>>
>>>
>>>>>  * The correspondence between guest physical address ranges and ranges
>>>>>    inside the mapped files (so external tools could set the policy on
>>>>>    those files instead of requiring QEMU to set it directly)
>>>>>
>>>>> I understand that your use case may require additional information and
>>>>> additional interfaces. But if we provide the information above we will
>>>>> allow external components set the policy on the hugetlbfs files before
>>>>> we add new interfaces required for your use case.
>>>>
>>>> But the file backed memory is not good for the host which has many
>>>> virtual machines, in this situation, we can't handle anon THP yet.
>>>
>>> I don't understand what you mean, here. What prevents someone from using
>>> file-backed memory with multiple virtual machines?
>>
>> While if we use hugetlbfs backed memory, we should know how many virtual machines,
>> how much memory each vm will use, then reserve these pages for them. And even
>> should reserve more pages for external tools(numactl) to set memory polices.
>> Even the memory reservation also has it's own memory policies. It's very hard
>> to control it to what we want to set.
> 
> Well, it's hard because we don't even have tools to help on that, yet.
> 
> Anyway, I understand that you want to make it work with THP as well. But
> if THP works with tmpfs (does it?), people then could use exactly the
> same file-based mechanisms with tmpfs and keep THP working.
> 
> (Right now I am doing some experiments to understand how the system
> behaves when using numactl on hugetlbfs and tmpfs, before and after
> getting the files mapped).
> 
> 
>>>
>>>>
>>>> And as I mentioned, the cross numa node access performance regression
>>>> is caused by pci-passthrough, it's a very long time bug, we should
>>>> back port the host memory pinning patch to old QEMU to resolve this performance
>>>> problem, too.
>>>
>>> If it's a regression, what's the last version of QEMU where the bug
>>> wasn't present?
>>>
>>
>>  As QEMU doesn't support host memory binding, I think
>> this was present since we support guest NUMA, and the pci-passthrough made
>> it even worse.
> 
> If the problem was always present, it is not a regression, is it?
>
Anthony Liguori June 14, 2013, 1:04 a.m. UTC | #12
Eduardo Habkost <ehabkost@redhat.com> writes:

> On Wed, Jun 05, 2013 at 07:57:42AM -0500, Anthony Liguori wrote:
>> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
>> 
>> > Add monitor command mem-nodes to show the huge mapped
>> > memory nodes locations.
>> >
>> > (qemu) info mem-nodes
>> > /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
>> > /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
>> > /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
>> > /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
>> 
>> This creates an ABI that we don't currently support.  Memory hotplug or
>> a variety of things can break this mapping and then we'd have to provide
>> an interface to describe that the mapping was broken.
>
> What do you mean by "breaking this mapping", exactly? Would the backing
> file of existing guest RAM ever change? (It would require a memory copy
> from one file to another, why would QEMU ever do that?)

Memory hot-add will change the mapping.  hot-remove (if ever
implemented) would break it.

>
>> 
>> Also, it only works with hugetlbfs which is probbably not widely used
>> given the existance of THP.
>
> Quoting yourself at
> http://article.gmane.org/gmane.comp.emulators.kvm.devel/58227:

Unfortunately the kernel side of the world has all but stalled making
progress here.  I don't think we have better choices.

>
>>> It's extremely likely that if you're doing NUMA pinning, you're also 
>>> doing large pages via hugetlbfs.  numactl can already set policies for 
>>> files in hugetlbfs so all you need to do is have a separate hugetlbfs 
>>> file for each numa node.
>>> 
>>> Then you have all the flexibility of numactl and you can implement node 
>>> migration external to QEMU if you so desire.
>
> And if we simply report where are the backing files and offsets being
> used for guest RAM, one could simply use
> 'numactl --file --offset --length', so we don't even need separate
> files/mem-paths for each node.
>
> Does THP work with tmpfs, already?

No.

> If it does, people who doesn't want
> hugetlbfs and want numa tuning to work with THP could just use tmpfs for
> -mem-path.
>
>> 
>> I had hoped that we would get proper userspace interfaces for describing
>> memory groups but that appears to have stalled out.
>
> I would love to have it. But while we don't have it, sharing the
> tmpfs/hugetlbfs backing files seem to work just fine as a mechanism to
> let other tools manipulate guest memory policy.  We just need to let
> external tools know where the backing files are.

Is this meant for numad?  Wouldn't you want numad to work without
hugetlbfs?

You have to preallocate pages to hugetlbfs.  It's very difficult to use
in practice.

>
>> 
>> Does anyone know if this is still on the table?
>> 
>> If we can't get a proper kernel interface, then perhaps we need to add
>> full libnuma support but that would really be unfortunate...
>
> Why isn't the "info mem-nodes" solution (I mean: not this version, but a
> proper QMP version that exposes all the information we need) an
> option?

We're exposing internal QEMU information (the HVA -> GPA mapping) as an
external stable interface.

Regards,

Anthony Liguori

>
>
>> 
>> Regards,
>> 
>> Anthony Liguori
>> 
>> >
>> > Refer to the proposal of Eduardo and Daniel.
>> > http://article.gmane.org/gmane.comp.emulators.kvm.devel/93476
>> >
>> > Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
>> > ---
>> >  monitor.c | 45 +++++++++++++++++++++++++++++++++++++++++++++
>> >  1 file changed, 45 insertions(+)
>> >
>> > diff --git a/monitor.c b/monitor.c
>> > index eefc7f0..85c865f 100644
>> > --- a/monitor.c
>> > +++ b/monitor.c
>> > @@ -74,6 +74,10 @@
>> >  #endif
>> >  #include "hw/lm32/lm32_pic.h"
>> >  
>> > +#if defined(CONFIG_NUMA)
>> > +#include <numaif.h>
>> > +#endif
>> > +
>> >  //#define DEBUG
>> >  //#define DEBUG_COMPLETION
>> >  
>> > @@ -1759,6 +1763,38 @@ static void mem_info(Monitor *mon, const QDict *qdict)
>> >  }
>> >  #endif
>> >  
>> > +#if defined(CONFIG_NUMA)
>> > +static void mem_nodes(Monitor *mon, const QDict *qdict)
>> > +{
>> > +    RAMBlock *block;
>> > +    int prevnode, node;
>> > +    unsigned long long c, start, area;
>> > +    int fd;
>> > +    int pid = getpid();
>> > +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>> > +        if (!(fd = block->fd))
>> > +            continue;
>> > +        prevnode = -1;
>> > +        start = 0;
>> > +        area = (unsigned long long)block->host;
>> > +        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
>> > +            if (get_mempolicy(&node, NULL, 0, c + block->host,
>> > +                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
>> > +                continue;
>> > +            if (node == prevnode)
>> > +                continue;
>> > +            if (prevnode != -1)
>> > +                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
>> > +                               pid, fd, start + area, c + area, prevnode);
>> > +            prevnode = node;
>> > +            start = c;
>> > +         }
>> > +         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
>> > +                        pid, fd, start + area, c + area, prevnode);
>> > +    }
>> > +}
>> > +#endif
>> > +
>> >  #if defined(TARGET_SH4)
>> >  
>> >  static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
>> > @@ -2567,6 +2603,15 @@ static mon_cmd_t info_cmds[] = {
>> >          .mhandler.cmd = mem_info,
>> >      },
>> >  #endif
>> > +#if defined(CONFIG_NUMA)
>> > +    {
>> > +        .name       = "mem-nodes",
>> > +        .args_type  = "",
>> > +        .params     = "",
>> > +        .help       = "show the huge mapped memory nodes location",
>> > +        .mhandler.cmd = mem_nodes,
>> > +    },
>> > +#endif
>> >      {
>> >          .name       = "mtree",
>> >          .args_type  = "",
>> > -- 
>> > 1.8.3.rc2.10.g0c2b1cf
>> 
>
> -- 
> Eduardo
Anthony Liguori June 14, 2013, 1:05 a.m. UTC | #13
Paolo Bonzini <pbonzini@redhat.com> writes:

> Il 13/06/2013 08:50, Eduardo Habkost ha scritto:
>> I believe an interface based on guest physical memory addresses is more
>> flexible (and even simpler!) than one that only allows binding of whole
>> virtual NUMA nodes.
>
> And "-numa node" is already one, what about just adding "mem-path=/foo"
> or "host_node=NN" suboptions?  Then "-mem-path /foo" would be a shortcut
> for "-numa node,mem-path=/foo".
>
> I even had patches to convert -numa to QemuOpts, I can dig them out if
> your interested.

Ack.  This is a very reasonable thing to add.

Regards,

Anthony Liguori

>
> Paolo
>
>> (And I still don't understand why you are exposing QEMU virtual memory
>> addresses in the new command, if they are useless).
>> 
>> 
>>>>
>>>>
>>>>>>  * The correspondence between guest physical address ranges and ranges
>>>>>>    inside the mapped files (so external tools could set the policy on
>>>>>>    those files instead of requiring QEMU to set it directly)
>>>>>>
>>>>>> I understand that your use case may require additional information and
>>>>>> additional interfaces. But if we provide the information above we will
>>>>>> allow external components set the policy on the hugetlbfs files before
>>>>>> we add new interfaces required for your use case.
>>>>>
>>>>> But the file backed memory is not good for the host which has many
>>>>> virtual machines, in this situation, we can't handle anon THP yet.
>>>>
>>>> I don't understand what you mean, here. What prevents someone from using
>>>> file-backed memory with multiple virtual machines?
>>>
>>> While if we use hugetlbfs backed memory, we should know how many virtual machines,
>>> how much memory each vm will use, then reserve these pages for them. And even
>>> should reserve more pages for external tools(numactl) to set memory polices.
>>> Even the memory reservation also has it's own memory policies. It's very hard
>>> to control it to what we want to set.
>> 
>> Well, it's hard because we don't even have tools to help on that, yet.
>> 
>> Anyway, I understand that you want to make it work with THP as well. But
>> if THP works with tmpfs (does it?), people then could use exactly the
>> same file-based mechanisms with tmpfs and keep THP working.
>> 
>> (Right now I am doing some experiments to understand how the system
>> behaves when using numactl on hugetlbfs and tmpfs, before and after
>> getting the files mapped).
>> 
>> 
>>>>
>>>>>
>>>>> And as I mentioned, the cross numa node access performance regression
>>>>> is caused by pci-passthrough, it's a very long time bug, we should
>>>>> back port the host memory pinning patch to old QEMU to resolve this performance
>>>>> problem, too.
>>>>
>>>> If it's a regression, what's the last version of QEMU where the bug
>>>> wasn't present?
>>>>
>>>
>>>  As QEMU doesn't support host memory binding, I think
>>> this was present since we support guest NUMA, and the pci-passthrough made
>>> it even worse.
>> 
>> If the problem was always present, it is not a regression, is it?
>>
Wanlong Gao June 14, 2013, 1:16 a.m. UTC | #14
On 06/14/2013 09:05 AM, Anthony Liguori wrote:
> Paolo Bonzini <pbonzini@redhat.com> writes:
> 
>> Il 13/06/2013 08:50, Eduardo Habkost ha scritto:
>>> I believe an interface based on guest physical memory addresses is more
>>> flexible (and even simpler!) than one that only allows binding of whole
>>> virtual NUMA nodes.
>>
>> And "-numa node" is already one, what about just adding "mem-path=/foo"
>> or "host_node=NN" suboptions?  Then "-mem-path /foo" would be a shortcut
>> for "-numa node,mem-path=/foo".
>>
>> I even had patches to convert -numa to QemuOpts, I can dig them out if
>> your interested.
> 
> Ack.  This is a very reasonable thing to add.

How about making "-numa node,membind=0" like options, and also provide a
QMP interface "numa_set guest_node_id mempolicy". So that we can set
the mempolicy not only for file backed memory but also for anon mapped
guest numa node. This is full numa support in QEMU as you said. I'm making
the patches now.


Thanks,
Wanlong Gao

> 
> Regards,
> 
> Anthony Liguori
> 
>>
>> Paolo
>>
>>> (And I still don't understand why you are exposing QEMU virtual memory
>>> addresses in the new command, if they are useless).
>>>
>>>
>>>>>
>>>>>
>>>>>>>  * The correspondence between guest physical address ranges and ranges
>>>>>>>    inside the mapped files (so external tools could set the policy on
>>>>>>>    those files instead of requiring QEMU to set it directly)
>>>>>>>
>>>>>>> I understand that your use case may require additional information and
>>>>>>> additional interfaces. But if we provide the information above we will
>>>>>>> allow external components set the policy on the hugetlbfs files before
>>>>>>> we add new interfaces required for your use case.
>>>>>>
>>>>>> But the file backed memory is not good for the host which has many
>>>>>> virtual machines, in this situation, we can't handle anon THP yet.
>>>>>
>>>>> I don't understand what you mean, here. What prevents someone from using
>>>>> file-backed memory with multiple virtual machines?
>>>>
>>>> While if we use hugetlbfs backed memory, we should know how many virtual machines,
>>>> how much memory each vm will use, then reserve these pages for them. And even
>>>> should reserve more pages for external tools(numactl) to set memory polices.
>>>> Even the memory reservation also has it's own memory policies. It's very hard
>>>> to control it to what we want to set.
>>>
>>> Well, it's hard because we don't even have tools to help on that, yet.
>>>
>>> Anyway, I understand that you want to make it work with THP as well. But
>>> if THP works with tmpfs (does it?), people then could use exactly the
>>> same file-based mechanisms with tmpfs and keep THP working.
>>>
>>> (Right now I am doing some experiments to understand how the system
>>> behaves when using numactl on hugetlbfs and tmpfs, before and after
>>> getting the files mapped).
>>>
>>>
>>>>>
>>>>>>
>>>>>> And as I mentioned, the cross numa node access performance regression
>>>>>> is caused by pci-passthrough, it's a very long time bug, we should
>>>>>> back port the host memory pinning patch to old QEMU to resolve this performance
>>>>>> problem, too.
>>>>>
>>>>> If it's a regression, what's the last version of QEMU where the bug
>>>>> wasn't present?
>>>>>
>>>>
>>>>  As QEMU doesn't support host memory binding, I think
>>>> this was present since we support guest NUMA, and the pci-passthrough made
>>>> it even worse.
>>>
>>> If the problem was always present, it is not a regression, is it?
>>>
> 
>
Eduardo Habkost June 14, 2013, 1:56 p.m. UTC | #15
On Thu, Jun 13, 2013 at 08:04:00PM -0500, Anthony Liguori wrote:
> Eduardo Habkost <ehabkost@redhat.com> writes:
> 
> > On Wed, Jun 05, 2013 at 07:57:42AM -0500, Anthony Liguori wrote:
> >> Wanlong Gao <gaowanlong@cn.fujitsu.com> writes:
> >> 
> >> > Add monitor command mem-nodes to show the huge mapped
> >> > memory nodes locations.
> >> >
> >> > (qemu) info mem-nodes
> >> > /proc/14132/fd/13: 00002aaaaac00000-00002aaaeac00000: node0
> >> > /proc/14132/fd/13: 00002aaaeac00000-00002aab2ac00000: node1
> >> > /proc/14132/fd/14: 00002aab2ac00000-00002aab2b000000: node0
> >> > /proc/14132/fd/14: 00002aab2b000000-00002aab2b400000: node1
> >> 
> >> This creates an ABI that we don't currently support.  Memory hotplug or
> >> a variety of things can break this mapping and then we'd have to provide
> >> an interface to describe that the mapping was broken.
> >
> > What do you mean by "breaking this mapping", exactly? Would the backing
> > file of existing guest RAM ever change? (It would require a memory copy
> > from one file to another, why would QEMU ever do that?)
> 
> Memory hot-add will change the mapping.  hot-remove (if ever
> implemented) would break it.

So, would the backing-file/offset of existing guest RAM ever change? (It
would require a memory copy from one file to another, why would QEMU
ever do that?)


[...]
> >
> > Does THP work with tmpfs, already?
> 
> No.

OK, that's a real problem.


> > If it does, people who doesn't want
> > hugetlbfs and want numa tuning to work with THP could just use tmpfs for
> > -mem-path.
> >
> >> 
> >> I had hoped that we would get proper userspace interfaces for describing
> >> memory groups but that appears to have stalled out.
> >
> > I would love to have it. But while we don't have it, sharing the
> > tmpfs/hugetlbfs backing files seem to work just fine as a mechanism to
> > let other tools manipulate guest memory policy.  We just need to let
> > external tools know where the backing files are.
> 
> Is this meant for numad?  Wouldn't you want numad to work without
> hugetlbfs?
> 
> You have to preallocate pages to hugetlbfs.  It's very difficult to use
> in practice.

If you don't want hugetlbfs you could use tmpfs, and set the policy on
the tmpfs files. What I am asking is: why do we need to ask the kernel
folks for interfaces to define and set policies on memory groups if we
can (in theory) do the exactly same using tmpfs and hugetlbfs files?

(But the fact that THP doesn't work with tmpfs is a real problem, as I
said above)

> >> 
> >> Does anyone know if this is still on the table?
> >> 
> >> If we can't get a proper kernel interface, then perhaps we need to add
> >> full libnuma support but that would really be unfortunate...
> >
> > Why isn't the "info mem-nodes" solution (I mean: not this version, but a
> > proper QMP version that exposes all the information we need) an
> > option?
> 
> We're exposing internal QEMU information (the HVA -> GPA mapping) as an
> external stable interface.

I never wanted to expose the HVA -> GPA mapping. What I want to expose
is:

 * The virtual-NUMA-node -> GPA-range mapping
 * The GPA -> mem-path file/offset mapping

(Alternatively, a simple virtual-NUMA-node -> mem-path file/offset
mapping would be enough, too)

We could even replace "mem-path file/offset mapping" with "memory
groups", if the kernel already had interfaces to deal with memory
groups.
Paolo Bonzini June 15, 2013, 5:23 p.m. UTC | #16
Il 13/06/2013 21:16, Wanlong Gao ha scritto:
> On 06/14/2013 09:05 AM, Anthony Liguori wrote:
>> Paolo Bonzini <pbonzini@redhat.com> writes:
>>
>>> Il 13/06/2013 08:50, Eduardo Habkost ha scritto:
>>>> I believe an interface based on guest physical memory addresses is more
>>>> flexible (and even simpler!) than one that only allows binding of whole
>>>> virtual NUMA nodes.
>>>
>>> And "-numa node" is already one, what about just adding "mem-path=/foo"
>>> or "host_node=NN" suboptions?  Then "-mem-path /foo" would be a shortcut
>>> for "-numa node,mem-path=/foo".
>>>
>>> I even had patches to convert -numa to QemuOpts, I can dig them out if
>>> your interested.
>>
>> Ack.  This is a very reasonable thing to add.
> 
> How about making "-numa node,membind=0" like options, and also provide a
> QMP interface "numa_set guest_node_id mempolicy". So that we can set
> the mempolicy not only for file backed memory but also for anon mapped
> guest numa node. This is full numa support in QEMU as you said. I'm making
> the patches now.

Yup, that's exactly what I called "host_node".  membind also makes
sense, but make it mem-bind or mem-host-node for consistency with mem-path.

Paolo
diff mbox

Patch

diff --git a/monitor.c b/monitor.c
index eefc7f0..85c865f 100644
--- a/monitor.c
+++ b/monitor.c
@@ -74,6 +74,10 @@ 
 #endif
 #include "hw/lm32/lm32_pic.h"
 
+#if defined(CONFIG_NUMA)
+#include <numaif.h>
+#endif
+
 //#define DEBUG
 //#define DEBUG_COMPLETION
 
@@ -1759,6 +1763,38 @@  static void mem_info(Monitor *mon, const QDict *qdict)
 }
 #endif
 
+#if defined(CONFIG_NUMA)
+static void mem_nodes(Monitor *mon, const QDict *qdict)
+{
+    RAMBlock *block;
+    int prevnode, node;
+    unsigned long long c, start, area;
+    int fd;
+    int pid = getpid();
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (!(fd = block->fd))
+            continue;
+        prevnode = -1;
+        start = 0;
+        area = (unsigned long long)block->host;
+        for (c = 0; c < block->length; c += TARGET_PAGE_SIZE) {
+            if (get_mempolicy(&node, NULL, 0, c + block->host,
+                              MPOL_F_ADDR | MPOL_F_NODE) < 0)
+                continue;
+            if (node == prevnode)
+                continue;
+            if (prevnode != -1)
+                monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
+                               pid, fd, start + area, c + area, prevnode);
+            prevnode = node;
+            start = c;
+         }
+         monitor_printf(mon, "/proc/%d/fd/%d: %016Lx-%016Lx: node%d\n",
+                        pid, fd, start + area, c + area, prevnode);
+    }
+}
+#endif
+
 #if defined(TARGET_SH4)
 
 static void print_tlb(Monitor *mon, int idx, tlb_t *tlb)
@@ -2567,6 +2603,15 @@  static mon_cmd_t info_cmds[] = {
         .mhandler.cmd = mem_info,
     },
 #endif
+#if defined(CONFIG_NUMA)
+    {
+        .name       = "mem-nodes",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show the huge mapped memory nodes location",
+        .mhandler.cmd = mem_nodes,
+    },
+#endif
     {
         .name       = "mtree",
         .args_type  = "",