diff mbox

[v3.1,25/31] hostmem: add properties for NUMA memory policy

Message ID 947fa24a92db11d593709f37d49f6603a0e93c69.1399365798.git.hutao@cn.fujitsu.com
State New
Headers show

Commit Message

Hu Tao May 6, 2014, 9:27 a.m. UTC
Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
[Raise errors on setting properties if !CONFIG_NUMA.  Add BUILD_BUG_ON
 checks. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
---
 backends/hostmem.c       | 109 ++++++++++++++++++++++++++++++++++++++++++++++-
 include/sysemu/hostmem.h |   4 ++
 qapi-schema.json         |  20 +++++++++
 3 files changed, 132 insertions(+), 1 deletion(-)

Comments

Eduardo Habkost May 19, 2014, 11:34 p.m. UTC | #1
On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote:
[...]
> @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, Error **errp)
>      if (backend->prealloc) {
>          os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
>      }
> +
> +#ifdef CONFIG_NUMA
> +    unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES);
> +
> +    /* This is a workaround for a long standing bug in Linux'
> +     * mbind implementation, which cuts off the last specified
> +     * node.
> +     */

What if the bug is fixed? mbind() documentation says "nodemask points to
a bit mask of nodes containing up to maxnode bits", so we must ensure
backend->host_nodes has the one extra bit.

Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as
argument.

We could address both issues, and do this:

    struct HostMemoryBackend { [...]
        DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
    [...]
    lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
    /* lastbit == MAX_NODES means maxnode=0 */
    maxnode = (lastbit + 1) % (MAX_NODES + 1);
    /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
     * as argument to mbind() due to an old Linux bug (feature?) which
     * cuts off the last specified node. This means backend->host_nodes
     * must have MAX_NODES+1 bits available.
     */
    assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
    assert(maxnode <= MAX_NODES);
    mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, flags);


(I am starting to wonder if it was worth dropping the libnuma
requirement and implementing our own mbind()-calling code.)

> +    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) {
> +        error_setg_errno(errp, errno,
> +                         "cannot bind memory to host NUMA nodes");

Don't we want to set flags to MPOL_MF_STRICT here? I believe we
shouldn't have any pages preallocated at this point, but in case we do,
I would expect them to be moved instead of ignoring the policy set by
the user.

> +        return;
> +    }
> +#endif
>  }
>  
>  MemoryRegion *
> diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
> index 819b72d..4e96298 100644
> --- a/include/sysemu/hostmem.h
> +++ b/include/sysemu/hostmem.h
> @@ -12,8 +12,10 @@
>  #ifndef QEMU_HOSTMEM_H
>  #define QEMU_HOSTMEM_H
>  
> +#include "sysemu/sysemu.h" /* for MAX_NODES */
>  #include "qom/object.h"
>  #include "exec/memory.h"
> +#include "qemu/bitmap.h"
>  
>  #define TYPE_MEMORY_BACKEND "memory"
>  #define MEMORY_BACKEND(obj) \
> @@ -52,6 +54,8 @@ struct HostMemoryBackend {
>      uint64_t size;
>      bool merge, dump;
>      bool prealloc, force_prealloc;
> +    DECLARE_BITMAP(host_nodes, MAX_NODES);
> +    HostMemPolicy policy;
>  
>      MemoryRegion mr;
>  };
> diff --git a/qapi-schema.json b/qapi-schema.json
> index 5dd30eb..bea3476 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -4732,3 +4732,23 @@
>     '*cpus':   ['uint16'],
>     '*mem':    'size',
>     '*memdev': 'str' }}
> +
> +##
> +# @HostMemPolicy
> +#
> +# Host memory policy types
> +#
> +# @default: restore default policy, remove any nondefault policy
> +#
> +# @preferred: set the preferred host nodes for allocation
> +#
> +# @bind: a strict policy that restricts memory allocation to the
> +#        host nodes specified
> +#
> +# @interleave: memory allocations are interleaved across the set
> +#              of host nodes specified
> +#
> +# Since 2.1
> +##
> +{ 'enum': 'HostMemPolicy',
> +  'data': [ 'default', 'preferred', 'bind', 'interleave' ] }
> -- 
> 1.8.5.2.229.g4448466
> 
>
Eduardo Habkost May 19, 2014, 11:45 p.m. UTC | #2
On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote:\
[...]
> +static void
> +set_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
> +               Error **errp)
> +{
> +#ifdef CONFIG_NUMA
> +    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
> +    uint16List *l = NULL;
> +
> +    visit_type_uint16List(v, &l, name, errp);
> +
> +    while (l) {
> +        bitmap_set(backend->host_nodes, l->value, 1);

Shouldn't we clear the existing bits, here?

> +        l = l->next;
> +    }
> +#else
> +    error_setg(errp, "NUMA node binding are not supported by this QEMU");
> +#endif
> +}
Hu Tao June 6, 2014, 3:37 a.m. UTC | #3
On Mon, May 19, 2014 at 08:34:54PM -0300, Eduardo Habkost wrote:
> On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote:
> [...]
> > @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, Error **errp)
> >      if (backend->prealloc) {
> >          os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
> >      }
> > +
> > +#ifdef CONFIG_NUMA
> > +    unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES);
> > +
> > +    /* This is a workaround for a long standing bug in Linux'
> > +     * mbind implementation, which cuts off the last specified
> > +     * node.
> > +     */
> 
> What if the bug is fixed? mbind() documentation says "nodemask points to

No it won't, otherwise softwares depend on mbind() will break.

> a bit mask of nodes containing up to maxnode bits", so we must ensure
> backend->host_nodes has the one extra bit.

Yes.

> 
> Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as
> argument.
> 
> We could address both issues, and do this:
> 
>     struct HostMemoryBackend { [...]
>         DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
>     [...]
>     lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
>     /* lastbit == MAX_NODES means maxnode=0 */
>     maxnode = (lastbit + 1) % (MAX_NODES + 1);
>     /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
>      * as argument to mbind() due to an old Linux bug (feature?) which
>      * cuts off the last specified node. This means backend->host_nodes
>      * must have MAX_NODES+1 bits available.
>      */
>     assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
>     assert(maxnode <= MAX_NODES);

I think we can just omit these two asserts since they are guaranteed to
be true.

>     mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, flags);
> 
> 
> (I am starting to wonder if it was worth dropping the libnuma
> requirement and implementing our own mbind()-calling code.)
> 
> > +    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) {
> > +        error_setg_errno(errp, errno,
> > +                         "cannot bind memory to host NUMA nodes");
> 
> Don't we want to set flags to MPOL_MF_STRICT here? I believe we
> shouldn't have any pages preallocated at this point, but in case we do,
> I would expect them to be moved instead of ignoring the policy set by
> the user.

MPOL_MF_STRICT | MPOL_MF_MOVE to move. Actually in this version the
preallocation happens before mbind, which is fixed in v3.2.


> 
> > +        return;
> > +    }
> > +#endif
> >  }
> >  
> >  MemoryRegion *
> > diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
> > index 819b72d..4e96298 100644
> > --- a/include/sysemu/hostmem.h
> > +++ b/include/sysemu/hostmem.h
> > @@ -12,8 +12,10 @@
> >  #ifndef QEMU_HOSTMEM_H
> >  #define QEMU_HOSTMEM_H
> >  
> > +#include "sysemu/sysemu.h" /* for MAX_NODES */
> >  #include "qom/object.h"
> >  #include "exec/memory.h"
> > +#include "qemu/bitmap.h"
> >  
> >  #define TYPE_MEMORY_BACKEND "memory"
> >  #define MEMORY_BACKEND(obj) \
> > @@ -52,6 +54,8 @@ struct HostMemoryBackend {
> >      uint64_t size;
> >      bool merge, dump;
> >      bool prealloc, force_prealloc;
> > +    DECLARE_BITMAP(host_nodes, MAX_NODES);
> > +    HostMemPolicy policy;
> >  
> >      MemoryRegion mr;
> >  };
> > diff --git a/qapi-schema.json b/qapi-schema.json
> > index 5dd30eb..bea3476 100644
> > --- a/qapi-schema.json
> > +++ b/qapi-schema.json
> > @@ -4732,3 +4732,23 @@
> >     '*cpus':   ['uint16'],
> >     '*mem':    'size',
> >     '*memdev': 'str' }}
> > +
> > +##
> > +# @HostMemPolicy
> > +#
> > +# Host memory policy types
> > +#
> > +# @default: restore default policy, remove any nondefault policy
> > +#
> > +# @preferred: set the preferred host nodes for allocation
> > +#
> > +# @bind: a strict policy that restricts memory allocation to the
> > +#        host nodes specified
> > +#
> > +# @interleave: memory allocations are interleaved across the set
> > +#              of host nodes specified
> > +#
> > +# Since 2.1
> > +##
> > +{ 'enum': 'HostMemPolicy',
> > +  'data': [ 'default', 'preferred', 'bind', 'interleave' ] }
> > -- 
> > 1.8.5.2.229.g4448466
> > 
> > 
> 
> -- 
> Eduardo
Eduardo Habkost June 6, 2014, 4:15 p.m. UTC | #4
On Fri, Jun 06, 2014 at 11:37:26AM +0800, Hu Tao wrote:
> On Mon, May 19, 2014 at 08:34:54PM -0300, Eduardo Habkost wrote:
> > On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote:
> > [...]
> > > @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, Error **errp)
> > >      if (backend->prealloc) {
> > >          os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
> > >      }
> > > +
> > > +#ifdef CONFIG_NUMA
> > > +    unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES);
> > > +
> > > +    /* This is a workaround for a long standing bug in Linux'
> > > +     * mbind implementation, which cuts off the last specified
> > > +     * node.
> > > +     */
> > 
> > What if the bug is fixed? mbind() documentation says "nodemask points to
> 
> No it won't, otherwise softwares depend on mbind() will break.
> 
> > a bit mask of nodes containing up to maxnode bits", so we must ensure
> > backend->host_nodes has the one extra bit.
> 
> Yes.
> 
> > 
> > Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as
> > argument.
> > 
> > We could address both issues, and do this:
> > 
> >     struct HostMemoryBackend { [...]
> >         DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
> >     [...]
> >     lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
> >     /* lastbit == MAX_NODES means maxnode=0 */
> >     maxnode = (lastbit + 1) % (MAX_NODES + 1);
> >     /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
> >      * as argument to mbind() due to an old Linux bug (feature?) which
> >      * cuts off the last specified node. This means backend->host_nodes
> >      * must have MAX_NODES+1 bits available.
> >      */
> >     assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
> >     assert(maxnode <= MAX_NODES);
> 
> I think we can just omit these two asserts since they are guaranteed to
> be true.

asserts() must be always guaranteed to be true, that's the whole point
of using them. They can detect subtle off-by-one bugs if somebody
introduces them in the future.

> 
> >     mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, flags);
> > 
> > 
> > (I am starting to wonder if it was worth dropping the libnuma
> > requirement and implementing our own mbind()-calling code.)
> > 
> > > +    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) {
> > > +        error_setg_errno(errp, errno,
> > > +                         "cannot bind memory to host NUMA nodes");
> > 
> > Don't we want to set flags to MPOL_MF_STRICT here? I believe we
> > shouldn't have any pages preallocated at this point, but in case we do,
> > I would expect them to be moved instead of ignoring the policy set by
> > the user.
> 
> MPOL_MF_STRICT | MPOL_MF_MOVE to move. Actually in this version the
> preallocation happens before mbind, which is fixed in v3.2.

If memory was already allocated in a different node and has to be moved
that early, that's a bug we want to detect and fix (instead of
triggering useles memory moves). So I would use only MPOL_MF_STRICT.
Hu Tao June 9, 2014, 2:12 a.m. UTC | #5
On Fri, Jun 06, 2014 at 01:15:28PM -0300, Eduardo Habkost wrote:
> On Fri, Jun 06, 2014 at 11:37:26AM +0800, Hu Tao wrote:
> > On Mon, May 19, 2014 at 08:34:54PM -0300, Eduardo Habkost wrote:
> > > On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote:
> > > [...]
> > > > @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, Error **errp)
> > > >      if (backend->prealloc) {
> > > >          os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
> > > >      }
> > > > +
> > > > +#ifdef CONFIG_NUMA
> > > > +    unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES);
> > > > +
> > > > +    /* This is a workaround for a long standing bug in Linux'
> > > > +     * mbind implementation, which cuts off the last specified
> > > > +     * node.
> > > > +     */
> > > 
> > > What if the bug is fixed? mbind() documentation says "nodemask points to
> > 
> > No it won't, otherwise softwares depend on mbind() will break.
> > 
> > > a bit mask of nodes containing up to maxnode bits", so we must ensure
> > > backend->host_nodes has the one extra bit.
> > 
> > Yes.
> > 
> > > 
> > > Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as
> > > argument.
> > > 
> > > We could address both issues, and do this:
> > > 
> > >     struct HostMemoryBackend { [...]
> > >         DECLARE_BITMAP(host_nodes, MAX_NODES + 1);
> > >     [...]
> > >     lastbit = find_last_bit(backend->host_nodes, MAX_NODES);
> > >     /* lastbit == MAX_NODES means maxnode=0 */
> > >     maxnode = (lastbit + 1) % (MAX_NODES + 1);
> > >     /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1
> > >      * as argument to mbind() due to an old Linux bug (feature?) which
> > >      * cuts off the last specified node. This means backend->host_nodes
> > >      * must have MAX_NODES+1 bits available.
> > >      */
> > >     assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * sizeof(unsigned long));
> > >     assert(maxnode <= MAX_NODES);
> > 
> > I think we can just omit these two asserts since they are guaranteed to
> > be true.
> 
> asserts() must be always guaranteed to be true, that's the whole point
> of using them. They can detect subtle off-by-one bugs if somebody
> introduces them in the future.
> 
> > 
> > >     mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, flags);
> > > 
> > > 
> > > (I am starting to wonder if it was worth dropping the libnuma
> > > requirement and implementing our own mbind()-calling code.)
> > > 
> > > > +    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) {
> > > > +        error_setg_errno(errp, errno,
> > > > +                         "cannot bind memory to host NUMA nodes");
> > > 
> > > Don't we want to set flags to MPOL_MF_STRICT here? I believe we
> > > shouldn't have any pages preallocated at this point, but in case we do,
> > > I would expect them to be moved instead of ignoring the policy set by
> > > the user.
> > 
> > MPOL_MF_STRICT | MPOL_MF_MOVE to move. Actually in this version the
> > preallocation happens before mbind, which is fixed in v3.2.
> 
> If memory was already allocated in a different node and has to be moved
> that early, that's a bug we want to detect and fix (instead of
> triggering useles memory moves). So I would use only MPOL_MF_STRICT.

Fair enough. But what about huge pages? As man page says, MPOL_MF_STRICT
is ignored on huge page mappings. Is leaving a comment at the place of
memory preallocation to warn people against alocating memory before mbind
(like it's done in v3.2) the only thing we can do?

Regards,
Hu
Eduardo Habkost June 10, 2014, 6:44 p.m. UTC | #6
On Mon, Jun 09, 2014 at 10:12:07AM +0800, Hu Tao wrote:
[...]
> > > 
> > > >     mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, flags);
> > > > 
> > > > 
> > > > (I am starting to wonder if it was worth dropping the libnuma
> > > > requirement and implementing our own mbind()-calling code.)
> > > > 
> > > > > +    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) {
> > > > > +        error_setg_errno(errp, errno,
> > > > > +                         "cannot bind memory to host NUMA nodes");
> > > > 
> > > > Don't we want to set flags to MPOL_MF_STRICT here? I believe we
> > > > shouldn't have any pages preallocated at this point, but in case we do,
> > > > I would expect them to be moved instead of ignoring the policy set by
> > > > the user.
> > > 
> > > MPOL_MF_STRICT | MPOL_MF_MOVE to move. Actually in this version the
> > > preallocation happens before mbind, which is fixed in v3.2.
> > 
> > If memory was already allocated in a different node and has to be moved
> > that early, that's a bug we want to detect and fix (instead of
> > triggering useles memory moves). So I would use only MPOL_MF_STRICT.
> 
> Fair enough. But what about huge pages? As man page says, MPOL_MF_STRICT
> is ignored on huge page mappings. Is leaving a comment at the place of
> memory preallocation to warn people against alocating memory before mbind
> (like it's done in v3.2) the only thing we can do?

Well, maybe the kernel should be fixed to not ignore MPOL_MF_STRICT on
huge page mappings, then. Does anybody know if the warning on the
manpage still applies, and if this can be changed?

In the meantime, it looks like all we can do is to print a warning, or
refuse to preallocate before mbind().
diff mbox

Patch

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 738bb31..d3f8476 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -10,11 +10,20 @@ 
  * See the COPYING file in the top-level directory.
  */
 #include "sysemu/hostmem.h"
-#include "sysemu/sysemu.h"
 #include "qapi/visitor.h"
+#include "qapi-types.h"
+#include "qapi-visit.h"
 #include "qapi/qmp/qerror.h"
 #include "qom/object_interfaces.h"
 
+#ifdef CONFIG_NUMA
+#include <numaif.h>
+QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_DEFAULT != MPOL_DEFAULT);
+QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_PREFERRED != MPOL_PREFERRED);
+QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_BIND != MPOL_BIND);
+QEMU_BUILD_BUG_ON(HOST_MEM_POLICY_INTERLEAVE != MPOL_INTERLEAVE);
+#endif
+
 static void
 host_memory_backend_get_size(Object *obj, Visitor *v, void *opaque,
                             const char *name, Error **errp)
@@ -49,6 +58,84 @@  host_memory_backend_set_size(Object *obj, Visitor *v, void *opaque,
     backend->size = value;
 }
 
+static void
+get_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
+               Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+    uint16List *host_nodes = NULL;
+    uint16List **node = &host_nodes;
+    unsigned long value;
+
+    value = find_first_bit(backend->host_nodes, MAX_NODES);
+    if (value == MAX_NODES) {
+        return;
+    }
+
+    *node = g_malloc0(sizeof(**node));
+    (*node)->value = value;
+    node = &(*node)->next;
+
+    do {
+        value = find_next_bit(backend->host_nodes, MAX_NODES, value + 1);
+        if (value == MAX_NODES) {
+            break;
+        }
+
+        *node = g_malloc0(sizeof(**node));
+        (*node)->value = value;
+        node = &(*node)->next;
+    } while (true);
+
+    visit_type_uint16List(v, &host_nodes, name, errp);
+}
+
+static void
+set_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
+               Error **errp)
+{
+#ifdef CONFIG_NUMA
+    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+    uint16List *l = NULL;
+
+    visit_type_uint16List(v, &l, name, errp);
+
+    while (l) {
+        bitmap_set(backend->host_nodes, l->value, 1);
+        l = l->next;
+    }
+#else
+    error_setg(errp, "NUMA node binding are not supported by this QEMU");
+#endif
+}
+
+static void
+get_policy(Object *obj, Visitor *v, void *opaque, const char *name,
+           Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+    int policy = backend->policy;
+
+    visit_type_enum(v, &policy, HostMemPolicy_lookup, NULL, name, errp);
+}
+
+static void
+set_policy(Object *obj, Visitor *v, void *opaque, const char *name,
+           Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(obj);
+    int policy;
+
+    visit_type_enum(v, &policy, HostMemPolicy_lookup, NULL, name, errp);
+    backend->policy = policy;
+
+#ifndef CONFIG_NUMA
+    if (policy != HOST_MEM_POLICY_DEFAULT) {
+        error_setg(errp, "NUMA policies are not supported by this QEMU");
+    }
+#endif
+}
+
 static bool host_memory_backend_get_merge(Object *obj, Error **errp)
 {
     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
@@ -159,6 +246,12 @@  static void host_memory_backend_initfn(Object *obj)
     object_property_add(obj, "size", "int",
                         host_memory_backend_get_size,
                         host_memory_backend_set_size, NULL, NULL, NULL);
+    object_property_add(obj, "host-nodes", "int",
+                        get_host_nodes,
+                        set_host_nodes, NULL, NULL, NULL);
+    object_property_add(obj, "policy", "str",
+                        get_policy,
+                        set_policy, NULL, NULL, NULL);
 }
 
 static void host_memory_backend_finalize(Object *obj)
@@ -203,6 +296,20 @@  host_memory_backend_memory_init(UserCreatable *uc, Error **errp)
     if (backend->prealloc) {
         os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz);
     }
+
+#ifdef CONFIG_NUMA
+    unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES);
+
+    /* This is a workaround for a long standing bug in Linux'
+     * mbind implementation, which cuts off the last specified
+     * node.
+     */
+    if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, 0)) {
+        error_setg_errno(errp, errno,
+                         "cannot bind memory to host NUMA nodes");
+        return;
+    }
+#endif
 }
 
 MemoryRegion *
diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h
index 819b72d..4e96298 100644
--- a/include/sysemu/hostmem.h
+++ b/include/sysemu/hostmem.h
@@ -12,8 +12,10 @@ 
 #ifndef QEMU_HOSTMEM_H
 #define QEMU_HOSTMEM_H
 
+#include "sysemu/sysemu.h" /* for MAX_NODES */
 #include "qom/object.h"
 #include "exec/memory.h"
+#include "qemu/bitmap.h"
 
 #define TYPE_MEMORY_BACKEND "memory"
 #define MEMORY_BACKEND(obj) \
@@ -52,6 +54,8 @@  struct HostMemoryBackend {
     uint64_t size;
     bool merge, dump;
     bool prealloc, force_prealloc;
+    DECLARE_BITMAP(host_nodes, MAX_NODES);
+    HostMemPolicy policy;
 
     MemoryRegion mr;
 };
diff --git a/qapi-schema.json b/qapi-schema.json
index 5dd30eb..bea3476 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4732,3 +4732,23 @@ 
    '*cpus':   ['uint16'],
    '*mem':    'size',
    '*memdev': 'str' }}
+
+##
+# @HostMemPolicy
+#
+# Host memory policy types
+#
+# @default: restore default policy, remove any nondefault policy
+#
+# @preferred: set the preferred host nodes for allocation
+#
+# @bind: a strict policy that restricts memory allocation to the
+#        host nodes specified
+#
+# @interleave: memory allocations are interleaved across the set
+#              of host nodes specified
+#
+# Since 2.1
+##
+{ 'enum': 'HostMemPolicy',
+  'data': [ 'default', 'preferred', 'bind', 'interleave' ] }