diff mbox

[v18,13/14] memory backend: fill memory backend ram fields

Message ID fad899d2a898a9a4b487fd27791b32dee6ddeaff.1392794450.git.hutao@cn.fujitsu.com
State New
Headers show

Commit Message

Hu Tao Feb. 19, 2014, 7:54 a.m. UTC
Thus makes user control how to allocate memory for ram backend.

Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
---
 backends/hostmem-ram.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/sysemu/sysemu.h |   2 +
 2 files changed, 160 insertions(+)

Comments

Paolo Bonzini Feb. 19, 2014, 9:03 a.m. UTC | #1
19/02/2014 08:54, Hu Tao ha scritto:
> Thus makes user control how to allocate memory for ram backend.
>
> Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> ---
>  backends/hostmem-ram.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
>  include/sysemu/sysemu.h |   2 +
>  2 files changed, 160 insertions(+)
>
> diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
> index a496dbd..2da9341 100644
> --- a/backends/hostmem-ram.c
> +++ b/backends/hostmem-ram.c
> @@ -10,23 +10,179 @@
>   * See the COPYING file in the top-level directory.
>   */
>  #include "sysemu/hostmem.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/bitmap.h"
> +#include "qapi-visit.h"
> +#include "qemu/config-file.h"
> +#include "qapi/opts-visitor.h"
>
>  #define TYPE_MEMORY_BACKEND_RAM "memory-ram"
> +#define MEMORY_BACKEND_RAM(obj) \
> +    OBJECT_CHECK(HostMemoryBackendRam, (obj), TYPE_MEMORY_BACKEND_RAM)
>
> +typedef struct HostMemoryBackendRam HostMemoryBackendRam;
> +
> +/**
> + * @HostMemoryBackendRam
> + *
> + * @parent: opaque parent object container
> + * @host_nodes: host nodes bitmap used for memory policy
> + * @policy: host memory policy
> + * @relative: if the host nodes bitmap is relative
> + */
> +struct HostMemoryBackendRam {
> +    /* private */
> +    HostMemoryBackend parent;
> +
> +    DECLARE_BITMAP(host_nodes, MAX_NODES);
> +    HostMemPolicy policy;
> +    bool relative;
> +};
> +
> +static void
> +get_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
> +               Error **errp)
> +{
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
> +    uint16List *host_nodes = NULL;
> +    uint16List **node = &host_nodes;
> +    unsigned long value;
> +
> +    value = find_first_bit(ram_backend->host_nodes, MAX_NODES);
> +    if (value == MAX_NODES) {
> +        return;
> +    }
> +
> +    *node = g_malloc0(sizeof(**node));
> +    (*node)->value = value;
> +    node = &(*node)->next;
> +
> +    do {
> +        value = find_next_bit(ram_backend->host_nodes, MAX_NODES, value + 1);
> +        if (value == MAX_NODES) {
> +            break;
> +        }
> +
> +        *node = g_malloc0(sizeof(**node));
> +        (*node)->value = value;
> +        node = &(*node)->next;
> +    } while (true);
> +
> +    visit_type_uint16List(v, &host_nodes, name, errp);
> +}
> +
> +static void
> +set_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
> +               Error **errp)
> +{
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
> +    uint16List *l = NULL;
> +
> +    visit_type_uint16List(v, &l, name, errp);
> +
> +    while (l) {
> +        bitmap_set(ram_backend->host_nodes, l->value, 1);
> +        l = l->next;
> +    }
> +}
> +
> +static const char *policies[HOST_MEM_POLICY_MAX + 1] = {
> +    [HOST_MEM_POLICY_DEFAULT] = "default",
> +    [HOST_MEM_POLICY_PREFERRED] = "preferred",
> +    [HOST_MEM_POLICY_MEMBIND] = "membind",
> +    [HOST_MEM_POLICY_INTERLEAVE] = "interleave",
> +    [HOST_MEM_POLICY_MAX] = NULL,
> +};

This is already available in qapi-types.c as HostMemPolicy_lookup.

> +static void
> +get_policy(Object *obj, Visitor *v, void *opaque, const char *name,
> +           Error **errp)
> +{
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
> +    int policy = ram_backend->policy;
> +
> +    visit_type_enum(v, &policy, policies, NULL, name, errp);
> +}
> +
> +static void
> +set_policy(Object *obj, Visitor *v, void *opaque, const char *name,
> +           Error **errp)
> +{
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
> +    int policy;
> +
> +    visit_type_enum(v, &policy, policies, NULL, name, errp);
> +    ram_backend->policy = policy;

I think you need to set an error if backend->mr != NULL.

> +}
> +
> +
> +static bool get_relative(Object *obj, Error **errp)
> +{
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
> +
> +    return ram_backend->relative;
> +}
> +
> +static void set_relative(Object *obj, bool value, Error **errp)
> +{
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
> +
> +    ram_backend->relative = value;
> +}

Do we need relative vs. static?  Also, the default right now is static, 
while in Linux kernel this is a tri-state: relative, static, default.

I think that for now we should just omit this and only allow the default 
setting.  We can introduce an enum later without make the API 
backwards-incompatible.

> +#include <sys/syscall.h>
> +#ifndef MPOL_F_RELATIVE_NODES
> +#define MPOL_F_RELATIVE_NODES (1 << 14)
> +#define MPOL_F_STATIC_NODES   (1 << 15)
> +#endif
>
>  static int
>  ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
>  {
> +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> +    int mode = ram_backend->policy;
> +    void *p;
> +    unsigned long maxnode;
> +
>      if (!memory_region_size(&backend->mr)) {
>          memory_region_init_ram(&backend->mr, OBJECT(backend),
>                                 object_get_canonical_path(OBJECT(backend)),
>                                 backend->size);
> +
> +        p = memory_region_get_ram_ptr(&backend->mr);
> +        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> +
> +        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> +            MPOL_F_STATIC_NODES;
> +        /* This is a workaround for a long standing bug in Linux'
> +         * mbind implementation, which cuts off the last specified
> +         * node. To stay compatible should this bug be fixed, we
> +         * specify one more node and zero this one out.
> +         */
> +        if (syscall(SYS_mbind, p, backend->size, mode,
> +                    ram_backend->host_nodes, maxnode + 2, 0)) {

This does not compile on non-Linux; also, does libnuma include the 
workaround?  If so, this is a hint that we should be using libnuma 
instead...

Finally, all this code should be in hostmem.c, not hostmem-ram.c, 
because the same policies can be applied to hugepage-backed memory.

Currently host_memory_backend_get_memory is calling bc->memory_init. 
Probably the call should be replaced by something like

static void
host_memory_backend_alloc(HostMemoryBackend *backend, Error **errp)
{
     Error *local_err = NULL;
     bc->memory_init(backend, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
         return;
     }

     ... set policy ...
}

...

     Error *local_err = NULL;
     host_memory_backend_alloc(backend, &local_err);
     if (local_err != NULL) {
         error_propagate(errp, local_err);
         return NULL;
     }

     assert(memory_region_size(&backend->mr) != 0);
     return &backend->mr;
}

> +            return -1;
> +        }
>      }
>
>      return 0;
>  }
>
>  static void
> +ram_backend_initfn(Object *obj)
> +{
> +    object_property_add(obj, "host-nodes", "int",
> +                        get_host_nodes,
> +                        set_host_nodes, NULL, NULL, NULL);
> +    object_property_add(obj, "policy", "string",

The convention is "str".

> +                        get_policy,
> +                        set_policy, NULL, NULL, NULL);
> +    object_property_add_bool(obj, "relative",
> +                             get_relative, set_relative, NULL);
> +}
> +
> +static void
>  ram_backend_class_init(ObjectClass *oc, void *data)
>  {
>      HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);
> @@ -38,6 +194,8 @@ static const TypeInfo ram_backend_info = {
>      .name = TYPE_MEMORY_BACKEND_RAM,
>      .parent = TYPE_MEMORY_BACKEND,
>      .class_init = ram_backend_class_init,
> +    .instance_size = sizeof(HostMemoryBackendRam),
> +    .instance_init = ram_backend_initfn,
>  };
>
>  static void register_types(void)
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index acfc0c7..a3d8c02 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -152,6 +152,8 @@ void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
>                                            const char *name,
>                                            QEMUMachineInitArgs *args);
>
> +extern QemuOptsList qemu_memdev_opts;
> +

Not needed anymore, I think.

Paolo

>  #define MAX_OPTION_ROMS 16
>  typedef struct QEMUOptionRom {
>      const char *name;
>
Igor Mammedov Feb. 19, 2014, 9:36 a.m. UTC | #2
On Wed, 19 Feb 2014 10:03:13 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

>   19/02/2014 08:54, Hu Tao ha scritto:
> > Thus makes user control how to allocate memory for ram backend.
> >
> > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> > ---
> >  backends/hostmem-ram.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
> >  include/sysemu/sysemu.h |   2 +
> >  2 files changed, 160 insertions(+)
> >
> > diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
[...]

> >  static int
> >  ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
> >  {
> > +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> > +    int mode = ram_backend->policy;
> > +    void *p;
> > +    unsigned long maxnode;
> > +
> >      if (!memory_region_size(&backend->mr)) {
> >          memory_region_init_ram(&backend->mr, OBJECT(backend),
> >                                 object_get_canonical_path(OBJECT(backend)),
> >                                 backend->size);
> > +
> > +        p = memory_region_get_ram_ptr(&backend->mr);
> > +        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> > +
> > +        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> > +            MPOL_F_STATIC_NODES;
> > +        /* This is a workaround for a long standing bug in Linux'
> > +         * mbind implementation, which cuts off the last specified
> > +         * node. To stay compatible should this bug be fixed, we
> > +         * specify one more node and zero this one out.
> > +         */
> > +        if (syscall(SYS_mbind, p, backend->size, mode,
> > +                    ram_backend->host_nodes, maxnode + 2, 0)) {
> 
> This does not compile on non-Linux; also, does libnuma include the 
> workaround?  If so, this is a hint that we should be using libnuma 
> instead...
> 
> Finally, all this code should be in hostmem.c, not hostmem-ram.c, 
> because the same policies can be applied to hugepage-backed memory.
> 
> Currently host_memory_backend_get_memory is calling bc->memory_init. 
> Probably the call should be replaced by something like
I've pushed to github updated version of memdev, where
host_memory_backend_get_memory() is just convenience wrapper to get
access to memdev's internal MemoryRegion.

All initialization now is done in user_creatable->complete() method
which calls ram_backend_memory_init() so leaving it as is should be fine.

> 
> static void
> host_memory_backend_alloc(HostMemoryBackend *backend, Error **errp)
> {
>      Error *local_err = NULL;
>      bc->memory_init(backend, &local_err);
>      if (local_err != NULL) {
>          error_propagate(errp, local_err);
>          return;
>      }
> 
>      ... set policy ...
> }
> 
> ...
> 
>      Error *local_err = NULL;
>      host_memory_backend_alloc(backend, &local_err);
>      if (local_err != NULL) {
>          error_propagate(errp, local_err);
>          return NULL;
>      }
> 
>      assert(memory_region_size(&backend->mr) != 0);
>      return &backend->mr;
> }
> 
[...]
Hu Tao Feb. 25, 2014, 10:09 a.m. UTC | #3
On Wed, Feb 19, 2014 at 10:03:13AM +0100, Paolo Bonzini wrote:

<...>

> > static int
> > ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
> > {
> >+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> >+    int mode = ram_backend->policy;
> >+    void *p;
> >+    unsigned long maxnode;
> >+
> >     if (!memory_region_size(&backend->mr)) {
> >         memory_region_init_ram(&backend->mr, OBJECT(backend),
> >                                object_get_canonical_path(OBJECT(backend)),
> >                                backend->size);
> >+
> >+        p = memory_region_get_ram_ptr(&backend->mr);
> >+        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> >+
> >+        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> >+            MPOL_F_STATIC_NODES;
> >+        /* This is a workaround for a long standing bug in Linux'
> >+         * mbind implementation, which cuts off the last specified
> >+         * node. To stay compatible should this bug be fixed, we
> >+         * specify one more node and zero this one out.
> >+         */
> >+        if (syscall(SYS_mbind, p, backend->size, mode,
> >+                    ram_backend->host_nodes, maxnode + 2, 0)) {
> 
> This does not compile on non-Linux; also, does libnuma include the
> workaround?  If so, this is a hint that we should be using libnuma
> instead...

Tested with libnuma and works fine without the workaround. Will use
libnuma in v19.
Hu Tao Feb. 25, 2014, 10:20 a.m. UTC | #4
On Wed, Feb 19, 2014 at 10:36:57AM +0100, Igor Mammedov wrote:
> On Wed, 19 Feb 2014 10:03:13 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> >   19/02/2014 08:54, Hu Tao ha scritto:
> > > Thus makes user control how to allocate memory for ram backend.
> > >
> > > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> > > ---
> > >  backends/hostmem-ram.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/sysemu/sysemu.h |   2 +
> > >  2 files changed, 160 insertions(+)
> > >
> > > diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
> [...]
> 
> > >  static int
> > >  ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
> > >  {
> > > +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> > > +    int mode = ram_backend->policy;
> > > +    void *p;
> > > +    unsigned long maxnode;
> > > +
> > >      if (!memory_region_size(&backend->mr)) {
> > >          memory_region_init_ram(&backend->mr, OBJECT(backend),
> > >                                 object_get_canonical_path(OBJECT(backend)),
> > >                                 backend->size);
> > > +
> > > +        p = memory_region_get_ram_ptr(&backend->mr);
> > > +        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> > > +
> > > +        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> > > +            MPOL_F_STATIC_NODES;
> > > +        /* This is a workaround for a long standing bug in Linux'
> > > +         * mbind implementation, which cuts off the last specified
> > > +         * node. To stay compatible should this bug be fixed, we
> > > +         * specify one more node and zero this one out.
> > > +         */
> > > +        if (syscall(SYS_mbind, p, backend->size, mode,
> > > +                    ram_backend->host_nodes, maxnode + 2, 0)) {
> > 
> > This does not compile on non-Linux; also, does libnuma include the 
> > workaround?  If so, this is a hint that we should be using libnuma 
> > instead...
> > 
> > Finally, all this code should be in hostmem.c, not hostmem-ram.c, 
> > because the same policies can be applied to hugepage-backed memory.
> > 
> > Currently host_memory_backend_get_memory is calling bc->memory_init. 
> > Probably the call should be replaced by something like
> I've pushed to github updated version of memdev, where
> host_memory_backend_get_memory() is just convenience wrapper to get
> access to memdev's internal MemoryRegion.
> 
> All initialization now is done in user_creatable->complete() method
> which calls ram_backend_memory_init() so leaving it as is should be fine.

There is a problem that user_creatable_complete() is called before
adding object to "/objects" (see object_create()), which triggers a
assert failure when calling object_get_canonical_path() in
ram_backend_memory_init(). Any ideas?
Paolo Bonzini Feb. 25, 2014, 2:15 p.m. UTC | #5
Il 25/02/2014 11:20, Hu Tao ha scritto:
> There is a problem that user_creatable_complete() is called before
> adding object to "/objects" (see object_create()), which triggers a
> assert failure when calling object_get_canonical_path() in
> ram_backend_memory_init(). Any ideas?

You can call object_property_add_child before calling 
user_creatable_complete, and then call object_unparent (in addition to 
object_unref) if creation fails.

Paolo
Hu Tao Feb. 26, 2014, 5:57 a.m. UTC | #6
On Wed, Feb 19, 2014 at 10:36:57AM +0100, Igor Mammedov wrote:
> On Wed, 19 Feb 2014 10:03:13 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
> >   19/02/2014 08:54, Hu Tao ha scritto:
> > > Thus makes user control how to allocate memory for ram backend.
> > >
> > > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> > > ---
> > >  backends/hostmem-ram.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
> > >  include/sysemu/sysemu.h |   2 +
> > >  2 files changed, 160 insertions(+)
> > >
> > > diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
> [...]
> 
> > >  static int
> > >  ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
> > >  {
> > > +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> > > +    int mode = ram_backend->policy;
> > > +    void *p;
> > > +    unsigned long maxnode;
> > > +
> > >      if (!memory_region_size(&backend->mr)) {
> > >          memory_region_init_ram(&backend->mr, OBJECT(backend),
> > >                                 object_get_canonical_path(OBJECT(backend)),
> > >                                 backend->size);
> > > +
> > > +        p = memory_region_get_ram_ptr(&backend->mr);
> > > +        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> > > +
> > > +        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> > > +            MPOL_F_STATIC_NODES;
> > > +        /* This is a workaround for a long standing bug in Linux'
> > > +         * mbind implementation, which cuts off the last specified
> > > +         * node. To stay compatible should this bug be fixed, we
> > > +         * specify one more node and zero this one out.
> > > +         */
> > > +        if (syscall(SYS_mbind, p, backend->size, mode,
> > > +                    ram_backend->host_nodes, maxnode + 2, 0)) {
> > 
> > This does not compile on non-Linux; also, does libnuma include the 
> > workaround?  If so, this is a hint that we should be using libnuma 
> > instead...
> > 
> > Finally, all this code should be in hostmem.c, not hostmem-ram.c, 
> > because the same policies can be applied to hugepage-backed memory.
> > 
> > Currently host_memory_backend_get_memory is calling bc->memory_init. 
> > Probably the call should be replaced by something like
> I've pushed to github updated version of memdev, where
> host_memory_backend_get_memory() is just convenience wrapper to get
> access to memdev's internal MemoryRegion.
> 
> All initialization now is done in user_creatable->complete() method
> which calls ram_backend_memory_init() so leaving it as is should be fine.

If lines about memory polices are moved up to hostmem.c, the only thing
left in ram_backend_memory_init() is calling memory_region_init_ram() to
allocate memory. Then it comes a problem that when to apply memory
polices? Choices:

1. apply memory polices in hostmem.c since this is the place user sets
   memory polices. But user_creatable_complete() seems not to support
   this.( but fix me)

2. cast to HostMemoryBackend in ram_backend_memory_init() (or in other
   memory backends) and add lines to apply memory polices.

3. provide an interface in HostMemoryBackendClass to do the thing and
   call it in subclasses. (this is basically the same as 2 except that
   we can reuse code)

Opinions?
Paolo Bonzini Feb. 26, 2014, 9:05 a.m. UTC | #7
Il 26/02/2014 06:57, Hu Tao ha scritto:
> If lines about memory polices are moved up to hostmem.c, the only thing
> left in ram_backend_memory_init() is calling memory_region_init_ram() to
> allocate memory. Then it comes a problem that when to apply memory
> polices? Choices:
>
> 1. apply memory polices in hostmem.c since this is the place user sets
>    memory polices. But user_creatable_complete() seems not to support
>    this.( but fix me)
>
> 2. cast to HostMemoryBackend in ram_backend_memory_init() (or in other
>    memory backends) and add lines to apply memory polices.
>
> 3. provide an interface in HostMemoryBackendClass to do the thing and
>    call it in subclasses. (this is basically the same as 2 except that
>    we can reuse code)

I like (3).  I understand it's something like

void memory_backend_apply_mempolicy(HostMemoryBackend *be,
                                     void *addr, size_t len, Error **err)

?

Paolo
Igor Mammedov Feb. 26, 2014, 9:10 a.m. UTC | #8
On Wed, 26 Feb 2014 13:57:06 +0800
Hu Tao <hutao@cn.fujitsu.com> wrote:

> On Wed, Feb 19, 2014 at 10:36:57AM +0100, Igor Mammedov wrote:
> > On Wed, 19 Feb 2014 10:03:13 +0100
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> > 
> > >   19/02/2014 08:54, Hu Tao ha scritto:
> > > > Thus makes user control how to allocate memory for ram backend.
> > > >
> > > > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> > > > ---
> > > >  backends/hostmem-ram.c  | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
> > > >  include/sysemu/sysemu.h |   2 +
> > > >  2 files changed, 160 insertions(+)
> > > >
> > > > diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
> > [...]
> > 
> > > >  static int
> > > >  ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
> > > >  {
> > > > +    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> > > > +    int mode = ram_backend->policy;
> > > > +    void *p;
> > > > +    unsigned long maxnode;
> > > > +
> > > >      if (!memory_region_size(&backend->mr)) {
> > > >          memory_region_init_ram(&backend->mr, OBJECT(backend),
> > > >                                 object_get_canonical_path(OBJECT(backend)),
> > > >                                 backend->size);
> > > > +
> > > > +        p = memory_region_get_ram_ptr(&backend->mr);
> > > > +        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> > > > +
> > > > +        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> > > > +            MPOL_F_STATIC_NODES;
> > > > +        /* This is a workaround for a long standing bug in Linux'
> > > > +         * mbind implementation, which cuts off the last specified
> > > > +         * node. To stay compatible should this bug be fixed, we
> > > > +         * specify one more node and zero this one out.
> > > > +         */
> > > > +        if (syscall(SYS_mbind, p, backend->size, mode,
> > > > +                    ram_backend->host_nodes, maxnode + 2, 0)) {
> > > 
> > > This does not compile on non-Linux; also, does libnuma include the 
> > > workaround?  If so, this is a hint that we should be using libnuma 
> > > instead...
> > > 
> > > Finally, all this code should be in hostmem.c, not hostmem-ram.c, 
> > > because the same policies can be applied to hugepage-backed memory.
> > > 
> > > Currently host_memory_backend_get_memory is calling bc->memory_init. 
> > > Probably the call should be replaced by something like
> > I've pushed to github updated version of memdev, where
> > host_memory_backend_get_memory() is just convenience wrapper to get
> > access to memdev's internal MemoryRegion.
> > 
> > All initialization now is done in user_creatable->complete() method
> > which calls ram_backend_memory_init() so leaving it as is should be fine.
> 
> If lines about memory polices are moved up to hostmem.c, the only thing
> left in ram_backend_memory_init() is calling memory_region_init_ram() to
> allocate memory. Then it comes a problem that when to apply memory
> polices? Choices:
> 
> 1. apply memory polices in hostmem.c since this is the place user sets
>    memory polices. But user_creatable_complete() seems not to support
>    this.( but fix me)
if we assume that NUMA policies apply to every hostmem derived backend,
then we could realize() approach used by DEVICE. i.e.
set NUMA policies in hostmem.c:hostmemory_backend_memory_init()

Add parent_complete field to ram-backend class and store there parent's
complete pointer. Then we can do:

ram_backend_memory_init(UserCreatable *uc, Error **errp) {
    memory_region_init_ram();
    ...
    MEMORY_BACKEND_RAM_CLASS(uc)->parent_complete(uc, errp);
    ...
}

> 
> 2. cast to HostMemoryBackend in ram_backend_memory_init() (or in other
>    memory backends) and add lines to apply memory polices.
> 
> 3. provide an interface in HostMemoryBackendClass to do the thing and
>    call it in subclasses. (this is basically the same as 2 except that
>    we can reuse code)
> 
> Opinions?
> 
>
Paolo Bonzini Feb. 26, 2014, 10:33 a.m. UTC | #9
Il 26/02/2014 10:10, Igor Mammedov ha scritto:
> if we assume that NUMA policies apply to every hostmem derived backend,
> then we could realize() approach used by DEVICE. i.e.
> set NUMA policies in hostmem.c:hostmemory_backend_memory_init()
>
> Add parent_complete field to ram-backend class and store there parent's
> complete pointer. Then we can do:
>
> ram_backend_memory_init(UserCreatable *uc, Error **errp) {
>     memory_region_init_ram();
>     ...
>     MEMORY_BACKEND_RAM_CLASS(uc)->parent_complete(uc, errp);
>     ...
> }
>

The problem is that some backends might not be handled the same way. 
For example, not all backends might produce a single void*/size_t pair 
for the entire region.  Think of a "composite" backend that produces a 
large memory region from two smaller ones.

Paolo
Igor Mammedov Feb. 26, 2014, 12:31 p.m. UTC | #10
On Wed, 26 Feb 2014 11:33:30 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 26/02/2014 10:10, Igor Mammedov ha scritto:
> > if we assume that NUMA policies apply to every hostmem derived backend,
> > then we could realize() approach used by DEVICE. i.e.
> > set NUMA policies in hostmem.c:hostmemory_backend_memory_init()
> >
> > Add parent_complete field to ram-backend class and store there parent's
> > complete pointer. Then we can do:
> >
> > ram_backend_memory_init(UserCreatable *uc, Error **errp) {
> >     memory_region_init_ram();
> >     ...
> >     MEMORY_BACKEND_RAM_CLASS(uc)->parent_complete(uc, errp);
> >     ...
> > }
> >
> 
> The problem is that some backends might not be handled the same way. 
> For example, not all backends might produce a single void*/size_t pair 
> for the entire region.  Think of a "composite" backend that produces a 
> large memory region from two smaller ones.

I'd prefer to keep backends simple, with 1:1 mapping to memory regions.
Is there a need in composite one or something similar?

> 
> Paolo
Paolo Bonzini Feb. 26, 2014, 12:45 p.m. UTC | #11
Il 26/02/2014 13:31, Igor Mammedov ha scritto:
>> > The problem is that some backends might not be handled the same way.
>> > For example, not all backends might produce a single void*/size_t pair
>> > for the entire region.  Think of a "composite" backend that produces a
>> > large memory region from two smaller ones.
> I'd prefer to keep backends simple, with 1:1 mapping to memory regions.

I agree.  However not all backends may have a mapping to a RAM memory 
region.  A composite backend could create a container memory region 
whose children are other HostMemoryBackend objects.

> Is there a need in composite one or something similar?

I've heard of users that want a node backed partially by hugetlbfs and 
partially by regular RAM.  Not sure why.

Paolo
Marcelo Tosatti Feb. 26, 2014, 12:58 p.m. UTC | #12
On Wed, Feb 26, 2014 at 01:45:38PM +0100, Paolo Bonzini wrote:
> Il 26/02/2014 13:31, Igor Mammedov ha scritto:
> >>> The problem is that some backends might not be handled the same way.
> >>> For example, not all backends might produce a single void*/size_t pair
> >>> for the entire region.  Think of a "composite" backend that produces a
> >>> large memory region from two smaller ones.
> >I'd prefer to keep backends simple, with 1:1 mapping to memory regions.
> 
> I agree.  However not all backends may have a mapping to a RAM
> memory region.  A composite backend could create a container memory
> region whose children are other HostMemoryBackend objects.
> 
> >Is there a need in composite one or something similar?
> 
> I've heard of users that want a node backed partially by hugetlbfs
> and partially by regular RAM.  Not sure why.
> 
> Paolo

To overcommit the non hugetlbfs backed guest RAM (think guest pagecache on that non
hugetlbfs backed memory, swappable and KSM-able).

The problem is, you have to in someway guarantee the guest allocates 
1GB pages out of the hugetlb backed GPA ranges. Some thoughts
(honestly, dislike all of them):

1) Boot guest with hugepages, allocate hugepages in guest,
later on hotplug 4K backed ranges. HV-unaware reboot might fail,
though.

2) Communicate hugepage GPAs to guest.

3) Create holes in non hugepage backed GPA range.
Paolo Bonzini Feb. 26, 2014, 1:14 p.m. UTC | #13
Il 26/02/2014 13:58, Marcelo Tosatti ha scritto:
>>> I'd prefer to keep backends simple, with 1:1 mapping to memory regions.
>>
>> I agree.  However not all backends may have a mapping to a RAM
>> memory region.  A composite backend could create a container memory
>> region whose children are other HostMemoryBackend objects.
>>
>>> Is there a need in composite one or something similar?
>>
>> I've heard of users that want a node backed partially by hugetlbfs
>> and partially by regular RAM.  Not sure why.
>
> To overcommit the non hugetlbfs backed guest RAM (think guest pagecache on that non
> hugetlbfs backed memory, swappable and KSM-able).
>
> The problem is, you have to in someway guarantee the guest allocates
> 1GB pages out of the hugetlb backed GPA ranges. Some thoughts
> (honestly, dislike all of them):
>
> 1) Boot guest with hugepages, allocate hugepages in guest,
> later on hotplug 4K backed ranges. HV-unaware reboot might fail,
> though.
>
> 2) Communicate hugepage GPAs to guest.
>
> 3) Create holes in non hugepage backed GPA range.

I guess (2) is the only one I "like", and I like it just because it 
officially becomes Not Our Problem.

Paolo
Igor Mammedov Feb. 26, 2014, 1:43 p.m. UTC | #14
On Wed, 26 Feb 2014 13:45:38 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 26/02/2014 13:31, Igor Mammedov ha scritto:
> >> > The problem is that some backends might not be handled the same way.
> >> > For example, not all backends might produce a single void*/size_t pair
> >> > for the entire region.  Think of a "composite" backend that produces a
> >> > large memory region from two smaller ones.
> > I'd prefer to keep backends simple, with 1:1 mapping to memory regions.
> 
> I agree.  However not all backends may have a mapping to a RAM memory 
> region.  A composite backend could create a container memory region 
> whose children are other HostMemoryBackend objects.
> 
> > Is there a need in composite one or something similar?
> 
> I've heard of users that want a node backed partially by hugetlbfs and 
> partially by regular RAM.  Not sure why.
Isn't issue here in how backend is mapped into GPA? Well that is not
backend's job.

Once one starts to put layout (alignment, noncontinuously mapped
memory regions inside of container, ...), mapping HPA->GPA gets complicated.

It would be better to use simple building blocks and model as:
2 separate backends (ram + hugetlbfs) and 2 corresponding DIMM devices.


> 
> Paolo
>
Paolo Bonzini Feb. 26, 2014, 1:47 p.m. UTC | #15
Il 26/02/2014 14:43, Igor Mammedov ha scritto:
> On Wed, 26 Feb 2014 13:45:38 +0100
> Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>> Il 26/02/2014 13:31, Igor Mammedov ha scritto:
>>>>> The problem is that some backends might not be handled the same way.
>>>>> For example, not all backends might produce a single void*/size_t pair
>>>>> for the entire region.  Think of a "composite" backend that produces a
>>>>> large memory region from two smaller ones.
>>> I'd prefer to keep backends simple, with 1:1 mapping to memory regions.
>>
>> I agree.  However not all backends may have a mapping to a RAM memory
>> region.  A composite backend could create a container memory region
>> whose children are other HostMemoryBackend objects.
>>
>>> Is there a need in composite one or something similar?
>>
>> I've heard of users that want a node backed partially by hugetlbfs and
>> partially by regular RAM.  Not sure why.
> Isn't issue here in how backend is mapped into GPA? Well that is not
> backend's job.
>
> Once one starts to put layout (alignment, noncontinuously mapped
> memory regions inside of container, ...), mapping HPA->GPA gets complicated.
>
> It would be better to use simple building blocks and model as:
> 2 separate backends (ram + hugetlbfs) and 2 corresponding DIMM devices.

Right, I had forgotten that you can have cold-plugged DIMM devices. 
That's a nice solution, also because it simplifies passing the GPA 
configuration down to the guest.

How would that translate to sharing HostMemoryBackend code for memory 
policies?  Which of Hu Tao's proposals do you like best?

Paolo
Igor Mammedov Feb. 26, 2014, 2:25 p.m. UTC | #16
On Wed, 26 Feb 2014 14:47:28 +0100
Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 26/02/2014 14:43, Igor Mammedov ha scritto:
> > On Wed, 26 Feb 2014 13:45:38 +0100
> > Paolo Bonzini <pbonzini@redhat.com> wrote:
> >
> >> Il 26/02/2014 13:31, Igor Mammedov ha scritto:
> >>>>> The problem is that some backends might not be handled the same way.
> >>>>> For example, not all backends might produce a single void*/size_t pair
> >>>>> for the entire region.  Think of a "composite" backend that produces a
> >>>>> large memory region from two smaller ones.
> >>> I'd prefer to keep backends simple, with 1:1 mapping to memory regions.
> >>
> >> I agree.  However not all backends may have a mapping to a RAM memory
> >> region.  A composite backend could create a container memory region
> >> whose children are other HostMemoryBackend objects.
> >>
> >>> Is there a need in composite one or something similar?
> >>
> >> I've heard of users that want a node backed partially by hugetlbfs and
> >> partially by regular RAM.  Not sure why.
> > Isn't issue here in how backend is mapped into GPA? Well that is not
> > backend's job.
> >
> > Once one starts to put layout (alignment, noncontinuously mapped
> > memory regions inside of container, ...), mapping HPA->GPA gets complicated.
> >
> > It would be better to use simple building blocks and model as:
> > 2 separate backends (ram + hugetlbfs) and 2 corresponding DIMM devices.
> 
> Right, I had forgotten that you can have cold-plugged DIMM devices. 
> That's a nice solution, also because it simplifies passing the GPA 
> configuration down to the guest.
> 
> How would that translate to sharing HostMemoryBackend code for memory 
> policies?  Which of Hu Tao's proposals do you like best?
possible choices could be:

 1: 'realize' approach I suggested
      drawback is: assumption that all backends derived from HostMemoryBackend
                   will inherit NUMA controls even if backend shouldn't have
                   one (for example: fictional remote memory backend)
      plus: derived types from HostMemoryBackend, don't need to know anything
            about NUMA.
 2: #3 from Hu Tao's suggestion
      drawback is: every new backend have to explicitly call NUMA callbacks
      somewhat plus is that not NUMA aware backends could ignore NUMA callbacks,
      but they would still have NUMA properties available, which is confusing.

 3: might be over-engineered #1 from above: build proper class hierarchy:
      HostMemoryBackend  -> NumaMemoryBackend -> RamBackend
                        |                     -> HugepageBackend
                        |-> whatever else

> 
> Paolo
Paolo Bonzini Feb. 26, 2014, 2:39 p.m. UTC | #17
Il 26/02/2014 15:25, Igor Mammedov ha scritto:
>  1: 'realize' approach I suggested
>       drawback is: assumption that all backends derived from HostMemoryBackend
>                    will inherit NUMA controls even if backend shouldn't have
>                    one (for example: fictional remote memory backend)
>       plus: derived types from HostMemoryBackend, don't need to know anything
>             about NUMA.

Let's go with this for now.  It keeps things simple.

Paolo
Hu Tao March 3, 2014, 3:24 a.m. UTC | #18
On Tue, Feb 25, 2014 at 06:09:20PM +0800, Hu Tao wrote:
> On Wed, Feb 19, 2014 at 10:03:13AM +0100, Paolo Bonzini wrote:
> 
> <...>
> 
> > > static int
> > > ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
> > > {
> > >+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
> > >+    int mode = ram_backend->policy;
> > >+    void *p;
> > >+    unsigned long maxnode;
> > >+
> > >     if (!memory_region_size(&backend->mr)) {
> > >         memory_region_init_ram(&backend->mr, OBJECT(backend),
> > >                                object_get_canonical_path(OBJECT(backend)),
> > >                                backend->size);
> > >+
> > >+        p = memory_region_get_ram_ptr(&backend->mr);
> > >+        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
> > >+
> > >+        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
> > >+            MPOL_F_STATIC_NODES;
> > >+        /* This is a workaround for a long standing bug in Linux'
> > >+         * mbind implementation, which cuts off the last specified
> > >+         * node. To stay compatible should this bug be fixed, we
> > >+         * specify one more node and zero this one out.
> > >+         */
> > >+        if (syscall(SYS_mbind, p, backend->size, mode,
> > >+                    ram_backend->host_nodes, maxnode + 2, 0)) {
> > 
> > This does not compile on non-Linux; also, does libnuma include the
> > workaround?  If so, this is a hint that we should be using libnuma
> > instead...
> 
> Tested with libnuma and works fine without the workaround. Will use
> libnuma in v19.

Sorry I might do something wrong about the test. With libnuma the
workaround is still needed. I checked numactl-2.0.9, it doesn't include
the workaround.
diff mbox

Patch

diff --git a/backends/hostmem-ram.c b/backends/hostmem-ram.c
index a496dbd..2da9341 100644
--- a/backends/hostmem-ram.c
+++ b/backends/hostmem-ram.c
@@ -10,23 +10,179 @@ 
  * See the COPYING file in the top-level directory.
  */
 #include "sysemu/hostmem.h"
+#include "sysemu/sysemu.h"
+#include "qemu/bitmap.h"
+#include "qapi-visit.h"
+#include "qemu/config-file.h"
+#include "qapi/opts-visitor.h"
 
 #define TYPE_MEMORY_BACKEND_RAM "memory-ram"
+#define MEMORY_BACKEND_RAM(obj) \
+    OBJECT_CHECK(HostMemoryBackendRam, (obj), TYPE_MEMORY_BACKEND_RAM)
 
+typedef struct HostMemoryBackendRam HostMemoryBackendRam;
+
+/**
+ * @HostMemoryBackendRam
+ *
+ * @parent: opaque parent object container
+ * @host_nodes: host nodes bitmap used for memory policy
+ * @policy: host memory policy
+ * @relative: if the host nodes bitmap is relative
+ */
+struct HostMemoryBackendRam {
+    /* private */
+    HostMemoryBackend parent;
+
+    DECLARE_BITMAP(host_nodes, MAX_NODES);
+    HostMemPolicy policy;
+    bool relative;
+};
+
+static void
+get_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
+               Error **errp)
+{
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
+    uint16List *host_nodes = NULL;
+    uint16List **node = &host_nodes;
+    unsigned long value;
+
+    value = find_first_bit(ram_backend->host_nodes, MAX_NODES);
+    if (value == MAX_NODES) {
+        return;
+    }
+
+    *node = g_malloc0(sizeof(**node));
+    (*node)->value = value;
+    node = &(*node)->next;
+
+    do {
+        value = find_next_bit(ram_backend->host_nodes, MAX_NODES, value + 1);
+        if (value == MAX_NODES) {
+            break;
+        }
+
+        *node = g_malloc0(sizeof(**node));
+        (*node)->value = value;
+        node = &(*node)->next;
+    } while (true);
+
+    visit_type_uint16List(v, &host_nodes, name, errp);
+}
+
+static void
+set_host_nodes(Object *obj, Visitor *v, void *opaque, const char *name,
+               Error **errp)
+{
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
+    uint16List *l = NULL;
+
+    visit_type_uint16List(v, &l, name, errp);
+
+    while (l) {
+        bitmap_set(ram_backend->host_nodes, l->value, 1);
+        l = l->next;
+    }
+}
+
+static const char *policies[HOST_MEM_POLICY_MAX + 1] = {
+    [HOST_MEM_POLICY_DEFAULT] = "default",
+    [HOST_MEM_POLICY_PREFERRED] = "preferred",
+    [HOST_MEM_POLICY_MEMBIND] = "membind",
+    [HOST_MEM_POLICY_INTERLEAVE] = "interleave",
+    [HOST_MEM_POLICY_MAX] = NULL,
+};
+
+static void
+get_policy(Object *obj, Visitor *v, void *opaque, const char *name,
+           Error **errp)
+{
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
+    int policy = ram_backend->policy;
+
+    visit_type_enum(v, &policy, policies, NULL, name, errp);
+}
+
+static void
+set_policy(Object *obj, Visitor *v, void *opaque, const char *name,
+           Error **errp)
+{
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
+    int policy;
+
+    visit_type_enum(v, &policy, policies, NULL, name, errp);
+    ram_backend->policy = policy;
+}
+
+
+static bool get_relative(Object *obj, Error **errp)
+{
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
+
+    return ram_backend->relative;
+}
+
+static void set_relative(Object *obj, bool value, Error **errp)
+{
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(obj);
+
+    ram_backend->relative = value;
+}
+
+#include <sys/syscall.h>
+#ifndef MPOL_F_RELATIVE_NODES
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_F_STATIC_NODES   (1 << 15)
+#endif
 
 static int
 ram_backend_memory_init(HostMemoryBackend *backend, Error **errp)
 {
+    HostMemoryBackendRam *ram_backend = MEMORY_BACKEND_RAM(backend);
+    int mode = ram_backend->policy;
+    void *p;
+    unsigned long maxnode;
+
     if (!memory_region_size(&backend->mr)) {
         memory_region_init_ram(&backend->mr, OBJECT(backend),
                                object_get_canonical_path(OBJECT(backend)),
                                backend->size);
+
+        p = memory_region_get_ram_ptr(&backend->mr);
+        maxnode = find_last_bit(ram_backend->host_nodes, MAX_NODES);
+
+        mode |= ram_backend->relative ? MPOL_F_RELATIVE_NODES :
+            MPOL_F_STATIC_NODES;
+
+        /* This is a workaround for a long standing bug in Linux'
+         * mbind implementation, which cuts off the last specified
+         * node. To stay compatible should this bug be fixed, we
+         * specify one more node and zero this one out.
+         */
+        if (syscall(SYS_mbind, p, backend->size, mode,
+                    ram_backend->host_nodes, maxnode + 2, 0)) {
+            return -1;
+        }
     }
 
     return 0;
 }
 
 static void
+ram_backend_initfn(Object *obj)
+{
+    object_property_add(obj, "host-nodes", "int",
+                        get_host_nodes,
+                        set_host_nodes, NULL, NULL, NULL);
+    object_property_add(obj, "policy", "string",
+                        get_policy,
+                        set_policy, NULL, NULL, NULL);
+    object_property_add_bool(obj, "relative",
+                             get_relative, set_relative, NULL);
+}
+
+static void
 ram_backend_class_init(ObjectClass *oc, void *data)
 {
     HostMemoryBackendClass *bc = MEMORY_BACKEND_CLASS(oc);
@@ -38,6 +194,8 @@  static const TypeInfo ram_backend_info = {
     .name = TYPE_MEMORY_BACKEND_RAM,
     .parent = TYPE_MEMORY_BACKEND,
     .class_init = ram_backend_class_init,
+    .instance_size = sizeof(HostMemoryBackendRam),
+    .instance_init = ram_backend_initfn,
 };
 
 static void register_types(void)
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index acfc0c7..a3d8c02 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -152,6 +152,8 @@  void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
                                           const char *name,
                                           QEMUMachineInitArgs *args);
 
+extern QemuOptsList qemu_memdev_opts;
+
 #define MAX_OPTION_ROMS 16
 typedef struct QEMUOptionRom {
     const char *name;