diff mbox series

[v3,1/3] hostmem-file: add "align" option

Message ID 20171127043517.22441-2-haozhong.zhang@intel.com
State New
Headers show
Series nvdimm: fixes for (non-)dax backends | expand

Commit Message

Haozhong Zhang Nov. 27, 2017, 4:35 a.m. UTC
When mmap(2) the backend files, QEMU uses the host page size
(getpagesize(2)) by default as the alignment of mapping address.
However, some backends may require alignments different than the page
size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
fails with a kernel message like

[617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)

Because there is no common approach to get such alignment requirement,
we add the 'align' option to 'memory-backend-file', so that users or
management utils, which have enough knowledge about the backend, can
specify a proper alignment via this option.

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
---
 backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 docs/nvdimm.txt         | 16 ++++++++++++++++
 exec.c                  |  8 +++++++-
 include/exec/memory.h   |  3 +++
 memory.c                |  2 ++
 numa.c                  |  2 +-
 6 files changed, 69 insertions(+), 3 deletions(-)

Comments

Eduardo Habkost Nov. 28, 2017, 1:07 a.m. UTC | #1
On Mon, Nov 27, 2017 at 12:35:15PM +0800, Haozhong Zhang wrote:
> When mmap(2) the backend files, QEMU uses the host page size
> (getpagesize(2)) by default as the alignment of mapping address.
> However, some backends may require alignments different than the page
> size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
> kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
> fails with a kernel message like
> 
> [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
> 
> Because there is no common approach to get such alignment requirement,
> we add the 'align' option to 'memory-backend-file', so that users or
> management utils, which have enough knowledge about the backend, can
> specify a proper alignment via this option.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>

The new option needs to be documented on qemu-options.hx.

Except for that, the patch looks good to me.


> ---
>  backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++-
>  docs/nvdimm.txt         | 16 ++++++++++++++++
>  exec.c                  |  8 +++++++-
>  include/exec/memory.h   |  3 +++
>  memory.c                |  2 ++
>  numa.c                  |  2 +-
>  6 files changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> index e44c319915..e319ec1ad8 100644
> --- a/backends/hostmem-file.c
> +++ b/backends/hostmem-file.c
> @@ -34,6 +34,7 @@ struct HostMemoryBackendFile {
>      bool share;
>      bool discard_data;
>      char *mem_path;
> +    uint64_t align;
>  };
>  
>  static void
> @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>          path = object_get_canonical_path(OBJECT(backend));
>          memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
>                                   path,
> -                                 backend->size, fb->share,
> +                                 backend->size, fb->align, fb->share,
>                                   fb->mem_path, errp);
>          g_free(path);
>      }
> @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value,
>      MEMORY_BACKEND_FILE(o)->discard_data = value;
>  }
>  
> +static void file_memory_backend_get_align(Object *o, Visitor *v,
> +                                          const char *name, void *opaque,
> +                                          Error **errp)
> +{
> +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> +    uint64_t val = fb->align;
> +
> +    visit_type_size(v, name, &val, errp);
> +}
> +
> +static void file_memory_backend_set_align(Object *o, Visitor *v,
> +                                          const char *name, void *opaque,
> +                                          Error **errp)
> +{
> +    HostMemoryBackend *backend = MEMORY_BACKEND(o);
> +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> +    Error *local_err = NULL;
> +    uint64_t val;
> +
> +    if (host_memory_backend_mr_inited(backend)) {
> +        error_setg(&local_err, "cannot change property value");
> +        goto out;
> +    }
> +
> +    visit_type_size(v, name, &val, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +    fb->align = val;
> +
> + out:
> +    error_propagate(errp, local_err);
> +}
> +
>  static void file_backend_unparent(Object *obj)
>  {
>      HostMemoryBackend *backend = MEMORY_BACKEND(obj);
> @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data)
>      object_class_property_add_str(oc, "mem-path",
>          get_mem_path, set_mem_path,
>          &error_abort);
> +    object_class_property_add(oc, "align", "int",
> +        file_memory_backend_get_align,
> +        file_memory_backend_set_align,
> +        NULL, NULL, &error_abort);
>  }
>  
>  static void file_backend_instance_finalize(Object *o)
> diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
> index 2d9f8c0e8c..21249dd062 100644
> --- a/docs/nvdimm.txt
> +++ b/docs/nvdimm.txt
> @@ -122,3 +122,19 @@ Note:
>       M >= size of RAM devices +
>            size of statically plugged vNVDIMM devices +
>            size of hotplugged vNVDIMM devices
> +
> +Alignment
> +---------
> +
> +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
> +address to the page size (getpagesize(2)) by default. However, some
> +types of backends may require an alignment different than the page
> +size. In that case, QEMU v2.12.0 and later provide 'align' option to
> +memory-backend-file to allow users to specify the proper alignment.
> +
> +For example, device dax require the 2 MB alignment, so we can use
> +following QEMU command line options to use it (/dev/dax0.0) as the
> +backend of vNVDIMM:
> +
> + -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M
> + -device nvdimm,id=nvdimm1,memdev=mem1
> diff --git a/exec.c b/exec.c
> index 03238a3449..90440efecd 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1600,7 +1600,13 @@ static void *file_ram_alloc(RAMBlock *block,
>      void *area;
>  
>      block->page_size = qemu_fd_getpagesize(fd);
> -    block->mr->align = block->page_size;
> +    if (block->mr->align % block->page_size) {
> +        error_setg(errp, "aligment 0x%" PRIx64
> +                   " must be multiples of page size 0x%" PRIx64,
> +                   block->mr->align, block->page_size);
> +        return NULL;
> +    }
> +    block->mr->align = MAX(block->page_size, block->mr->align);
>  #if defined(__s390x__)
>      if (kvm_enabled()) {
>          block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 5ed4042f87..a1be8b06d3 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr,
>   * @name: Region name, becomes part of RAMBlock name used in migration stream
>   *        must be unique within any device
>   * @size: size of the region.
> + * @align: alignment of the region base address; if 0, the default alignment
> + *         (getpagesize()) will be used.
>   * @share: %true if memory must be mmaped with the MAP_SHARED flag
>   * @path: the path in which to allocate the RAM.
>   * @errp: pointer to Error*, to store an error if it happens.
> @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>                                        struct Object *owner,
>                                        const char *name,
>                                        uint64_t size,
> +                                      uint64_t align,
>                                        bool share,
>                                        const char *path,
>                                        Error **errp);
> diff --git a/memory.c b/memory.c
> index e26e5a3b1d..2c80656e39 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>                                        struct Object *owner,
>                                        const char *name,
>                                        uint64_t size,
> +                                      uint64_t align,
>                                        bool share,
>                                        const char *path,
>                                        Error **errp)
> @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>      mr->ram = true;
>      mr->terminates = true;
>      mr->destructor = memory_region_destructor_ram;
> +    mr->align = align;
>      mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp);
>      mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
>  }
> diff --git a/numa.c b/numa.c
> index 7151b24d1c..b0fe22a60c 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -551,7 +551,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
>      if (mem_path) {
>  #ifdef __linux__
>          Error *err = NULL;
> -        memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
> +        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false,
>                                           mem_path, &err);
>          if (err) {
>              error_report_err(err);
> -- 
> 2.14.1
>
Haozhong Zhang Nov. 29, 2017, 12:31 a.m. UTC | #2
copy mst

On 11/27/17 12:35 +0800, Haozhong Zhang wrote:
> When mmap(2) the backend files, QEMU uses the host page size
> (getpagesize(2)) by default as the alignment of mapping address.
> However, some backends may require alignments different than the page
> size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
> kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
> fails with a kernel message like
> 
> [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
> 
> Because there is no common approach to get such alignment requirement,
> we add the 'align' option to 'memory-backend-file', so that users or
> management utils, which have enough knowledge about the backend, can
> specify a proper alignment via this option.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> ---
>  backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++-
>  docs/nvdimm.txt         | 16 ++++++++++++++++
>  exec.c                  |  8 +++++++-
>  include/exec/memory.h   |  3 +++
>  memory.c                |  2 ++
>  numa.c                  |  2 +-
>  6 files changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> index e44c319915..e319ec1ad8 100644
> --- a/backends/hostmem-file.c
> +++ b/backends/hostmem-file.c
> @@ -34,6 +34,7 @@ struct HostMemoryBackendFile {
>      bool share;
>      bool discard_data;
>      char *mem_path;
> +    uint64_t align;
>  };
>  
>  static void
> @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>          path = object_get_canonical_path(OBJECT(backend));
>          memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
>                                   path,
> -                                 backend->size, fb->share,
> +                                 backend->size, fb->align, fb->share,
>                                   fb->mem_path, errp);
>          g_free(path);
>      }
> @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value,
>      MEMORY_BACKEND_FILE(o)->discard_data = value;
>  }
>  
> +static void file_memory_backend_get_align(Object *o, Visitor *v,
> +                                          const char *name, void *opaque,
> +                                          Error **errp)
> +{
> +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> +    uint64_t val = fb->align;
> +
> +    visit_type_size(v, name, &val, errp);
> +}
> +
> +static void file_memory_backend_set_align(Object *o, Visitor *v,
> +                                          const char *name, void *opaque,
> +                                          Error **errp)
> +{
> +    HostMemoryBackend *backend = MEMORY_BACKEND(o);
> +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> +    Error *local_err = NULL;
> +    uint64_t val;
> +
> +    if (host_memory_backend_mr_inited(backend)) {
> +        error_setg(&local_err, "cannot change property value");
> +        goto out;
> +    }
> +
> +    visit_type_size(v, name, &val, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +    fb->align = val;
> +
> + out:
> +    error_propagate(errp, local_err);
> +}
> +
>  static void file_backend_unparent(Object *obj)
>  {
>      HostMemoryBackend *backend = MEMORY_BACKEND(obj);
> @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data)
>      object_class_property_add_str(oc, "mem-path",
>          get_mem_path, set_mem_path,
>          &error_abort);
> +    object_class_property_add(oc, "align", "int",
> +        file_memory_backend_get_align,
> +        file_memory_backend_set_align,
> +        NULL, NULL, &error_abort);
>  }
>  
>  static void file_backend_instance_finalize(Object *o)
> diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
> index 2d9f8c0e8c..21249dd062 100644
> --- a/docs/nvdimm.txt
> +++ b/docs/nvdimm.txt
> @@ -122,3 +122,19 @@ Note:
>       M >= size of RAM devices +
>            size of statically plugged vNVDIMM devices +
>            size of hotplugged vNVDIMM devices
> +
> +Alignment
> +---------
> +
> +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
> +address to the page size (getpagesize(2)) by default. However, some
> +types of backends may require an alignment different than the page
> +size. In that case, QEMU v2.12.0 and later provide 'align' option to
> +memory-backend-file to allow users to specify the proper alignment.
> +
> +For example, device dax require the 2 MB alignment, so we can use
> +following QEMU command line options to use it (/dev/dax0.0) as the
> +backend of vNVDIMM:
> +
> + -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M
> + -device nvdimm,id=nvdimm1,memdev=mem1
> diff --git a/exec.c b/exec.c
> index 03238a3449..90440efecd 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1600,7 +1600,13 @@ static void *file_ram_alloc(RAMBlock *block,
>      void *area;
>  
>      block->page_size = qemu_fd_getpagesize(fd);
> -    block->mr->align = block->page_size;
> +    if (block->mr->align % block->page_size) {
> +        error_setg(errp, "aligment 0x%" PRIx64
> +                   " must be multiples of page size 0x%" PRIx64,
> +                   block->mr->align, block->page_size);
> +        return NULL;
> +    }
> +    block->mr->align = MAX(block->page_size, block->mr->align);
>  #if defined(__s390x__)
>      if (kvm_enabled()) {
>          block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 5ed4042f87..a1be8b06d3 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr,
>   * @name: Region name, becomes part of RAMBlock name used in migration stream
>   *        must be unique within any device
>   * @size: size of the region.
> + * @align: alignment of the region base address; if 0, the default alignment
> + *         (getpagesize()) will be used.
>   * @share: %true if memory must be mmaped with the MAP_SHARED flag
>   * @path: the path in which to allocate the RAM.
>   * @errp: pointer to Error*, to store an error if it happens.
> @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>                                        struct Object *owner,
>                                        const char *name,
>                                        uint64_t size,
> +                                      uint64_t align,
>                                        bool share,
>                                        const char *path,
>                                        Error **errp);
> diff --git a/memory.c b/memory.c
> index e26e5a3b1d..2c80656e39 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>                                        struct Object *owner,
>                                        const char *name,
>                                        uint64_t size,
> +                                      uint64_t align,
>                                        bool share,
>                                        const char *path,
>                                        Error **errp)
> @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>      mr->ram = true;
>      mr->terminates = true;
>      mr->destructor = memory_region_destructor_ram;
> +    mr->align = align;
>      mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp);
>      mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
>  }
> diff --git a/numa.c b/numa.c
> index 7151b24d1c..b0fe22a60c 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -551,7 +551,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
>      if (mem_path) {
>  #ifdef __linux__
>          Error *err = NULL;
> -        memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
> +        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false,
>                                           mem_path, &err);
>          if (err) {
>              error_report_err(err);
> -- 
> 2.14.1
>
Haozhong Zhang Nov. 29, 2017, 12:33 a.m. UTC | #3
On 11/27/17 23:07 -0200, Eduardo Habkost wrote:
> On Mon, Nov 27, 2017 at 12:35:15PM +0800, Haozhong Zhang wrote:
> > When mmap(2) the backend files, QEMU uses the host page size
> > (getpagesize(2)) by default as the alignment of mapping address.
> > However, some backends may require alignments different than the page
> > size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
> > kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
> > fails with a kernel message like
> > 
> > [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
> > 
> > Because there is no common approach to get such alignment requirement,
> > we add the 'align' option to 'memory-backend-file', so that users or
> > management utils, which have enough knowledge about the backend, can
> > specify a proper alignment via this option.
> > 
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> 
> The new option needs to be documented on qemu-options.hx.

will add in the next version.

Thanks,
Haozhong

> 
> Except for that, the patch looks good to me.
> 
> 
> > ---
> >  backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++-
> >  docs/nvdimm.txt         | 16 ++++++++++++++++
> >  exec.c                  |  8 +++++++-
> >  include/exec/memory.h   |  3 +++
> >  memory.c                |  2 ++
> >  numa.c                  |  2 +-
> >  6 files changed, 69 insertions(+), 3 deletions(-)
> > 
> > diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> > index e44c319915..e319ec1ad8 100644
> > --- a/backends/hostmem-file.c
> > +++ b/backends/hostmem-file.c
> > @@ -34,6 +34,7 @@ struct HostMemoryBackendFile {
> >      bool share;
> >      bool discard_data;
> >      char *mem_path;
> > +    uint64_t align;
> >  };
> >  
> >  static void
> > @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
> >          path = object_get_canonical_path(OBJECT(backend));
> >          memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
> >                                   path,
> > -                                 backend->size, fb->share,
> > +                                 backend->size, fb->align, fb->share,
> >                                   fb->mem_path, errp);
> >          g_free(path);
> >      }
> > @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value,
> >      MEMORY_BACKEND_FILE(o)->discard_data = value;
> >  }
> >  
> > +static void file_memory_backend_get_align(Object *o, Visitor *v,
> > +                                          const char *name, void *opaque,
> > +                                          Error **errp)
> > +{
> > +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> > +    uint64_t val = fb->align;
> > +
> > +    visit_type_size(v, name, &val, errp);
> > +}
> > +
> > +static void file_memory_backend_set_align(Object *o, Visitor *v,
> > +                                          const char *name, void *opaque,
> > +                                          Error **errp)
> > +{
> > +    HostMemoryBackend *backend = MEMORY_BACKEND(o);
> > +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> > +    Error *local_err = NULL;
> > +    uint64_t val;
> > +
> > +    if (host_memory_backend_mr_inited(backend)) {
> > +        error_setg(&local_err, "cannot change property value");
> > +        goto out;
> > +    }
> > +
> > +    visit_type_size(v, name, &val, &local_err);
> > +    if (local_err) {
> > +        goto out;
> > +    }
> > +    fb->align = val;
> > +
> > + out:
> > +    error_propagate(errp, local_err);
> > +}
> > +
> >  static void file_backend_unparent(Object *obj)
> >  {
> >      HostMemoryBackend *backend = MEMORY_BACKEND(obj);
> > @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data)
> >      object_class_property_add_str(oc, "mem-path",
> >          get_mem_path, set_mem_path,
> >          &error_abort);
> > +    object_class_property_add(oc, "align", "int",
> > +        file_memory_backend_get_align,
> > +        file_memory_backend_set_align,
> > +        NULL, NULL, &error_abort);
> >  }
> >  
> >  static void file_backend_instance_finalize(Object *o)
> > diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
> > index 2d9f8c0e8c..21249dd062 100644
> > --- a/docs/nvdimm.txt
> > +++ b/docs/nvdimm.txt
> > @@ -122,3 +122,19 @@ Note:
> >       M >= size of RAM devices +
> >            size of statically plugged vNVDIMM devices +
> >            size of hotplugged vNVDIMM devices
> > +
> > +Alignment
> > +---------
> > +
> > +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
> > +address to the page size (getpagesize(2)) by default. However, some
> > +types of backends may require an alignment different than the page
> > +size. In that case, QEMU v2.12.0 and later provide 'align' option to
> > +memory-backend-file to allow users to specify the proper alignment.
> > +
> > +For example, device dax require the 2 MB alignment, so we can use
> > +following QEMU command line options to use it (/dev/dax0.0) as the
> > +backend of vNVDIMM:
> > +
> > + -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M
> > + -device nvdimm,id=nvdimm1,memdev=mem1
> > diff --git a/exec.c b/exec.c
> > index 03238a3449..90440efecd 100644
> > --- a/exec.c
> > +++ b/exec.c
> > @@ -1600,7 +1600,13 @@ static void *file_ram_alloc(RAMBlock *block,
> >      void *area;
> >  
> >      block->page_size = qemu_fd_getpagesize(fd);
> > -    block->mr->align = block->page_size;
> > +    if (block->mr->align % block->page_size) {
> > +        error_setg(errp, "aligment 0x%" PRIx64
> > +                   " must be multiples of page size 0x%" PRIx64,
> > +                   block->mr->align, block->page_size);
> > +        return NULL;
> > +    }
> > +    block->mr->align = MAX(block->page_size, block->mr->align);
> >  #if defined(__s390x__)
> >      if (kvm_enabled()) {
> >          block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
> > diff --git a/include/exec/memory.h b/include/exec/memory.h
> > index 5ed4042f87..a1be8b06d3 100644
> > --- a/include/exec/memory.h
> > +++ b/include/exec/memory.h
> > @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr,
> >   * @name: Region name, becomes part of RAMBlock name used in migration stream
> >   *        must be unique within any device
> >   * @size: size of the region.
> > + * @align: alignment of the region base address; if 0, the default alignment
> > + *         (getpagesize()) will be used.
> >   * @share: %true if memory must be mmaped with the MAP_SHARED flag
> >   * @path: the path in which to allocate the RAM.
> >   * @errp: pointer to Error*, to store an error if it happens.
> > @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
> >                                        struct Object *owner,
> >                                        const char *name,
> >                                        uint64_t size,
> > +                                      uint64_t align,
> >                                        bool share,
> >                                        const char *path,
> >                                        Error **errp);
> > diff --git a/memory.c b/memory.c
> > index e26e5a3b1d..2c80656e39 100644
> > --- a/memory.c
> > +++ b/memory.c
> > @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
> >                                        struct Object *owner,
> >                                        const char *name,
> >                                        uint64_t size,
> > +                                      uint64_t align,
> >                                        bool share,
> >                                        const char *path,
> >                                        Error **errp)
> > @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
> >      mr->ram = true;
> >      mr->terminates = true;
> >      mr->destructor = memory_region_destructor_ram;
> > +    mr->align = align;
> >      mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp);
> >      mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
> >  }
> > diff --git a/numa.c b/numa.c
> > index 7151b24d1c..b0fe22a60c 100644
> > --- a/numa.c
> > +++ b/numa.c
> > @@ -551,7 +551,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
> >      if (mem_path) {
> >  #ifdef __linux__
> >          Error *err = NULL;
> > -        memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
> > +        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false,
> >                                           mem_path, &err);
> >          if (err) {
> >              error_report_err(err);
> > -- 
> > 2.14.1
> > 
> 
> -- 
> Eduardo
Haozhong Zhang Nov. 29, 2017, 12:38 a.m. UTC | #4
[ it looks my email client had some glitch and failed to add mst in cc list ]

copy mst

On 11/27/17 12:35 +0800, Haozhong Zhang wrote:
> When mmap(2) the backend files, QEMU uses the host page size
> (getpagesize(2)) by default as the alignment of mapping address.
> However, some backends may require alignments different than the page
> size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
> kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
> fails with a kernel message like
> 
> [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
> 
> Because there is no common approach to get such alignment requirement,
> we add the 'align' option to 'memory-backend-file', so that users or
> management utils, which have enough knowledge about the backend, can
> specify a proper alignment via this option.
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> ---
>  backends/hostmem-file.c | 41 ++++++++++++++++++++++++++++++++++++++++-
>  docs/nvdimm.txt         | 16 ++++++++++++++++
>  exec.c                  |  8 +++++++-
>  include/exec/memory.h   |  3 +++
>  memory.c                |  2 ++
>  numa.c                  |  2 +-
>  6 files changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
> index e44c319915..e319ec1ad8 100644
> --- a/backends/hostmem-file.c
> +++ b/backends/hostmem-file.c
> @@ -34,6 +34,7 @@ struct HostMemoryBackendFile {
>      bool share;
>      bool discard_data;
>      char *mem_path;
> +    uint64_t align;
>  };
>  
>  static void
> @@ -58,7 +59,7 @@ file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
>          path = object_get_canonical_path(OBJECT(backend));
>          memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
>                                   path,
> -                                 backend->size, fb->share,
> +                                 backend->size, fb->align, fb->share,
>                                   fb->mem_path, errp);
>          g_free(path);
>      }
> @@ -115,6 +116,40 @@ static void file_memory_backend_set_discard_data(Object *o, bool value,
>      MEMORY_BACKEND_FILE(o)->discard_data = value;
>  }
>  
> +static void file_memory_backend_get_align(Object *o, Visitor *v,
> +                                          const char *name, void *opaque,
> +                                          Error **errp)
> +{
> +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> +    uint64_t val = fb->align;
> +
> +    visit_type_size(v, name, &val, errp);
> +}
> +
> +static void file_memory_backend_set_align(Object *o, Visitor *v,
> +                                          const char *name, void *opaque,
> +                                          Error **errp)
> +{
> +    HostMemoryBackend *backend = MEMORY_BACKEND(o);
> +    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
> +    Error *local_err = NULL;
> +    uint64_t val;
> +
> +    if (host_memory_backend_mr_inited(backend)) {
> +        error_setg(&local_err, "cannot change property value");
> +        goto out;
> +    }
> +
> +    visit_type_size(v, name, &val, &local_err);
> +    if (local_err) {
> +        goto out;
> +    }
> +    fb->align = val;
> +
> + out:
> +    error_propagate(errp, local_err);
> +}
> +
>  static void file_backend_unparent(Object *obj)
>  {
>      HostMemoryBackend *backend = MEMORY_BACKEND(obj);
> @@ -145,6 +180,10 @@ file_backend_class_init(ObjectClass *oc, void *data)
>      object_class_property_add_str(oc, "mem-path",
>          get_mem_path, set_mem_path,
>          &error_abort);
> +    object_class_property_add(oc, "align", "int",
> +        file_memory_backend_get_align,
> +        file_memory_backend_set_align,
> +        NULL, NULL, &error_abort);
>  }
>  
>  static void file_backend_instance_finalize(Object *o)
> diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
> index 2d9f8c0e8c..21249dd062 100644
> --- a/docs/nvdimm.txt
> +++ b/docs/nvdimm.txt
> @@ -122,3 +122,19 @@ Note:
>       M >= size of RAM devices +
>            size of statically plugged vNVDIMM devices +
>            size of hotplugged vNVDIMM devices
> +
> +Alignment
> +---------
> +
> +QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
> +address to the page size (getpagesize(2)) by default. However, some
> +types of backends may require an alignment different than the page
> +size. In that case, QEMU v2.12.0 and later provide 'align' option to
> +memory-backend-file to allow users to specify the proper alignment.
> +
> +For example, device dax require the 2 MB alignment, so we can use
> +following QEMU command line options to use it (/dev/dax0.0) as the
> +backend of vNVDIMM:
> +
> + -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M
> + -device nvdimm,id=nvdimm1,memdev=mem1
> diff --git a/exec.c b/exec.c
> index 03238a3449..90440efecd 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -1600,7 +1600,13 @@ static void *file_ram_alloc(RAMBlock *block,
>      void *area;
>  
>      block->page_size = qemu_fd_getpagesize(fd);
> -    block->mr->align = block->page_size;
> +    if (block->mr->align % block->page_size) {
> +        error_setg(errp, "aligment 0x%" PRIx64
> +                   " must be multiples of page size 0x%" PRIx64,
> +                   block->mr->align, block->page_size);
> +        return NULL;
> +    }
> +    block->mr->align = MAX(block->page_size, block->mr->align);
>  #if defined(__s390x__)
>      if (kvm_enabled()) {
>          block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 5ed4042f87..a1be8b06d3 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -465,6 +465,8 @@ void memory_region_init_resizeable_ram(MemoryRegion *mr,
>   * @name: Region name, becomes part of RAMBlock name used in migration stream
>   *        must be unique within any device
>   * @size: size of the region.
> + * @align: alignment of the region base address; if 0, the default alignment
> + *         (getpagesize()) will be used.
>   * @share: %true if memory must be mmaped with the MAP_SHARED flag
>   * @path: the path in which to allocate the RAM.
>   * @errp: pointer to Error*, to store an error if it happens.
> @@ -476,6 +478,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>                                        struct Object *owner,
>                                        const char *name,
>                                        uint64_t size,
> +                                      uint64_t align,
>                                        bool share,
>                                        const char *path,
>                                        Error **errp);
> diff --git a/memory.c b/memory.c
> index e26e5a3b1d..2c80656e39 100644
> --- a/memory.c
> +++ b/memory.c
> @@ -1570,6 +1570,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>                                        struct Object *owner,
>                                        const char *name,
>                                        uint64_t size,
> +                                      uint64_t align,
>                                        bool share,
>                                        const char *path,
>                                        Error **errp)
> @@ -1578,6 +1579,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
>      mr->ram = true;
>      mr->terminates = true;
>      mr->destructor = memory_region_destructor_ram;
> +    mr->align = align;
>      mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp);
>      mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
>  }
> diff --git a/numa.c b/numa.c
> index 7151b24d1c..b0fe22a60c 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -551,7 +551,7 @@ static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
>      if (mem_path) {
>  #ifdef __linux__
>          Error *err = NULL;
> -        memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
> +        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false,
>                                           mem_path, &err);
>          if (err) {
>              error_report_err(err);
> -- 
> 2.14.1
>
Eduardo Habkost Nov. 29, 2017, 10:05 a.m. UTC | #5
On Wed, Nov 29, 2017 at 08:33:29AM +0800, Haozhong Zhang wrote:
> On 11/27/17 23:07 -0200, Eduardo Habkost wrote:
> > On Mon, Nov 27, 2017 at 12:35:15PM +0800, Haozhong Zhang wrote:
> > > When mmap(2) the backend files, QEMU uses the host page size
> > > (getpagesize(2)) by default as the alignment of mapping address.
> > > However, some backends may require alignments different than the page
> > > size. For example, mmap a device DAX (e.g., /dev/dax0.0) on Linux
> > > kernel 4.13 to an address, which is 4K-aligned but not 2M-aligned,
> > > fails with a kernel message like
> > > 
> > > [617494.969768] dax dax0.0: qemu-system-x86: dax_mmap: fail, unaligned vma (0x7fa37c579000 - 0x7fa43c579000, 0x1fffff)
> > > 
> > > Because there is no common approach to get such alignment requirement,
> > > we add the 'align' option to 'memory-backend-file', so that users or
> > > management utils, which have enough knowledge about the backend, can
> > > specify a proper alignment via this option.
> > > 
> > > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > 
> > The new option needs to be documented on qemu-options.hx.
> 
> will add in the next version.

Note that there are patches on on machine-next that change the memory backend
documentation.

See:
  git://github.com/ehabkost/qemu.git machine-next
  https://github.com/ehabkost/qemu/commit/869f7f61c557a46b46e41b5e38a61551d45b6d0d

But the conflicts should be trivial to solve if you document the new option in
a separate paragraph.
diff mbox series

Patch

diff --git a/backends/hostmem-file.c b/backends/hostmem-file.c
index e44c319915..e319ec1ad8 100644
--- a/backends/hostmem-file.c
+++ b/backends/hostmem-file.c
@@ -34,6 +34,7 @@  struct HostMemoryBackendFile {
     bool share;
     bool discard_data;
     char *mem_path;
+    uint64_t align;
 };
 
 static void
@@ -58,7 +59,7 @@  file_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
         path = object_get_canonical_path(OBJECT(backend));
         memory_region_init_ram_from_file(&backend->mr, OBJECT(backend),
                                  path,
-                                 backend->size, fb->share,
+                                 backend->size, fb->align, fb->share,
                                  fb->mem_path, errp);
         g_free(path);
     }
@@ -115,6 +116,40 @@  static void file_memory_backend_set_discard_data(Object *o, bool value,
     MEMORY_BACKEND_FILE(o)->discard_data = value;
 }
 
+static void file_memory_backend_get_align(Object *o, Visitor *v,
+                                          const char *name, void *opaque,
+                                          Error **errp)
+{
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+    uint64_t val = fb->align;
+
+    visit_type_size(v, name, &val, errp);
+}
+
+static void file_memory_backend_set_align(Object *o, Visitor *v,
+                                          const char *name, void *opaque,
+                                          Error **errp)
+{
+    HostMemoryBackend *backend = MEMORY_BACKEND(o);
+    HostMemoryBackendFile *fb = MEMORY_BACKEND_FILE(o);
+    Error *local_err = NULL;
+    uint64_t val;
+
+    if (host_memory_backend_mr_inited(backend)) {
+        error_setg(&local_err, "cannot change property value");
+        goto out;
+    }
+
+    visit_type_size(v, name, &val, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    fb->align = val;
+
+ out:
+    error_propagate(errp, local_err);
+}
+
 static void file_backend_unparent(Object *obj)
 {
     HostMemoryBackend *backend = MEMORY_BACKEND(obj);
@@ -145,6 +180,10 @@  file_backend_class_init(ObjectClass *oc, void *data)
     object_class_property_add_str(oc, "mem-path",
         get_mem_path, set_mem_path,
         &error_abort);
+    object_class_property_add(oc, "align", "int",
+        file_memory_backend_get_align,
+        file_memory_backend_set_align,
+        NULL, NULL, &error_abort);
 }
 
 static void file_backend_instance_finalize(Object *o)
diff --git a/docs/nvdimm.txt b/docs/nvdimm.txt
index 2d9f8c0e8c..21249dd062 100644
--- a/docs/nvdimm.txt
+++ b/docs/nvdimm.txt
@@ -122,3 +122,19 @@  Note:
      M >= size of RAM devices +
           size of statically plugged vNVDIMM devices +
           size of hotplugged vNVDIMM devices
+
+Alignment
+---------
+
+QEMU uses mmap(2) to maps vNVDIMM backends and aligns the mapping
+address to the page size (getpagesize(2)) by default. However, some
+types of backends may require an alignment different than the page
+size. In that case, QEMU v2.12.0 and later provide 'align' option to
+memory-backend-file to allow users to specify the proper alignment.
+
+For example, device dax require the 2 MB alignment, so we can use
+following QEMU command line options to use it (/dev/dax0.0) as the
+backend of vNVDIMM:
+
+ -object memory-backend-file,id=mem1,share=on,mem-path=/dev/dax0.0,size=4G,align=2M
+ -device nvdimm,id=nvdimm1,memdev=mem1
diff --git a/exec.c b/exec.c
index 03238a3449..90440efecd 100644
--- a/exec.c
+++ b/exec.c
@@ -1600,7 +1600,13 @@  static void *file_ram_alloc(RAMBlock *block,
     void *area;
 
     block->page_size = qemu_fd_getpagesize(fd);
-    block->mr->align = block->page_size;
+    if (block->mr->align % block->page_size) {
+        error_setg(errp, "aligment 0x%" PRIx64
+                   " must be multiples of page size 0x%" PRIx64,
+                   block->mr->align, block->page_size);
+        return NULL;
+    }
+    block->mr->align = MAX(block->page_size, block->mr->align);
 #if defined(__s390x__)
     if (kvm_enabled()) {
         block->mr->align = MAX(block->mr->align, QEMU_VMALLOC_ALIGN);
diff --git a/include/exec/memory.h b/include/exec/memory.h
index 5ed4042f87..a1be8b06d3 100644
--- a/include/exec/memory.h
+++ b/include/exec/memory.h
@@ -465,6 +465,8 @@  void memory_region_init_resizeable_ram(MemoryRegion *mr,
  * @name: Region name, becomes part of RAMBlock name used in migration stream
  *        must be unique within any device
  * @size: size of the region.
+ * @align: alignment of the region base address; if 0, the default alignment
+ *         (getpagesize()) will be used.
  * @share: %true if memory must be mmaped with the MAP_SHARED flag
  * @path: the path in which to allocate the RAM.
  * @errp: pointer to Error*, to store an error if it happens.
@@ -476,6 +478,7 @@  void memory_region_init_ram_from_file(MemoryRegion *mr,
                                       struct Object *owner,
                                       const char *name,
                                       uint64_t size,
+                                      uint64_t align,
                                       bool share,
                                       const char *path,
                                       Error **errp);
diff --git a/memory.c b/memory.c
index e26e5a3b1d..2c80656e39 100644
--- a/memory.c
+++ b/memory.c
@@ -1570,6 +1570,7 @@  void memory_region_init_ram_from_file(MemoryRegion *mr,
                                       struct Object *owner,
                                       const char *name,
                                       uint64_t size,
+                                      uint64_t align,
                                       bool share,
                                       const char *path,
                                       Error **errp)
@@ -1578,6 +1579,7 @@  void memory_region_init_ram_from_file(MemoryRegion *mr,
     mr->ram = true;
     mr->terminates = true;
     mr->destructor = memory_region_destructor_ram;
+    mr->align = align;
     mr->ram_block = qemu_ram_alloc_from_file(size, mr, share, path, errp);
     mr->dirty_log_mask = tcg_enabled() ? (1 << DIRTY_MEMORY_CODE) : 0;
 }
diff --git a/numa.c b/numa.c
index 7151b24d1c..b0fe22a60c 100644
--- a/numa.c
+++ b/numa.c
@@ -551,7 +551,7 @@  static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
     if (mem_path) {
 #ifdef __linux__
         Error *err = NULL;
-        memory_region_init_ram_from_file(mr, owner, name, ram_size, false,
+        memory_region_init_ram_from_file(mr, owner, name, ram_size, 0, false,
                                          mem_path, &err);
         if (err) {
             error_report_err(err);