diff mbox series

[V13,4/5] util/mmap-alloc: support MAP_SYNC in qemu_ram_mmap()

Message ID 5d07bf7e9a3e576f5a87e81d786e8886fb2bb551.1549555521.git.yi.z.zhang@linux.intel.com
State New
Headers show
Series support MAP_SYNC for memory-backend-file | expand

Commit Message

Zhang, Yi Feb. 8, 2019, 10:11 a.m. UTC
From: Zhang Yi <yi.z.zhang@linux.intel.com>

When a file supporting DAX is used as vNVDIMM backend, mmap it with
MAP_SYNC flag in addition which can ensure file system metadata
synced in each guest writes to the backend file, without other QEMU
actions (e.g., periodic fsync() by QEMU).

Current, We have below different possible use cases:

1. pmem=on is set, shared=on is set, MAP_SYNC supported:
   a: backend is a dax supporting file.
    - MAP_SYNC will active.
   b: backend is not a dax supporting file.
    - mmap will trigger a warning. then MAP_SYNC flag will be ignored

2. The rest of cases:
   - we will never pass the MAP_SYNC to mmap2

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
---
 util/mmap-alloc.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 44 insertions(+), 1 deletion(-)

Comments

Eduardo Habkost April 18, 2019, 10:05 p.m. UTC | #1
Hi,

I found out that this series missed QEMU 4.0 and I was going to
queue for 4.1, but unfortunately this patch conflicts with:

commit 2044c3e7116eeac0449dcb4a4130cc8f8b9310da
Author: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
Date:   Wed Jan 30 21:36:04 2019 -0200

    mmap-alloc: unfold qemu_ram_mmap()
    
    Unfold parts of qemu_ram_mmap() for the sake of understanding, moving
    declarations to the top, and keeping architecture-specifics in the
    ifdef-else blocks.  No changes in the function behaviour.
    
    Give ptr and ptr1 meaningful names:
      ptr  -> guardptr : pointer to the PROT_NONE guard region
      ptr1 -> ptr      : pointer to the mapped memory returned to caller
    
    Signed-off-by: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
    Reviewed-by: Greg Kurz <groug@kaod.org>
    Signed-off-by: David Gibson <david@gibson.dropbear.id.au>

I'm queueing patches 1-3 into machine-next[1], so only patches 4
and 5 need to be refreshed and resubmitted.

[1] https://github.com/ehabkost/qemu.git machine-next


On Fri, Feb 08, 2019 at 06:11:11PM +0800, Zhang, Yi wrote:
> From: Zhang Yi <yi.z.zhang@linux.intel.com>
> 
> When a file supporting DAX is used as vNVDIMM backend, mmap it with
> MAP_SYNC flag in addition which can ensure file system metadata
> synced in each guest writes to the backend file, without other QEMU
> actions (e.g., periodic fsync() by QEMU).
> 
> Current, We have below different possible use cases:
> 
> 1. pmem=on is set, shared=on is set, MAP_SYNC supported:
>    a: backend is a dax supporting file.
>     - MAP_SYNC will active.
>    b: backend is not a dax supporting file.
>     - mmap will trigger a warning. then MAP_SYNC flag will be ignored
> 
> 2. The rest of cases:
>    - we will never pass the MAP_SYNC to mmap2
> 
> Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
> ---
>  util/mmap-alloc.c | 45 ++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 44 insertions(+), 1 deletion(-)
> 
> diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
> index 97bbeed..2f21efd 100644
> --- a/util/mmap-alloc.c
> +++ b/util/mmap-alloc.c
> @@ -10,6 +10,13 @@
>   * later.  See the COPYING file in the top-level directory.
>   */
>  
> +#ifdef CONFIG_LINUX
> +#include <linux/mman.h>
> +#else  /* !CONFIG_LINUX */
> +#define MAP_SYNC              0x0
> +#define MAP_SHARED_VALIDATE   0x0
> +#endif /* CONFIG_LINUX */
> +
>  #include "qemu/osdep.h"
>  #include "qemu/mmap-alloc.h"
>  #include "qemu/host-utils.h"
> @@ -101,6 +108,8 @@ void *qemu_ram_mmap(int fd,
>  #else
>      void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
>  #endif
> +    int mmap_flags;
> +    int map_sync_flags = 0;
>      size_t offset;
>      void *ptr1;
>  
> @@ -111,13 +120,47 @@ void *qemu_ram_mmap(int fd,
>      assert(is_power_of_2(align));
>      /* Always align to host page size */
>      assert(align >= getpagesize());
> +    mmap_flags = shared ? MAP_SHARED : MAP_PRIVATE;
> +    if (shared && is_pmem) {
> +        map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
> +        mmap_flags |= map_sync_flags;
> +    }
>  
>      offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
>      ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
>                  MAP_FIXED |
>                  (fd == -1 ? MAP_ANONYMOUS : 0) |
> -                (shared ? MAP_SHARED : MAP_PRIVATE),
> +                mmap_flags,
>                  fd, 0);
> +
> +
> +    if (ptr1 == MAP_FAILED && map_sync_flags) {
> +        if (errno == ENOTSUP) {
> +            char *proc_link, *file_name;
> +            int len;
> +            proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
> +            file_name = g_malloc0(PATH_MAX);
> +            len = readlink(proc_link, file_name, PATH_MAX - 1);
> +            if (len < 0) {
> +                len = 0;
> +            }
> +            file_name[len] = '\0';
> +            fprintf(stderr, "Warning: requesting persistence across crashes "
> +                    "for backend file %s failed. Proceeding without "
> +                    "persistence, data might become corrupted in case of host "
> +                    "crash.\n", file_name);
> +            g_free(proc_link);
> +            g_free(file_name);
> +        }
> +        /* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
> +         * we will remove these flags to handle compatibility.
> +         */
> +        ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
> +                    MAP_FIXED |
> +                    (fd == -1 ? MAP_ANONYMOUS : 0) |
> +                    MAP_SHARED,
> +                    fd, 0);
> +    }
>      if (ptr1 == MAP_FAILED) {
>          munmap(ptr, total);
>          return MAP_FAILED;
> -- 
> 2.7.4
> 
>
Eduardo Habkost April 18, 2019, 10:33 p.m. UTC | #2
On Thu, Apr 18, 2019 at 07:05:16PM -0300, Eduardo Habkost wrote:
> Hi,
> 
> I found out that this series missed QEMU 4.0 and I was going to
> queue for 4.1, but unfortunately this patch conflicts with:
> 
> commit 2044c3e7116eeac0449dcb4a4130cc8f8b9310da
> Author: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
> Date:   Wed Jan 30 21:36:04 2019 -0200
> 
>     mmap-alloc: unfold qemu_ram_mmap()
[...]
> On Fri, Feb 08, 2019 at 06:11:11PM +0800, Zhang, Yi wrote:
> > From: Zhang Yi <yi.z.zhang@linux.intel.com>
> > 
> > When a file supporting DAX is used as vNVDIMM backend, mmap it with
> > MAP_SYNC flag in addition which can ensure file system metadata
> > synced in each guest writes to the backend file, without other QEMU
> > actions (e.g., periodic fsync() by QEMU).
> > 
> > Current, We have below different possible use cases:
> > 
> > 1. pmem=on is set, shared=on is set, MAP_SYNC supported:
> >    a: backend is a dax supporting file.
> >     - MAP_SYNC will active.
> >    b: backend is not a dax supporting file.
> >     - mmap will trigger a warning. then MAP_SYNC flag will be ignored
> > 
> > 2. The rest of cases:
> >    - we will never pass the MAP_SYNC to mmap2
> > 
> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
> > Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>

Untested patch rebase is below.  Can Intel help test it and
submit v14?

Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
[ehabkost: Rebased patch to latest code on master]
Signed-off-by: Eduardo Habkost <ehabkost@redhat.com>
---
 util/mmap-alloc.c | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 9713f4b960..7063a44c7d 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -10,6 +10,13 @@
  * later.  See the COPYING file in the top-level directory.
  */
 
+#ifdef CONFIG_LINUX
+#include <linux/mman.h>
+#else  /* !CONFIG_LINUX */
+#define MAP_SYNC              0x0
+#define MAP_SHARED_VALIDATE   0x0
+#endif /* CONFIG_LINUX */
+
 #include "qemu/osdep.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/host-utils.h"
@@ -82,6 +89,7 @@ void *qemu_ram_mmap(int fd,
                     bool is_pmem)
 {
     int flags;
+    int map_sync_flags = 0;
     int guardfd;
     size_t offset;
     size_t pagesize;
@@ -132,10 +140,39 @@ void *qemu_ram_mmap(int fd,
     flags = MAP_FIXED;
     flags |= fd == -1 ? MAP_ANONYMOUS : 0;
     flags |= shared ? MAP_SHARED : MAP_PRIVATE;
+    if (shared && is_pmem) {
+        map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
+    }
     offset = QEMU_ALIGN_UP((uintptr_t)guardptr, align) - (uintptr_t)guardptr;
 
-    ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE, flags, fd, 0);
-
+    ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
+               flags | map_sync_flags, fd, 0);
+
+    if (ptr == MAP_FAILED && map_sync_flags) {
+        if (errno == ENOTSUP) {
+            char *proc_link, *file_name;
+            int len;
+            proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
+            file_name = g_malloc0(PATH_MAX);
+            len = readlink(proc_link, file_name, PATH_MAX - 1);
+            if (len < 0) {
+                len = 0;
+            }
+            file_name[len] = '\0';
+            fprintf(stderr, "Warning: requesting persistence across crashes "
+                    "for backend file %s failed. Proceeding without "
+                    "persistence, data might become corrupted in case of host "
+                    "crash.\n", file_name);
+            g_free(proc_link);
+            g_free(file_name);
+        }
+        /*
+         * if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
+         * we will remove these flags to handle compatibility.
+         */
+        ptr = mmap(guardptr + offset, size, PROT_READ | PROT_WRITE,
+                   flags, fd, 0);
+    }
     if (ptr == MAP_FAILED) {
         munmap(guardptr, total);
         return MAP_FAILED;
Wei Yang April 22, 2019, 12:37 a.m. UTC | #3
On Thu, Apr 18, 2019 at 07:33:01PM -0300, Eduardo Habkost wrote:
>On Thu, Apr 18, 2019 at 07:05:16PM -0300, Eduardo Habkost wrote:
>> Hi,
>> 
>> I found out that this series missed QEMU 4.0 and I was going to
>> queue for 4.1, but unfortunately this patch conflicts with:
>> 
>> commit 2044c3e7116eeac0449dcb4a4130cc8f8b9310da
>> Author: Murilo Opsfelder Araujo <muriloo@linux.ibm.com>
>> Date:   Wed Jan 30 21:36:04 2019 -0200
>> 
>>     mmap-alloc: unfold qemu_ram_mmap()
>[...]
>> On Fri, Feb 08, 2019 at 06:11:11PM +0800, Zhang, Yi wrote:
>> > From: Zhang Yi <yi.z.zhang@linux.intel.com>
>> > 
>> > When a file supporting DAX is used as vNVDIMM backend, mmap it with
>> > MAP_SYNC flag in addition which can ensure file system metadata
>> > synced in each guest writes to the backend file, without other QEMU
>> > actions (e.g., periodic fsync() by QEMU).
>> > 
>> > Current, We have below different possible use cases:
>> > 
>> > 1. pmem=on is set, shared=on is set, MAP_SYNC supported:
>> >    a: backend is a dax supporting file.
>> >     - MAP_SYNC will active.
>> >    b: backend is not a dax supporting file.
>> >     - mmap will trigger a warning. then MAP_SYNC flag will be ignored
>> > 
>> > 2. The rest of cases:
>> >    - we will never pass the MAP_SYNC to mmap2
>> > 
>> > Signed-off-by: Haozhong Zhang <haozhong.zhang@intel.com>
>> > Signed-off-by: Zhang Yi <yi.z.zhang@linux.intel.com>
>
>Untested patch rebase is below.  Can Intel help test it and
>submit v14?
>

Thanks Eduardo, the patch looks good to me and I have tested it.
diff mbox series

Patch

diff --git a/util/mmap-alloc.c b/util/mmap-alloc.c
index 97bbeed..2f21efd 100644
--- a/util/mmap-alloc.c
+++ b/util/mmap-alloc.c
@@ -10,6 +10,13 @@ 
  * later.  See the COPYING file in the top-level directory.
  */
 
+#ifdef CONFIG_LINUX
+#include <linux/mman.h>
+#else  /* !CONFIG_LINUX */
+#define MAP_SYNC              0x0
+#define MAP_SHARED_VALIDATE   0x0
+#endif /* CONFIG_LINUX */
+
 #include "qemu/osdep.h"
 #include "qemu/mmap-alloc.h"
 #include "qemu/host-utils.h"
@@ -101,6 +108,8 @@  void *qemu_ram_mmap(int fd,
 #else
     void *ptr = mmap(0, total, PROT_NONE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
 #endif
+    int mmap_flags;
+    int map_sync_flags = 0;
     size_t offset;
     void *ptr1;
 
@@ -111,13 +120,47 @@  void *qemu_ram_mmap(int fd,
     assert(is_power_of_2(align));
     /* Always align to host page size */
     assert(align >= getpagesize());
+    mmap_flags = shared ? MAP_SHARED : MAP_PRIVATE;
+    if (shared && is_pmem) {
+        map_sync_flags = MAP_SYNC | MAP_SHARED_VALIDATE;
+        mmap_flags |= map_sync_flags;
+    }
 
     offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
     ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
                 MAP_FIXED |
                 (fd == -1 ? MAP_ANONYMOUS : 0) |
-                (shared ? MAP_SHARED : MAP_PRIVATE),
+                mmap_flags,
                 fd, 0);
+
+
+    if (ptr1 == MAP_FAILED && map_sync_flags) {
+        if (errno == ENOTSUP) {
+            char *proc_link, *file_name;
+            int len;
+            proc_link = g_strdup_printf("/proc/self/fd/%d", fd);
+            file_name = g_malloc0(PATH_MAX);
+            len = readlink(proc_link, file_name, PATH_MAX - 1);
+            if (len < 0) {
+                len = 0;
+            }
+            file_name[len] = '\0';
+            fprintf(stderr, "Warning: requesting persistence across crashes "
+                    "for backend file %s failed. Proceeding without "
+                    "persistence, data might become corrupted in case of host "
+                    "crash.\n", file_name);
+            g_free(proc_link);
+            g_free(file_name);
+        }
+        /* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
+         * we will remove these flags to handle compatibility.
+         */
+        ptr1 = mmap(ptr + offset, size, PROT_READ | PROT_WRITE,
+                    MAP_FIXED |
+                    (fd == -1 ? MAP_ANONYMOUS : 0) |
+                    MAP_SHARED,
+                    fd, 0);
+    }
     if (ptr1 == MAP_FAILED) {
         munmap(ptr, total);
         return MAP_FAILED;