diff mbox series

[RFC,2/2] block/file-posix: verify page cache is not used

Message ID 20180419075232.31407-3-stefanha@redhat.com
State New
Headers show
Series block/file-posix: allow -drive cache.direct=off live migration | expand

Commit Message

Stefan Hajnoczi April 19, 2018, 7:52 a.m. UTC
This commit is for debugging only.  Do not merge it.

mincore(2) checks whether pages are resident.  Use it to verify that
page cache has been dropped.

You can trigger a verification failure by mmapping the image file from
another process and loading a byte from a page so that it becomes
resident.  bdrv_co_invalidate_cache() will fail while the process is
alive.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/file-posix.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

Comments

Dr. David Alan Gilbert April 19, 2018, 9:05 a.m. UTC | #1
* Stefan Hajnoczi (stefanha@redhat.com) wrote:
> This commit is for debugging only.  Do not merge it.
> 
> mincore(2) checks whether pages are resident.  Use it to verify that
> page cache has been dropped.
> 
> You can trigger a verification failure by mmapping the image file from
> another process and loading a byte from a page so that it becomes
> resident.  bdrv_co_invalidate_cache() will fail while the process is
> alive.

It doesn't seem a bad diagnostic to keep in (with a switch to activate)
for when we're faced with some weird corruption on some weird storage
system.

Dave

> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  block/file-posix.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 71 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index df4f52919f..d3105269c6 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -2236,6 +2236,75 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
>      return ret | BDRV_BLOCK_OFFSET_VALID;
>  }
>  
> +static bool is_mincore(void *addr, size_t length)
> +{
> +    size_t vec_len = DIV_ROUND_UP(length, sysconf(_SC_PAGESIZE));
> +    unsigned char *vec;
> +    size_t i;
> +    int ret;
> +    bool incore = false;
> +
> +    vec = g_malloc(vec_len);
> +    ret = mincore(addr, length, vec);
> +    if (ret < 0) {
> +        incore = true;
> +        goto out;
> +    }
> +
> +    for (i = 0; i < vec_len; i++) {
> +        if (vec[i] & 0x1) {
> +            incore = true;
> +            break;
> +        }
> +    }
> +
> +out:
> +    g_free(vec);
> +    return incore;
> +}
> +
> +static void check_not_in_page_cache(BlockDriverState *bs, Error **errp)
> +{
> +    const size_t WINDOW_SIZE = 128 * 1024 * 1024;
> +    BDRVRawState *s = bs->opaque;
> +    void *window = NULL;
> +    size_t length = 0;
> +    off_t end;
> +    off_t offset;
> +
> +    end = raw_getlength(bs);
> +
> +    for (offset = 0; offset < end; offset += WINDOW_SIZE) {
> +        void *new_window;
> +        size_t new_length = MIN(end - offset, WINDOW_SIZE);
> +
> +        if (new_length != length) {
> +            munmap(window, length);
> +            window = NULL;
> +            length = 0;
> +        }
> +
> +        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
> +                          s->fd, offset);
> +        if (new_window == MAP_FAILED) {
> +            error_setg_errno(errp, errno, "mmap failed");
> +            break;
> +        }
> +
> +        window = new_window;
> +        length = new_length;
> +
> +        if (is_mincore(window, length)) {
> +            error_setg(errp, "page cache still in use!");
> +            break;
> +        }
> +    }
> +
> +    if (window) {
> +        munmap(window, length);
> +    }
> +}
> +
>  static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
>                                                   Error **errp)
>  {
> @@ -2270,6 +2339,8 @@ static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
>          return;
>      }
>  #endif /* __linux__ */
> +
> +    check_not_in_page_cache(bs, errp);
>  }
>  
>  static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,
> -- 
> 2.14.3
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Stefan Hajnoczi April 20, 2018, 3:02 a.m. UTC | #2
On Thu, Apr 19, 2018 at 10:05:47AM +0100, Dr. David Alan Gilbert wrote:
> * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > This commit is for debugging only.  Do not merge it.
> > 
> > mincore(2) checks whether pages are resident.  Use it to verify that
> > page cache has been dropped.
> > 
> > You can trigger a verification failure by mmapping the image file from
> > another process and loading a byte from a page so that it becomes
> > resident.  bdrv_co_invalidate_cache() will fail while the process is
> > alive.
> 
> It doesn't seem a bad diagnostic to keep in (with a switch to activate)
> for when we're faced with some weird corruption on some weird storage
> system.

Okay.  It's very slow to mmap an entire image file and query mincore(2)
so it needs to be off by default.

Stefan
Kevin Wolf April 20, 2018, 6:25 a.m. UTC | #3
Am 20.04.2018 um 05:02 hat Stefan Hajnoczi geschrieben:
> On Thu, Apr 19, 2018 at 10:05:47AM +0100, Dr. David Alan Gilbert wrote:
> > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > This commit is for debugging only.  Do not merge it.
> > > 
> > > mincore(2) checks whether pages are resident.  Use it to verify that
> > > page cache has been dropped.
> > > 
> > > You can trigger a verification failure by mmapping the image file from
> > > another process and loading a byte from a page so that it becomes
> > > resident.  bdrv_co_invalidate_cache() will fail while the process is
> > > alive.
> > 
> > It doesn't seem a bad diagnostic to keep in (with a switch to activate)
> > for when we're faced with some weird corruption on some weird storage
> > system.
> 
> Okay.  It's very slow to mmap an entire image file and query mincore(2)
> so it needs to be off by default.

Also, having it enabled breaks localhost migration at least on tmpfs
(which was what I tried out first).

I wonder if the kernel would add some way to query whether the "advice"
was actually acted upon if we asked. Either with a new function that
returns an error if not everything is dropped (basically
.bdrv_invalidate_cache on the kernel level), or a function that just
queries if any page is allocated (or maybe the address of the first
allocated page in a given range) without having to use mincore() and
iterating over all the pages in userspace.

Kevin
Stefan Hajnoczi April 24, 2018, 2:04 p.m. UTC | #4
On Fri, Apr 20, 2018 at 08:25:13AM +0200, Kevin Wolf wrote:
> Am 20.04.2018 um 05:02 hat Stefan Hajnoczi geschrieben:
> > On Thu, Apr 19, 2018 at 10:05:47AM +0100, Dr. David Alan Gilbert wrote:
> > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > > This commit is for debugging only.  Do not merge it.
> > > > 
> > > > mincore(2) checks whether pages are resident.  Use it to verify that
> > > > page cache has been dropped.
> > > > 
> > > > You can trigger a verification failure by mmapping the image file from
> > > > another process and loading a byte from a page so that it becomes
> > > > resident.  bdrv_co_invalidate_cache() will fail while the process is
> > > > alive.
> > > 
> > > It doesn't seem a bad diagnostic to keep in (with a switch to activate)
> > > for when we're faced with some weird corruption on some weird storage
> > > system.
> > 
> > Okay.  It's very slow to mmap an entire image file and query mincore(2)
> > so it needs to be off by default.
> 
> Also, having it enabled breaks localhost migration at least on tmpfs
> (which was what I tried out first).
> 
> I wonder if the kernel would add some way to query whether the "advice"
> was actually acted upon if we asked. Either with a new function that
> returns an error if not everything is dropped (basically
> .bdrv_invalidate_cache on the kernel level), or a function that just
> queries if any page is allocated (or maybe the address of the first
> allocated page in a given range) without having to use mincore() and
> iterating over all the pages in userspace.

I'm trying to figure out how to expose the optional mincore check on the
command-line/QMP:

1. Add a check_consistency bool argument to bdrv_invalidate_cache*().
   Add command-line/QMP option to -incoming and migrate_incoming.  This
   is messy and won't be easy to access for libvirt users.

2. Add a BlockdevOptionsFile *check-cache-consistency bool field.  This
   is specified at .bdrv_open() time.  It can be changed at runtime with
   .bdrv_reopen*().

3. Add a 'blockdev-check-cache-consistency' QMP command that calls a new
   .bdrv_check_cache_consistency() callback that is implemented by
   file-posix.c.  The problem is users might issue this command after
   I/O has resumed and pages have become resident again.  It only makes
   sense if the guest is still paused.  Probably a bad interface...

Have I missed a good way to expose this optional check functionality?

Which approach do you prefer?  I'm leaning towards #2.

Stefan
Kevin Wolf April 24, 2018, 2:29 p.m. UTC | #5
Am 24.04.2018 um 16:04 hat Stefan Hajnoczi geschrieben:
> On Fri, Apr 20, 2018 at 08:25:13AM +0200, Kevin Wolf wrote:
> > Am 20.04.2018 um 05:02 hat Stefan Hajnoczi geschrieben:
> > > On Thu, Apr 19, 2018 at 10:05:47AM +0100, Dr. David Alan Gilbert wrote:
> > > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > > > This commit is for debugging only.  Do not merge it.
> > > > > 
> > > > > mincore(2) checks whether pages are resident.  Use it to verify that
> > > > > page cache has been dropped.
> > > > > 
> > > > > You can trigger a verification failure by mmapping the image file from
> > > > > another process and loading a byte from a page so that it becomes
> > > > > resident.  bdrv_co_invalidate_cache() will fail while the process is
> > > > > alive.
> > > > 
> > > > It doesn't seem a bad diagnostic to keep in (with a switch to activate)
> > > > for when we're faced with some weird corruption on some weird storage
> > > > system.
> > > 
> > > Okay.  It's very slow to mmap an entire image file and query mincore(2)
> > > so it needs to be off by default.
> > 
> > Also, having it enabled breaks localhost migration at least on tmpfs
> > (which was what I tried out first).
> > 
> > I wonder if the kernel would add some way to query whether the "advice"
> > was actually acted upon if we asked. Either with a new function that
> > returns an error if not everything is dropped (basically
> > .bdrv_invalidate_cache on the kernel level), or a function that just
> > queries if any page is allocated (or maybe the address of the first
> > allocated page in a given range) without having to use mincore() and
> > iterating over all the pages in userspace.
> 
> I'm trying to figure out how to expose the optional mincore check on the
> command-line/QMP:
> 
> 1. Add a check_consistency bool argument to bdrv_invalidate_cache*().
>    Add command-line/QMP option to -incoming and migrate_incoming.  This
>    is messy and won't be easy to access for libvirt users.
> 
> 2. Add a BlockdevOptionsFile *check-cache-consistency bool field.  This
>    is specified at .bdrv_open() time.  It can be changed at runtime with
>    .bdrv_reopen*().
> 
> 3. Add a 'blockdev-check-cache-consistency' QMP command that calls a new
>    .bdrv_check_cache_consistency() callback that is implemented by
>    file-posix.c.  The problem is users might issue this command after
>    I/O has resumed and pages have become resident again.  It only makes
>    sense if the guest is still paused.  Probably a bad interface...
> 
> Have I missed a good way to expose this optional check functionality?
> 
> Which approach do you prefer?  I'm leaning towards #2.

Yes, I think that makes the most sense.

In its current form, this can probably only be a debugging feature,
though, so maybe x-check-cache-consistency? I also don't think libvirt
should mess with this.

Kevin
Stefan Hajnoczi April 27, 2018, 10:06 a.m. UTC | #6
On Tue, Apr 24, 2018 at 04:29:15PM +0200, Kevin Wolf wrote:
> Am 24.04.2018 um 16:04 hat Stefan Hajnoczi geschrieben:
> > On Fri, Apr 20, 2018 at 08:25:13AM +0200, Kevin Wolf wrote:
> > > Am 20.04.2018 um 05:02 hat Stefan Hajnoczi geschrieben:
> > > > On Thu, Apr 19, 2018 at 10:05:47AM +0100, Dr. David Alan Gilbert wrote:
> > > > > * Stefan Hajnoczi (stefanha@redhat.com) wrote:
> > > > > > This commit is for debugging only.  Do not merge it.
> > > > > > 
> > > > > > mincore(2) checks whether pages are resident.  Use it to verify that
> > > > > > page cache has been dropped.
> > > > > > 
> > > > > > You can trigger a verification failure by mmapping the image file from
> > > > > > another process and loading a byte from a page so that it becomes
> > > > > > resident.  bdrv_co_invalidate_cache() will fail while the process is
> > > > > > alive.
> > > > > 
> > > > > It doesn't seem a bad diagnostic to keep in (with a switch to activate)
> > > > > for when we're faced with some weird corruption on some weird storage
> > > > > system.
> > > > 
> > > > Okay.  It's very slow to mmap an entire image file and query mincore(2)
> > > > so it needs to be off by default.
> > > 
> > > Also, having it enabled breaks localhost migration at least on tmpfs
> > > (which was what I tried out first).
> > > 
> > > I wonder if the kernel would add some way to query whether the "advice"
> > > was actually acted upon if we asked. Either with a new function that
> > > returns an error if not everything is dropped (basically
> > > .bdrv_invalidate_cache on the kernel level), or a function that just
> > > queries if any page is allocated (or maybe the address of the first
> > > allocated page in a given range) without having to use mincore() and
> > > iterating over all the pages in userspace.
> > 
> > I'm trying to figure out how to expose the optional mincore check on the
> > command-line/QMP:
> > 
> > 1. Add a check_consistency bool argument to bdrv_invalidate_cache*().
> >    Add command-line/QMP option to -incoming and migrate_incoming.  This
> >    is messy and won't be easy to access for libvirt users.
> > 
> > 2. Add a BlockdevOptionsFile *check-cache-consistency bool field.  This
> >    is specified at .bdrv_open() time.  It can be changed at runtime with
> >    .bdrv_reopen*().
> > 
> > 3. Add a 'blockdev-check-cache-consistency' QMP command that calls a new
> >    .bdrv_check_cache_consistency() callback that is implemented by
> >    file-posix.c.  The problem is users might issue this command after
> >    I/O has resumed and pages have become resident again.  It only makes
> >    sense if the guest is still paused.  Probably a bad interface...
> > 
> > Have I missed a good way to expose this optional check functionality?
> > 
> > Which approach do you prefer?  I'm leaning towards #2.
> 
> Yes, I think that makes the most sense.
> 
> In its current form, this can probably only be a debugging feature,
> though, so maybe x-check-cache-consistency? I also don't think libvirt
> should mess with this.

Cool, I will implement this.

Stefan
diff mbox series

Patch

diff --git a/block/file-posix.c b/block/file-posix.c
index df4f52919f..d3105269c6 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2236,6 +2236,75 @@  static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
     return ret | BDRV_BLOCK_OFFSET_VALID;
 }
 
+static bool is_mincore(void *addr, size_t length)
+{
+    size_t vec_len = DIV_ROUND_UP(length, sysconf(_SC_PAGESIZE));
+    unsigned char *vec;
+    size_t i;
+    int ret;
+    bool incore = false;
+
+    vec = g_malloc(vec_len);
+    ret = mincore(addr, length, vec);
+    if (ret < 0) {
+        incore = true;
+        goto out;
+    }
+
+    for (i = 0; i < vec_len; i++) {
+        if (vec[i] & 0x1) {
+            incore = true;
+            break;
+        }
+    }
+
+out:
+    g_free(vec);
+    return incore;
+}
+
+static void check_not_in_page_cache(BlockDriverState *bs, Error **errp)
+{
+    const size_t WINDOW_SIZE = 128 * 1024 * 1024;
+    BDRVRawState *s = bs->opaque;
+    void *window = NULL;
+    size_t length = 0;
+    off_t end;
+    off_t offset;
+
+    end = raw_getlength(bs);
+
+    for (offset = 0; offset < end; offset += WINDOW_SIZE) {
+        void *new_window;
+        size_t new_length = MIN(end - offset, WINDOW_SIZE);
+
+        if (new_length != length) {
+            munmap(window, length);
+            window = NULL;
+            length = 0;
+        }
+
+        new_window = mmap(window, new_length, PROT_NONE, MAP_PRIVATE,
+                          s->fd, offset);
+        if (new_window == MAP_FAILED) {
+            error_setg_errno(errp, errno, "mmap failed");
+            break;
+        }
+
+        window = new_window;
+        length = new_length;
+
+        if (is_mincore(window, length)) {
+            error_setg(errp, "page cache still in use!");
+            break;
+        }
+    }
+
+    if (window) {
+        munmap(window, length);
+    }
+}
+
 static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
                                                  Error **errp)
 {
@@ -2270,6 +2339,8 @@  static void coroutine_fn raw_co_invalidate_cache(BlockDriverState *bs,
         return;
     }
 #endif /* __linux__ */
+
+    check_not_in_page_cache(bs, errp);
 }
 
 static coroutine_fn BlockAIOCB *raw_aio_pdiscard(BlockDriverState *bs,