diff mbox

[RFC,14/17] block: support FALLOC_FL_PUNCH_HOLE trimming

Message ID 1331226917-6658-15-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini March 8, 2012, 5:15 p.m. UTC
This will enable discard on more filesystems, including ext4.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/raw-posix.c |   29 ++++++++++++++++++++++++-----
 1 files changed, 24 insertions(+), 5 deletions(-)

Comments

Chris Wedgwood March 9, 2012, 8:20 a.m. UTC | #1
> This will enable discard on more filesystems, including ext4.

i think this should default to off


for preallocated files this will punch holes all over the place and
reallocation will cause fragmentation (guests will free and reallocat
blocks normally)

xfs has a mechanism to convert to unwritten extents that would avoid
that, but it's not implemented in other filesystems
Paolo Bonzini March 9, 2012, 8:31 a.m. UTC | #2
Il 09/03/2012 09:20, Chris Wedgwood ha scritto:
> for preallocated files this will punch holes all over the place and
> reallocation will cause fragmentation (guests will free and reallocat
> blocks normally)

SEEK_HOLE could provide a very simple heuristic to detect preallocated
files (unfortunately ext4 does not implement SEEK_HOLE yet).

> xfs has a mechanism to convert to unwritten extents that would avoid
> that, but it's not implemented in other filesystems

Interesting, thanks!

Paolo
Chris Wedgwood March 9, 2012, 8:35 a.m. UTC | #3
> SEEK_HOLE could provide a very simple heuristic to detect preallocated
> files (unfortunately ext4 does not implement SEEK_HOLE yet).

SEEK_HOLE is a weird (confusing and no intuitive) API (IMO).  There is
FIEMAP or whatever it's called which seems somewhat saner.

XFS specific there is also GETBMAP[X].


Simplest still compare the blocks allocated by the file
to it's length (ie. stat.st_blocks != stat.st_size>>9).
Paolo Bonzini March 9, 2012, 8:40 a.m. UTC | #4
Il 09/03/2012 09:35, Chris Wedgwood ha scritto:
>> SEEK_HOLE could provide a very simple heuristic to detect preallocated
>> files (unfortunately ext4 does not implement SEEK_HOLE yet).
> 
> SEEK_HOLE is a weird (confusing and no intuitive) API (IMO).  There is
> FIEMAP or whatever it's called which seems somewhat saner.

FIEMAP is a powertool, we need a butter knife only (cit.).

> Simplest still compare the blocks allocated by the file
> to it's length (ie. stat.st_blocks != stat.st_size>>9).

But yes, you're right about this.

Paolo
Stefan Hajnoczi March 9, 2012, 10:31 a.m. UTC | #5
On Thu, Mar 8, 2012 at 5:15 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> This will enable discard on more filesystems, including ext4.
>
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
>  block/raw-posix.c |   29 ++++++++++++++++++++++++-----
>  1 files changed, 24 insertions(+), 5 deletions(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 2d1bc13..f23d92b 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -52,6 +52,7 @@
>  #include <sys/param.h>
>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
> +#include <linux/falloc.h>
>  #endif
>  #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
>  #include <sys/disk.h>
> @@ -131,6 +132,7 @@ typedef struct BDRVRawState {
>  #endif
>     uint8_t *aligned_buf;
>     unsigned aligned_buf_size;
> +    bool has_discard : 1;
>  #ifdef CONFIG_XFS
>     bool is_xfs : 1;
>  #endif
> @@ -257,11 +259,9 @@ static int raw_open_common(BlockDriverState *bs, const char *filename,
>     }
>
>  #ifdef CONFIG_XFS
> -    if (platform_test_xfs_fd(s->fd)) {
> -        s->is_xfs = 1;
> -    }
> +    s->is_xfs = platform_test_xfs_fd(s->fd);
>  #endif
> -
> +    s->has_discard = 1;
>     return 0;
>
>  out_free_buf:
> @@ -605,15 +605,34 @@ static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
>  static coroutine_fn int raw_co_discard(BlockDriverState *bs,
>     int64_t sector_num, int nb_sectors)
>  {
> -#ifdef CONFIG_XFS
>     BDRVRawState *s = bs->opaque;
> +    int retval;
>
> +    if (s->has_discard == 0) {
> +        return -ENOTSUP;
> +    }
> +#ifdef CONFIG_XFS
>     if (s->is_xfs) {
>         return xfs_discard(s, sector_num, nb_sectors);
>     }
>  #endif
>
> +#ifdef FALLOC_FL_PUNCH_HOLE
> +    retval = fallocate(s->fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
> +                       sector_num << 9, (int64_t)nb_sectors << 9);

I'm concerned about introducing blocking syscalls in coroutine code
paths.  This needs to be done asynchronously.

Stefan
Paolo Bonzini March 9, 2012, 10:43 a.m. UTC | #6
Il 09/03/2012 11:31, Stefan Hajnoczi ha scritto:
>> > +#ifdef FALLOC_FL_PUNCH_HOLE
>> > +    retval = fallocate(s->fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
>> > +                       sector_num << 9, (int64_t)nb_sectors << 9);
> I'm concerned about introducing blocking syscalls in coroutine code
> paths.  This needs to be done asynchronously.

Right; it is no worse than what is already there, except that XFS could
use paio_ioctl.  Alternatives are:

1) require a new-enough kernel and only use fallocate; return a NULL
aiocb if !has_discard and convert it to ENOTSUP.

2) extract now from my threads branch the work to generalize
posix-aio-compat into a more flexible threadpool, and move the AIO code
back to block/raw-posix.c.

Paolo
Stefan Hajnoczi March 9, 2012, 10:53 a.m. UTC | #7
On Fri, Mar 9, 2012 at 10:43 AM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> Il 09/03/2012 11:31, Stefan Hajnoczi ha scritto:
>>> > +#ifdef FALLOC_FL_PUNCH_HOLE
>>> > +    retval = fallocate(s->fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
>>> > +                       sector_num << 9, (int64_t)nb_sectors << 9);
>> I'm concerned about introducing blocking syscalls in coroutine code
>> paths.  This needs to be done asynchronously.
>
> Right; it is no worse than what is already there, except that XFS could
> use paio_ioctl.  Alternatives are:

It is worse in the sense that if you make this feature more widely
available then it deserves a production-quality implementation.

> 1) require a new-enough kernel and only use fallocate; return a NULL
> aiocb if !has_discard and convert it to ENOTSUP.
>
> 2) extract now from my threads branch the work to generalize
> posix-aio-compat into a more flexible threadpool, and move the AIO code
> back to block/raw-posix.c.

3) Add discard to posix-aio-compat.c as an AIO request?

Stefan
Paolo Bonzini March 9, 2012, 10:57 a.m. UTC | #8
Il 09/03/2012 11:53, Stefan Hajnoczi ha scritto:
> 
>> > 1) require a new-enough kernel and only use fallocate; return a NULL
>> > aiocb if !has_discard and convert it to ENOTSUP.
>> >
>> > 2) extract now from my threads branch the work to generalize
>> > posix-aio-compat into a more flexible threadpool, and move the AIO code
>> > back to block/raw-posix.c.
> 3) Add discard to posix-aio-compat.c as an AIO request?

That would be (1) or you would need to pass down is_xfs.

Paolo
Richard Laager March 9, 2012, 8:36 p.m. UTC | #9
I was just working on this as well, though your implementation is *far*
more complete than mine. (I was only looking at making changes to the
discard implementation in block/raw-posix.c.)

I've got several comments, which I've separated by logical topic...


-------- BLKDISCARD --------

fallocate() only supports files. In my patch, I was fstat()ing the fd
just after opening it and caching an is_device boolean. Then, when doing
discards, if !is_device, call fallocate(), else (i.e. for devices) do
the following (note: untested code):

        __uint64_t range[2];

        range[0] = sector_num << 9;
        range[1] = (__uint64_t)nb_sectors << 9;

        if (ioctl(s->fd, BLKDISCARD, &range) && errno != EOPNOTSUPP) {
             return -errno;
        }
        return 0;

Note that for BLKDISCARD, you probably do NOT want to clear has_discard
if you get EOPNOTSUPP. For example, if you have LVM across multiple
disks where some support discard and some do not, you'll get EOPNOTSUPP
for some discard requests, but not all. ext4 had a behavior where it
would stop issuing discards after one failed, but the LVM guys heavily
criticized that and asked them to get rid of that behavior. (I haven't
checked to make sure it actually happened.)

I'd imagine it's still fine to stop trying fallocate() hole punches
after the first failure; I'm not aware of any filesystem where discards
would be supported in only some ranges of a single file, nor would that
make much sense.


-------- sync requirements --------

I'm not sure if fallocate() and/or BLKDISCARD always guarantee that the
discard has made it to stable storage. If they don't, does O_DIRECT or
O_DSYNC on open() cause them to make any such guarantee? If not, should
you be calling fdatasync() or fsync() when the user has specified
cache=direct or cache=writethrough?

Note that the Illumos implementation (see below) has a flag to ask for
either behavior.


-------- non-Linux Implementations --------

FreeBSD and Illumos have ioctl()s similar to BLKDISCARD and I have
similar untested code for those as well:

/* FreeBSD */
#ifdef DIOCGDELETE
    if (s->is_device) {
        off_t range[2];

        range[0] = sector_num << 9;
        range[1] = (off_t)nb_sectors << 9;

        if (ioctl(s->fd, DIOCGDELETE, &range) != 0 && errno != ENOIOCTL)
        {
            return -errno;
        }
        return 0;
    }
#endif


/* Illumos */
#ifdef DKIOCFREE
    if (s->is_device) {
        dkioc_free_t df;

        /* TODO: Is this correct? Are the other discard ioctl()s sync or async? */
        df.df_flags = (s->open_flags & (O_DIRECT | O_DSYNC)) ?
            DF_WAIT_SYNC : 0;
        df.df_start = sector_num << 9;
        df.df_length = (diskaddr_t)nb_sectors << 9;

        if (ioctl(s->fd, DKIOCFREE, &range)) != 0 && errno != ENOTTY)
        {
            return -errno;
        }
        return 0;
    }
#endif



-------- Thin vs. Thick Provisioning --------

On Fri, 2012-03-09 at 00:35 -0800, Chris Wedgwood wrote:
> Simplest still compare the blocks allocated by the file
> to it's length (ie. stat.st_blocks != stat.st_size>>9).

I thought of this as well. It covers "99%" of the cases, but there's one
case where it breaks down. Imagine I have a sparse file backing my
virtual disk. In the guest, I fill the virtual disk 100%. Then, I
restart QEMU. Now it thinks that sparse file is non-sparse and stops
issuing hole punches.

To be completely correct, I suggest the following behavior:
     1. Add a discard boolean option to the disk layer.
     2. If discard is not specified:
              * For files, detect a true/false value by comparing
                stat.st_blocks != stat.st_size>>9.
              * For devices, assume a fixed value (true?).
     3. If discard is true, issue discards.
     4. If discard is false, do not issue discards.


-------- CONFIG_XFS --------

XFS has implemented the FALLOC_FL_PUNCH_HOLE interface since it was
created. So unless you're concerned about people compiling QEMU on newer
systems with FALLOC_FL_PUNCH_HOLE and then running that binary on older
kernels, there's no case where one needs both the CONFIG_XFS code and
FALLOC_FL_PUNCH_HOLE code.
Paolo Bonzini March 12, 2012, 9:34 a.m. UTC | #10
Il 09/03/2012 21:36, Richard Laager ha scritto:
> fallocate() only supports files. In my patch, I was fstat()ing the fd
> just after opening it and caching an is_device boolean. Then, when doing
> discards, if !is_device, call fallocate(), else (i.e. for devices) do
> the following (note: untested code):
> 
>         __uint64_t range[2];
> 
>         range[0] = sector_num << 9;
>         range[1] = (__uint64_t)nb_sectors << 9;
> 
>         if (ioctl(s->fd, BLKDISCARD, &range) && errno != EOPNOTSUPP) {
>              return -errno;
>         }
>         return 0;

You can instead put this code in a separate function hdev_discard.  hdev
device is used instead of file when the backend is a special file (block
or character).

> -------- sync requirements --------
> 
> I'm not sure if fallocate() and/or BLKDISCARD always guarantee that the
> discard has made it to stable storage. If they don't, does O_DIRECT or
> O_DSYNC on open() cause them to make any such guarantee?

O_DIRECT shouldn't have any effect; for O_DSYNC I don't think so.

> -------- Thin vs. Thick Provisioning --------
> 
> On Fri, 2012-03-09 at 00:35 -0800, Chris Wedgwood wrote:
>> Simplest still compare the blocks allocated by the file
>> to it's length (ie. stat.st_blocks != stat.st_size>>9).
> 
> I thought of this as well. It covers "99%" of the cases, but there's one
> case where it breaks down. Imagine I have a sparse file backing my
> virtual disk. In the guest, I fill the virtual disk 100%. Then, I
> restart QEMU. Now it thinks that sparse file is non-sparse and stops
> issuing hole punches.

I would not really have any problem with that, it seems like a
relatively minor case (also because newer guests will allocate the first
partition at 1 MB, so you might still have a small hole at the beginning).

> To be completely correct, I suggest the following behavior:
>      1. Add a discard boolean option to the disk layer.
>      2. If discard is not specified:
>               * For files, detect a true/false value by comparing
>                 stat.st_blocks != stat.st_size>>9.
>               * For devices, assume a fixed value (true?).
>      3. If discard is true, issue discards.
>      4. If discard is false, do not issue discards.

The problem is, who will use this interface?

Paolo
Christoph Hellwig March 24, 2012, 3:40 p.m. UTC | #11
On Fri, Mar 09, 2012 at 02:36:50PM -0600, Richard Laager wrote:
> I'm not sure if fallocate() and/or BLKDISCARD always guarantee that the
> discard has made it to stable storage. If they don't, does O_DIRECT or
> O_DSYNC on open() cause them to make any such guarantee? If not, should
> you be calling fdatasync() or fsync() when the user has specified
> cache=direct or cache=writethrough?
> 
> Note that the Illumos implementation (see below) has a flag to ask for
> either behavior.

For fallocate the current Linux behaviour is that you need an fdatasync
or the O_DSYNC flag to guarantee that it makes it's way to disk.  For
XFS the history implementation of both XFS_IOC_FREESP and hole punching
was that it always made it to disk, and for all other filesystems that
historic behaviour was that O_DSYNC was ignored, and depending on the
fs a fdatasync might or might not be able to give your a guarantee either.

For BLKDISCARD you'd strictly speaking need to issue a cache flush via
fdatasync, too, but I'm not sure if any actual devices require that.

> On Fri, 2012-03-09 at 00:35 -0800, Chris Wedgwood wrote:
> > Simplest still compare the blocks allocated by the file
> > to it's length (ie. stat.st_blocks != stat.st_size>>9).
> 
> I thought of this as well. It covers "99%" of the cases, but there's one
> case where it breaks down. Imagine I have a sparse file backing my
> virtual disk. In the guest, I fill the virtual disk 100%. Then, I
> restart QEMU. Now it thinks that sparse file is non-sparse and stops
> issuing hole punches.

Note that due to speculative preallocation in the filesystem stat.st_blocks
can easily be bigger than stat.st_size>>9, so the above comparism should
at least be a < instead of !=.
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 2d1bc13..f23d92b 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -52,6 +52,7 @@ 
 #include <sys/param.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
+#include <linux/falloc.h>
 #endif
 #if defined (__FreeBSD__) || defined(__FreeBSD_kernel__)
 #include <sys/disk.h>
@@ -131,6 +132,7 @@  typedef struct BDRVRawState {
 #endif
     uint8_t *aligned_buf;
     unsigned aligned_buf_size;
+    bool has_discard : 1;
 #ifdef CONFIG_XFS
     bool is_xfs : 1;
 #endif
@@ -257,11 +259,9 @@  static int raw_open_common(BlockDriverState *bs, const char *filename,
     }
 
 #ifdef CONFIG_XFS
-    if (platform_test_xfs_fd(s->fd)) {
-        s->is_xfs = 1;
-    }
+    s->is_xfs = platform_test_xfs_fd(s->fd);
 #endif
-
+    s->has_discard = 1;
     return 0;
 
 out_free_buf:
@@ -605,15 +605,34 @@  static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors)
 static coroutine_fn int raw_co_discard(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors)
 {
-#ifdef CONFIG_XFS
     BDRVRawState *s = bs->opaque;
+    int retval;
 
+    if (s->has_discard == 0) {
+        return -ENOTSUP;
+    }
+#ifdef CONFIG_XFS
     if (s->is_xfs) {
         return xfs_discard(s, sector_num, nb_sectors);
     }
 #endif
 
+#ifdef FALLOC_FL_PUNCH_HOLE
+    retval = fallocate(s->fd, FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE,
+                       sector_num << 9, (int64_t)nb_sectors << 9);
+    if (retval == -1) {
+        if (errno == ENODEV || errno == ENOSYS || errno == EOPNOTSUPP) {
+            s->has_discard = 0;
+            return -ENOTSUP;
+        } else {
+            return -errno;
+        }
+    }
     return 0;
+#else
+    s->has_discard = 0;
+    return -ENOTSUP;
+#endif
 }
 
 static QEMUOptionParameter raw_create_options[] = {