diff mbox

[v13,5/6] raw-posix: Add full preallocation option

Message ID 67d97abc587e7c6985166dbe800686938ac8adb5.1409299732.git.hutao@cn.fujitsu.com
State New
Headers show

Commit Message

Hu Tao Aug. 29, 2014, 8:33 a.m. UTC
This patch adds a new option preallocation for raw format, and implements
full preallocation.

Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
---
 block/raw-posix.c | 92 +++++++++++++++++++++++++++++++++++++++++++------------
 qemu-doc.texi     |  8 +++++
 qemu-img.texi     |  8 +++++
 3 files changed, 88 insertions(+), 20 deletions(-)

Comments

Richard W.M. Jones Aug. 29, 2014, 8:48 a.m. UTC | #1
On Fri, Aug 29, 2014 at 04:33:12PM +0800, Hu Tao wrote:
> +    if (prealloc == PREALLOC_MODE_FULL) {
> +        /* posix_fallocate() doesn't set errno. */
> +        result = -posix_fallocate(fd, 0, total_size);
> +        if (result != 0) {

Is it better to test:

  result != ENOSYS && result != EOPNOTSUPP

here?

I think this is definitely the right approach.

Rich.
Max Reitz Sept. 2, 2014, 9:45 p.m. UTC | #2
On 29.08.2014 10:33, Hu Tao wrote:
> This patch adds a new option preallocation for raw format, and implements
> full preallocation.
>
> Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> ---
>   block/raw-posix.c | 92 +++++++++++++++++++++++++++++++++++++++++++------------
>   qemu-doc.texi     |  8 +++++
>   qemu-img.texi     |  8 +++++
>   3 files changed, 88 insertions(+), 20 deletions(-)
>
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index abe0759..25f66f2 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -30,6 +30,7 @@
>   #include "block/thread-pool.h"
>   #include "qemu/iov.h"
>   #include "raw-aio.h"
> +#include "qapi/util.h"
>   
>   #if defined(__APPLE__) && (__MACH__)
>   #include <paths.h>
> @@ -1365,6 +1366,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
>       int result = 0;
>       int64_t total_size = 0;
>       bool nocow = false;
> +    PreallocMode prealloc = PREALLOC_MODE_OFF;
> +    char *buf = NULL;
> +    Error *local_err = NULL;
>   
>       strstart(filename, "file:", &filename);
>   
> @@ -1372,37 +1376,80 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
>       total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
>                             BDRV_SECTOR_SIZE);
>       nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
> +    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
> +    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
> +                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
> +                               &local_err);
> +    g_free(buf);
> +    if (local_err) {
> +        error_propagate(errp, local_err);
> +        result = -EINVAL;
> +        goto out;
> +    }
>   
>       fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
>                      0644);
>       if (fd < 0) {
>           result = -errno;
>           error_setg_errno(errp, -result, "Could not create file");
> -    } else {
> -        if (nocow) {
> +        goto out;
> +    }
> +
> +    if (nocow) {
>   #ifdef __linux__
> -            /* Set NOCOW flag to solve performance issue on fs like btrfs.
> -             * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
> -             * will be ignored since any failure of this operation should not
> -             * block the left work.
> -             */
> -            int attr;
> -            if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
> -                attr |= FS_NOCOW_FL;
> -                ioctl(fd, FS_IOC_SETFLAGS, &attr);
> -            }
> -#endif
> +        /* Set NOCOW flag to solve performance issue on fs like btrfs.
> +         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
> +         * will be ignored since any failure of this operation should not
> +         * block the left work.
> +         */
> +        int attr;
> +        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
> +            attr |= FS_NOCOW_FL;
> +            ioctl(fd, FS_IOC_SETFLAGS, &attr);
>           }
> +#endif
> +    }
>   
> -        if (ftruncate(fd, total_size) != 0) {
> -            result = -errno;
> -            error_setg_errno(errp, -result, "Could not resize file");
> -        }
> -        if (qemu_close(fd) != 0) {
> -            result = -errno;
> -            error_setg_errno(errp, -result, "Could not close the new file");
> +    if (ftruncate(fd, total_size) != 0) {
> +        result = -errno;
> +        error_setg_errno(errp, -result, "Could not resize file");
> +        goto out_close;
> +    }
> +
> +    if (prealloc == PREALLOC_MODE_FULL) {
> +        /* posix_fallocate() doesn't set errno. */
> +        result = -posix_fallocate(fd, 0, total_size);
> +        if (result != 0) {
> +            buf = g_malloc0(65536);
> +            int64_t num = 0, left = total_size;
> +
> +            while (left > 0) {
> +                num = MIN(left, 65536);
> +                result = write(fd, buf, num);
> +                if (result < 0) {
> +                    result = -errno;
> +                    error_setg_errno(errp, -result,
> +                                     "Could not write to the new file");
> +                    g_free(buf);
> +                    goto out_close;
> +                }
> +                left -= num;
> +            }
> +            fsync(fd);
> +            g_free(buf);
>           }
> +    } else if (prealloc != PREALLOC_MODE_OFF) {
> +        result = -1;

As for qcow2 in patch 4, I'd prefer -EINVAL.

> +        error_setg(errp, "Unsupported preallocation mode: %s",
> +                   PreallocMode_lookup[prealloc]);
> +    }
> +
> +out_close:
> +    if (qemu_close(fd) != 0 && result == 0) {
> +        result = -errno;
> +        error_setg_errno(errp, -result, "Could not close the new file");
>       }
> +out:
>       return result;
>   }
>   
> @@ -1585,6 +1632,11 @@ static QemuOptsList raw_create_opts = {
>               .type = QEMU_OPT_BOOL,
>               .help = "Turn off copy-on-write (valid only on btrfs)"
>           },
> +        {
> +            .name = BLOCK_OPT_PREALLOC,
> +            .type = QEMU_OPT_STRING,
> +            .help = "Preallocation mode (allowed values: off, full)"
> +        },
>           { /* end of list */ }
>       }
>   };
> diff --git a/qemu-doc.texi b/qemu-doc.texi
> index 2b232ae..2637765 100644
> --- a/qemu-doc.texi
> +++ b/qemu-doc.texi
> @@ -527,6 +527,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve
>   space. Use @code{qemu-img info} to know the real size used by the
>   image or @code{ls -ls} on Unix/Linux.
>   
> +Supported options:
> +@table @code
> +@item preallocation
> +Preallocation mode(allowed values: @code{off}, @code{full}). An image is

Missing space in front of the opening bracket.

> +fully preallocated by calling posix_fallocate() if it's available, or by

Hm, I personally am not so happy about contractions ("it's") in 
persistent documentation (even source code comments). Although I know 
there are already some of them in qemu-doc.texi, I'd rather avoid them. 
But I'll leave this up to you as I'm no native speaker.

> +writing zeros to underlying storage.
> +@end table
> +
>   @item qcow2
>   QEMU image format, the most versatile format. Use it to have smaller
>   images (useful if your filesystem does not supports holes, for example
> diff --git a/qemu-img.texi b/qemu-img.texi
> index cb68948..063ec61 100644
> --- a/qemu-img.texi
> +++ b/qemu-img.texi
> @@ -418,6 +418,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve
>   space. Use @code{qemu-img info} to know the real size used by the
>   image or @code{ls -ls} on Unix/Linux.
>   
> +Supported options:
> +@table @code
> +@item preallocation
> +Preallocation mode(allowed values: @code{off}, @code{full}). An image is
> +fully preallocated by calling posix_fallocate() if it's available, or by
> +writing zeros to underlying storage.
> +@end table
> +

Same as for qemu-doc.texi.

However, these are all minor, so with "result = -EINVAL" and the missing 
space added before the brackets:

Reviewed-by: Max Reitz <mreitz@redhat.com>

>   @item qcow2
>   QEMU image format, the most versatile format. Use it to have smaller
>   images (useful if your filesystem does not supports holes, for example
Hu Tao Sept. 3, 2014, 1:26 a.m. UTC | #3
On Fri, Aug 29, 2014 at 09:48:01AM +0100, Richard W.M. Jones wrote:
> On Fri, Aug 29, 2014 at 04:33:12PM +0800, Hu Tao wrote:
> > +    if (prealloc == PREALLOC_MODE_FULL) {
> > +        /* posix_fallocate() doesn't set errno. */
> > +        result = -posix_fallocate(fd, 0, total_size);
> > +        if (result != 0) {
> 
> Is it better to test:
> 
>   result != ENOSYS && result != EOPNOTSUPP
> 
> here?

posix_fallocate() doesn't return ENOSYS or EOPNOTSUPP. All the errors
returned by posix_fallocate() apply to writing zeros, too. that is, if
posix_fallocate() returns an error, we should not do writing zeros,
neither.

I'm wondering what is the right way to test if posix_fallocate() is
supported, something like AC_CHECK_FUNC? how?

Regards,
Hu

> 
> I think this is definitely the right approach.
> 
> Rich.
> 
> 
> -- 
> Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
> Read my programming and virtualization blog: http://rwmj.wordpress.com
> virt-p2v converts physical machines to virtual machines.  Boot with a
> live CD or over the network (PXE) and turn machines into KVM guests.
> http://libguestfs.org/virt-v2v
Hu Tao Sept. 3, 2014, 1:55 a.m. UTC | #4
On Tue, Sep 02, 2014 at 11:45:38PM +0200, Max Reitz wrote:
> On 29.08.2014 10:33, Hu Tao wrote:
> >This patch adds a new option preallocation for raw format, and implements
> >full preallocation.
> >
> >Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> >---
> >  block/raw-posix.c | 92 +++++++++++++++++++++++++++++++++++++++++++------------
> >  qemu-doc.texi     |  8 +++++
> >  qemu-img.texi     |  8 +++++
> >  3 files changed, 88 insertions(+), 20 deletions(-)
> >
> >diff --git a/block/raw-posix.c b/block/raw-posix.c
> >index abe0759..25f66f2 100644
> >--- a/block/raw-posix.c
> >+++ b/block/raw-posix.c
> >@@ -30,6 +30,7 @@
> >  #include "block/thread-pool.h"
> >  #include "qemu/iov.h"
> >  #include "raw-aio.h"
> >+#include "qapi/util.h"
> >  #if defined(__APPLE__) && (__MACH__)
> >  #include <paths.h>
> >@@ -1365,6 +1366,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
> >      int result = 0;
> >      int64_t total_size = 0;
> >      bool nocow = false;
> >+    PreallocMode prealloc = PREALLOC_MODE_OFF;
> >+    char *buf = NULL;
> >+    Error *local_err = NULL;
> >      strstart(filename, "file:", &filename);
> >@@ -1372,37 +1376,80 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
> >      total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
> >                            BDRV_SECTOR_SIZE);
> >      nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
> >+    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
> >+    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
> >+                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
> >+                               &local_err);
> >+    g_free(buf);
> >+    if (local_err) {
> >+        error_propagate(errp, local_err);
> >+        result = -EINVAL;
> >+        goto out;
> >+    }
> >      fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
> >                     0644);
> >      if (fd < 0) {
> >          result = -errno;
> >          error_setg_errno(errp, -result, "Could not create file");
> >-    } else {
> >-        if (nocow) {
> >+        goto out;
> >+    }
> >+
> >+    if (nocow) {
> >  #ifdef __linux__
> >-            /* Set NOCOW flag to solve performance issue on fs like btrfs.
> >-             * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
> >-             * will be ignored since any failure of this operation should not
> >-             * block the left work.
> >-             */
> >-            int attr;
> >-            if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
> >-                attr |= FS_NOCOW_FL;
> >-                ioctl(fd, FS_IOC_SETFLAGS, &attr);
> >-            }
> >-#endif
> >+        /* Set NOCOW flag to solve performance issue on fs like btrfs.
> >+         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
> >+         * will be ignored since any failure of this operation should not
> >+         * block the left work.
> >+         */
> >+        int attr;
> >+        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
> >+            attr |= FS_NOCOW_FL;
> >+            ioctl(fd, FS_IOC_SETFLAGS, &attr);
> >          }
> >+#endif
> >+    }
> >-        if (ftruncate(fd, total_size) != 0) {
> >-            result = -errno;
> >-            error_setg_errno(errp, -result, "Could not resize file");
> >-        }
> >-        if (qemu_close(fd) != 0) {
> >-            result = -errno;
> >-            error_setg_errno(errp, -result, "Could not close the new file");
> >+    if (ftruncate(fd, total_size) != 0) {
> >+        result = -errno;
> >+        error_setg_errno(errp, -result, "Could not resize file");
> >+        goto out_close;
> >+    }
> >+
> >+    if (prealloc == PREALLOC_MODE_FULL) {
> >+        /* posix_fallocate() doesn't set errno. */
> >+        result = -posix_fallocate(fd, 0, total_size);
> >+        if (result != 0) {
> >+            buf = g_malloc0(65536);
> >+            int64_t num = 0, left = total_size;
> >+
> >+            while (left > 0) {
> >+                num = MIN(left, 65536);
> >+                result = write(fd, buf, num);
> >+                if (result < 0) {
> >+                    result = -errno;
> >+                    error_setg_errno(errp, -result,
> >+                                     "Could not write to the new file");
> >+                    g_free(buf);
> >+                    goto out_close;
> >+                }
> >+                left -= num;
> >+            }
> >+            fsync(fd);
> >+            g_free(buf);
> >          }
> >+    } else if (prealloc != PREALLOC_MODE_OFF) {
> >+        result = -1;
> 
> As for qcow2 in patch 4, I'd prefer -EINVAL.

Okay.

> 
> >+        error_setg(errp, "Unsupported preallocation mode: %s",
> >+                   PreallocMode_lookup[prealloc]);
> >+    }
> >+
> >+out_close:
> >+    if (qemu_close(fd) != 0 && result == 0) {
> >+        result = -errno;
> >+        error_setg_errno(errp, -result, "Could not close the new file");
> >      }
> >+out:
> >      return result;
> >  }
> >@@ -1585,6 +1632,11 @@ static QemuOptsList raw_create_opts = {
> >              .type = QEMU_OPT_BOOL,
> >              .help = "Turn off copy-on-write (valid only on btrfs)"
> >          },
> >+        {
> >+            .name = BLOCK_OPT_PREALLOC,
> >+            .type = QEMU_OPT_STRING,
> >+            .help = "Preallocation mode (allowed values: off, full)"
> >+        },
> >          { /* end of list */ }
> >      }
> >  };
> >diff --git a/qemu-doc.texi b/qemu-doc.texi
> >index 2b232ae..2637765 100644
> >--- a/qemu-doc.texi
> >+++ b/qemu-doc.texi
> >@@ -527,6 +527,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve
> >  space. Use @code{qemu-img info} to know the real size used by the
> >  image or @code{ls -ls} on Unix/Linux.
> >+Supported options:
> >+@table @code
> >+@item preallocation
> >+Preallocation mode(allowed values: @code{off}, @code{full}). An image is
> 
> Missing space in front of the opening bracket.

Okay. I checked that in other places of opening bracket there is a space
before them.

> 
> >+fully preallocated by calling posix_fallocate() if it's available, or by
> 
> Hm, I personally am not so happy about contractions ("it's") in
> persistent documentation (even source code comments). Although I
> know there are already some of them in qemu-doc.texi, I'd rather
> avoid them. But I'll leave this up to you as I'm no native speaker.

I'm not, either. I don't know which one in "it's" and "it is" is more
common, but I can change the contraction if it makes you feel better:)

Regards,
Hu

> 
> >+writing zeros to underlying storage.
> >+@end table
> >+
> >  @item qcow2
> >  QEMU image format, the most versatile format. Use it to have smaller
> >  images (useful if your filesystem does not supports holes, for example
> >diff --git a/qemu-img.texi b/qemu-img.texi
> >index cb68948..063ec61 100644
> >--- a/qemu-img.texi
> >+++ b/qemu-img.texi
> >@@ -418,6 +418,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve
> >  space. Use @code{qemu-img info} to know the real size used by the
> >  image or @code{ls -ls} on Unix/Linux.
> >+Supported options:
> >+@table @code
> >+@item preallocation
> >+Preallocation mode(allowed values: @code{off}, @code{full}). An image is
> >+fully preallocated by calling posix_fallocate() if it's available, or by
> >+writing zeros to underlying storage.
> >+@end table
> >+
> 
> Same as for qemu-doc.texi.
> 
> However, these are all minor, so with "result = -EINVAL" and the
> missing space added before the brackets:
> 
> Reviewed-by: Max Reitz <mreitz@redhat.com>
> 
> >  @item qcow2
> >  QEMU image format, the most versatile format. Use it to have smaller
> >  images (useful if your filesystem does not supports holes, for example
Hu Tao Sept. 4, 2014, 8:32 a.m. UTC | #5
On Fri, Aug 29, 2014 at 09:48:01AM +0100, Richard W.M. Jones wrote:
> On Fri, Aug 29, 2014 at 04:33:12PM +0800, Hu Tao wrote:
> > +    if (prealloc == PREALLOC_MODE_FULL) {
> > +        /* posix_fallocate() doesn't set errno. */
> > +        result = -posix_fallocate(fd, 0, total_size);
> > +        if (result != 0) {
> 
> Is it better to test:
> 
>   result != ENOSYS && result != EOPNOTSUPP
> 
> here?
> 
> I think this is definitely the right approach.

Hi Kevin,

How do you think this approach?

Regards,
Hu
Kevin Wolf Sept. 4, 2014, 12:35 p.m. UTC | #6
Am 29.08.2014 um 10:33 hat Hu Tao geschrieben:
> This patch adds a new option preallocation for raw format, and implements
> full preallocation.
> 
> Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>

v12 was better, it wasn't doing only metadata preallocation when you
told it to do full preallocation.

> +    if (prealloc == PREALLOC_MODE_FULL) {
> +        /* posix_fallocate() doesn't set errno. */
> +        result = -posix_fallocate(fd, 0, total_size);
> +        if (result != 0) {
> +            buf = g_malloc0(65536);
> +            int64_t num = 0, left = total_size;
> +
> +            while (left > 0) {
> +                num = MIN(left, 65536);
> +                result = write(fd, buf, num);
> +                if (result < 0) {
> +                    result = -errno;
> +                    error_setg_errno(errp, -result,
> +                                     "Could not write to the new file");
> +                    g_free(buf);
> +                    goto out_close;
> +                }
> +                left -= num;
> +            }
> +            fsync(fd);
> +            g_free(buf);
>          }

This is totally pointless. If the file system doesn't support fallocate,
posix_fallocate() will already try writing zeros.

Having code to write zeros in qemu only makes sense if you implement a
real full preallocation mode, which doesn't use fallocate even when it's
supported.

Please change the code to always write zeros for FULL, and either
reintroduce FALLOC or use METADATA for the fallocate (without a fallback
to zero writing code). In fact, for metadata preallocation we should
probably directly use fallocate(), which has no slow zero-write fallback
in the libc.

Kevin
Richard W.M. Jones Sept. 4, 2014, 12:45 p.m. UTC | #7
On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> Please change the code to always write zeros for FULL,

How is this useful for anyone?  You don't know if the underlying SAN
is going to detect these zeroes or combine these blocks together.
It's just slow for no reason.

Rich.
Kevin Wolf Sept. 4, 2014, 12:52 p.m. UTC | #8
Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben:
> On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> > Please change the code to always write zeros for FULL,
> 
> How is this useful for anyone?  You don't know if the underlying SAN
> is going to detect these zeroes or combine these blocks together.
> It's just slow for no reason.

It's slow for the reason that the user has requested it. Do you doubt
that users can know what their backend is doing? Why are you insisting
on providing only the functionality that you personally need?

I doubt it's a way to make many users happier, but if you insist, we can
leave full preallocation unsupported on raw-posix and allow only
metadata preallocation (which should still be fallocate() without a
zero-write fallback, so that you can rely on it being fast). But call it
by its name and don't say "full" when you only implement "metadata".

Kevin
Richard W.M. Jones Sept. 4, 2014, 1:07 p.m. UTC | #9
On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote:
> Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben:
> > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> > > Please change the code to always write zeros for FULL,
> > 
> > How is this useful for anyone?  You don't know if the underlying SAN
> > is going to detect these zeroes or combine these blocks together.
> > It's just slow for no reason.
> 
> It's slow for the reason that the user has requested it. Do you doubt
> that users can know what their backend is doing? Why are you insisting
> on providing only the functionality that you personally need?

I'm not!  I'm trying to make sure we don't end up with a qemu
interface which is useless for higher layers.  You're proposing
preallocation=full which will be slow but not actually give any
guarantees, or preallocation=meta which is going to be fast but may
not work, and I'm saying that's a dumb interface that's not useful.

Rich.
Daniel P. Berrangé Sept. 4, 2014, 1:13 p.m. UTC | #10
On Thu, Sep 04, 2014 at 02:07:14PM +0100, Richard W.M. Jones wrote:
> On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote:
> > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben:
> > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> > > > Please change the code to always write zeros for FULL,
> > > 
> > > How is this useful for anyone?  You don't know if the underlying SAN
> > > is going to detect these zeroes or combine these blocks together.
> > > It's just slow for no reason.
> > 
> > It's slow for the reason that the user has requested it. Do you doubt
> > that users can know what their backend is doing? Why are you insisting
> > on providing only the functionality that you personally need?
> 
> I'm not!  I'm trying to make sure we don't end up with a qemu
> interface which is useless for higher layers.  You're proposing
> preallocation=full which will be slow but not actually give any
> guarantees, or preallocation=meta which is going to be fast but may
> not work, and I'm saying that's a dumb interface that's not useful.

Agree, from my POV as an app maintainer, fallocate with automatically
fallback to zero'ing is really what we want to use. Having two separate
options which we have to explicitly try in turn, or only having the
slow option is really not what we want to use.

Regards,
Daniel
Kevin Wolf Sept. 4, 2014, 1:17 p.m. UTC | #11
Am 04.09.2014 um 15:07 hat Richard W.M. Jones geschrieben:
> On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote:
> > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben:
> > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> > > > Please change the code to always write zeros for FULL,
> > > 
> > > How is this useful for anyone?  You don't know if the underlying SAN
> > > is going to detect these zeroes or combine these blocks together.
> > > It's just slow for no reason.
> > 
> > It's slow for the reason that the user has requested it. Do you doubt
> > that users can know what their backend is doing? Why are you insisting
> > on providing only the functionality that you personally need?
> 
> I'm not!  I'm trying to make sure we don't end up with a qemu
> interface which is useless for higher layers.  You're proposing
> preallocation=full which will be slow but not actually give any
> guarantees, or preallocation=meta which is going to be fast but may
> not work, and I'm saying that's a dumb interface that's not useful.

So what you propose is an interface that combines both and may
unpredictably be slow or fast, and doesn't give any guarantees either.
Why would this be any better?

What is your specific use case of full preallocation that wants zero
writing, but only implicitly as a fallback? My expectation is that most
users want cheap preallocation if it's available, but don't bother to
write many gigabytes if it isn't.

Kevin
Richard W.M. Jones Sept. 4, 2014, 1:43 p.m. UTC | #12
On Thu, Sep 04, 2014 at 03:17:51PM +0200, Kevin Wolf wrote:
> Am 04.09.2014 um 15:07 hat Richard W.M. Jones geschrieben:
> > On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote:
> > > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben:
> > > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> > > > > Please change the code to always write zeros for FULL,
> > > > 
> > > > How is this useful for anyone?  You don't know if the underlying SAN
> > > > is going to detect these zeroes or combine these blocks together.
> > > > It's just slow for no reason.
> > > 
> > > It's slow for the reason that the user has requested it. Do you doubt
> > > that users can know what their backend is doing? Why are you insisting
> > > on providing only the functionality that you personally need?
> > 
> > I'm not!  I'm trying to make sure we don't end up with a qemu
> > interface which is useless for higher layers.  You're proposing
> > preallocation=full which will be slow but not actually give any
> > guarantees, or preallocation=meta which is going to be fast but may
> > not work, and I'm saying that's a dumb interface that's not useful.
> 
> So what you propose is an interface that combines both and may
> unpredictably be slow or fast, and doesn't give any guarantees either.
> Why would this be any better?
> 
> What is your specific use case of full preallocation that wants zero
> writing, but only implicitly as a fallback? My expectation is that most
> users want cheap preallocation if it's available, but don't bother to
> write many gigabytes if it isn't.

Stepping back, I think what we have are two general approaches:

 - do the exact thing I want

 - do the best effort you can

virt-manager, virt-install, virt-clone, libvirt, libguestfs, all
currently do preallocation with fallback to writing data (via
hand-written code).  The reason is that customers using simple disks
(not SANs etc) just want to be sure they're not going to run out of
space during OS installation.  For the vast majority of users,
posix_fallocate makes this fast, but people using ext2 or *BSD get the
slow-but-safe path.

Now it may be that some qemu users are going to want a very exact
preallocation mode, but that doesn't mean we can't have
preallocation=besteffort too.

And it's not just here.  qemu has other places where we'd like "do the
best thing", not "do the exact thing" ... off the top of my head:
selecting -cpu, discard support, O_DIRECT.  Libguestfs has to go
through hoops in these areas, often involving very hairy workarounds
which reduce reliability.  I'm not saying that more exact options
aren't also welcome, but doing the best effort is very useful too.

Rich.
Kevin Wolf Sept. 4, 2014, 3:23 p.m. UTC | #13
Am 04.09.2014 um 15:43 hat Richard W.M. Jones geschrieben:
> On Thu, Sep 04, 2014 at 03:17:51PM +0200, Kevin Wolf wrote:
> > Am 04.09.2014 um 15:07 hat Richard W.M. Jones geschrieben:
> > > On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote:
> > > > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben:
> > > > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> > > > > > Please change the code to always write zeros for FULL,
> > > > > 
> > > > > How is this useful for anyone?  You don't know if the underlying SAN
> > > > > is going to detect these zeroes or combine these blocks together.
> > > > > It's just slow for no reason.
> > > > 
> > > > It's slow for the reason that the user has requested it. Do you doubt
> > > > that users can know what their backend is doing? Why are you insisting
> > > > on providing only the functionality that you personally need?
> > > 
> > > I'm not!  I'm trying to make sure we don't end up with a qemu
> > > interface which is useless for higher layers.  You're proposing
> > > preallocation=full which will be slow but not actually give any
> > > guarantees, or preallocation=meta which is going to be fast but may
> > > not work, and I'm saying that's a dumb interface that's not useful.
> > 
> > So what you propose is an interface that combines both and may
> > unpredictably be slow or fast, and doesn't give any guarantees either.
> > Why would this be any better?
> > 
> > What is your specific use case of full preallocation that wants zero
> > writing, but only implicitly as a fallback? My expectation is that most
> > users want cheap preallocation if it's available, but don't bother to
> > write many gigabytes if it isn't.
> 
> Stepping back, I think what we have are two general approaches:
> 
>  - do the exact thing I want
> 
>  - do the best effort you can
>
> [...]
>
> And it's not just here.  qemu has other places where we'd like "do the
> best thing", not "do the exact thing" ... off the top of my head:
> selecting -cpu, discard support, O_DIRECT.  Libguestfs has to go
> through hoops in these areas, often involving very hairy workarounds
> which reduce reliability.  I'm not saying that more exact options
> aren't also welcome, but doing the best effort is very useful too.

The point is that different people have different ideas of what "do the
best thing" is. You may be surprised to learn that generally we don't
roll a dice to select our defaults, but do it based on what we think is
the best thing overall. And yes, because we have less information than
management tools, it often means that we have to take more conservative
defaults (works for everyone) than what you would want (all the features
that your shiny new guest can use).

> virt-manager, virt-install, virt-clone, libvirt, libguestfs, all
> currently do preallocation with fallback to writing data (via
> hand-written code).  The reason is that customers using simple disks
> (not SANs etc) just want to be sure they're not going to run out of
> space during OS installation.  For the vast majority of users,
> posix_fallocate makes this fast, but people using ext2 or *BSD get the
> slow-but-safe path.

Is this really a concern for many users? I think I never preallocated an
image to make sure it fits. If anything, I run df first, but generally I
assume that it works and if it doesn't, I check what happened.

Basically all users I saw asking for preallocation did it because they
were worried about performance and wanted to get rid of the overhead of
metadata updates during allocations.

> Now it may be that some qemu users are going to want a very exact
> preallocation mode, but that doesn't mean we can't have
> preallocation=besteffort too.

The definition of "besteffort" depends on what you want to achieve. It
is policy, and we generally try to keep policy out of qemu.

It isn't really clear that "make absolutely sure that you never get
-ENOSPC on the file system level, but in the layers below it's okay, and
the resulting performance doesn't matter" is the best thing for everyone.

Kevin
Richard W.M. Jones Sept. 4, 2014, 3:33 p.m. UTC | #14
On Thu, Sep 04, 2014 at 05:23:21PM +0200, Kevin Wolf wrote:
> The definition of "besteffort" depends on what you want to achieve. It
> is policy, and we generally try to keep policy out of qemu.

I think qemu *should* have a policy of make it work and don't fail - first -
and then offer a million knobs to optimize and customize things on top
of that.

But whatever ... what's one more thing that we have to provide fragile
workarounds for ...

Rich.
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index abe0759..25f66f2 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -30,6 +30,7 @@ 
 #include "block/thread-pool.h"
 #include "qemu/iov.h"
 #include "raw-aio.h"
+#include "qapi/util.h"
 
 #if defined(__APPLE__) && (__MACH__)
 #include <paths.h>
@@ -1365,6 +1366,9 @@  static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
     int result = 0;
     int64_t total_size = 0;
     bool nocow = false;
+    PreallocMode prealloc = PREALLOC_MODE_OFF;
+    char *buf = NULL;
+    Error *local_err = NULL;
 
     strstart(filename, "file:", &filename);
 
@@ -1372,37 +1376,80 @@  static int raw_create(const char *filename, QemuOpts *opts, Error **errp)
     total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0),
                           BDRV_SECTOR_SIZE);
     nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false);
+    buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
+    prealloc = qapi_enum_parse(PreallocMode_lookup, buf,
+                               PREALLOC_MODE_MAX, PREALLOC_MODE_OFF,
+                               &local_err);
+    g_free(buf);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        result = -EINVAL;
+        goto out;
+    }
 
     fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY,
                    0644);
     if (fd < 0) {
         result = -errno;
         error_setg_errno(errp, -result, "Could not create file");
-    } else {
-        if (nocow) {
+        goto out;
+    }
+
+    if (nocow) {
 #ifdef __linux__
-            /* Set NOCOW flag to solve performance issue on fs like btrfs.
-             * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
-             * will be ignored since any failure of this operation should not
-             * block the left work.
-             */
-            int attr;
-            if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
-                attr |= FS_NOCOW_FL;
-                ioctl(fd, FS_IOC_SETFLAGS, &attr);
-            }
-#endif
+        /* Set NOCOW flag to solve performance issue on fs like btrfs.
+         * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value
+         * will be ignored since any failure of this operation should not
+         * block the left work.
+         */
+        int attr;
+        if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) {
+            attr |= FS_NOCOW_FL;
+            ioctl(fd, FS_IOC_SETFLAGS, &attr);
         }
+#endif
+    }
 
-        if (ftruncate(fd, total_size) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not resize file");
-        }
-        if (qemu_close(fd) != 0) {
-            result = -errno;
-            error_setg_errno(errp, -result, "Could not close the new file");
+    if (ftruncate(fd, total_size) != 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not resize file");
+        goto out_close;
+    }
+
+    if (prealloc == PREALLOC_MODE_FULL) {
+        /* posix_fallocate() doesn't set errno. */
+        result = -posix_fallocate(fd, 0, total_size);
+        if (result != 0) {
+            buf = g_malloc0(65536);
+            int64_t num = 0, left = total_size;
+
+            while (left > 0) {
+                num = MIN(left, 65536);
+                result = write(fd, buf, num);
+                if (result < 0) {
+                    result = -errno;
+                    error_setg_errno(errp, -result,
+                                     "Could not write to the new file");
+                    g_free(buf);
+                    goto out_close;
+                }
+                left -= num;
+            }
+            fsync(fd);
+            g_free(buf);
         }
+    } else if (prealloc != PREALLOC_MODE_OFF) {
+        result = -1;
+        error_setg(errp, "Unsupported preallocation mode: %s",
+                   PreallocMode_lookup[prealloc]);
+    }
+
+out_close:
+    if (qemu_close(fd) != 0 && result == 0) {
+        result = -errno;
+        error_setg_errno(errp, -result, "Could not close the new file");
     }
+out:
     return result;
 }
 
@@ -1585,6 +1632,11 @@  static QemuOptsList raw_create_opts = {
             .type = QEMU_OPT_BOOL,
             .help = "Turn off copy-on-write (valid only on btrfs)"
         },
+        {
+            .name = BLOCK_OPT_PREALLOC,
+            .type = QEMU_OPT_STRING,
+            .help = "Preallocation mode (allowed values: off, full)"
+        },
         { /* end of list */ }
     }
 };
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 2b232ae..2637765 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -527,6 +527,14 @@  Linux or NTFS on Windows), then only the written sectors will reserve
 space. Use @code{qemu-img info} to know the real size used by the
 image or @code{ls -ls} on Unix/Linux.
 
+Supported options:
+@table @code
+@item preallocation
+Preallocation mode(allowed values: @code{off}, @code{full}). An image is
+fully preallocated by calling posix_fallocate() if it's available, or by
+writing zeros to underlying storage.
+@end table
+
 @item qcow2
 QEMU image format, the most versatile format. Use it to have smaller
 images (useful if your filesystem does not supports holes, for example
diff --git a/qemu-img.texi b/qemu-img.texi
index cb68948..063ec61 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -418,6 +418,14 @@  Linux or NTFS on Windows), then only the written sectors will reserve
 space. Use @code{qemu-img info} to know the real size used by the
 image or @code{ls -ls} on Unix/Linux.
 
+Supported options:
+@table @code
+@item preallocation
+Preallocation mode(allowed values: @code{off}, @code{full}). An image is
+fully preallocated by calling posix_fallocate() if it's available, or by
+writing zeros to underlying storage.
+@end table
+
 @item qcow2
 QEMU image format, the most versatile format. Use it to have smaller
 images (useful if your filesystem does not supports holes, for example