Message ID | 67d97abc587e7c6985166dbe800686938ac8adb5.1409299732.git.hutao@cn.fujitsu.com |
---|---|
State | New |
Headers | show |
On Fri, Aug 29, 2014 at 04:33:12PM +0800, Hu Tao wrote: > + if (prealloc == PREALLOC_MODE_FULL) { > + /* posix_fallocate() doesn't set errno. */ > + result = -posix_fallocate(fd, 0, total_size); > + if (result != 0) { Is it better to test: result != ENOSYS && result != EOPNOTSUPP here? I think this is definitely the right approach. Rich.
On 29.08.2014 10:33, Hu Tao wrote: > This patch adds a new option preallocation for raw format, and implements > full preallocation. > > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com> > --- > block/raw-posix.c | 92 +++++++++++++++++++++++++++++++++++++++++++------------ > qemu-doc.texi | 8 +++++ > qemu-img.texi | 8 +++++ > 3 files changed, 88 insertions(+), 20 deletions(-) > > diff --git a/block/raw-posix.c b/block/raw-posix.c > index abe0759..25f66f2 100644 > --- a/block/raw-posix.c > +++ b/block/raw-posix.c > @@ -30,6 +30,7 @@ > #include "block/thread-pool.h" > #include "qemu/iov.h" > #include "raw-aio.h" > +#include "qapi/util.h" > > #if defined(__APPLE__) && (__MACH__) > #include <paths.h> > @@ -1365,6 +1366,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) > int result = 0; > int64_t total_size = 0; > bool nocow = false; > + PreallocMode prealloc = PREALLOC_MODE_OFF; > + char *buf = NULL; > + Error *local_err = NULL; > > strstart(filename, "file:", &filename); > > @@ -1372,37 +1376,80 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) > total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), > BDRV_SECTOR_SIZE); > nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); > + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); > + prealloc = qapi_enum_parse(PreallocMode_lookup, buf, > + PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, > + &local_err); > + g_free(buf); > + if (local_err) { > + error_propagate(errp, local_err); > + result = -EINVAL; > + goto out; > + } > > fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, > 0644); > if (fd < 0) { > result = -errno; > error_setg_errno(errp, -result, "Could not create file"); > - } else { > - if (nocow) { > + goto out; > + } > + > + if (nocow) { > #ifdef __linux__ > - /* Set NOCOW flag to solve performance issue on fs like btrfs. > - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value > - * will be ignored since any failure of this operation should not > - * block the left work. > - */ > - int attr; > - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { > - attr |= FS_NOCOW_FL; > - ioctl(fd, FS_IOC_SETFLAGS, &attr); > - } > -#endif > + /* Set NOCOW flag to solve performance issue on fs like btrfs. > + * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value > + * will be ignored since any failure of this operation should not > + * block the left work. > + */ > + int attr; > + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { > + attr |= FS_NOCOW_FL; > + ioctl(fd, FS_IOC_SETFLAGS, &attr); > } > +#endif > + } > > - if (ftruncate(fd, total_size) != 0) { > - result = -errno; > - error_setg_errno(errp, -result, "Could not resize file"); > - } > - if (qemu_close(fd) != 0) { > - result = -errno; > - error_setg_errno(errp, -result, "Could not close the new file"); > + if (ftruncate(fd, total_size) != 0) { > + result = -errno; > + error_setg_errno(errp, -result, "Could not resize file"); > + goto out_close; > + } > + > + if (prealloc == PREALLOC_MODE_FULL) { > + /* posix_fallocate() doesn't set errno. */ > + result = -posix_fallocate(fd, 0, total_size); > + if (result != 0) { > + buf = g_malloc0(65536); > + int64_t num = 0, left = total_size; > + > + while (left > 0) { > + num = MIN(left, 65536); > + result = write(fd, buf, num); > + if (result < 0) { > + result = -errno; > + error_setg_errno(errp, -result, > + "Could not write to the new file"); > + g_free(buf); > + goto out_close; > + } > + left -= num; > + } > + fsync(fd); > + g_free(buf); > } > + } else if (prealloc != PREALLOC_MODE_OFF) { > + result = -1; As for qcow2 in patch 4, I'd prefer -EINVAL. > + error_setg(errp, "Unsupported preallocation mode: %s", > + PreallocMode_lookup[prealloc]); > + } > + > +out_close: > + if (qemu_close(fd) != 0 && result == 0) { > + result = -errno; > + error_setg_errno(errp, -result, "Could not close the new file"); > } > +out: > return result; > } > > @@ -1585,6 +1632,11 @@ static QemuOptsList raw_create_opts = { > .type = QEMU_OPT_BOOL, > .help = "Turn off copy-on-write (valid only on btrfs)" > }, > + { > + .name = BLOCK_OPT_PREALLOC, > + .type = QEMU_OPT_STRING, > + .help = "Preallocation mode (allowed values: off, full)" > + }, > { /* end of list */ } > } > }; > diff --git a/qemu-doc.texi b/qemu-doc.texi > index 2b232ae..2637765 100644 > --- a/qemu-doc.texi > +++ b/qemu-doc.texi > @@ -527,6 +527,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve > space. Use @code{qemu-img info} to know the real size used by the > image or @code{ls -ls} on Unix/Linux. > > +Supported options: > +@table @code > +@item preallocation > +Preallocation mode(allowed values: @code{off}, @code{full}). An image is Missing space in front of the opening bracket. > +fully preallocated by calling posix_fallocate() if it's available, or by Hm, I personally am not so happy about contractions ("it's") in persistent documentation (even source code comments). Although I know there are already some of them in qemu-doc.texi, I'd rather avoid them. But I'll leave this up to you as I'm no native speaker. > +writing zeros to underlying storage. > +@end table > + > @item qcow2 > QEMU image format, the most versatile format. Use it to have smaller > images (useful if your filesystem does not supports holes, for example > diff --git a/qemu-img.texi b/qemu-img.texi > index cb68948..063ec61 100644 > --- a/qemu-img.texi > +++ b/qemu-img.texi > @@ -418,6 +418,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve > space. Use @code{qemu-img info} to know the real size used by the > image or @code{ls -ls} on Unix/Linux. > > +Supported options: > +@table @code > +@item preallocation > +Preallocation mode(allowed values: @code{off}, @code{full}). An image is > +fully preallocated by calling posix_fallocate() if it's available, or by > +writing zeros to underlying storage. > +@end table > + Same as for qemu-doc.texi. However, these are all minor, so with "result = -EINVAL" and the missing space added before the brackets: Reviewed-by: Max Reitz <mreitz@redhat.com> > @item qcow2 > QEMU image format, the most versatile format. Use it to have smaller > images (useful if your filesystem does not supports holes, for example
On Fri, Aug 29, 2014 at 09:48:01AM +0100, Richard W.M. Jones wrote: > On Fri, Aug 29, 2014 at 04:33:12PM +0800, Hu Tao wrote: > > + if (prealloc == PREALLOC_MODE_FULL) { > > + /* posix_fallocate() doesn't set errno. */ > > + result = -posix_fallocate(fd, 0, total_size); > > + if (result != 0) { > > Is it better to test: > > result != ENOSYS && result != EOPNOTSUPP > > here? posix_fallocate() doesn't return ENOSYS or EOPNOTSUPP. All the errors returned by posix_fallocate() apply to writing zeros, too. that is, if posix_fallocate() returns an error, we should not do writing zeros, neither. I'm wondering what is the right way to test if posix_fallocate() is supported, something like AC_CHECK_FUNC? how? Regards, Hu > > I think this is definitely the right approach. > > Rich. > > > -- > Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones > Read my programming and virtualization blog: http://rwmj.wordpress.com > virt-p2v converts physical machines to virtual machines. Boot with a > live CD or over the network (PXE) and turn machines into KVM guests. > http://libguestfs.org/virt-v2v
On Tue, Sep 02, 2014 at 11:45:38PM +0200, Max Reitz wrote: > On 29.08.2014 10:33, Hu Tao wrote: > >This patch adds a new option preallocation for raw format, and implements > >full preallocation. > > > >Signed-off-by: Hu Tao <hutao@cn.fujitsu.com> > >--- > > block/raw-posix.c | 92 +++++++++++++++++++++++++++++++++++++++++++------------ > > qemu-doc.texi | 8 +++++ > > qemu-img.texi | 8 +++++ > > 3 files changed, 88 insertions(+), 20 deletions(-) > > > >diff --git a/block/raw-posix.c b/block/raw-posix.c > >index abe0759..25f66f2 100644 > >--- a/block/raw-posix.c > >+++ b/block/raw-posix.c > >@@ -30,6 +30,7 @@ > > #include "block/thread-pool.h" > > #include "qemu/iov.h" > > #include "raw-aio.h" > >+#include "qapi/util.h" > > #if defined(__APPLE__) && (__MACH__) > > #include <paths.h> > >@@ -1365,6 +1366,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) > > int result = 0; > > int64_t total_size = 0; > > bool nocow = false; > >+ PreallocMode prealloc = PREALLOC_MODE_OFF; > >+ char *buf = NULL; > >+ Error *local_err = NULL; > > strstart(filename, "file:", &filename); > >@@ -1372,37 +1376,80 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) > > total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), > > BDRV_SECTOR_SIZE); > > nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); > >+ buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); > >+ prealloc = qapi_enum_parse(PreallocMode_lookup, buf, > >+ PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, > >+ &local_err); > >+ g_free(buf); > >+ if (local_err) { > >+ error_propagate(errp, local_err); > >+ result = -EINVAL; > >+ goto out; > >+ } > > fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, > > 0644); > > if (fd < 0) { > > result = -errno; > > error_setg_errno(errp, -result, "Could not create file"); > >- } else { > >- if (nocow) { > >+ goto out; > >+ } > >+ > >+ if (nocow) { > > #ifdef __linux__ > >- /* Set NOCOW flag to solve performance issue on fs like btrfs. > >- * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value > >- * will be ignored since any failure of this operation should not > >- * block the left work. > >- */ > >- int attr; > >- if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { > >- attr |= FS_NOCOW_FL; > >- ioctl(fd, FS_IOC_SETFLAGS, &attr); > >- } > >-#endif > >+ /* Set NOCOW flag to solve performance issue on fs like btrfs. > >+ * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value > >+ * will be ignored since any failure of this operation should not > >+ * block the left work. > >+ */ > >+ int attr; > >+ if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { > >+ attr |= FS_NOCOW_FL; > >+ ioctl(fd, FS_IOC_SETFLAGS, &attr); > > } > >+#endif > >+ } > >- if (ftruncate(fd, total_size) != 0) { > >- result = -errno; > >- error_setg_errno(errp, -result, "Could not resize file"); > >- } > >- if (qemu_close(fd) != 0) { > >- result = -errno; > >- error_setg_errno(errp, -result, "Could not close the new file"); > >+ if (ftruncate(fd, total_size) != 0) { > >+ result = -errno; > >+ error_setg_errno(errp, -result, "Could not resize file"); > >+ goto out_close; > >+ } > >+ > >+ if (prealloc == PREALLOC_MODE_FULL) { > >+ /* posix_fallocate() doesn't set errno. */ > >+ result = -posix_fallocate(fd, 0, total_size); > >+ if (result != 0) { > >+ buf = g_malloc0(65536); > >+ int64_t num = 0, left = total_size; > >+ > >+ while (left > 0) { > >+ num = MIN(left, 65536); > >+ result = write(fd, buf, num); > >+ if (result < 0) { > >+ result = -errno; > >+ error_setg_errno(errp, -result, > >+ "Could not write to the new file"); > >+ g_free(buf); > >+ goto out_close; > >+ } > >+ left -= num; > >+ } > >+ fsync(fd); > >+ g_free(buf); > > } > >+ } else if (prealloc != PREALLOC_MODE_OFF) { > >+ result = -1; > > As for qcow2 in patch 4, I'd prefer -EINVAL. Okay. > > >+ error_setg(errp, "Unsupported preallocation mode: %s", > >+ PreallocMode_lookup[prealloc]); > >+ } > >+ > >+out_close: > >+ if (qemu_close(fd) != 0 && result == 0) { > >+ result = -errno; > >+ error_setg_errno(errp, -result, "Could not close the new file"); > > } > >+out: > > return result; > > } > >@@ -1585,6 +1632,11 @@ static QemuOptsList raw_create_opts = { > > .type = QEMU_OPT_BOOL, > > .help = "Turn off copy-on-write (valid only on btrfs)" > > }, > >+ { > >+ .name = BLOCK_OPT_PREALLOC, > >+ .type = QEMU_OPT_STRING, > >+ .help = "Preallocation mode (allowed values: off, full)" > >+ }, > > { /* end of list */ } > > } > > }; > >diff --git a/qemu-doc.texi b/qemu-doc.texi > >index 2b232ae..2637765 100644 > >--- a/qemu-doc.texi > >+++ b/qemu-doc.texi > >@@ -527,6 +527,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve > > space. Use @code{qemu-img info} to know the real size used by the > > image or @code{ls -ls} on Unix/Linux. > >+Supported options: > >+@table @code > >+@item preallocation > >+Preallocation mode(allowed values: @code{off}, @code{full}). An image is > > Missing space in front of the opening bracket. Okay. I checked that in other places of opening bracket there is a space before them. > > >+fully preallocated by calling posix_fallocate() if it's available, or by > > Hm, I personally am not so happy about contractions ("it's") in > persistent documentation (even source code comments). Although I > know there are already some of them in qemu-doc.texi, I'd rather > avoid them. But I'll leave this up to you as I'm no native speaker. I'm not, either. I don't know which one in "it's" and "it is" is more common, but I can change the contraction if it makes you feel better:) Regards, Hu > > >+writing zeros to underlying storage. > >+@end table > >+ > > @item qcow2 > > QEMU image format, the most versatile format. Use it to have smaller > > images (useful if your filesystem does not supports holes, for example > >diff --git a/qemu-img.texi b/qemu-img.texi > >index cb68948..063ec61 100644 > >--- a/qemu-img.texi > >+++ b/qemu-img.texi > >@@ -418,6 +418,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve > > space. Use @code{qemu-img info} to know the real size used by the > > image or @code{ls -ls} on Unix/Linux. > >+Supported options: > >+@table @code > >+@item preallocation > >+Preallocation mode(allowed values: @code{off}, @code{full}). An image is > >+fully preallocated by calling posix_fallocate() if it's available, or by > >+writing zeros to underlying storage. > >+@end table > >+ > > Same as for qemu-doc.texi. > > However, these are all minor, so with "result = -EINVAL" and the > missing space added before the brackets: > > Reviewed-by: Max Reitz <mreitz@redhat.com> > > > @item qcow2 > > QEMU image format, the most versatile format. Use it to have smaller > > images (useful if your filesystem does not supports holes, for example
On Fri, Aug 29, 2014 at 09:48:01AM +0100, Richard W.M. Jones wrote: > On Fri, Aug 29, 2014 at 04:33:12PM +0800, Hu Tao wrote: > > + if (prealloc == PREALLOC_MODE_FULL) { > > + /* posix_fallocate() doesn't set errno. */ > > + result = -posix_fallocate(fd, 0, total_size); > > + if (result != 0) { > > Is it better to test: > > result != ENOSYS && result != EOPNOTSUPP > > here? > > I think this is definitely the right approach. Hi Kevin, How do you think this approach? Regards, Hu
Am 29.08.2014 um 10:33 hat Hu Tao geschrieben: > This patch adds a new option preallocation for raw format, and implements > full preallocation. > > Signed-off-by: Hu Tao <hutao@cn.fujitsu.com> v12 was better, it wasn't doing only metadata preallocation when you told it to do full preallocation. > + if (prealloc == PREALLOC_MODE_FULL) { > + /* posix_fallocate() doesn't set errno. */ > + result = -posix_fallocate(fd, 0, total_size); > + if (result != 0) { > + buf = g_malloc0(65536); > + int64_t num = 0, left = total_size; > + > + while (left > 0) { > + num = MIN(left, 65536); > + result = write(fd, buf, num); > + if (result < 0) { > + result = -errno; > + error_setg_errno(errp, -result, > + "Could not write to the new file"); > + g_free(buf); > + goto out_close; > + } > + left -= num; > + } > + fsync(fd); > + g_free(buf); > } This is totally pointless. If the file system doesn't support fallocate, posix_fallocate() will already try writing zeros. Having code to write zeros in qemu only makes sense if you implement a real full preallocation mode, which doesn't use fallocate even when it's supported. Please change the code to always write zeros for FULL, and either reintroduce FALLOC or use METADATA for the fallocate (without a fallback to zero writing code). In fact, for metadata preallocation we should probably directly use fallocate(), which has no slow zero-write fallback in the libc. Kevin
On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote:
> Please change the code to always write zeros for FULL,
How is this useful for anyone? You don't know if the underlying SAN
is going to detect these zeroes or combine these blocks together.
It's just slow for no reason.
Rich.
Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben: > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote: > > Please change the code to always write zeros for FULL, > > How is this useful for anyone? You don't know if the underlying SAN > is going to detect these zeroes or combine these blocks together. > It's just slow for no reason. It's slow for the reason that the user has requested it. Do you doubt that users can know what their backend is doing? Why are you insisting on providing only the functionality that you personally need? I doubt it's a way to make many users happier, but if you insist, we can leave full preallocation unsupported on raw-posix and allow only metadata preallocation (which should still be fallocate() without a zero-write fallback, so that you can rely on it being fast). But call it by its name and don't say "full" when you only implement "metadata". Kevin
On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote: > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben: > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote: > > > Please change the code to always write zeros for FULL, > > > > How is this useful for anyone? You don't know if the underlying SAN > > is going to detect these zeroes or combine these blocks together. > > It's just slow for no reason. > > It's slow for the reason that the user has requested it. Do you doubt > that users can know what their backend is doing? Why are you insisting > on providing only the functionality that you personally need? I'm not! I'm trying to make sure we don't end up with a qemu interface which is useless for higher layers. You're proposing preallocation=full which will be slow but not actually give any guarantees, or preallocation=meta which is going to be fast but may not work, and I'm saying that's a dumb interface that's not useful. Rich.
On Thu, Sep 04, 2014 at 02:07:14PM +0100, Richard W.M. Jones wrote: > On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote: > > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben: > > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote: > > > > Please change the code to always write zeros for FULL, > > > > > > How is this useful for anyone? You don't know if the underlying SAN > > > is going to detect these zeroes or combine these blocks together. > > > It's just slow for no reason. > > > > It's slow for the reason that the user has requested it. Do you doubt > > that users can know what their backend is doing? Why are you insisting > > on providing only the functionality that you personally need? > > I'm not! I'm trying to make sure we don't end up with a qemu > interface which is useless for higher layers. You're proposing > preallocation=full which will be slow but not actually give any > guarantees, or preallocation=meta which is going to be fast but may > not work, and I'm saying that's a dumb interface that's not useful. Agree, from my POV as an app maintainer, fallocate with automatically fallback to zero'ing is really what we want to use. Having two separate options which we have to explicitly try in turn, or only having the slow option is really not what we want to use. Regards, Daniel
Am 04.09.2014 um 15:07 hat Richard W.M. Jones geschrieben: > On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote: > > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben: > > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote: > > > > Please change the code to always write zeros for FULL, > > > > > > How is this useful for anyone? You don't know if the underlying SAN > > > is going to detect these zeroes or combine these blocks together. > > > It's just slow for no reason. > > > > It's slow for the reason that the user has requested it. Do you doubt > > that users can know what their backend is doing? Why are you insisting > > on providing only the functionality that you personally need? > > I'm not! I'm trying to make sure we don't end up with a qemu > interface which is useless for higher layers. You're proposing > preallocation=full which will be slow but not actually give any > guarantees, or preallocation=meta which is going to be fast but may > not work, and I'm saying that's a dumb interface that's not useful. So what you propose is an interface that combines both and may unpredictably be slow or fast, and doesn't give any guarantees either. Why would this be any better? What is your specific use case of full preallocation that wants zero writing, but only implicitly as a fallback? My expectation is that most users want cheap preallocation if it's available, but don't bother to write many gigabytes if it isn't. Kevin
On Thu, Sep 04, 2014 at 03:17:51PM +0200, Kevin Wolf wrote: > Am 04.09.2014 um 15:07 hat Richard W.M. Jones geschrieben: > > On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote: > > > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben: > > > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote: > > > > > Please change the code to always write zeros for FULL, > > > > > > > > How is this useful for anyone? You don't know if the underlying SAN > > > > is going to detect these zeroes or combine these blocks together. > > > > It's just slow for no reason. > > > > > > It's slow for the reason that the user has requested it. Do you doubt > > > that users can know what their backend is doing? Why are you insisting > > > on providing only the functionality that you personally need? > > > > I'm not! I'm trying to make sure we don't end up with a qemu > > interface which is useless for higher layers. You're proposing > > preallocation=full which will be slow but not actually give any > > guarantees, or preallocation=meta which is going to be fast but may > > not work, and I'm saying that's a dumb interface that's not useful. > > So what you propose is an interface that combines both and may > unpredictably be slow or fast, and doesn't give any guarantees either. > Why would this be any better? > > What is your specific use case of full preallocation that wants zero > writing, but only implicitly as a fallback? My expectation is that most > users want cheap preallocation if it's available, but don't bother to > write many gigabytes if it isn't. Stepping back, I think what we have are two general approaches: - do the exact thing I want - do the best effort you can virt-manager, virt-install, virt-clone, libvirt, libguestfs, all currently do preallocation with fallback to writing data (via hand-written code). The reason is that customers using simple disks (not SANs etc) just want to be sure they're not going to run out of space during OS installation. For the vast majority of users, posix_fallocate makes this fast, but people using ext2 or *BSD get the slow-but-safe path. Now it may be that some qemu users are going to want a very exact preallocation mode, but that doesn't mean we can't have preallocation=besteffort too. And it's not just here. qemu has other places where we'd like "do the best thing", not "do the exact thing" ... off the top of my head: selecting -cpu, discard support, O_DIRECT. Libguestfs has to go through hoops in these areas, often involving very hairy workarounds which reduce reliability. I'm not saying that more exact options aren't also welcome, but doing the best effort is very useful too. Rich.
Am 04.09.2014 um 15:43 hat Richard W.M. Jones geschrieben: > On Thu, Sep 04, 2014 at 03:17:51PM +0200, Kevin Wolf wrote: > > Am 04.09.2014 um 15:07 hat Richard W.M. Jones geschrieben: > > > On Thu, Sep 04, 2014 at 02:52:57PM +0200, Kevin Wolf wrote: > > > > Am 04.09.2014 um 14:45 hat Richard W.M. Jones geschrieben: > > > > > On Thu, Sep 04, 2014 at 02:35:22PM +0200, Kevin Wolf wrote: > > > > > > Please change the code to always write zeros for FULL, > > > > > > > > > > How is this useful for anyone? You don't know if the underlying SAN > > > > > is going to detect these zeroes or combine these blocks together. > > > > > It's just slow for no reason. > > > > > > > > It's slow for the reason that the user has requested it. Do you doubt > > > > that users can know what their backend is doing? Why are you insisting > > > > on providing only the functionality that you personally need? > > > > > > I'm not! I'm trying to make sure we don't end up with a qemu > > > interface which is useless for higher layers. You're proposing > > > preallocation=full which will be slow but not actually give any > > > guarantees, or preallocation=meta which is going to be fast but may > > > not work, and I'm saying that's a dumb interface that's not useful. > > > > So what you propose is an interface that combines both and may > > unpredictably be slow or fast, and doesn't give any guarantees either. > > Why would this be any better? > > > > What is your specific use case of full preallocation that wants zero > > writing, but only implicitly as a fallback? My expectation is that most > > users want cheap preallocation if it's available, but don't bother to > > write many gigabytes if it isn't. > > Stepping back, I think what we have are two general approaches: > > - do the exact thing I want > > - do the best effort you can > > [...] > > And it's not just here. qemu has other places where we'd like "do the > best thing", not "do the exact thing" ... off the top of my head: > selecting -cpu, discard support, O_DIRECT. Libguestfs has to go > through hoops in these areas, often involving very hairy workarounds > which reduce reliability. I'm not saying that more exact options > aren't also welcome, but doing the best effort is very useful too. The point is that different people have different ideas of what "do the best thing" is. You may be surprised to learn that generally we don't roll a dice to select our defaults, but do it based on what we think is the best thing overall. And yes, because we have less information than management tools, it often means that we have to take more conservative defaults (works for everyone) than what you would want (all the features that your shiny new guest can use). > virt-manager, virt-install, virt-clone, libvirt, libguestfs, all > currently do preallocation with fallback to writing data (via > hand-written code). The reason is that customers using simple disks > (not SANs etc) just want to be sure they're not going to run out of > space during OS installation. For the vast majority of users, > posix_fallocate makes this fast, but people using ext2 or *BSD get the > slow-but-safe path. Is this really a concern for many users? I think I never preallocated an image to make sure it fits. If anything, I run df first, but generally I assume that it works and if it doesn't, I check what happened. Basically all users I saw asking for preallocation did it because they were worried about performance and wanted to get rid of the overhead of metadata updates during allocations. > Now it may be that some qemu users are going to want a very exact > preallocation mode, but that doesn't mean we can't have > preallocation=besteffort too. The definition of "besteffort" depends on what you want to achieve. It is policy, and we generally try to keep policy out of qemu. It isn't really clear that "make absolutely sure that you never get -ENOSPC on the file system level, but in the layers below it's okay, and the resulting performance doesn't matter" is the best thing for everyone. Kevin
On Thu, Sep 04, 2014 at 05:23:21PM +0200, Kevin Wolf wrote: > The definition of "besteffort" depends on what you want to achieve. It > is policy, and we generally try to keep policy out of qemu. I think qemu *should* have a policy of make it work and don't fail - first - and then offer a million knobs to optimize and customize things on top of that. But whatever ... what's one more thing that we have to provide fragile workarounds for ... Rich.
diff --git a/block/raw-posix.c b/block/raw-posix.c index abe0759..25f66f2 100644 --- a/block/raw-posix.c +++ b/block/raw-posix.c @@ -30,6 +30,7 @@ #include "block/thread-pool.h" #include "qemu/iov.h" #include "raw-aio.h" +#include "qapi/util.h" #if defined(__APPLE__) && (__MACH__) #include <paths.h> @@ -1365,6 +1366,9 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) int result = 0; int64_t total_size = 0; bool nocow = false; + PreallocMode prealloc = PREALLOC_MODE_OFF; + char *buf = NULL; + Error *local_err = NULL; strstart(filename, "file:", &filename); @@ -1372,37 +1376,80 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), BDRV_SECTOR_SIZE); nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + prealloc = qapi_enum_parse(PreallocMode_lookup, buf, + PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, + &local_err); + g_free(buf); + if (local_err) { + error_propagate(errp, local_err); + result = -EINVAL; + goto out; + } fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); if (fd < 0) { result = -errno; error_setg_errno(errp, -result, "Could not create file"); - } else { - if (nocow) { + goto out; + } + + if (nocow) { #ifdef __linux__ - /* Set NOCOW flag to solve performance issue on fs like btrfs. - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value - * will be ignored since any failure of this operation should not - * block the left work. - */ - int attr; - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { - attr |= FS_NOCOW_FL; - ioctl(fd, FS_IOC_SETFLAGS, &attr); - } -#endif + /* Set NOCOW flag to solve performance issue on fs like btrfs. + * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value + * will be ignored since any failure of this operation should not + * block the left work. + */ + int attr; + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { + attr |= FS_NOCOW_FL; + ioctl(fd, FS_IOC_SETFLAGS, &attr); } +#endif + } - if (ftruncate(fd, total_size) != 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not resize file"); - } - if (qemu_close(fd) != 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not close the new file"); + if (ftruncate(fd, total_size) != 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); + goto out_close; + } + + if (prealloc == PREALLOC_MODE_FULL) { + /* posix_fallocate() doesn't set errno. */ + result = -posix_fallocate(fd, 0, total_size); + if (result != 0) { + buf = g_malloc0(65536); + int64_t num = 0, left = total_size; + + while (left > 0) { + num = MIN(left, 65536); + result = write(fd, buf, num); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Could not write to the new file"); + g_free(buf); + goto out_close; + } + left -= num; + } + fsync(fd); + g_free(buf); } + } else if (prealloc != PREALLOC_MODE_OFF) { + result = -1; + error_setg(errp, "Unsupported preallocation mode: %s", + PreallocMode_lookup[prealloc]); + } + +out_close: + if (qemu_close(fd) != 0 && result == 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not close the new file"); } +out: return result; } @@ -1585,6 +1632,11 @@ static QemuOptsList raw_create_opts = { .type = QEMU_OPT_BOOL, .help = "Turn off copy-on-write (valid only on btrfs)" }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, { /* end of list */ } } }; diff --git a/qemu-doc.texi b/qemu-doc.texi index 2b232ae..2637765 100644 --- a/qemu-doc.texi +++ b/qemu-doc.texi @@ -527,6 +527,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve space. Use @code{qemu-img info} to know the real size used by the image or @code{ls -ls} on Unix/Linux. +Supported options: +@table @code +@item preallocation +Preallocation mode(allowed values: @code{off}, @code{full}). An image is +fully preallocated by calling posix_fallocate() if it's available, or by +writing zeros to underlying storage. +@end table + @item qcow2 QEMU image format, the most versatile format. Use it to have smaller images (useful if your filesystem does not supports holes, for example diff --git a/qemu-img.texi b/qemu-img.texi index cb68948..063ec61 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -418,6 +418,14 @@ Linux or NTFS on Windows), then only the written sectors will reserve space. Use @code{qemu-img info} to know the real size used by the image or @code{ls -ls} on Unix/Linux. +Supported options: +@table @code +@item preallocation +Preallocation mode(allowed values: @code{off}, @code{full}). An image is +fully preallocated by calling posix_fallocate() if it's available, or by +writing zeros to underlying storage. +@end table + @item qcow2 QEMU image format, the most versatile format. Use it to have smaller images (useful if your filesystem does not supports holes, for example
This patch adds a new option preallocation for raw format, and implements full preallocation. Signed-off-by: Hu Tao <hutao@cn.fujitsu.com> --- block/raw-posix.c | 92 +++++++++++++++++++++++++++++++++++++++++++------------ qemu-doc.texi | 8 +++++ qemu-img.texi | 8 +++++ 3 files changed, 88 insertions(+), 20 deletions(-)