Patchwork qcow2: Add full image preallocation option

login
register
mail settings
Submitter Kevin Wolf
Date Jan. 27, 2011, 3:52 p.m.
Message ID <1296143534-13495-1-git-send-email-kwolf@redhat.com>
Download mbox | patch
Permalink /patch/80709/
State New
Headers show

Comments

Kevin Wolf - Jan. 27, 2011, 3:52 p.m.
This adds a preallocation=full mode to qcow2 image creation, which does not
only allocate metadata for the whole image, but also writes zeros to it,
creating a non-sparse image file.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
 1 files changed, 40 insertions(+), 5 deletions(-)
Daniel P. Berrange - Jan. 27, 2011, 3:58 p.m.
On Thu, Jan 27, 2011 at 04:52:14PM +0100, Kevin Wolf wrote:
> This adds a preallocation=full mode to qcow2 image creation, which does not
> only allocate metadata for the whole image, but also writes zeros to it,
> creating a non-sparse image file.
> 
> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> ---
>  block/qcow2.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
>  1 files changed, 40 insertions(+), 5 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index a1773e4..90cf2ca 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -838,7 +838,15 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
>      return qcow2_update_ext_header(bs, backing_file, backing_fmt);
>  }
>  
> -static int preallocate(BlockDriverState *bs)
> +enum prealloc_mode {
> +    PREALLOC_OFF = 0,
> +    PREALLOC_METADATA,
> +    PREALLOC_FULL,
> +};
> +
> +#define IO_BUF_SIZE (2 * 1024 * 1024)
> +
> +static int preallocate(BlockDriverState *bs, enum prealloc_mode mode)
>  {
>      uint64_t nb_sectors;
>      uint64_t offset;
> @@ -846,11 +854,14 @@ static int preallocate(BlockDriverState *bs)
>      int ret;
>      QCowL2Meta meta;
>  
> +    assert(mode != PREALLOC_OFF);
> +
>      nb_sectors = bdrv_getlength(bs) >> 9;
>      offset = 0;
>      QLIST_INIT(&meta.dependent_requests);
>      meta.cluster_offset = 0;
>  
> +    /* First allocate metadata in _really_ big chunks */
>      while (nb_sectors) {
>          num = MIN(nb_sectors, INT_MAX >> 9);
>          ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
> @@ -874,6 +885,28 @@ static int preallocate(BlockDriverState *bs)
>          offset += num << 9;
>      }
>  
> +    /* Then write zeros to the cluster data, if requested */
> +    if (mode == PREALLOC_FULL) {
> +        void *buf = qemu_mallocz(IO_BUF_SIZE);
> +
> +        nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
> +        offset = 0;
> +
> +        while (nb_sectors) {
> +            num = MIN(nb_sectors, IO_BUF_SIZE / BDRV_SECTOR_SIZE);
> +            ret = bdrv_write(bs, offset >> BDRV_SECTOR_BITS, buf, num);

Is there a way you can calculate the total size of the qcow2
file upfront, and just use a single posix_fallocate() call to
do the zero-filled allocation of all the data blocks. It is
many orders of magnitude faster than truely writing blocks of
zero'd data on modern filesystems.  I guess if you're using
compression or encryption, we'd really have to go the slow
path, but for regular usage it'd be better to take a fast
path.

Daniel
Anthony Liguori - Jan. 27, 2011, 5:45 p.m.
On 01/27/2011 09:52 AM, Kevin Wolf wrote:
> This adds a preallocation=full mode to qcow2 image creation, which does not
> only allocate metadata for the whole image, but also writes zeros to it,
> creating a non-sparse image file.
>    

The writing zeros bit is in order to support physical devices?  Would it 
be better to have a flag in BlockDriverState that indicated whether 
uninitialized sectors could be assumed to be zero filled and key off of 
that?

Regards,

Anthony Liguori

> Signed-off-by: Kevin Wolf<kwolf@redhat.com>
> ---
>   block/qcow2.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
>   1 files changed, 40 insertions(+), 5 deletions(-)
>
> diff --git a/block/qcow2.c b/block/qcow2.c
> index a1773e4..90cf2ca 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -838,7 +838,15 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
>       return qcow2_update_ext_header(bs, backing_file, backing_fmt);
>   }
>
> -static int preallocate(BlockDriverState *bs)
> +enum prealloc_mode {
> +    PREALLOC_OFF = 0,
> +    PREALLOC_METADATA,
> +    PREALLOC_FULL,
> +};
> +
> +#define IO_BUF_SIZE (2 * 1024 * 1024)
> +
> +static int preallocate(BlockDriverState *bs, enum prealloc_mode mode)
>   {
>       uint64_t nb_sectors;
>       uint64_t offset;
> @@ -846,11 +854,14 @@ static int preallocate(BlockDriverState *bs)
>       int ret;
>       QCowL2Meta meta;
>
> +    assert(mode != PREALLOC_OFF);
> +
>       nb_sectors = bdrv_getlength(bs)>>  9;
>       offset = 0;
>       QLIST_INIT(&meta.dependent_requests);
>       meta.cluster_offset = 0;
>
> +    /* First allocate metadata in _really_ big chunks */
>       while (nb_sectors) {
>           num = MIN(nb_sectors, INT_MAX>>  9);
>           ret = qcow2_alloc_cluster_offset(bs, offset, 0, num,&num,&meta);
> @@ -874,6 +885,28 @@ static int preallocate(BlockDriverState *bs)
>           offset += num<<  9;
>       }
>
> +    /* Then write zeros to the cluster data, if requested */
> +    if (mode == PREALLOC_FULL) {
> +        void *buf = qemu_mallocz(IO_BUF_SIZE);
> +
> +        nb_sectors = bdrv_getlength(bs)>>  BDRV_SECTOR_BITS;
> +        offset = 0;
> +
> +        while (nb_sectors) {
> +            num = MIN(nb_sectors, IO_BUF_SIZE / BDRV_SECTOR_SIZE);
> +            ret = bdrv_write(bs, offset>>  BDRV_SECTOR_BITS, buf, num);
> +            if (ret<  0) {
> +                qemu_free(buf);
> +                return ret;
> +            }
> +
> +            nb_sectors -= num;
> +            offset += num<<  9;
> +        }
> +
> +        qemu_free(buf);
> +    }
> +
>       /*
>        * It is expected that the image file is large enough to actually contain
>        * all of the allocated clusters (otherwise we get failing reads after
> @@ -1006,7 +1039,7 @@ static int qcow2_create2(const char *filename, int64_t total_size,
>
>       /* And if we're supposed to preallocate metadata, do that now */
>       if (prealloc) {
> -        ret = preallocate(bs);
> +        ret = preallocate(bs, prealloc);
>           if (ret<  0) {
>               goto out;
>           }
> @@ -1043,9 +1076,11 @@ static int qcow2_create(const char *filename, QEMUOptionParameter *options)
>               }
>           } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
>               if (!options->value.s || !strcmp(options->value.s, "off")) {
> -                prealloc = 0;
> +                prealloc = PREALLOC_OFF;
>               } else if (!strcmp(options->value.s, "metadata")) {
> -                prealloc = 1;
> +                prealloc = PREALLOC_METADATA;
> +            } else if (!strcmp(options->value.s, "full")) {
> +                prealloc = PREALLOC_FULL;
>               } else {
>                   fprintf(stderr, "Invalid preallocation mode: '%s'\n",
>                       options->value.s);
> @@ -1336,7 +1371,7 @@ static QEMUOptionParameter qcow2_create_options[] = {
>       {
>           .name = BLOCK_OPT_PREALLOC,
>           .type = OPT_STRING,
> -        .help = "Preallocation mode (allowed values: off, metadata)"
> +        .help = "Preallocation mode (allowed values: off, metadata, full)"
>       },
>       { NULL }
>   };
>
Anthony Liguori - Jan. 27, 2011, 5:50 p.m.
On 01/27/2011 09:58 AM, Daniel P. Berrange wrote:
> On Thu, Jan 27, 2011 at 04:52:14PM +0100, Kevin Wolf wrote:
>    
>> This adds a preallocation=full mode to qcow2 image creation, which does not
>> only allocate metadata for the whole image, but also writes zeros to it,
>> creating a non-sparse image file.
>>
>> Signed-off-by: Kevin Wolf<kwolf@redhat.com>
>> ---
>>   block/qcow2.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
>>   1 files changed, 40 insertions(+), 5 deletions(-)
>>
>> diff --git a/block/qcow2.c b/block/qcow2.c
>> index a1773e4..90cf2ca 100644
>> --- a/block/qcow2.c
>> +++ b/block/qcow2.c
>> @@ -838,7 +838,15 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
>>       return qcow2_update_ext_header(bs, backing_file, backing_fmt);
>>   }
>>
>> -static int preallocate(BlockDriverState *bs)
>> +enum prealloc_mode {
>> +    PREALLOC_OFF = 0,
>> +    PREALLOC_METADATA,
>> +    PREALLOC_FULL,
>> +};
>> +
>> +#define IO_BUF_SIZE (2 * 1024 * 1024)
>> +
>> +static int preallocate(BlockDriverState *bs, enum prealloc_mode mode)
>>   {
>>       uint64_t nb_sectors;
>>       uint64_t offset;
>> @@ -846,11 +854,14 @@ static int preallocate(BlockDriverState *bs)
>>       int ret;
>>       QCowL2Meta meta;
>>
>> +    assert(mode != PREALLOC_OFF);
>> +
>>       nb_sectors = bdrv_getlength(bs)>>  9;
>>       offset = 0;
>>       QLIST_INIT(&meta.dependent_requests);
>>       meta.cluster_offset = 0;
>>
>> +    /* First allocate metadata in _really_ big chunks */
>>       while (nb_sectors) {
>>           num = MIN(nb_sectors, INT_MAX>>  9);
>>           ret = qcow2_alloc_cluster_offset(bs, offset, 0, num,&num,&meta);
>> @@ -874,6 +885,28 @@ static int preallocate(BlockDriverState *bs)
>>           offset += num<<  9;
>>       }
>>
>> +    /* Then write zeros to the cluster data, if requested */
>> +    if (mode == PREALLOC_FULL) {
>> +        void *buf = qemu_mallocz(IO_BUF_SIZE);
>> +
>> +        nb_sectors = bdrv_getlength(bs)>>  BDRV_SECTOR_BITS;
>> +        offset = 0;
>> +
>> +        while (nb_sectors) {
>> +            num = MIN(nb_sectors, IO_BUF_SIZE / BDRV_SECTOR_SIZE);
>> +            ret = bdrv_write(bs, offset>>  BDRV_SECTOR_BITS, buf, num);
>>      
> Is there a way you can calculate the total size of the qcow2
> file upfront, and just use a single posix_fallocate() call to
> do the zero-filled allocation of all the data blocks. It is
> many orders of magnitude faster than truely writing blocks of
> zero'd data on modern filesystems.  I guess if you're using
> compression or encryption, we'd really have to go the slow
> path, but for regular usage it'd be better to take a fast
> path.
>    

Hrm, so is the intention here to avoid sparse files or to not assume 
zero-fill?

Regards,

Anthony Liguori

> Daniel
>
>
Kevin Wolf - Jan. 28, 2011, 8:22 a.m.
Am 27.01.2011 16:58, schrieb Daniel P. Berrange:
> On Thu, Jan 27, 2011 at 04:52:14PM +0100, Kevin Wolf wrote:
>> This adds a preallocation=full mode to qcow2 image creation, which does not
>> only allocate metadata for the whole image, but also writes zeros to it,
>> creating a non-sparse image file.
>>
>> Signed-off-by: Kevin Wolf <kwolf@redhat.com>

> Is there a way you can calculate the total size of the qcow2
> file upfront, and just use a single posix_fallocate() call to
> do the zero-filled allocation of all the data blocks. It is
> many orders of magnitude faster than truely writing blocks of
> zero'd data on modern filesystems.  I guess if you're using
> compression or encryption, we'd really have to go the slow
> path, but for regular usage it'd be better to take a fast
> path.

Encryption doesn't really change anything with respect to cluster
allocations, but combining compression with preallocation doesn't make
any sense. We should probably forbid that.

To get the size of the image, it should be enough to get the offset of
the last cluster as the allocation is done sequentially. However, we
don't have a bdrv_fallocate (yet). I'm not sure how to emulate this for
drivers that don't support it directly, but maybe we could just ignore
it for them.

So yes, optimizing it should be possible from the qcow2 side of things,
but it requires at least some additional code in other places.

Kevin
Kevin Wolf - Jan. 28, 2011, 8:46 a.m.
Am 27.01.2011 18:50, schrieb Anthony Liguori:
> On 01/27/2011 09:58 AM, Daniel P. Berrange wrote:
>> On Thu, Jan 27, 2011 at 04:52:14PM +0100, Kevin Wolf wrote:
>>    
>>> This adds a preallocation=full mode to qcow2 image creation, which does not
>>> only allocate metadata for the whole image, but also writes zeros to it,
>>> creating a non-sparse image file.
>>>
>>> Signed-off-by: Kevin Wolf<kwolf@redhat.com>
>>> ---
>>>   block/qcow2.c |   45 ++++++++++++++++++++++++++++++++++++++++-----
>>>   1 files changed, 40 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/block/qcow2.c b/block/qcow2.c
>>> index a1773e4..90cf2ca 100644
>>> --- a/block/qcow2.c
>>> +++ b/block/qcow2.c
>>> @@ -838,7 +838,15 @@ static int qcow2_change_backing_file(BlockDriverState *bs,
>>>       return qcow2_update_ext_header(bs, backing_file, backing_fmt);
>>>   }
>>>
>>> -static int preallocate(BlockDriverState *bs)
>>> +enum prealloc_mode {
>>> +    PREALLOC_OFF = 0,
>>> +    PREALLOC_METADATA,
>>> +    PREALLOC_FULL,
>>> +};
>>> +
>>> +#define IO_BUF_SIZE (2 * 1024 * 1024)
>>> +
>>> +static int preallocate(BlockDriverState *bs, enum prealloc_mode mode)
>>>   {
>>>       uint64_t nb_sectors;
>>>       uint64_t offset;
>>> @@ -846,11 +854,14 @@ static int preallocate(BlockDriverState *bs)
>>>       int ret;
>>>       QCowL2Meta meta;
>>>
>>> +    assert(mode != PREALLOC_OFF);
>>> +
>>>       nb_sectors = bdrv_getlength(bs)>>  9;
>>>       offset = 0;
>>>       QLIST_INIT(&meta.dependent_requests);
>>>       meta.cluster_offset = 0;
>>>
>>> +    /* First allocate metadata in _really_ big chunks */
>>>       while (nb_sectors) {
>>>           num = MIN(nb_sectors, INT_MAX>>  9);
>>>           ret = qcow2_alloc_cluster_offset(bs, offset, 0, num,&num,&meta);
>>> @@ -874,6 +885,28 @@ static int preallocate(BlockDriverState *bs)
>>>           offset += num<<  9;
>>>       }
>>>
>>> +    /* Then write zeros to the cluster data, if requested */
>>> +    if (mode == PREALLOC_FULL) {
>>> +        void *buf = qemu_mallocz(IO_BUF_SIZE);
>>> +
>>> +        nb_sectors = bdrv_getlength(bs)>>  BDRV_SECTOR_BITS;
>>> +        offset = 0;
>>> +
>>> +        while (nb_sectors) {
>>> +            num = MIN(nb_sectors, IO_BUF_SIZE / BDRV_SECTOR_SIZE);
>>> +            ret = bdrv_write(bs, offset>>  BDRV_SECTOR_BITS, buf, num);
>>>      
>> Is there a way you can calculate the total size of the qcow2
>> file upfront, and just use a single posix_fallocate() call to
>> do the zero-filled allocation of all the data blocks. It is
>> many orders of magnitude faster than truely writing blocks of
>> zero'd data on modern filesystems.  I guess if you're using
>> compression or encryption, we'd really have to go the slow
>> path, but for regular usage it'd be better to take a fast
>> path.
>>    
> 
> Hrm, so is the intention here to avoid sparse files or to not assume 
> zero-fill?

The primary intention (as I understood our feature request ;-)) was to
avoid sparse files. In it's current implementation you could also use it
to overwrite any left-over data. Maybe that's a point for not having a
bdrv_fallocate like Daniel suggested, but rather a bdrv_zero_init, which
could fallocate on files and write zeros on a block device.

Kevin
Daniel P. Berrange - Jan. 28, 2011, 10:49 a.m.
On Fri, Jan 28, 2011 at 09:22:47AM +0100, Kevin Wolf wrote:
> Am 27.01.2011 16:58, schrieb Daniel P. Berrange:
> > On Thu, Jan 27, 2011 at 04:52:14PM +0100, Kevin Wolf wrote:
> >> This adds a preallocation=full mode to qcow2 image creation, which does not
> >> only allocate metadata for the whole image, but also writes zeros to it,
> >> creating a non-sparse image file.
> >>
> >> Signed-off-by: Kevin Wolf <kwolf@redhat.com>
> 
> > Is there a way you can calculate the total size of the qcow2
> > file upfront, and just use a single posix_fallocate() call to
> > do the zero-filled allocation of all the data blocks. It is
> > many orders of magnitude faster than truely writing blocks of
> > zero'd data on modern filesystems.  I guess if you're using
> > compression or encryption, we'd really have to go the slow
> > path, but for regular usage it'd be better to take a fast
> > path.
> 
> Encryption doesn't really change anything with respect to cluster
> allocations, but combining compression with preallocation doesn't make
> any sense. We should probably forbid that.
> 
> To get the size of the image, it should be enough to get the offset of
> the last cluster as the allocation is done sequentially. However, we
> don't have a bdrv_fallocate (yet). I'm not sure how to emulate this for
> drivers that don't support it directly, but maybe we could just ignore
> it for them.

FWIW in libvirt code we ended up with 'posix_fallocate()' as our
first choice. If that wasn't available, then we do a sequence of
'ftruncate()+mmap()+memset()+munmap()' for the region as second
choice. And if mmap doesn't exist, as the catch-all portable
option for any OS we do a write() of 1MB chunks in a loop.

Regards,
Daniel
Stefan Hajnoczi - Feb. 4, 2011, 10:59 a.m.
On Thu, Jan 27, 2011 at 11:45:55AM -0600, Anthony Liguori wrote:
> On 01/27/2011 09:52 AM, Kevin Wolf wrote:
> >This adds a preallocation=full mode to qcow2 image creation, which does not
> >only allocate metadata for the whole image, but also writes zeros to it,
> >creating a non-sparse image file.
> 
> The writing zeros bit is in order to support physical devices?
> Would it be better to have a flag in BlockDriverState that indicated
> whether uninitialized sectors could be assumed to be zero filled and
> key off of that?

There is already:
int bdrv_has_zero_init(BlockDriverState *bs);

Stefan

Patch

diff --git a/block/qcow2.c b/block/qcow2.c
index a1773e4..90cf2ca 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -838,7 +838,15 @@  static int qcow2_change_backing_file(BlockDriverState *bs,
     return qcow2_update_ext_header(bs, backing_file, backing_fmt);
 }
 
-static int preallocate(BlockDriverState *bs)
+enum prealloc_mode {
+    PREALLOC_OFF = 0,
+    PREALLOC_METADATA,
+    PREALLOC_FULL,
+};
+
+#define IO_BUF_SIZE (2 * 1024 * 1024)
+
+static int preallocate(BlockDriverState *bs, enum prealloc_mode mode)
 {
     uint64_t nb_sectors;
     uint64_t offset;
@@ -846,11 +854,14 @@  static int preallocate(BlockDriverState *bs)
     int ret;
     QCowL2Meta meta;
 
+    assert(mode != PREALLOC_OFF);
+
     nb_sectors = bdrv_getlength(bs) >> 9;
     offset = 0;
     QLIST_INIT(&meta.dependent_requests);
     meta.cluster_offset = 0;
 
+    /* First allocate metadata in _really_ big chunks */
     while (nb_sectors) {
         num = MIN(nb_sectors, INT_MAX >> 9);
         ret = qcow2_alloc_cluster_offset(bs, offset, 0, num, &num, &meta);
@@ -874,6 +885,28 @@  static int preallocate(BlockDriverState *bs)
         offset += num << 9;
     }
 
+    /* Then write zeros to the cluster data, if requested */
+    if (mode == PREALLOC_FULL) {
+        void *buf = qemu_mallocz(IO_BUF_SIZE);
+
+        nb_sectors = bdrv_getlength(bs) >> BDRV_SECTOR_BITS;
+        offset = 0;
+
+        while (nb_sectors) {
+            num = MIN(nb_sectors, IO_BUF_SIZE / BDRV_SECTOR_SIZE);
+            ret = bdrv_write(bs, offset >> BDRV_SECTOR_BITS, buf, num);
+            if (ret < 0) {
+                qemu_free(buf);
+                return ret;
+            }
+
+            nb_sectors -= num;
+            offset += num << 9;
+        }
+
+        qemu_free(buf);
+    }
+
     /*
      * It is expected that the image file is large enough to actually contain
      * all of the allocated clusters (otherwise we get failing reads after
@@ -1006,7 +1039,7 @@  static int qcow2_create2(const char *filename, int64_t total_size,
 
     /* And if we're supposed to preallocate metadata, do that now */
     if (prealloc) {
-        ret = preallocate(bs);
+        ret = preallocate(bs, prealloc);
         if (ret < 0) {
             goto out;
         }
@@ -1043,9 +1076,11 @@  static int qcow2_create(const char *filename, QEMUOptionParameter *options)
             }
         } else if (!strcmp(options->name, BLOCK_OPT_PREALLOC)) {
             if (!options->value.s || !strcmp(options->value.s, "off")) {
-                prealloc = 0;
+                prealloc = PREALLOC_OFF;
             } else if (!strcmp(options->value.s, "metadata")) {
-                prealloc = 1;
+                prealloc = PREALLOC_METADATA;
+            } else if (!strcmp(options->value.s, "full")) {
+                prealloc = PREALLOC_FULL;
             } else {
                 fprintf(stderr, "Invalid preallocation mode: '%s'\n",
                     options->value.s);
@@ -1336,7 +1371,7 @@  static QEMUOptionParameter qcow2_create_options[] = {
     {
         .name = BLOCK_OPT_PREALLOC,
         .type = OPT_STRING,
-        .help = "Preallocation mode (allowed values: off, metadata)"
+        .help = "Preallocation mode (allowed values: off, metadata, full)"
     },
     { NULL }
 };