diff mbox

[1/8] block: prepare bdrv_co_do_write_zeroes to deal with large bl.max_write_zeroes

Message ID 1419931250-19259-2-git-send-email-den@openvz.org
State New
Headers show

Commit Message

Denis V. Lunev Dec. 30, 2014, 9:20 a.m. UTC
bdrv_co_do_write_zeroes split writes using bl.max_write_zeroes or
16 MiB as a chunk size. This is implemented in this way to tolerate
buggy block backends which do not accept too big requests.

Though if the bdrv_co_write_zeroes callback is not good enough, we
fallback to write data explicitely using bdrv_co_writev and we
create buffer to accomodate zeroes inside. The size of this buffer
is the size of the chunk. Thus if the underlying layer will have
bl.max_write_zeroes high enough, f.e. 4 GiB, the allocation can fail.

Actually, there is no need to allocate such a big amount of memory.
We could simply allocate 1 MiB buffer and create iovec, which will
point to the same memory.

Signed-off-by: Denis V. Lunev <den@openvz.org>
CC: Kevin Wolf <kwolf@redhat.com>
CC: Stefan Hajnoczi <stefanha@redhat.com>
CC: Peter Lieven <pl@kamp.de>
---
 block.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

Comments

Peter Lieven Jan. 5, 2015, 7:34 a.m. UTC | #1
On 30.12.2014 10:20, Denis V. Lunev wrote:
> bdrv_co_do_write_zeroes split writes using bl.max_write_zeroes or
> 16 MiB as a chunk size. This is implemented in this way to tolerate
> buggy block backends which do not accept too big requests.
>
> Though if the bdrv_co_write_zeroes callback is not good enough, we
> fallback to write data explicitely using bdrv_co_writev and we
> create buffer to accomodate zeroes inside. The size of this buffer
> is the size of the chunk. Thus if the underlying layer will have
> bl.max_write_zeroes high enough, f.e. 4 GiB, the allocation can fail.
>
> Actually, there is no need to allocate such a big amount of memory.
> We could simply allocate 1 MiB buffer and create iovec, which will
> point to the same memory.
>
> Signed-off-by: Denis V. Lunev <den@openvz.org>
> CC: Kevin Wolf <kwolf@redhat.com>
> CC: Stefan Hajnoczi <stefanha@redhat.com>
> CC: Peter Lieven <pl@kamp.de>
> ---
>   block.c | 35 ++++++++++++++++++++++++-----------
>   1 file changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/block.c b/block.c
> index 4165d42..d69c121 100644
> --- a/block.c
> +++ b/block.c
> @@ -3173,14 +3173,18 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
>    * of 32768 512-byte sectors (16 MiB) per request.
>    */
>   #define MAX_WRITE_ZEROES_DEFAULT 32768
> +/* allocate iovec with zeroes using 1 MiB chunks to avoid to big allocations */
> +#define MAX_ZEROES_CHUNK (1024 * 1024)
>   
>   static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>       int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
>   {
>       BlockDriver *drv = bs->drv;
>       QEMUIOVector qiov;
> -    struct iovec iov = {0};
>       int ret = 0;
> +    void *chunk = NULL;
> +
> +    qemu_iovec_init(&qiov, 0);
>   
>       int max_write_zeroes = bs->bl.max_write_zeroes ?
>                              bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
> @@ -3217,27 +3221,35 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>           }
>   
>           if (ret == -ENOTSUP) {
> +            int64_t num_bytes = (int64_t)num << BDRV_SECTOR_BITS;
> +            int chunk_size = MIN(MAX_ZEROES_CHUNK, num_bytes);
> +
>               /* Fall back to bounce buffer if write zeroes is unsupported */
> -            iov.iov_len = num * BDRV_SECTOR_SIZE;
> -            if (iov.iov_base == NULL) {
> -                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
> -                if (iov.iov_base == NULL) {
> +            if (chunk == NULL) {
> +                chunk = qemu_try_blockalign(bs, chunk_size);
> +                if (chunk == NULL) {
>                       ret = -ENOMEM;
>                       goto fail;
>                   }
> -                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
> +                memset(chunk, 0, chunk_size);
> +            }
> +
> +            while (num_bytes > 0) {
> +                int to_add = MIN(chunk_size, num_bytes);
> +                qemu_iovec_add(&qiov, chunk, to_add);

This can and likely will fail for big num_bytes if you exceed IOV_MAX vectors.

I would stick to the old method and limit the num to a reasonable value e.g. MAX_WRITE_ZEROES_DEFAULT.
This becomes necessary as you set INT_MAX for max_write_zeroes. That hasn't been considered before in
the original patch.

Peter

> +                num_bytes -= to_add;
>               }
> -            qemu_iovec_init_external(&qiov, &iov, 1);
>   
>               ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
>   
>               /* Keep bounce buffer around if it is big enough for all
>                * all future requests.
>                */
> -            if (num < max_write_zeroes) {
> -                qemu_vfree(iov.iov_base);
> -                iov.iov_base = NULL;
> +            if (chunk_size != MAX_ZEROES_CHUNK) {
> +                qemu_vfree(chunk);
> +                chunk = NULL;
>               }
> +            qemu_iovec_reset(&qiov);
>           }
>   
>           sector_num += num;
> @@ -3245,7 +3257,8 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
>       }
>   
>   fail:
> -    qemu_vfree(iov.iov_base);
> +    qemu_iovec_destroy(&qiov);
> +    qemu_vfree(chunk);
>       return ret;
>   }
>
Denis V. Lunev Jan. 5, 2015, 11:06 a.m. UTC | #2
On 05/01/15 10:34, Peter Lieven wrote:
> On 30.12.2014 10:20, Denis V. Lunev wrote:
>> bdrv_co_do_write_zeroes split writes using bl.max_write_zeroes or
>> 16 MiB as a chunk size. This is implemented in this way to tolerate
>> buggy block backends which do not accept too big requests.
>>
>> Though if the bdrv_co_write_zeroes callback is not good enough, we
>> fallback to write data explicitely using bdrv_co_writev and we
>> create buffer to accomodate zeroes inside. The size of this buffer
>> is the size of the chunk. Thus if the underlying layer will have
>> bl.max_write_zeroes high enough, f.e. 4 GiB, the allocation can fail.
>>
>> Actually, there is no need to allocate such a big amount of memory.
>> We could simply allocate 1 MiB buffer and create iovec, which will
>> point to the same memory.
>>
>> Signed-off-by: Denis V. Lunev <den@openvz.org>
>> CC: Kevin Wolf <kwolf@redhat.com>
>> CC: Stefan Hajnoczi <stefanha@redhat.com>
>> CC: Peter Lieven <pl@kamp.de>
>> ---
>>   block.c | 35 ++++++++++++++++++++++++-----------
>>   1 file changed, 24 insertions(+), 11 deletions(-)
>>
>> diff --git a/block.c b/block.c
>> index 4165d42..d69c121 100644
>> --- a/block.c
>> +++ b/block.c
>> @@ -3173,14 +3173,18 @@ int coroutine_fn 
>> bdrv_co_copy_on_readv(BlockDriverState *bs,
>>    * of 32768 512-byte sectors (16 MiB) per request.
>>    */
>>   #define MAX_WRITE_ZEROES_DEFAULT 32768
>> +/* allocate iovec with zeroes using 1 MiB chunks to avoid to big 
>> allocations */
>> +#define MAX_ZEROES_CHUNK (1024 * 1024)
>>     static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState 
>> *bs,
>>       int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
>>   {
>>       BlockDriver *drv = bs->drv;
>>       QEMUIOVector qiov;
>> -    struct iovec iov = {0};
>>       int ret = 0;
>> +    void *chunk = NULL;
>> +
>> +    qemu_iovec_init(&qiov, 0);
>>         int max_write_zeroes = bs->bl.max_write_zeroes ?
>>                              bs->bl.max_write_zeroes : 
>> MAX_WRITE_ZEROES_DEFAULT;
>> @@ -3217,27 +3221,35 @@ static int coroutine_fn 
>> bdrv_co_do_write_zeroes(BlockDriverState *bs,
>>           }
>>             if (ret == -ENOTSUP) {
>> +            int64_t num_bytes = (int64_t)num << BDRV_SECTOR_BITS;
>> +            int chunk_size = MIN(MAX_ZEROES_CHUNK, num_bytes);
>> +
>>               /* Fall back to bounce buffer if write zeroes is 
>> unsupported */
>> -            iov.iov_len = num * BDRV_SECTOR_SIZE;
>> -            if (iov.iov_base == NULL) {
>> -                iov.iov_base = qemu_try_blockalign(bs, num * 
>> BDRV_SECTOR_SIZE);
>> -                if (iov.iov_base == NULL) {
>> +            if (chunk == NULL) {
>> +                chunk = qemu_try_blockalign(bs, chunk_size);
>> +                if (chunk == NULL) {
>>                       ret = -ENOMEM;
>>                       goto fail;
>>                   }
>> -                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
>> +                memset(chunk, 0, chunk_size);
>> +            }
>> +
>> +            while (num_bytes > 0) {
>> +                int to_add = MIN(chunk_size, num_bytes);
>> +                qemu_iovec_add(&qiov, chunk, to_add);
>
> This can and likely will fail for big num_bytes if you exceed IOV_MAX 
> vectors.
>
> I would stick to the old method and limit the num to a reasonable 
> value e.g. MAX_WRITE_ZEROES_DEFAULT.
> This becomes necessary as you set INT_MAX for max_write_zeroes. That 
> hasn't been considered before in
> the original patch.
>
> Peter
>

hmm. You are right, but I think that it would be better to limit iovec size
to 32 and this will solve the problem. Allocation of 32 Mb could be a 
real problem
on loaded system could be a problem.

What do you think on this? May be we could consider 16 as a limit...
diff mbox

Patch

diff --git a/block.c b/block.c
index 4165d42..d69c121 100644
--- a/block.c
+++ b/block.c
@@ -3173,14 +3173,18 @@  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
  * of 32768 512-byte sectors (16 MiB) per request.
  */
 #define MAX_WRITE_ZEROES_DEFAULT 32768
+/* allocate iovec with zeroes using 1 MiB chunks to avoid to big allocations */
+#define MAX_ZEROES_CHUNK (1024 * 1024)
 
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
     BlockDriver *drv = bs->drv;
     QEMUIOVector qiov;
-    struct iovec iov = {0};
     int ret = 0;
+    void *chunk = NULL;
+
+    qemu_iovec_init(&qiov, 0);
 
     int max_write_zeroes = bs->bl.max_write_zeroes ?
                            bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
@@ -3217,27 +3221,35 @@  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
         }
 
         if (ret == -ENOTSUP) {
+            int64_t num_bytes = (int64_t)num << BDRV_SECTOR_BITS;
+            int chunk_size = MIN(MAX_ZEROES_CHUNK, num_bytes);
+
             /* Fall back to bounce buffer if write zeroes is unsupported */
-            iov.iov_len = num * BDRV_SECTOR_SIZE;
-            if (iov.iov_base == NULL) {
-                iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE);
-                if (iov.iov_base == NULL) {
+            if (chunk == NULL) {
+                chunk = qemu_try_blockalign(bs, chunk_size);
+                if (chunk == NULL) {
                     ret = -ENOMEM;
                     goto fail;
                 }
-                memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE);
+                memset(chunk, 0, chunk_size);
+            }
+
+            while (num_bytes > 0) {
+                int to_add = MIN(chunk_size, num_bytes);
+                qemu_iovec_add(&qiov, chunk, to_add);
+                num_bytes -= to_add;
             }
-            qemu_iovec_init_external(&qiov, &iov, 1);
 
             ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov);
 
             /* Keep bounce buffer around if it is big enough for all
              * all future requests.
              */
-            if (num < max_write_zeroes) {
-                qemu_vfree(iov.iov_base);
-                iov.iov_base = NULL;
+            if (chunk_size != MAX_ZEROES_CHUNK) {
+                qemu_vfree(chunk);
+                chunk = NULL;
             }
+            qemu_iovec_reset(&qiov);
         }
 
         sector_num += num;
@@ -3245,7 +3257,8 @@  static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     }
 
 fail:
-    qemu_vfree(iov.iov_base);
+    qemu_iovec_destroy(&qiov);
+    qemu_vfree(chunk);
     return ret;
 }