Message ID | 1419931250-19259-2-git-send-email-den@openvz.org |
---|---|
State | New |
Headers | show |
On 30.12.2014 10:20, Denis V. Lunev wrote: > bdrv_co_do_write_zeroes split writes using bl.max_write_zeroes or > 16 MiB as a chunk size. This is implemented in this way to tolerate > buggy block backends which do not accept too big requests. > > Though if the bdrv_co_write_zeroes callback is not good enough, we > fallback to write data explicitely using bdrv_co_writev and we > create buffer to accomodate zeroes inside. The size of this buffer > is the size of the chunk. Thus if the underlying layer will have > bl.max_write_zeroes high enough, f.e. 4 GiB, the allocation can fail. > > Actually, there is no need to allocate such a big amount of memory. > We could simply allocate 1 MiB buffer and create iovec, which will > point to the same memory. > > Signed-off-by: Denis V. Lunev <den@openvz.org> > CC: Kevin Wolf <kwolf@redhat.com> > CC: Stefan Hajnoczi <stefanha@redhat.com> > CC: Peter Lieven <pl@kamp.de> > --- > block.c | 35 ++++++++++++++++++++++++----------- > 1 file changed, 24 insertions(+), 11 deletions(-) > > diff --git a/block.c b/block.c > index 4165d42..d69c121 100644 > --- a/block.c > +++ b/block.c > @@ -3173,14 +3173,18 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, > * of 32768 512-byte sectors (16 MiB) per request. > */ > #define MAX_WRITE_ZEROES_DEFAULT 32768 > +/* allocate iovec with zeroes using 1 MiB chunks to avoid to big allocations */ > +#define MAX_ZEROES_CHUNK (1024 * 1024) > > static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, > int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) > { > BlockDriver *drv = bs->drv; > QEMUIOVector qiov; > - struct iovec iov = {0}; > int ret = 0; > + void *chunk = NULL; > + > + qemu_iovec_init(&qiov, 0); > > int max_write_zeroes = bs->bl.max_write_zeroes ? > bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; > @@ -3217,27 +3221,35 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, > } > > if (ret == -ENOTSUP) { > + int64_t num_bytes = (int64_t)num << BDRV_SECTOR_BITS; > + int chunk_size = MIN(MAX_ZEROES_CHUNK, num_bytes); > + > /* Fall back to bounce buffer if write zeroes is unsupported */ > - iov.iov_len = num * BDRV_SECTOR_SIZE; > - if (iov.iov_base == NULL) { > - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); > - if (iov.iov_base == NULL) { > + if (chunk == NULL) { > + chunk = qemu_try_blockalign(bs, chunk_size); > + if (chunk == NULL) { > ret = -ENOMEM; > goto fail; > } > - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); > + memset(chunk, 0, chunk_size); > + } > + > + while (num_bytes > 0) { > + int to_add = MIN(chunk_size, num_bytes); > + qemu_iovec_add(&qiov, chunk, to_add); This can and likely will fail for big num_bytes if you exceed IOV_MAX vectors. I would stick to the old method and limit the num to a reasonable value e.g. MAX_WRITE_ZEROES_DEFAULT. This becomes necessary as you set INT_MAX for max_write_zeroes. That hasn't been considered before in the original patch. Peter > + num_bytes -= to_add; > } > - qemu_iovec_init_external(&qiov, &iov, 1); > > ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); > > /* Keep bounce buffer around if it is big enough for all > * all future requests. > */ > - if (num < max_write_zeroes) { > - qemu_vfree(iov.iov_base); > - iov.iov_base = NULL; > + if (chunk_size != MAX_ZEROES_CHUNK) { > + qemu_vfree(chunk); > + chunk = NULL; > } > + qemu_iovec_reset(&qiov); > } > > sector_num += num; > @@ -3245,7 +3257,8 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, > } > > fail: > - qemu_vfree(iov.iov_base); > + qemu_iovec_destroy(&qiov); > + qemu_vfree(chunk); > return ret; > } >
On 05/01/15 10:34, Peter Lieven wrote: > On 30.12.2014 10:20, Denis V. Lunev wrote: >> bdrv_co_do_write_zeroes split writes using bl.max_write_zeroes or >> 16 MiB as a chunk size. This is implemented in this way to tolerate >> buggy block backends which do not accept too big requests. >> >> Though if the bdrv_co_write_zeroes callback is not good enough, we >> fallback to write data explicitely using bdrv_co_writev and we >> create buffer to accomodate zeroes inside. The size of this buffer >> is the size of the chunk. Thus if the underlying layer will have >> bl.max_write_zeroes high enough, f.e. 4 GiB, the allocation can fail. >> >> Actually, there is no need to allocate such a big amount of memory. >> We could simply allocate 1 MiB buffer and create iovec, which will >> point to the same memory. >> >> Signed-off-by: Denis V. Lunev <den@openvz.org> >> CC: Kevin Wolf <kwolf@redhat.com> >> CC: Stefan Hajnoczi <stefanha@redhat.com> >> CC: Peter Lieven <pl@kamp.de> >> --- >> block.c | 35 ++++++++++++++++++++++++----------- >> 1 file changed, 24 insertions(+), 11 deletions(-) >> >> diff --git a/block.c b/block.c >> index 4165d42..d69c121 100644 >> --- a/block.c >> +++ b/block.c >> @@ -3173,14 +3173,18 @@ int coroutine_fn >> bdrv_co_copy_on_readv(BlockDriverState *bs, >> * of 32768 512-byte sectors (16 MiB) per request. >> */ >> #define MAX_WRITE_ZEROES_DEFAULT 32768 >> +/* allocate iovec with zeroes using 1 MiB chunks to avoid to big >> allocations */ >> +#define MAX_ZEROES_CHUNK (1024 * 1024) >> static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState >> *bs, >> int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) >> { >> BlockDriver *drv = bs->drv; >> QEMUIOVector qiov; >> - struct iovec iov = {0}; >> int ret = 0; >> + void *chunk = NULL; >> + >> + qemu_iovec_init(&qiov, 0); >> int max_write_zeroes = bs->bl.max_write_zeroes ? >> bs->bl.max_write_zeroes : >> MAX_WRITE_ZEROES_DEFAULT; >> @@ -3217,27 +3221,35 @@ static int coroutine_fn >> bdrv_co_do_write_zeroes(BlockDriverState *bs, >> } >> if (ret == -ENOTSUP) { >> + int64_t num_bytes = (int64_t)num << BDRV_SECTOR_BITS; >> + int chunk_size = MIN(MAX_ZEROES_CHUNK, num_bytes); >> + >> /* Fall back to bounce buffer if write zeroes is >> unsupported */ >> - iov.iov_len = num * BDRV_SECTOR_SIZE; >> - if (iov.iov_base == NULL) { >> - iov.iov_base = qemu_try_blockalign(bs, num * >> BDRV_SECTOR_SIZE); >> - if (iov.iov_base == NULL) { >> + if (chunk == NULL) { >> + chunk = qemu_try_blockalign(bs, chunk_size); >> + if (chunk == NULL) { >> ret = -ENOMEM; >> goto fail; >> } >> - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); >> + memset(chunk, 0, chunk_size); >> + } >> + >> + while (num_bytes > 0) { >> + int to_add = MIN(chunk_size, num_bytes); >> + qemu_iovec_add(&qiov, chunk, to_add); > > This can and likely will fail for big num_bytes if you exceed IOV_MAX > vectors. > > I would stick to the old method and limit the num to a reasonable > value e.g. MAX_WRITE_ZEROES_DEFAULT. > This becomes necessary as you set INT_MAX for max_write_zeroes. That > hasn't been considered before in > the original patch. > > Peter > hmm. You are right, but I think that it would be better to limit iovec size to 32 and this will solve the problem. Allocation of 32 Mb could be a real problem on loaded system could be a problem. What do you think on this? May be we could consider 16 as a limit...
diff --git a/block.c b/block.c index 4165d42..d69c121 100644 --- a/block.c +++ b/block.c @@ -3173,14 +3173,18 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, * of 32768 512-byte sectors (16 MiB) per request. */ #define MAX_WRITE_ZEROES_DEFAULT 32768 +/* allocate iovec with zeroes using 1 MiB chunks to avoid to big allocations */ +#define MAX_ZEROES_CHUNK (1024 * 1024) static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { BlockDriver *drv = bs->drv; QEMUIOVector qiov; - struct iovec iov = {0}; int ret = 0; + void *chunk = NULL; + + qemu_iovec_init(&qiov, 0); int max_write_zeroes = bs->bl.max_write_zeroes ? bs->bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; @@ -3217,27 +3221,35 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, } if (ret == -ENOTSUP) { + int64_t num_bytes = (int64_t)num << BDRV_SECTOR_BITS; + int chunk_size = MIN(MAX_ZEROES_CHUNK, num_bytes); + /* Fall back to bounce buffer if write zeroes is unsupported */ - iov.iov_len = num * BDRV_SECTOR_SIZE; - if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); - if (iov.iov_base == NULL) { + if (chunk == NULL) { + chunk = qemu_try_blockalign(bs, chunk_size); + if (chunk == NULL) { ret = -ENOMEM; goto fail; } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + memset(chunk, 0, chunk_size); + } + + while (num_bytes > 0) { + int to_add = MIN(chunk_size, num_bytes); + qemu_iovec_add(&qiov, chunk, to_add); + num_bytes -= to_add; } - qemu_iovec_init_external(&qiov, &iov, 1); ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); /* Keep bounce buffer around if it is big enough for all * all future requests. */ - if (num < max_write_zeroes) { - qemu_vfree(iov.iov_base); - iov.iov_base = NULL; + if (chunk_size != MAX_ZEROES_CHUNK) { + qemu_vfree(chunk); + chunk = NULL; } + qemu_iovec_reset(&qiov); } sector_num += num; @@ -3245,7 +3257,8 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, } fail: - qemu_vfree(iov.iov_base); + qemu_iovec_destroy(&qiov); + qemu_vfree(chunk); return ret; }
bdrv_co_do_write_zeroes split writes using bl.max_write_zeroes or 16 MiB as a chunk size. This is implemented in this way to tolerate buggy block backends which do not accept too big requests. Though if the bdrv_co_write_zeroes callback is not good enough, we fallback to write data explicitely using bdrv_co_writev and we create buffer to accomodate zeroes inside. The size of this buffer is the size of the chunk. Thus if the underlying layer will have bl.max_write_zeroes high enough, f.e. 4 GiB, the allocation can fail. Actually, there is no need to allocate such a big amount of memory. We could simply allocate 1 MiB buffer and create iovec, which will point to the same memory. Signed-off-by: Denis V. Lunev <den@openvz.org> CC: Kevin Wolf <kwolf@redhat.com> CC: Stefan Hajnoczi <stefanha@redhat.com> CC: Peter Lieven <pl@kamp.de> --- block.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-)