diff mbox

[v6] migration/block: use blk_pwrite_zeroes for each zero cluster

Message ID 1492050868-16200-1-git-send-email-lidongchen@tencent.com
State New
Headers show

Commit Message

858585 jemmy April 13, 2017, 2:34 a.m. UTC
From: Lidong Chen <lidongchen@tencent.com>

BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
this may cause the qcow2 file size to be bigger after migration.
This patch checks each cluster, using blk_pwrite_zeroes for each
zero cluster.

Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Lidong Chen <lidongchen@tencent.com>
---
v6 changelog:
Fix up some grammar in the comment.
---
 migration/block.c | 35 +++++++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

Comments

Stefan Hajnoczi April 13, 2017, 2:16 p.m. UTC | #1
On Thu, Apr 13, 2017 at 10:34:28AM +0800, jemmy858585@gmail.com wrote:
> From: Lidong Chen <lidongchen@tencent.com>
> 
> BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
> this may cause the qcow2 file size to be bigger after migration.
> This patch checks each cluster, using blk_pwrite_zeroes for each
> zero cluster.
> 
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
> ---
> v6 changelog:
> Fix up some grammar in the comment.
> ---
>  migration/block.c | 35 +++++++++++++++++++++++++++++++++--
>  1 file changed, 33 insertions(+), 2 deletions(-)

I fixed the following gcc warning when merging the patch:

  migration/block.c:958:25: error: ‘cluster_size’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
                            buffer_is_zero(cur_buf, cluster_size)) {

Thanks, applied to my block-next tree:
https://github.com/stefanha/qemu/commits/block-next

Stefan
858585 jemmy April 14, 2017, 12:57 a.m. UTC | #2
On Thu, Apr 13, 2017 at 10:16 PM, Stefan Hajnoczi <stefanha@redhat.com> wrote:
> On Thu, Apr 13, 2017 at 10:34:28AM +0800, jemmy858585@gmail.com wrote:
>> From: Lidong Chen <lidongchen@tencent.com>
>>
>> BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
>> this may cause the qcow2 file size to be bigger after migration.
>> This patch checks each cluster, using blk_pwrite_zeroes for each
>> zero cluster.
>>
>> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>> ---
>> v6 changelog:
>> Fix up some grammar in the comment.
>> ---
>>  migration/block.c | 35 +++++++++++++++++++++++++++++++++--
>>  1 file changed, 33 insertions(+), 2 deletions(-)
>
> I fixed the following gcc warning when merging the patch:
>
>   migration/block.c:958:25: error: ‘cluster_size’ may be used uninitialized in this function [-Werror=maybe-uninitialized]
>                             buffer_is_zero(cur_buf, cluster_size)) {

Thanks,i will check gcc warning next time.

>
> Thanks, applied to my block-next tree:
> https://github.com/stefanha/qemu/commits/block-next
>
> Stefan
Fam Zheng April 14, 2017, 6 a.m. UTC | #3
On Thu, 04/13 10:34, jemmy858585@gmail.com wrote:
> From: Lidong Chen <lidongchen@tencent.com>
> 
> BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
> this may cause the qcow2 file size to be bigger after migration.
> This patch checks each cluster, using blk_pwrite_zeroes for each
> zero cluster.
> 
> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
> ---
> v6 changelog:
> Fix up some grammar in the comment.
> ---
>  migration/block.c | 35 +++++++++++++++++++++++++++++++++--
>  1 file changed, 33 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/block.c b/migration/block.c
> index 7734ff7..41c7a55 100644
> --- a/migration/block.c
> +++ b/migration/block.c
> @@ -885,6 +885,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>      int64_t total_sectors = 0;
>      int nr_sectors;
>      int ret;
> +    BlockDriverInfo bdi;
> +    int cluster_size;
>  
>      do {
>          addr = qemu_get_be64(f);
> @@ -919,6 +921,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>                      error_report_err(local_err);
>                      return -EINVAL;
>                  }
> +
> +                ret = bdrv_get_info(blk_bs(blk), &bdi);
> +                if (ret == 0 && bdi.cluster_size > 0 &&
> +                    bdi.cluster_size <= BLOCK_SIZE &&
> +                    BLOCK_SIZE % bdi.cluster_size == 0) {
> +                    cluster_size = bdi.cluster_size;
> +                } else {
> +                    cluster_size = BLOCK_SIZE;
> +                }
>              }
>  
>              if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
> @@ -932,10 +943,30 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>                                          nr_sectors * BDRV_SECTOR_SIZE,
>                                          BDRV_REQ_MAY_UNMAP);
>              } else {
> +                int i;
> +                int64_t cur_addr;
> +                uint8_t *cur_buf;
> +
>                  buf = g_malloc(BLOCK_SIZE);
>                  qemu_get_buffer(f, buf, BLOCK_SIZE);
> -                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
> -                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
> +                for (i = 0; i < BLOCK_SIZE / cluster_size; i++) {
> +                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
> +                    cur_buf = buf + i * cluster_size;
> +
> +                    if ((!block_mig_state.zero_blocks ||
> +                        cluster_size < BLOCK_SIZE) &&
> +                        buffer_is_zero(cur_buf, cluster_size)) {
> +                        ret = blk_pwrite_zeroes(blk, cur_addr,
> +                                                cluster_size,
> +                                                BDRV_REQ_MAY_UNMAP);
> +                    } else {
> +                        ret = blk_pwrite(blk, cur_addr, cur_buf,
> +                                         cluster_size, 0);
> +                    }
> +                    if (ret < 0) {
> +                        break;
> +                    }
> +                }
>                  g_free(buf);
>              }

Sorry for asking this question so late, but, before it gets too late: did you
evaluate the performance impact of this change under real world workload?

Effectively, if no cluster is zero, this patch still splits a big write into
small ones, which is the opposition of usual performance optimizations (i.e.
trying to coalesce requests).

Fam
858585 jemmy April 14, 2017, 6:30 a.m. UTC | #4
On Fri, Apr 14, 2017 at 2:00 PM, Fam Zheng <famz@redhat.com> wrote:
> On Thu, 04/13 10:34, jemmy858585@gmail.com wrote:
>> From: Lidong Chen <lidongchen@tencent.com>
>>
>> BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
>> this may cause the qcow2 file size to be bigger after migration.
>> This patch checks each cluster, using blk_pwrite_zeroes for each
>> zero cluster.
>>
>> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>> ---
>> v6 changelog:
>> Fix up some grammar in the comment.
>> ---
>>  migration/block.c | 35 +++++++++++++++++++++++++++++++++--
>>  1 file changed, 33 insertions(+), 2 deletions(-)
>>
>> diff --git a/migration/block.c b/migration/block.c
>> index 7734ff7..41c7a55 100644
>> --- a/migration/block.c
>> +++ b/migration/block.c
>> @@ -885,6 +885,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>      int64_t total_sectors = 0;
>>      int nr_sectors;
>>      int ret;
>> +    BlockDriverInfo bdi;
>> +    int cluster_size;
>>
>>      do {
>>          addr = qemu_get_be64(f);
>> @@ -919,6 +921,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>                      error_report_err(local_err);
>>                      return -EINVAL;
>>                  }
>> +
>> +                ret = bdrv_get_info(blk_bs(blk), &bdi);
>> +                if (ret == 0 && bdi.cluster_size > 0 &&
>> +                    bdi.cluster_size <= BLOCK_SIZE &&
>> +                    BLOCK_SIZE % bdi.cluster_size == 0) {
>> +                    cluster_size = bdi.cluster_size;
>> +                } else {
>> +                    cluster_size = BLOCK_SIZE;
>> +                }
>>              }
>>
>>              if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
>> @@ -932,10 +943,30 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>                                          nr_sectors * BDRV_SECTOR_SIZE,
>>                                          BDRV_REQ_MAY_UNMAP);
>>              } else {
>> +                int i;
>> +                int64_t cur_addr;
>> +                uint8_t *cur_buf;
>> +
>>                  buf = g_malloc(BLOCK_SIZE);
>>                  qemu_get_buffer(f, buf, BLOCK_SIZE);
>> -                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
>> -                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
>> +                for (i = 0; i < BLOCK_SIZE / cluster_size; i++) {
>> +                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
>> +                    cur_buf = buf + i * cluster_size;
>> +
>> +                    if ((!block_mig_state.zero_blocks ||
>> +                        cluster_size < BLOCK_SIZE) &&
>> +                        buffer_is_zero(cur_buf, cluster_size)) {
>> +                        ret = blk_pwrite_zeroes(blk, cur_addr,
>> +                                                cluster_size,
>> +                                                BDRV_REQ_MAY_UNMAP);
>> +                    } else {
>> +                        ret = blk_pwrite(blk, cur_addr, cur_buf,
>> +                                         cluster_size, 0);
>> +                    }
>> +                    if (ret < 0) {
>> +                        break;
>> +                    }
>> +                }
>>                  g_free(buf);
>>              }
>
> Sorry for asking this question so late, but, before it gets too late: did you
> evaluate the performance impact of this change under real world workload?
>
> Effectively, if no cluster is zero, this patch still splits a big write into
> small ones, which is the opposition of usual performance optimizations (i.e.
> trying to coalesce requests).

I test this patch for qcow2, the migration speed is the same before
apply this patch.

Do you know some other format which have very small cluster size?

>
> Fam
Stefan Hajnoczi April 14, 2017, 6:38 a.m. UTC | #5
On Fri, Apr 14, 2017 at 7:00 AM, Fam Zheng <famz@redhat.com> wrote:
> On Thu, 04/13 10:34, jemmy858585@gmail.com wrote:
>> From: Lidong Chen <lidongchen@tencent.com>
>>
>> BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
>> this may cause the qcow2 file size to be bigger after migration.
>> This patch checks each cluster, using blk_pwrite_zeroes for each
>> zero cluster.
>>
>> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>> ---
>> v6 changelog:
>> Fix up some grammar in the comment.
>> ---
>>  migration/block.c | 35 +++++++++++++++++++++++++++++++++--
>>  1 file changed, 33 insertions(+), 2 deletions(-)
>>
>> diff --git a/migration/block.c b/migration/block.c
>> index 7734ff7..41c7a55 100644
>> --- a/migration/block.c
>> +++ b/migration/block.c
>> @@ -885,6 +885,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>      int64_t total_sectors = 0;
>>      int nr_sectors;
>>      int ret;
>> +    BlockDriverInfo bdi;
>> +    int cluster_size;
>>
>>      do {
>>          addr = qemu_get_be64(f);
>> @@ -919,6 +921,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>                      error_report_err(local_err);
>>                      return -EINVAL;
>>                  }
>> +
>> +                ret = bdrv_get_info(blk_bs(blk), &bdi);
>> +                if (ret == 0 && bdi.cluster_size > 0 &&
>> +                    bdi.cluster_size <= BLOCK_SIZE &&
>> +                    BLOCK_SIZE % bdi.cluster_size == 0) {
>> +                    cluster_size = bdi.cluster_size;
>> +                } else {
>> +                    cluster_size = BLOCK_SIZE;
>> +                }
>>              }
>>
>>              if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
>> @@ -932,10 +943,30 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>                                          nr_sectors * BDRV_SECTOR_SIZE,
>>                                          BDRV_REQ_MAY_UNMAP);
>>              } else {
>> +                int i;
>> +                int64_t cur_addr;
>> +                uint8_t *cur_buf;
>> +
>>                  buf = g_malloc(BLOCK_SIZE);
>>                  qemu_get_buffer(f, buf, BLOCK_SIZE);
>> -                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
>> -                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
>> +                for (i = 0; i < BLOCK_SIZE / cluster_size; i++) {
>> +                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
>> +                    cur_buf = buf + i * cluster_size;
>> +
>> +                    if ((!block_mig_state.zero_blocks ||
>> +                        cluster_size < BLOCK_SIZE) &&
>> +                        buffer_is_zero(cur_buf, cluster_size)) {
>> +                        ret = blk_pwrite_zeroes(blk, cur_addr,
>> +                                                cluster_size,
>> +                                                BDRV_REQ_MAY_UNMAP);
>> +                    } else {
>> +                        ret = blk_pwrite(blk, cur_addr, cur_buf,
>> +                                         cluster_size, 0);
>> +                    }
>> +                    if (ret < 0) {
>> +                        break;
>> +                    }
>> +                }
>>                  g_free(buf);
>>              }
>
> Sorry for asking this question so late, but, before it gets too late: did you
> evaluate the performance impact of this change under real world workload?
>
> Effectively, if no cluster is zero, this patch still splits a big write into
> small ones, which is the opposition of usual performance optimizations (i.e.
> trying to coalesce requests).

Good point!

Another patch can modify the loop to perform the largest writes
possible.  In other words, do not perform the write immediately and
keep a cluster counter instead.  When the zero/non-zero state changes,
perform the write for the accumulated cluster count.

Stefan
858585 jemmy April 14, 2017, 6:48 a.m. UTC | #6
On Fri, Apr 14, 2017 at 2:38 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Fri, Apr 14, 2017 at 7:00 AM, Fam Zheng <famz@redhat.com> wrote:
>> On Thu, 04/13 10:34, jemmy858585@gmail.com wrote:
>>> From: Lidong Chen <lidongchen@tencent.com>
>>>
>>> BLOCK_SIZE is (1 << 20), qcow2 cluster size is 65536 by default,
>>> this may cause the qcow2 file size to be bigger after migration.
>>> This patch checks each cluster, using blk_pwrite_zeroes for each
>>> zero cluster.
>>>
>>> Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
>>> Signed-off-by: Lidong Chen <lidongchen@tencent.com>
>>> ---
>>> v6 changelog:
>>> Fix up some grammar in the comment.
>>> ---
>>>  migration/block.c | 35 +++++++++++++++++++++++++++++++++--
>>>  1 file changed, 33 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/migration/block.c b/migration/block.c
>>> index 7734ff7..41c7a55 100644
>>> --- a/migration/block.c
>>> +++ b/migration/block.c
>>> @@ -885,6 +885,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>>      int64_t total_sectors = 0;
>>>      int nr_sectors;
>>>      int ret;
>>> +    BlockDriverInfo bdi;
>>> +    int cluster_size;
>>>
>>>      do {
>>>          addr = qemu_get_be64(f);
>>> @@ -919,6 +921,15 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>>                      error_report_err(local_err);
>>>                      return -EINVAL;
>>>                  }
>>> +
>>> +                ret = bdrv_get_info(blk_bs(blk), &bdi);
>>> +                if (ret == 0 && bdi.cluster_size > 0 &&
>>> +                    bdi.cluster_size <= BLOCK_SIZE &&
>>> +                    BLOCK_SIZE % bdi.cluster_size == 0) {
>>> +                    cluster_size = bdi.cluster_size;
>>> +                } else {
>>> +                    cluster_size = BLOCK_SIZE;
>>> +                }
>>>              }
>>>
>>>              if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
>>> @@ -932,10 +943,30 @@ static int block_load(QEMUFile *f, void *opaque, int version_id)
>>>                                          nr_sectors * BDRV_SECTOR_SIZE,
>>>                                          BDRV_REQ_MAY_UNMAP);
>>>              } else {
>>> +                int i;
>>> +                int64_t cur_addr;
>>> +                uint8_t *cur_buf;
>>> +
>>>                  buf = g_malloc(BLOCK_SIZE);
>>>                  qemu_get_buffer(f, buf, BLOCK_SIZE);
>>> -                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
>>> -                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
>>> +                for (i = 0; i < BLOCK_SIZE / cluster_size; i++) {
>>> +                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
>>> +                    cur_buf = buf + i * cluster_size;
>>> +
>>> +                    if ((!block_mig_state.zero_blocks ||
>>> +                        cluster_size < BLOCK_SIZE) &&
>>> +                        buffer_is_zero(cur_buf, cluster_size)) {
>>> +                        ret = blk_pwrite_zeroes(blk, cur_addr,
>>> +                                                cluster_size,
>>> +                                                BDRV_REQ_MAY_UNMAP);
>>> +                    } else {
>>> +                        ret = blk_pwrite(blk, cur_addr, cur_buf,
>>> +                                         cluster_size, 0);
>>> +                    }
>>> +                    if (ret < 0) {
>>> +                        break;
>>> +                    }
>>> +                }
>>>                  g_free(buf);
>>>              }
>>
>> Sorry for asking this question so late, but, before it gets too late: did you
>> evaluate the performance impact of this change under real world workload?
>>
>> Effectively, if no cluster is zero, this patch still splits a big write into
>> small ones, which is the opposition of usual performance optimizations (i.e.
>> trying to coalesce requests).
>
> Good point!
>
> Another patch can modify the loop to perform the largest writes
> possible.  In other words, do not perform the write immediately and
> keep a cluster counter instead.  When the zero/non-zero state changes,
> perform the write for the accumulated cluster count.

if the zero/non-zero state changes very frequently, it will not work.

I also consider this way before i submit this patch.
but i find the performance is almost the same for qcow2 which
cluster_size is 65536.

I worry about some other format which have very small cluster_size,for
example 512.
but i don't find. please tell me if you know, and i will test it.

Do you think it's necessary if the size of the cluster_size is too small, we
can cluster_size*N instead?

Thanks.


>
> Stefan
Fam Zheng April 17, 2017, 3:47 a.m. UTC | #7
On Fri, 04/14 14:48, 858585 jemmy wrote:
> >> Effectively, if no cluster is zero, this patch still splits a big write into
> >> small ones, which is the opposition of usual performance optimizations (i.e.
> >> trying to coalesce requests).
> >
> > Good point!
> >
> > Another patch can modify the loop to perform the largest writes
> > possible.  In other words, do not perform the write immediately and
> > keep a cluster counter instead.  When the zero/non-zero state changes,
> > perform the write for the accumulated cluster count.
> 
> if the zero/non-zero state changes very frequently, it will not work.

It will work, but just not improving anything. I think this is a worthwhile
optimzation to do.

Fam
Fam Zheng April 17, 2017, 3:49 a.m. UTC | #8
On Fri, 04/14 14:30, 858585 jemmy wrote:
> Do you know some other format which have very small cluster size?

64k is the default cluster size for qcow2 but it can be configured at image
creation time, as 512 bytes, for example:

    $ qemu-img create -f qcow2 test.qcow2 -o cluster_size=512 1G

Fam
858585 jemmy April 17, 2017, 4 a.m. UTC | #9
On Mon, Apr 17, 2017 at 11:49 AM, Fam Zheng <famz@redhat.com> wrote:
> On Fri, 04/14 14:30, 858585 jemmy wrote:
>> Do you know some other format which have very small cluster size?
>
> 64k is the default cluster size for qcow2 but it can be configured at image
> creation time, as 512 bytes, for example:
>
>     $ qemu-img create -f qcow2 test.qcow2 -o cluster_size=512 1G

Thanks, i will test the performance again.
>
> Fam
858585 jemmy April 24, 2017, 7:40 a.m. UTC | #10
On Mon, Apr 17, 2017 at 12:00 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
> On Mon, Apr 17, 2017 at 11:49 AM, Fam Zheng <famz@redhat.com> wrote:
>> On Fri, 04/14 14:30, 858585 jemmy wrote:
>>> Do you know some other format which have very small cluster size?
>>
>> 64k is the default cluster size for qcow2 but it can be configured at image
>> creation time, as 512 bytes, for example:
>>
>>     $ qemu-img create -f qcow2 test.qcow2 -o cluster_size=512 1G
>
> Thanks, i will test the performance again.

I find the performance reduce when cluster size is 512.
I will optimize the performance and submit a patch later.
Thanks.

>>
>> Fam
858585 jemmy April 24, 2017, 11:54 a.m. UTC | #11
On Mon, Apr 24, 2017 at 3:40 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
> On Mon, Apr 17, 2017 at 12:00 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
>> On Mon, Apr 17, 2017 at 11:49 AM, Fam Zheng <famz@redhat.com> wrote:
>>> On Fri, 04/14 14:30, 858585 jemmy wrote:
>>>> Do you know some other format which have very small cluster size?
>>>
>>> 64k is the default cluster size for qcow2 but it can be configured at image
>>> creation time, as 512 bytes, for example:
>>>
>>>     $ qemu-img create -f qcow2 test.qcow2 -o cluster_size=512 1G
>>
>> Thanks, i will test the performance again.
>
> I find the performance reduce when cluster size is 512.
> I will optimize the performance and submit a patch later.
> Thanks.

after optimize the code, i find the destination qemu process still have very
bad performance when cluster_size is 512. the reason is cause by
qcow2_check_metadata_overlap.

if cluster_size is 512, the destination qemu process reach 100% cpu usage.
and the perf top result is below:

Samples: 32K of event 'cycles', Event count (approx.): 20105269445
 91.68%  qemu-system-x86_64       [.] qcow2_check_metadata_overlap
  3.33%  qemu-system-x86_64       [.] range_get_last
  2.76%  qemu-system-x86_64       [.] ranges_overlap
  0.61%  qemu-system-x86_64       [.] qcow2_cache_do_get

very large l1_size.
(gdb) p s->l1_size
$3 = 1310720

(gdb) p s->max_refcount_table_index
$5 = 21905

the backtrace:

Breakpoint 1, qcow2_check_metadata_overlap (bs=0x16feb00, ign=0,
offset=440329728, size=4096) at block/qcow2-refcount.c:2344
2344    {
(gdb) bt
#0  qcow2_check_metadata_overlap (bs=0x16feb00, ign=0,
offset=440329728, size=4096) at block/qcow2-refcount.c:2344
#1  0x0000000000878d9f in qcow2_pre_write_overlap_check (bs=0x16feb00,
ign=0, offset=440329728, size=4096) at block/qcow2-refcount.c:2473
#2  0x000000000086e382 in qcow2_co_pwritev (bs=0x16feb00,
offset=771047424, bytes=704512, qiov=0x7fd026bfdb90, flags=0) at
block/qcow2.c:1653
#3  0x00000000008aeace in bdrv_driver_pwritev (bs=0x16feb00,
offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
block/io.c:871
#4  0x00000000008b015c in bdrv_aligned_pwritev (child=0x171b630,
req=0x7fd026bfd980, offset=770703360, bytes=1048576, align=1,
qiov=0x7fd026bfdb90, flags=0) at block/io.c:1371
#5  0x00000000008b0d77 in bdrv_co_pwritev (child=0x171b630,
offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
block/io.c:1622
#6  0x000000000089a76d in blk_co_pwritev (blk=0x16fe920,
offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
block/block-backend.c:992
#7  0x000000000089a878 in blk_write_entry (opaque=0x7fd026bfdb70) at
block/block-backend.c:1017
#8  0x000000000089a95d in blk_prw (blk=0x16fe920, offset=770703360,
buf=0x362b050 "", bytes=1048576, co_entry=0x89a81a <blk_write_entry>,
flags=0) at block/block-backend.c:1045
#9  0x000000000089b222 in blk_pwrite (blk=0x16fe920, offset=770703360,
buf=0x362b050, count=1048576, flags=0) at block/block-backend.c:1208
#10 0x00000000007d480d in block_load (f=0x1784fa0, opaque=0xfd46a0,
version_id=1) at migration/block.c:992
#11 0x000000000049dc58 in vmstate_load (f=0x1784fa0, se=0x16fbdc0,
version_id=1) at /data/qemu/migration/savevm.c:730
#12 0x00000000004a0752 in qemu_loadvm_section_part_end (f=0x1784fa0,
mis=0xfd4160) at /data/qemu/migration/savevm.c:1923
#13 0x00000000004a0842 in qemu_loadvm_state_main (f=0x1784fa0,
mis=0xfd4160) at /data/qemu/migration/savevm.c:1954
#14 0x00000000004a0a33 in qemu_loadvm_state (f=0x1784fa0) at
/data/qemu/migration/savevm.c:2020
#15 0x00000000007c2d33 in process_incoming_migration_co
(opaque=0x1784fa0) at migration/migration.c:404
#16 0x0000000000966593 in coroutine_trampoline (i0=27108400, i1=0) at
util/coroutine-ucontext.c:79
#17 0x00007fd03946b8f0 in ?? () from /lib64/libc.so.6
#18 0x00007fff869c87e0 in ?? ()
#19 0x0000000000000000 in ?? ()

when the cluster_size is too small, the write performance is very bad.
How to solve this problem? Any suggestion?
1. when the cluster_size is too small, not invoke qcow2_pre_write_overlap_check.
2.limit the qcow2 cluster_size range, don't allow set the cluster_size
too small.
which way is better?

>
>>>
>>> Fam
Fam Zheng April 24, 2017, 12:09 p.m. UTC | #12
On Mon, 04/24 19:54, 858585 jemmy wrote:
> On Mon, Apr 24, 2017 at 3:40 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
> > On Mon, Apr 17, 2017 at 12:00 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
> >> On Mon, Apr 17, 2017 at 11:49 AM, Fam Zheng <famz@redhat.com> wrote:
> >>> On Fri, 04/14 14:30, 858585 jemmy wrote:
> >>>> Do you know some other format which have very small cluster size?
> >>>
> >>> 64k is the default cluster size for qcow2 but it can be configured at image
> >>> creation time, as 512 bytes, for example:
> >>>
> >>>     $ qemu-img create -f qcow2 test.qcow2 -o cluster_size=512 1G
> >>
> >> Thanks, i will test the performance again.
> >
> > I find the performance reduce when cluster size is 512.
> > I will optimize the performance and submit a patch later.
> > Thanks.
> 
> after optimize the code, i find the destination qemu process still have very
> bad performance when cluster_size is 512. the reason is cause by
> qcow2_check_metadata_overlap.
> 
> if cluster_size is 512, the destination qemu process reach 100% cpu usage.
> and the perf top result is below:
> 
> Samples: 32K of event 'cycles', Event count (approx.): 20105269445
>  91.68%  qemu-system-x86_64       [.] qcow2_check_metadata_overlap
>   3.33%  qemu-system-x86_64       [.] range_get_last
>   2.76%  qemu-system-x86_64       [.] ranges_overlap
>   0.61%  qemu-system-x86_64       [.] qcow2_cache_do_get
> 
> very large l1_size.
> (gdb) p s->l1_size
> $3 = 1310720
> 
> (gdb) p s->max_refcount_table_index
> $5 = 21905
> 
> the backtrace:
> 
> Breakpoint 1, qcow2_check_metadata_overlap (bs=0x16feb00, ign=0,
> offset=440329728, size=4096) at block/qcow2-refcount.c:2344
> 2344    {
> (gdb) bt
> #0  qcow2_check_metadata_overlap (bs=0x16feb00, ign=0,
> offset=440329728, size=4096) at block/qcow2-refcount.c:2344
> #1  0x0000000000878d9f in qcow2_pre_write_overlap_check (bs=0x16feb00,
> ign=0, offset=440329728, size=4096) at block/qcow2-refcount.c:2473
> #2  0x000000000086e382 in qcow2_co_pwritev (bs=0x16feb00,
> offset=771047424, bytes=704512, qiov=0x7fd026bfdb90, flags=0) at
> block/qcow2.c:1653
> #3  0x00000000008aeace in bdrv_driver_pwritev (bs=0x16feb00,
> offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
> block/io.c:871
> #4  0x00000000008b015c in bdrv_aligned_pwritev (child=0x171b630,
> req=0x7fd026bfd980, offset=770703360, bytes=1048576, align=1,
> qiov=0x7fd026bfdb90, flags=0) at block/io.c:1371
> #5  0x00000000008b0d77 in bdrv_co_pwritev (child=0x171b630,
> offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
> block/io.c:1622
> #6  0x000000000089a76d in blk_co_pwritev (blk=0x16fe920,
> offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
> block/block-backend.c:992
> #7  0x000000000089a878 in blk_write_entry (opaque=0x7fd026bfdb70) at
> block/block-backend.c:1017
> #8  0x000000000089a95d in blk_prw (blk=0x16fe920, offset=770703360,
> buf=0x362b050 "", bytes=1048576, co_entry=0x89a81a <blk_write_entry>,
> flags=0) at block/block-backend.c:1045
> #9  0x000000000089b222 in blk_pwrite (blk=0x16fe920, offset=770703360,
> buf=0x362b050, count=1048576, flags=0) at block/block-backend.c:1208
> #10 0x00000000007d480d in block_load (f=0x1784fa0, opaque=0xfd46a0,
> version_id=1) at migration/block.c:992
> #11 0x000000000049dc58 in vmstate_load (f=0x1784fa0, se=0x16fbdc0,
> version_id=1) at /data/qemu/migration/savevm.c:730
> #12 0x00000000004a0752 in qemu_loadvm_section_part_end (f=0x1784fa0,
> mis=0xfd4160) at /data/qemu/migration/savevm.c:1923
> #13 0x00000000004a0842 in qemu_loadvm_state_main (f=0x1784fa0,
> mis=0xfd4160) at /data/qemu/migration/savevm.c:1954
> #14 0x00000000004a0a33 in qemu_loadvm_state (f=0x1784fa0) at
> /data/qemu/migration/savevm.c:2020
> #15 0x00000000007c2d33 in process_incoming_migration_co
> (opaque=0x1784fa0) at migration/migration.c:404
> #16 0x0000000000966593 in coroutine_trampoline (i0=27108400, i1=0) at
> util/coroutine-ucontext.c:79
> #17 0x00007fd03946b8f0 in ?? () from /lib64/libc.so.6
> #18 0x00007fff869c87e0 in ?? ()
> #19 0x0000000000000000 in ?? ()
> 
> when the cluster_size is too small, the write performance is very bad.
> How to solve this problem? Any suggestion?
> 1. when the cluster_size is too small, not invoke qcow2_pre_write_overlap_check.
> 2.limit the qcow2 cluster_size range, don't allow set the cluster_size
> too small.
> which way is better?

It's a separate problem.

I think what should be done in this patch (or a follow up) is coalescing the
same type of write as much as possible (by type I mean "zeroed" or "normal"
write).  With that, cluster size won't matter that much.

Fam
858585 jemmy April 24, 2017, 12:15 p.m. UTC | #13
On Mon, Apr 24, 2017 at 8:09 PM, Fam Zheng <famz@redhat.com> wrote:
> On Mon, 04/24 19:54, 858585 jemmy wrote:
>> On Mon, Apr 24, 2017 at 3:40 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
>> > On Mon, Apr 17, 2017 at 12:00 PM, 858585 jemmy <jemmy858585@gmail.com> wrote:
>> >> On Mon, Apr 17, 2017 at 11:49 AM, Fam Zheng <famz@redhat.com> wrote:
>> >>> On Fri, 04/14 14:30, 858585 jemmy wrote:
>> >>>> Do you know some other format which have very small cluster size?
>> >>>
>> >>> 64k is the default cluster size for qcow2 but it can be configured at image
>> >>> creation time, as 512 bytes, for example:
>> >>>
>> >>>     $ qemu-img create -f qcow2 test.qcow2 -o cluster_size=512 1G
>> >>
>> >> Thanks, i will test the performance again.
>> >
>> > I find the performance reduce when cluster size is 512.
>> > I will optimize the performance and submit a patch later.
>> > Thanks.
>>
>> after optimize the code, i find the destination qemu process still have very
>> bad performance when cluster_size is 512. the reason is cause by
>> qcow2_check_metadata_overlap.
>>
>> if cluster_size is 512, the destination qemu process reach 100% cpu usage.
>> and the perf top result is below:
>>
>> Samples: 32K of event 'cycles', Event count (approx.): 20105269445
>>  91.68%  qemu-system-x86_64       [.] qcow2_check_metadata_overlap
>>   3.33%  qemu-system-x86_64       [.] range_get_last
>>   2.76%  qemu-system-x86_64       [.] ranges_overlap
>>   0.61%  qemu-system-x86_64       [.] qcow2_cache_do_get
>>
>> very large l1_size.
>> (gdb) p s->l1_size
>> $3 = 1310720
>>
>> (gdb) p s->max_refcount_table_index
>> $5 = 21905
>>
>> the backtrace:
>>
>> Breakpoint 1, qcow2_check_metadata_overlap (bs=0x16feb00, ign=0,
>> offset=440329728, size=4096) at block/qcow2-refcount.c:2344
>> 2344    {
>> (gdb) bt
>> #0  qcow2_check_metadata_overlap (bs=0x16feb00, ign=0,
>> offset=440329728, size=4096) at block/qcow2-refcount.c:2344
>> #1  0x0000000000878d9f in qcow2_pre_write_overlap_check (bs=0x16feb00,
>> ign=0, offset=440329728, size=4096) at block/qcow2-refcount.c:2473
>> #2  0x000000000086e382 in qcow2_co_pwritev (bs=0x16feb00,
>> offset=771047424, bytes=704512, qiov=0x7fd026bfdb90, flags=0) at
>> block/qcow2.c:1653
>> #3  0x00000000008aeace in bdrv_driver_pwritev (bs=0x16feb00,
>> offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
>> block/io.c:871
>> #4  0x00000000008b015c in bdrv_aligned_pwritev (child=0x171b630,
>> req=0x7fd026bfd980, offset=770703360, bytes=1048576, align=1,
>> qiov=0x7fd026bfdb90, flags=0) at block/io.c:1371
>> #5  0x00000000008b0d77 in bdrv_co_pwritev (child=0x171b630,
>> offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
>> block/io.c:1622
>> #6  0x000000000089a76d in blk_co_pwritev (blk=0x16fe920,
>> offset=770703360, bytes=1048576, qiov=0x7fd026bfdb90, flags=0) at
>> block/block-backend.c:992
>> #7  0x000000000089a878 in blk_write_entry (opaque=0x7fd026bfdb70) at
>> block/block-backend.c:1017
>> #8  0x000000000089a95d in blk_prw (blk=0x16fe920, offset=770703360,
>> buf=0x362b050 "", bytes=1048576, co_entry=0x89a81a <blk_write_entry>,
>> flags=0) at block/block-backend.c:1045
>> #9  0x000000000089b222 in blk_pwrite (blk=0x16fe920, offset=770703360,
>> buf=0x362b050, count=1048576, flags=0) at block/block-backend.c:1208
>> #10 0x00000000007d480d in block_load (f=0x1784fa0, opaque=0xfd46a0,
>> version_id=1) at migration/block.c:992
>> #11 0x000000000049dc58 in vmstate_load (f=0x1784fa0, se=0x16fbdc0,
>> version_id=1) at /data/qemu/migration/savevm.c:730
>> #12 0x00000000004a0752 in qemu_loadvm_section_part_end (f=0x1784fa0,
>> mis=0xfd4160) at /data/qemu/migration/savevm.c:1923
>> #13 0x00000000004a0842 in qemu_loadvm_state_main (f=0x1784fa0,
>> mis=0xfd4160) at /data/qemu/migration/savevm.c:1954
>> #14 0x00000000004a0a33 in qemu_loadvm_state (f=0x1784fa0) at
>> /data/qemu/migration/savevm.c:2020
>> #15 0x00000000007c2d33 in process_incoming_migration_co
>> (opaque=0x1784fa0) at migration/migration.c:404
>> #16 0x0000000000966593 in coroutine_trampoline (i0=27108400, i1=0) at
>> util/coroutine-ucontext.c:79
>> #17 0x00007fd03946b8f0 in ?? () from /lib64/libc.so.6
>> #18 0x00007fff869c87e0 in ?? ()
>> #19 0x0000000000000000 in ?? ()
>>
>> when the cluster_size is too small, the write performance is very bad.
>> How to solve this problem? Any suggestion?
>> 1. when the cluster_size is too small, not invoke qcow2_pre_write_overlap_check.
>> 2.limit the qcow2 cluster_size range, don't allow set the cluster_size
>> too small.
>> which way is better?
>
> It's a separate problem.
>
> I think what should be done in this patch (or a follow up) is coalescing the
> same type of write as much as possible (by type I mean "zeroed" or "normal"
> write).  With that, cluster size won't matter that much.
yes, i have already optimize the code this way. i will send the patch later.
but the performance is still bad. It's a separate problem.
qcow2_check_metadata_overlap use a lot of cpu usage.

after i optimize the code, the blk_pwrite already coalescing the same type.
you can see in the backtrace.
#9  0x000000000089b222 in blk_pwrite (blk=0x16fe920, offset=770703360,
 buf=0x362b050, count=1048576, flags=0)

>
> Fam
Fam Zheng April 24, 2017, 12:19 p.m. UTC | #14
On Mon, 04/24 20:09, Fam Zheng wrote:
> It's a separate problem.

To be specific:

1) there is an option "overlap-check" that one can use to
disable the costly metadata check;

2) qcow2 with cluster_size = 512 is probably too uncommon to be optimized for.

Both are irrelevant to why and how this patch can be improved, IMO.

Fam
858585 jemmy April 24, 2017, 12:26 p.m. UTC | #15
On Mon, Apr 24, 2017 at 8:19 PM, Fam Zheng <famz@redhat.com> wrote:
> On Mon, 04/24 20:09, Fam Zheng wrote:
>> It's a separate problem.
>
> To be specific:
>
> 1) there is an option "overlap-check" that one can use to
> disable the costly metadata check;
yes, i will disable metadata check, and test the performance again.

>
> 2) qcow2 with cluster_size = 512 is probably too uncommon to be optimized for.
if culster_size is very small, should disable metadata check default?

>
> Both are irrelevant to why and how this patch can be improved, IMO.
>
> Fam
Fam Zheng April 24, 2017, 12:36 p.m. UTC | #16
On Mon, 04/24 20:26, 858585 jemmy wrote:
> > 2) qcow2 with cluster_size = 512 is probably too uncommon to be optimized for.
> if culster_size is very small, should disable metadata check default?
> 

No, I don't think it's worth the inconsistent behavior. People who want
performance shouldn't use 512 bytes anyway.

Fam
858585 jemmy April 24, 2017, 12:44 p.m. UTC | #17
On Mon, Apr 24, 2017 at 8:36 PM, Fam Zheng <famz@redhat.com> wrote:
> On Mon, 04/24 20:26, 858585 jemmy wrote:
>> > 2) qcow2 with cluster_size = 512 is probably too uncommon to be optimized for.
>> if culster_size is very small, should disable metadata check default?
>>
>
> No, I don't think it's worth the inconsistent behavior. People who want
> performance shouldn't use 512 bytes anyway.
ok,thanks.

>
> Fam
diff mbox

Patch

diff --git a/migration/block.c b/migration/block.c
index 7734ff7..41c7a55 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -885,6 +885,8 @@  static int block_load(QEMUFile *f, void *opaque, int version_id)
     int64_t total_sectors = 0;
     int nr_sectors;
     int ret;
+    BlockDriverInfo bdi;
+    int cluster_size;
 
     do {
         addr = qemu_get_be64(f);
@@ -919,6 +921,15 @@  static int block_load(QEMUFile *f, void *opaque, int version_id)
                     error_report_err(local_err);
                     return -EINVAL;
                 }
+
+                ret = bdrv_get_info(blk_bs(blk), &bdi);
+                if (ret == 0 && bdi.cluster_size > 0 &&
+                    bdi.cluster_size <= BLOCK_SIZE &&
+                    BLOCK_SIZE % bdi.cluster_size == 0) {
+                    cluster_size = bdi.cluster_size;
+                } else {
+                    cluster_size = BLOCK_SIZE;
+                }
             }
 
             if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) {
@@ -932,10 +943,30 @@  static int block_load(QEMUFile *f, void *opaque, int version_id)
                                         nr_sectors * BDRV_SECTOR_SIZE,
                                         BDRV_REQ_MAY_UNMAP);
             } else {
+                int i;
+                int64_t cur_addr;
+                uint8_t *cur_buf;
+
                 buf = g_malloc(BLOCK_SIZE);
                 qemu_get_buffer(f, buf, BLOCK_SIZE);
-                ret = blk_pwrite(blk, addr * BDRV_SECTOR_SIZE, buf,
-                                 nr_sectors * BDRV_SECTOR_SIZE, 0);
+                for (i = 0; i < BLOCK_SIZE / cluster_size; i++) {
+                    cur_addr = addr * BDRV_SECTOR_SIZE + i * cluster_size;
+                    cur_buf = buf + i * cluster_size;
+
+                    if ((!block_mig_state.zero_blocks ||
+                        cluster_size < BLOCK_SIZE) &&
+                        buffer_is_zero(cur_buf, cluster_size)) {
+                        ret = blk_pwrite_zeroes(blk, cur_addr,
+                                                cluster_size,
+                                                BDRV_REQ_MAY_UNMAP);
+                    } else {
+                        ret = blk_pwrite(blk, cur_addr, cur_buf,
+                                         cluster_size, 0);
+                    }
+                    if (ret < 0) {
+                        break;
+                    }
+                }
                 g_free(buf);
             }