diff mbox

[PATCHv2,07/11] iscsi: let bdrv_create conditionally zero out the device

Message ID 1372338695-411-8-git-send-email-pl@kamp.de
State New
Headers show

Commit Message

Peter Lieven June 27, 2013, 1:11 p.m. UTC
if the device supports unmapping and unmapped blocks read as
zero ensure that the whole device is unmapped and report
.has_zero_init = 1 in this case to speed up qemu-img convert.

Signed-off-by: Peter Lieven <pl@kamp.de>
---
 block/iscsi.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

Comments

Stefan Hajnoczi July 1, 2013, 1:58 p.m. UTC | #1
On Thu, Jun 27, 2013 at 03:11:31PM +0200, Peter Lieven wrote:
> @@ -1481,7 +1483,75 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>      }
>  
>      ret = 0;
> +
> +    if (iscsilun->lbpu && iscsilun->lbprz) {

Moving this block of code into a function would be nice.
Paolo Bonzini July 1, 2013, 8:20 p.m. UTC | #2
Il 27/06/2013 15:11, Peter Lieven ha scritto:
> if the device supports unmapping and unmapped blocks read as
> zero ensure that the whole device is unmapped and report
> .has_zero_init = 1 in this case to speed up qemu-img convert.

This can still take several minutes.  Do you need any special timeout in
libiscsi?

Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
the unmap functionality can be moved up to qemu-img convert?

Paolo

> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/iscsi.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 71 insertions(+), 1 deletion(-)
> 
> diff --git a/block/iscsi.c b/block/iscsi.c
> index 92e66a6..621ca40 100644
> --- a/block/iscsi.c
> +++ b/block/iscsi.c
> @@ -1436,7 +1436,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
>  
>  static int iscsi_has_zero_init(BlockDriverState *bs)
>  {
> -    return 0;
> +    IscsiLun *iscsilun = bs->opaque;
> +    return (iscsilun->lbpu && iscsilun->lbprz) ? 1 : 0;
>  }
>  
>  static int iscsi_create(const char *filename, QEMUOptionParameter *options)
> @@ -1446,6 +1447,7 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>      BlockDriverState bs;
>      IscsiLun *iscsilun = NULL;
>      QDict *bs_options;
> +    struct scsi_task *task = NULL;
>  
>      memset(&bs, 0, sizeof(BlockDriverState));
>  
> @@ -1481,7 +1483,75 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>      }
>  
>      ret = 0;
> +
> +    if (iscsilun->lbpu && iscsilun->lbprz) {
> +        uint64_t lba = 0;
> +        while (lba < iscsilun->num_blocks) {
> +            struct scsi_get_lba_status *lbas = NULL;
> +            struct scsi_lba_status_descriptor *lbasd = NULL;
> +            enum scsi_provisioning_type provisioning;
> +            uint32_t nb_sectors;
> +
> +            task = iscsi_get_lba_status_sync(iscsilun->iscsi, iscsilun->lun,
> +                                             lba, 8 + 16);
> +            if (task == NULL || task->status != SCSI_STATUS_GOOD) {
> +                error_report("iSCSI: Failed to get_lba_status on iSCSI lun. %s",
> +                             iscsi_get_error(iscsilun->iscsi));
> +                ret = -EINVAL;
> +                goto out;
> +            }
> +
> +            lbas = scsi_datain_unmarshall(task);
> +            if (lbas == NULL) {
> +                error_report("iSCSI: failed to unmarshall inquiry datain blob");
> +                ret = -EINVAL;
> +                goto out;
> +            }
> +            lbasd = &lbas->descriptors[0];
> +            if (lbasd->lba != lba) {
> +                ret = -EINVAL;
> +                goto out;
> +            }
> +            nb_sectors = lbasd->num_blocks;
> +            provisioning = lbasd->provisioning;
> +            scsi_free_scsi_task(task);
> +            task = NULL;
> +
> +            /* blocks from lba to lba + nb_sectors - 1 are not mapped
> +             * and read as zero (lbprz==1) so we can skip them */
> +            if (provisioning != SCSI_PROVISIONING_TYPE_MAPPED) {
> +                lba += nb_sectors;
> +                continue;
> +            }
> +
> +            uint64_t lba2 = lba + nb_sectors;
> +            while (lba < lba2) {
> +                struct unmap_list list[1];
> +                list[0].lba = lba;
> +                list[0].num = iscsilun->max_unmap;
> +                if (lba + list[0].num > iscsilun->num_blocks) {
> +                    list[0].num = iscsilun->num_blocks - lba;
> +                }
> +                task = iscsi_unmap_sync(iscsilun->iscsi,
> +                                        iscsilun->lun,
> +                                        0, 0, &list[0], 1);
> +                if (task == NULL || task->status != SCSI_STATUS_GOOD) {
> +                    error_report("iSCSI: Failed to unmap data on iSCSI lun. %s",
> +                                 iscsi_get_error(iscsilun->iscsi));
> +                    ret = -EINVAL;
> +                    goto out;
> +                }
> +                scsi_free_scsi_task(task);
> +                task = NULL;
> +                lba += list[0].num;
> +            }
> +        }
> +    }
> +
>  out:
> +    if (task) {
> +        scsi_free_scsi_task(task);
> +    }
>      if (iscsilun->iscsi != NULL) {
>          iscsi_destroy_context(iscsilun->iscsi);
>      }
>
Peter Lieven July 1, 2013, 9:36 p.m. UTC | #3
Am 01.07.2013 um 22:20 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> Il 27/06/2013 15:11, Peter Lieven ha scritto:
>> if the device supports unmapping and unmapped blocks read as
>> zero ensure that the whole device is unmapped and report
>> .has_zero_init = 1 in this case to speed up qemu-img convert.
> 
> This can still take several minutes.  Do you need any special timeout in
> libiscsi?

Not as far as I know. The number of unmapped sectors is bound by iscsilun->max_unmap.
This is what the storage will handle in one request. For my storage its the internal
page size (15MB). Its very fast. Except for some broken storages I expect that unmapping
on a thin-provisioned target means deleting of pointers or something similar not actually
writing something to the disks. This it what the SANITIZE command does.

Do you have evidence that it is really taking minutes somewhere?

I tested it with a fully allocated 1TB volume. In my case it was a question of about 20 seconds.
I think this was mainly due to the thousands of sync requests that had to be sent.

> 
> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
> the unmap functionality can be moved up to qemu-img convert?

Is there any other storage protocol out there that could benefit from it?

Peter
Paolo Bonzini July 2, 2013, 9:22 a.m. UTC | #4
Il 01/07/2013 23:36, Peter Lieven ha scritto:
> 
> Am 01.07.2013 um 22:20 schrieb Paolo Bonzini <pbonzini@redhat.com>:
> 
>> Il 27/06/2013 15:11, Peter Lieven ha scritto:
>>> if the device supports unmapping and unmapped blocks read as
>>> zero ensure that the whole device is unmapped and report
>>> .has_zero_init = 1 in this case to speed up qemu-img convert.
>>
>> This can still take several minutes.  Do you need any special timeout in
>> libiscsi?
> 
> Not as far as I know. The number of unmapped sectors is bound by iscsilun->max_unmap.
> This is what the storage will handle in one request. For my storage its the internal
> page size (15MB). Its very fast. Except for some broken storages I expect that unmapping
> on a thin-provisioned target means deleting of pointers or something similar not actually
> writing something to the disks. This it what the SANITIZE command does.

Ok, I remember someone reporting timeouts when doing a discard with
virtio-scsi.  But maybe the problem there was that Linux parallelized
the requests excessively after splitting them.

> Do you have evidence that it is really taking minutes somewhere?
> 
> I tested it with a fully allocated 1TB volume. In my case it was a question of about 20 seconds.
> I think this was mainly due to the thousands of sync requests that had to be sent.
> 
>> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
>> the unmap functionality can be moved up to qemu-img convert?
> 
> Is there any other storage protocol out there that could benefit from it?

Definitely LVM.  Perhaps in the future gluster too, though right now it
only supports discard on files, not block devices.

Paolo
Peter Lieven July 2, 2013, 10:36 a.m. UTC | #5
Am 02.07.2013 um 11:22 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> Il 01/07/2013 23:36, Peter Lieven ha scritto:
>> 
>> Am 01.07.2013 um 22:20 schrieb Paolo Bonzini <pbonzini@redhat.com>:
>> 
>>> Il 27/06/2013 15:11, Peter Lieven ha scritto:
>>>> if the device supports unmapping and unmapped blocks read as
>>>> zero ensure that the whole device is unmapped and report
>>>> .has_zero_init = 1 in this case to speed up qemu-img convert.
>>> 
>>> This can still take several minutes.  Do you need any special timeout in
>>> libiscsi?
>> 
>> Not as far as I know. The number of unmapped sectors is bound by iscsilun->max_unmap.
>> This is what the storage will handle in one request. For my storage its the internal
>> page size (15MB). Its very fast. Except for some broken storages I expect that unmapping
>> on a thin-provisioned target means deleting of pointers or something similar not actually
>> writing something to the disks. This it what the SANITIZE command does.
> 
> Ok, I remember someone reporting timeouts when doing a discard with
> virtio-scsi.  But maybe the problem there was that Linux parallelized
> the requests excessively after splitting them.
> 
>> Do you have evidence that it is really taking minutes somewhere?
>> 
>> I tested it with a fully allocated 1TB volume. In my case it was a question of about 20 seconds.
>> I think this was mainly due to the thousands of sync requests that had to be sent.
>> 
>>> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
>>> the unmap functionality can be moved up to qemu-img convert?
>> 
>> Is there any other storage protocol out there that could benefit from it?
> 
> Definitely LVM.  Perhaps in the future gluster too, though right now it
> only supports discard on files, not block devices.

Is discards on LVM sth that is already implemented in qemu?

Would you mind if we postpone the general approach to a later point.
It seems that this is much more complex than the iSCSI approach.
My intention was to fix the iSCSI thin-provisioning problem here.

Peter
Paolo Bonzini July 2, 2013, 10:49 a.m. UTC | #6
Il 02/07/2013 12:36, Peter Lieven ha scritto:
>>>> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
>>>> the unmap functionality can be moved up to qemu-img convert?
>>>
>>> Is there any other storage protocol out there that could benefit from it?
>>
>> Definitely LVM.  Perhaps in the future gluster too, though right now it
>> only supports discard on files, not block devices.
> 
> Is discards on LVM sth that is already implemented in qemu?

Yes, it supports BLKDISCARD (see handle_aiocb_discard in
block/raw-posix.c).  Of course there is no way to query the host
discard_zeroes setting yet.

But even if it weren't implemented in QEMU, you should aim at making it
easier (if it's not too much work, which it isn't), not harder.  If you
do it in block/iscsi.c, the next person who comes will have to basically
undo your work and reimplement+retest it with the right API.

> Would you mind if we postpone the general approach to a later point.
> It seems that this is much more complex than the iSCSI approach.

It shouldn't be more complex at all, actually.  You just need to pass
the maximum unmap sectors and lbprz parameters through bdrv_get_info.

I'm not asking you to add support for BLKDISCARDZEROES and all that.
I'm asking you to do the work at the right level.

Paolo
Peter Lieven July 2, 2013, 10:56 a.m. UTC | #7
Am 02.07.2013 um 12:49 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> Il 02/07/2013 12:36, Peter Lieven ha scritto:
>>>>> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
>>>>> the unmap functionality can be moved up to qemu-img convert?
>>>> 
>>>> Is there any other storage protocol out there that could benefit from it?
>>> 
>>> Definitely LVM.  Perhaps in the future gluster too, though right now it
>>> only supports discard on files, not block devices.
>> 
>> Is discards on LVM sth that is already implemented in qemu?
> 
> Yes, it supports BLKDISCARD (see handle_aiocb_discard in
> block/raw-posix.c).  Of course there is no way to query the host
> discard_zeroes setting yet.

No way in qemu or no way at all?

> 
> But even if it weren't implemented in QEMU, you should aim at making it
> easier (if it's not too much work, which it isn't), not harder.  If you
> do it in block/iscsi.c, the next person who comes will have to basically
> undo your work and reimplement+retest it with the right API.
> 
>> Would you mind if we postpone the general approach to a later point.
>> It seems that this is much more complex than the iSCSI approach.
> 
> It shouldn't be more complex at all, actually.  You just need to pass
> the maximum unmap sectors and lbprz parameters through bdrv_get_info.
> 
> I'm not asking you to add support for BLKDISCARDZEROES and all that.
> I'm asking you to do the work at the right level.

You are right, if its possible in other protocols also it should be done at a higher level.
I will at least look into integrating it into bdrv_get_info for iSCSI and implement the
unmapping logic in qemu-img for the case that has_zero_init is 0.

Peter
Paolo Bonzini July 2, 2013, 11:04 a.m. UTC | #8
Il 02/07/2013 12:56, Peter Lieven ha scritto:
> 
> Am 02.07.2013 um 12:49 schrieb Paolo Bonzini <pbonzini@redhat.com>:
> 
>> Il 02/07/2013 12:36, Peter Lieven ha scritto:
>>>>>> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
>>>>>> the unmap functionality can be moved up to qemu-img convert?
>>>>>
>>>>> Is there any other storage protocol out there that could benefit from it?
>>>>
>>>> Definitely LVM.  Perhaps in the future gluster too, though right now it
>>>> only supports discard on files, not block devices.
>>>
>>> Is discards on LVM sth that is already implemented in qemu?
>>
>> Yes, it supports BLKDISCARD (see handle_aiocb_discard in
>> block/raw-posix.c).  Of course there is no way to query the host
>> discard_zeroes setting yet.
> 
> No way in qemu or no way at all?

No way in QEMU.  Linux has the BLKDISCARDZEROES ioctl, it takes an int*
as the third argument and puts 0/1 in it.

Paolo
Peter Lieven July 2, 2013, 11:18 a.m. UTC | #9
Am 02.07.2013 um 13:04 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> Il 02/07/2013 12:56, Peter Lieven ha scritto:
>> 
>> Am 02.07.2013 um 12:49 schrieb Paolo Bonzini <pbonzini@redhat.com>:
>> 
>>> Il 02/07/2013 12:36, Peter Lieven ha scritto:
>>>>>>> Perhaps we can have a new "discard_zeroes" field in bdrv_get_info, and
>>>>>>> the unmap functionality can be moved up to qemu-img convert?
>>>>>> 
>>>>>> Is there any other storage protocol out there that could benefit from it?
>>>>> 
>>>>> Definitely LVM.  Perhaps in the future gluster too, though right now it
>>>>> only supports discard on files, not block devices.
>>>> 
>>>> Is discards on LVM sth that is already implemented in qemu?
>>> 
>>> Yes, it supports BLKDISCARD (see handle_aiocb_discard in
>>> block/raw-posix.c).  Of course there is no way to query the host
>>> discard_zeroes setting yet.
>> 
>> No way in qemu or no way at all?
> 
> No way in QEMU.  Linux has the BLKDISCARDZEROES ioctl, it takes an int*
> as the third argument and puts 0/1 in it.

okay, thanks for the pointer.

if you all don't mind I would split up my series and first send a v3 which adds the unproblematic stuff and
leave 

iscsi: let bdrv_create conditionally zero out the device
block-migration: efficiently encode zero blocks

out. both seem to need more time and cycles to follow the better approaches.

Peter
Kevin Wolf July 10, 2013, 10:14 a.m. UTC | #10
Am 27.06.2013 um 15:11 hat Peter Lieven geschrieben:
> if the device supports unmapping and unmapped blocks read as
> zero ensure that the whole device is unmapped and report
> .has_zero_init = 1 in this case to speed up qemu-img convert.
> 
> Signed-off-by: Peter Lieven <pl@kamp.de>
> ---
>  block/iscsi.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 71 insertions(+), 1 deletion(-)
> 
> diff --git a/block/iscsi.c b/block/iscsi.c
> index 92e66a6..621ca40 100644
> --- a/block/iscsi.c
> +++ b/block/iscsi.c
> @@ -1436,7 +1436,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
>  
>  static int iscsi_has_zero_init(BlockDriverState *bs)
>  {
> -    return 0;
> +    IscsiLun *iscsilun = bs->opaque;
> +    return (iscsilun->lbpu && iscsilun->lbprz) ? 1 : 0;
>  }
>  
>  static int iscsi_create(const char *filename, QEMUOptionParameter *options)
> @@ -1446,6 +1447,7 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>      BlockDriverState bs;
>      IscsiLun *iscsilun = NULL;
>      QDict *bs_options;
> +    struct scsi_task *task = NULL;
>  
>      memset(&bs, 0, sizeof(BlockDriverState));
>  
> @@ -1481,7 +1483,75 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>      }
>  
>      ret = 0;

Noticed something independent from your patch, a bit more context:

    if (bs.total_sectors < total_size) {
        ret = -ENOSPC;
    }

    ret = 0;

The -ENOSPC case can't ever trigger, there's a 'goto out' missing. Maybe
you can include another patch too fix this in the next version of your
series?

Kevin
Peter Lieven July 10, 2013, 1:52 p.m. UTC | #11
Am 10.07.2013 12:14, schrieb Kevin Wolf:
> Am 27.06.2013 um 15:11 hat Peter Lieven geschrieben:
>> if the device supports unmapping and unmapped blocks read as
>> zero ensure that the whole device is unmapped and report
>> .has_zero_init = 1 in this case to speed up qemu-img convert.
>>
>> Signed-off-by: Peter Lieven <pl@kamp.de>
>> ---
>>  block/iscsi.c |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 71 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/iscsi.c b/block/iscsi.c
>> index 92e66a6..621ca40 100644
>> --- a/block/iscsi.c
>> +++ b/block/iscsi.c
>> @@ -1436,7 +1436,8 @@ static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
>>  
>>  static int iscsi_has_zero_init(BlockDriverState *bs)
>>  {
>> -    return 0;
>> +    IscsiLun *iscsilun = bs->opaque;
>> +    return (iscsilun->lbpu && iscsilun->lbprz) ? 1 : 0;
>>  }
>>  
>>  static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>> @@ -1446,6 +1447,7 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>>      BlockDriverState bs;
>>      IscsiLun *iscsilun = NULL;
>>      QDict *bs_options;
>> +    struct scsi_task *task = NULL;
>>  
>>      memset(&bs, 0, sizeof(BlockDriverState));
>>  
>> @@ -1481,7 +1483,75 @@ static int iscsi_create(const char *filename, QEMUOptionParameter *options)
>>      }
>>  
>>      ret = 0;
> Noticed something independent from your patch, a bit more context:
>
>     if (bs.total_sectors < total_size) {
>         ret = -ENOSPC;
>     }
>
>     ret = 0;
>
> The -ENOSPC case can't ever trigger, there's a 'goto out' missing. Maybe
> you can include another patch too fix this in the next version of your
> series?
will do.

the patch 7/11 itself will not be included. i was asked to make a general
approach to zero out the device in qemu-img if a device has discard_zeroes.

Peter
diff mbox

Patch

diff --git a/block/iscsi.c b/block/iscsi.c
index 92e66a6..621ca40 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1436,7 +1436,8 @@  static int iscsi_truncate(BlockDriverState *bs, int64_t offset)
 
 static int iscsi_has_zero_init(BlockDriverState *bs)
 {
-    return 0;
+    IscsiLun *iscsilun = bs->opaque;
+    return (iscsilun->lbpu && iscsilun->lbprz) ? 1 : 0;
 }
 
 static int iscsi_create(const char *filename, QEMUOptionParameter *options)
@@ -1446,6 +1447,7 @@  static int iscsi_create(const char *filename, QEMUOptionParameter *options)
     BlockDriverState bs;
     IscsiLun *iscsilun = NULL;
     QDict *bs_options;
+    struct scsi_task *task = NULL;
 
     memset(&bs, 0, sizeof(BlockDriverState));
 
@@ -1481,7 +1483,75 @@  static int iscsi_create(const char *filename, QEMUOptionParameter *options)
     }
 
     ret = 0;
+
+    if (iscsilun->lbpu && iscsilun->lbprz) {
+        uint64_t lba = 0;
+        while (lba < iscsilun->num_blocks) {
+            struct scsi_get_lba_status *lbas = NULL;
+            struct scsi_lba_status_descriptor *lbasd = NULL;
+            enum scsi_provisioning_type provisioning;
+            uint32_t nb_sectors;
+
+            task = iscsi_get_lba_status_sync(iscsilun->iscsi, iscsilun->lun,
+                                             lba, 8 + 16);
+            if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+                error_report("iSCSI: Failed to get_lba_status on iSCSI lun. %s",
+                             iscsi_get_error(iscsilun->iscsi));
+                ret = -EINVAL;
+                goto out;
+            }
+
+            lbas = scsi_datain_unmarshall(task);
+            if (lbas == NULL) {
+                error_report("iSCSI: failed to unmarshall inquiry datain blob");
+                ret = -EINVAL;
+                goto out;
+            }
+            lbasd = &lbas->descriptors[0];
+            if (lbasd->lba != lba) {
+                ret = -EINVAL;
+                goto out;
+            }
+            nb_sectors = lbasd->num_blocks;
+            provisioning = lbasd->provisioning;
+            scsi_free_scsi_task(task);
+            task = NULL;
+
+            /* blocks from lba to lba + nb_sectors - 1 are not mapped
+             * and read as zero (lbprz==1) so we can skip them */
+            if (provisioning != SCSI_PROVISIONING_TYPE_MAPPED) {
+                lba += nb_sectors;
+                continue;
+            }
+
+            uint64_t lba2 = lba + nb_sectors;
+            while (lba < lba2) {
+                struct unmap_list list[1];
+                list[0].lba = lba;
+                list[0].num = iscsilun->max_unmap;
+                if (lba + list[0].num > iscsilun->num_blocks) {
+                    list[0].num = iscsilun->num_blocks - lba;
+                }
+                task = iscsi_unmap_sync(iscsilun->iscsi,
+                                        iscsilun->lun,
+                                        0, 0, &list[0], 1);
+                if (task == NULL || task->status != SCSI_STATUS_GOOD) {
+                    error_report("iSCSI: Failed to unmap data on iSCSI lun. %s",
+                                 iscsi_get_error(iscsilun->iscsi));
+                    ret = -EINVAL;
+                    goto out;
+                }
+                scsi_free_scsi_task(task);
+                task = NULL;
+                lba += list[0].num;
+            }
+        }
+    }
+
 out:
+    if (task) {
+        scsi_free_scsi_task(task);
+    }
     if (iscsilun->iscsi != NULL) {
         iscsi_destroy_context(iscsilun->iscsi);
     }