diff mbox

[09/11] raw-posix: implement write_zeroes with MAY_UNMAP for block devices

Message ID 1384271389-20716-10-git-send-email-pbonzini@redhat.com
State New
Headers show

Commit Message

Paolo Bonzini Nov. 12, 2013, 3:49 p.m. UTC
See the next commit for the description of the Linux kernel problem
that is worked around in raw_open_common.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 block/raw-posix.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

Comments

Peter Lieven Nov. 13, 2013, 6:29 a.m. UTC | #1
Wouldn't it be good to add bdi->can_write_zeroes_with_unmap here as well?
This would automatically avoid full allocation when converting something to a host device
supporting BLKDISCARDZEROES.

Peter

Am 12.11.2013 um 16:49 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> See the next commit for the description of the Linux kernel problem
> that is worked around in raw_open_common.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> ---
> block/raw-posix.c | 38 ++++++++++++++++++++++++++++++++++++++
> 1 file changed, 38 insertions(+)
> 
> diff --git a/block/raw-posix.c b/block/raw-posix.c
> index 830e109..5cb46f1 100644
> --- a/block/raw-posix.c
> +++ b/block/raw-posix.c
> @@ -335,6 +335,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>     if (S_ISREG(st.st_mode)) {
>         s->discard_zeroes = true;
>     }
> +#ifdef BLKDISCARDZEROES
> +    if (S_ISBLK(st.st_mode)) {
> +        unsigned int arg;
> +        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
> +            s->discard_zeroes = true;
> +        }
> +    }
> +#endif
> +#ifdef CONFIG_LINUX
> +	/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
> +	 * not rely on the contents of discarded blocks unless using O_DIRECT.
> +	 */
> +        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
> +            s->discard_zeroes = false;
> +        }
> +#endif
> +    }
> 
> #ifdef CONFIG_XFS
>     if (platform_test_xfs_fd(s->fd)) {
> @@ -1587,6 +1604,26 @@ static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
>                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
> }
> 
> +static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
> +    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
> +{
> +    BDRVRawState *s = bs->opaque;
> +    int rc;
> +
> +    rc = fd_open(bs);
> +    if (rc < 0) {
> +        return rc;
> +    }
> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
> +        return -ENOTSUP;
> +    }
> +    if (!s->discard_zeroes) {
> +        return -ENOTSUP;
> +    }
> +    return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
> +                          QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
> +}
> +
> static int hdev_create(const char *filename, QEMUOptionParameter *options,
>                        Error **errp)
> {
> @@ -1639,6 +1676,7 @@ static BlockDriver bdrv_host_device = {
>     .bdrv_reopen_abort   = raw_reopen_abort,
>     .bdrv_create        = hdev_create,
>     .create_options     = raw_create_options,
> +    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
> 
>     .bdrv_aio_readv	= raw_aio_readv,
>     .bdrv_aio_writev	= raw_aio_writev,
> -- 
> 1.8.4.2
> 
>
Paolo Bonzini Nov. 13, 2013, 9:44 a.m. UTC | #2
Il 13/11/2013 07:29, Peter Lieven ha scritto:
> Wouldn't it be good to add bdi->can_write_zeroes_with_unmap here as well?

We do:

> +    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
> +    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;

> This would automatically avoid full allocation when converting something to a host device
> supporting BLKDISCARDZEROES.

Yes, that's (part of) the point of this patch.

Regarding the question you posed in the previous patch:

> does BLKDISCARDZEROES ioctl guarantee that a device is
> zero initialized or does it just guarantee that a discard may not
> fail and that it reads as zeroes afterwards?

Only the latter.  ".bdrv_has_zero_init" is only present in the bdrv_file
BlockDriver.

Paolo

> Peter
> 
> Am 12.11.2013 um 16:49 schrieb Paolo Bonzini <pbonzini@redhat.com>:
> 
>> See the next commit for the description of the Linux kernel problem
>> that is worked around in raw_open_common.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
>> ---
>> block/raw-posix.c | 38 ++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 38 insertions(+)
>>
>> diff --git a/block/raw-posix.c b/block/raw-posix.c
>> index 830e109..5cb46f1 100644
>> --- a/block/raw-posix.c
>> +++ b/block/raw-posix.c
>> @@ -335,6 +335,23 @@ static int raw_open_common(BlockDriverState *bs, QDict *options,
>>     if (S_ISREG(st.st_mode)) {
>>         s->discard_zeroes = true;
>>     }
>> +#ifdef BLKDISCARDZEROES
>> +    if (S_ISBLK(st.st_mode)) {
>> +        unsigned int arg;
>> +        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
>> +            s->discard_zeroes = true;
>> +        }
>> +    }
>> +#endif
>> +#ifdef CONFIG_LINUX
>> +	/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
>> +	 * not rely on the contents of discarded blocks unless using O_DIRECT.
>> +	 */
>> +        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
>> +            s->discard_zeroes = false;
>> +        }
>> +#endif
>> +    }
>>
>> #ifdef CONFIG_XFS
>>     if (platform_test_xfs_fd(s->fd)) {
>> @@ -1587,6 +1604,26 @@ static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
>>                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
>> }
>>
>> +static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
>> +    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
>> +{
>> +    BDRVRawState *s = bs->opaque;
>> +    int rc;
>> +
>> +    rc = fd_open(bs);
>> +    if (rc < 0) {
>> +        return rc;
>> +    }
>> +    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
>> +        return -ENOTSUP;
>> +    }
>> +    if (!s->discard_zeroes) {
>> +        return -ENOTSUP;
>> +    }
>> +    return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
>> +                          QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
>> +}
>> +
>> static int hdev_create(const char *filename, QEMUOptionParameter *options,
>>                        Error **errp)
>> {
>> @@ -1639,6 +1676,7 @@ static BlockDriver bdrv_host_device = {
>>     .bdrv_reopen_abort   = raw_reopen_abort,
>>     .bdrv_create        = hdev_create,
>>     .create_options     = raw_create_options,
>> +    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
>>
>>     .bdrv_aio_readv	= raw_aio_readv,
>>     .bdrv_aio_writev	= raw_aio_writev,
>> -- 
>> 1.8.4.2
>>
>>
> 
> 
>
Peter Lieven Nov. 13, 2013, 2:14 p.m. UTC | #3
Am 13.11.2013 um 10:44 schrieb Paolo Bonzini <pbonzini@redhat.com>:

> Il 13/11/2013 07:29, Peter Lieven ha scritto:
>> Wouldn't it be good to add bdi->can_write_zeroes_with_unmap here as well?
> 
> We do:
> 
>> +    bdi->unallocated_blocks_are_zero = s->discard_zeroes;
>> +    bdi->can_write_zeroes_with_unmap = s->discard_zeroes;

Sorry, I missed this.

> 
>> This would automatically avoid full allocation when converting something to a host device
>> supporting BLKDISCARDZEROES.
> 
> Yes, that's (part of) the point of this patch.
> 
> Regarding the question you posed in the previous patch:
> 
>> does BLKDISCARDZEROES ioctl guarantee that a device is
>> zero initialized or does it just guarantee that a discard may not
>> fail and that it reads as zeroes afterwards?
> 
> Only the latter.  ".bdrv_has_zero_init" is only present in the bdrv_file
> BlockDriver.

Then bdi->unallocated_blocks_are_zero must stay 0. .bdrv_has_zero_init's
semantic is to reflect the zero status of all blocks of the device right after bdrv_create
independently of their allocation status. bdi->unallocated_blocks_are_zero
reflects the zero status of every unallocated block regardless if it was
unallocated right from the beginning or became unallocated through a discard.

Peter
Paolo Bonzini Nov. 13, 2013, 2:39 p.m. UTC | #4
Il 13/11/2013 15:14, Peter Lieven ha scritto:
>>> >> does BLKDISCARDZEROES ioctl guarantee that a device is
>>> >> zero initialized or does it just guarantee that a discard may not
>>> >> fail and that it reads as zeroes afterwards?
>> > 
>> > Only the latter.  ".bdrv_has_zero_init" is only present in the bdrv_file
>> > BlockDriver.
> Then bdi->unallocated_blocks_are_zero must stay 0. .bdrv_has_zero_init's
> semantic is to reflect the zero status of all blocks of the device right after bdrv_create
> independently of their allocation status. bdi->unallocated_blocks_are_zero
> reflects the zero status of every unallocated block regardless if it was
> unallocated right from the beginning or became unallocated through a discard.

What we have is:

* bdi->unallocated_blocks_are_zero returns true

* bdrv_create doesn't ensure that every block starts unallocated

* hence bdrv_has_zero_init returns false

Blocks that (for any reason) are unallocated after bdrv_create *will* be
zero if BLKDISCARDZEROES returns true.

Paolo
Peter Lieven Nov. 13, 2013, 2:45 p.m. UTC | #5
> Am 13.11.2013 um 15:39 schrieb Paolo Bonzini <pbonzini@redhat.com>:
> 
> Il 13/11/2013 15:14, Peter Lieven ha scritto:
>>>>>> does BLKDISCARDZEROES ioctl guarantee that a device is
>>>>>> zero initialized or does it just guarantee that a discard may not
>>>>>> fail and that it reads as zeroes afterwards?
>>>> 
>>>> Only the latter.  ".bdrv_has_zero_init" is only present in the bdrv_file
>>>> BlockDriver.
>> Then bdi->unallocated_blocks_are_zero must stay 0. .bdrv_has_zero_init's
>> semantic is to reflect the zero status of all blocks of the device right after bdrv_create
>> independently of their allocation status. bdi->unallocated_blocks_are_zero
>> reflects the zero status of every unallocated block regardless if it was
>> unallocated right from the beginning or became unallocated through a discard.
> 
> What we have is:
> 
> * bdi->unallocated_blocks_are_zero returns true
> 
> * bdrv_create doesn't ensure that every block starts unallocated
> 
> * hence bdrv_has_zero_init returns false
> 
> Blocks that (for any reason) are unallocated after bdrv_create *will* be
> zero if BLKDISCARDZEROES returns true.

Ok, then i misunderstood your comment. If BLKDISCARDZEROES means lbprz == 1 SCSI speaking then you are right.

Peter

> 
> Paolo
diff mbox

Patch

diff --git a/block/raw-posix.c b/block/raw-posix.c
index 830e109..5cb46f1 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -335,6 +335,23 @@  static int raw_open_common(BlockDriverState *bs, QDict *options,
     if (S_ISREG(st.st_mode)) {
         s->discard_zeroes = true;
     }
+#ifdef BLKDISCARDZEROES
+    if (S_ISBLK(st.st_mode)) {
+        unsigned int arg;
+        if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) {
+            s->discard_zeroes = true;
+        }
+    }
+#endif
+#ifdef CONFIG_LINUX
+	/* On Linux 3.10, BLKDISCARD leaves stale data in the page cache.  Do
+	 * not rely on the contents of discarded blocks unless using O_DIRECT.
+	 */
+        if (!(bs->open_flags & BDRV_O_NOCACHE)) {
+            s->discard_zeroes = false;
+        }
+#endif
+    }
 
 #ifdef CONFIG_XFS
     if (platform_test_xfs_fd(s->fd)) {
@@ -1587,6 +1604,26 @@  static coroutine_fn BlockDriverAIOCB *hdev_aio_discard(BlockDriverState *bs,
                        cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
 }
 
+static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
+{
+    BDRVRawState *s = bs->opaque;
+    int rc;
+
+    rc = fd_open(bs);
+    if (rc < 0) {
+        return rc;
+    }
+    if (!(flags & BDRV_REQ_MAY_UNMAP)) {
+        return -ENOTSUP;
+    }
+    if (!s->discard_zeroes) {
+        return -ENOTSUP;
+    }
+    return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors,
+                          QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV);
+}
+
 static int hdev_create(const char *filename, QEMUOptionParameter *options,
                        Error **errp)
 {
@@ -1639,6 +1676,7 @@  static BlockDriver bdrv_host_device = {
     .bdrv_reopen_abort   = raw_reopen_abort,
     .bdrv_create        = hdev_create,
     .create_options     = raw_create_options,
+    .bdrv_co_write_zeroes = hdev_co_write_zeroes,
 
     .bdrv_aio_readv	= raw_aio_readv,
     .bdrv_aio_writev	= raw_aio_writev,