Patchwork [v2,7/8] block: core copy-on-read logic

login
register
mail settings
Submitter Stefan Hajnoczi
Date Nov. 17, 2011, 1:40 p.m.
Message ID <1321537232-799-8-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/126230/
State New
Headers show

Comments

Stefan Hajnoczi - Nov. 17, 2011, 1:40 p.m.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 block.c      |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 trace-events |    1 +
 2 files changed, 73 insertions(+), 0 deletions(-)
Zhiyong Wu - Nov. 23, 2011, 3:42 a.m.
On Thu, Nov 17, 2011 at 9:40 PM, Stefan Hajnoczi
<stefanha@linux.vnet.ibm.com> wrote:
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  block.c      |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  trace-events |    1 +
>  2 files changed, 73 insertions(+), 0 deletions(-)
>
> diff --git a/block.c b/block.c
> index 0eef122..d5faa6c 100644
> --- a/block.c
> +++ b/block.c
> @@ -1464,6 +1464,61 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
>     return 0;
>  }
>
> +static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> +{
> +    /* Perform I/O through a temporary buffer so that users who scribble over
> +     * their read buffer while the operation is in progress do not end up
> +     * modifying the image file.  This is critical for zero-copy guest I/O
> +     * where anything might happen inside guest memory.
> +     */
> +    void *bounce_buffer;
> +
> +    struct iovec iov;
> +    QEMUIOVector bounce_qiov;
> +    int64_t cluster_sector_num;
> +    int cluster_nb_sectors;
> +    size_t skip_bytes;
> +    int ret;
> +
> +    /* Cover entire cluster so no additional backing file I/O is required when
> +     * allocating cluster in the image file.
> +     */
> +    round_to_clusters(bs, sector_num, nb_sectors,
> +                      &cluster_sector_num, &cluster_nb_sectors);
> +
> +    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
> +                                cluster_sector_num, cluster_nb_sectors);
> +
> +    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
> +    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
> +    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
> +
> +    ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
> +                                 &bounce_qiov);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
> +    ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
> +                                  &bounce_qiov);
> +    if (ret < 0) {
> +        /* It might be okay to ignore write errors for guest requests.  If this
> +         * is a deliberate copy-on-read then we don't want to ignore the error.
> +         * Simply report it in all cases.
> +         */
> +        goto err;
> +    }
> +
> +    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
> +    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
> +                           nb_sectors * BDRV_SECTOR_SIZE);
> +
> +err:
> +    qemu_vfree(bounce_buffer);
> +    return ret;
> +}
> +
>  /*
>  * Handle a read request in coroutine context
>  */
> @@ -1491,7 +1546,24 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>     }
>
>     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
> +
> +    if (bs->copy_on_read) {
Why is  tracked_request_begin/end() not put inside the brace around
bs->copy_on_read?
If this COR function is not enabled, i guess that request tracing
function should not be need.
> +        int pnum;
> +
> +        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +
> +        if (!ret || pnum != nb_sectors) {
> +            ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
> +            goto out;
> +        }
> +    }
> +
>     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> +
> +out:
>     tracked_request_end(&req);
>     return ret;
>  }
> diff --git a/trace-events b/trace-events
> index 962caca..518b76b 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -69,6 +69,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
>  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
> +bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
>
>  # hw/virtio-blk.c
>  virtio_blk_req_complete(void *req, int status) "req %p status %d"
> --
> 1.7.7.1
>
>
>
Zhiyong Wu - Nov. 23, 2011, 4:42 a.m.
On Thu, Nov 17, 2011 at 9:40 PM, Stefan Hajnoczi
<stefanha@linux.vnet.ibm.com> wrote:
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  block.c      |   72 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  trace-events |    1 +
>  2 files changed, 73 insertions(+), 0 deletions(-)
>
> diff --git a/block.c b/block.c
> index 0eef122..d5faa6c 100644
> --- a/block.c
> +++ b/block.c
> @@ -1464,6 +1464,61 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
>     return 0;
>  }
>
> +static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
> +        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
> +{
> +    /* Perform I/O through a temporary buffer so that users who scribble over
> +     * their read buffer while the operation is in progress do not end up
> +     * modifying the image file.  This is critical for zero-copy guest I/O
> +     * where anything might happen inside guest memory.
> +     */
> +    void *bounce_buffer;
> +
> +    struct iovec iov;
> +    QEMUIOVector bounce_qiov;
> +    int64_t cluster_sector_num;
> +    int cluster_nb_sectors;
> +    size_t skip_bytes;
> +    int ret;
> +
> +    /* Cover entire cluster so no additional backing file I/O is required when
> +     * allocating cluster in the image file.
> +     */
> +    round_to_clusters(bs, sector_num, nb_sectors,
> +                      &cluster_sector_num, &cluster_nb_sectors);
Why need to round down/up sector_num/nb_sectors in this function? The
detection of race condition for write request has been done before
this function.

> +
> +    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
> +                                cluster_sector_num, cluster_nb_sectors);
> +
> +    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
> +    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
> +    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
> +
> +    ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
> +                                 &bounce_qiov);
> +    if (ret < 0) {
> +        goto err;
> +    }
> +
> +    ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
> +                                  &bounce_qiov);
> +    if (ret < 0) {
> +        /* It might be okay to ignore write errors for guest requests.  If this
> +         * is a deliberate copy-on-read then we don't want to ignore the error.
> +         * Simply report it in all cases.
> +         */
> +        goto err;
> +    }
> +
> +    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
> +    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
> +                           nb_sectors * BDRV_SECTOR_SIZE);
> +
> +err:
> +    qemu_vfree(bounce_buffer);
> +    return ret;
> +}
> +
>  /*
>  * Handle a read request in coroutine context
>  */
> @@ -1491,7 +1546,24 @@ static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
>     }
>
>     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
> +
> +    if (bs->copy_on_read) {
> +        int pnum;
> +
> +        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
> +        if (ret < 0) {
> +            goto out;
> +        }
> +
> +        if (!ret || pnum != nb_sectors) {
> +            ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
> +            goto out;
> +        }
> +    }
> +
>     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
> +
> +out:
>     tracked_request_end(&req);
>     return ret;
>  }
> diff --git a/trace-events b/trace-events
> index 962caca..518b76b 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -69,6 +69,7 @@ bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
>  bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
>  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
> +bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
>
>  # hw/virtio-blk.c
>  virtio_blk_req_complete(void *req, int status) "req %p status %d"
> --
> 1.7.7.1
>
>
>
Stefan Hajnoczi - Nov. 23, 2011, 8:58 a.m.
On Wed, Nov 23, 2011 at 4:42 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Thu, Nov 17, 2011 at 9:40 PM, Stefan Hajnoczi
> <stefanha@linux.vnet.ibm.com> wrote:
>> +    /* Cover entire cluster so no additional backing file I/O is required when
>> +     * allocating cluster in the image file.
>> +     */
>> +    round_to_clusters(bs, sector_num, nb_sectors,
>> +                      &cluster_sector_num, &cluster_nb_sectors);
> Why need to round down/up sector_num/nb_sectors in this function? The
> detection of race condition for write request has been done before
> this function.

If we write less than a cluster then the image format will have to
perform additional to populate the regions of the cluster that we did
not touch.  So we touch entire clusters and therefore no extra I/O is
generated.

The comment right above the line you are asking about explains this.

Stefan
Stefan Hajnoczi - Nov. 23, 2011, 9 a.m.
On Wed, Nov 23, 2011 at 8:58 AM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Nov 23, 2011 at 4:42 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Thu, Nov 17, 2011 at 9:40 PM, Stefan Hajnoczi
>> <stefanha@linux.vnet.ibm.com> wrote:
>>> +    /* Cover entire cluster so no additional backing file I/O is required when
>>> +     * allocating cluster in the image file.
>>> +     */
>>> +    round_to_clusters(bs, sector_num, nb_sectors,
>>> +                      &cluster_sector_num, &cluster_nb_sectors);
>> Why need to round down/up sector_num/nb_sectors in this function? The
>> detection of race condition for write request has been done before
>> this function.
>
> If we write less than a cluster then the image format will have to
> perform additional to populate the regions of the cluster that we did

"perform additional I/O"
Stefan Hajnoczi - Nov. 23, 2011, 9:05 a.m.
On Wed, Nov 23, 2011 at 3:42 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
> On Thu, Nov 17, 2011 at 9:40 PM, Stefan Hajnoczi
> <stefanha@linux.vnet.ibm.com> wrote:
>>     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
>> +
>> +    if (bs->copy_on_read) {
> Why is  tracked_request_begin/end() not put inside the brace around
> bs->copy_on_read?
> If this COR function is not enabled, i guess that request tracing
> function should not be need.

It's not safe to put the calls inside the "if (bs->copy_on_read) {"
body because turning off copy_on_read while a request is pending would
leave the request in the tracked list forever!

In a previous version of the series there was a flag to turn request
tracking on/off.  Pending requests would still remove themselves from
the list even after request tracking was disabled.

But request tracking is cheap - it involves filling in fields on the
stack and adding them to a linked list.  So to keep things simple we
always maintain this list.

Stefan
Zhiyong Wu - Nov. 23, 2011, 9:51 a.m.
On Wed, Nov 23, 2011 at 5:05 PM, Stefan Hajnoczi <stefanha@gmail.com> wrote:
> On Wed, Nov 23, 2011 at 3:42 AM, Zhi Yong Wu <zwu.kernel@gmail.com> wrote:
>> On Thu, Nov 17, 2011 at 9:40 PM, Stefan Hajnoczi
>> <stefanha@linux.vnet.ibm.com> wrote:
>>>     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
>>> +
>>> +    if (bs->copy_on_read) {
>> Why is  tracked_request_begin/end() not put inside the brace around
>> bs->copy_on_read?
>> If this COR function is not enabled, i guess that request tracing
>> function should not be need.
>
> It's not safe to put the calls inside the "if (bs->copy_on_read) {"
> body because turning off copy_on_read while a request is pending would
> leave the request in the tracked list forever!
GOT IT, thanks.
>
> In a previous version of the series there was a flag to turn request
> tracking on/off.  Pending requests would still remove themselves from
> the list even after request tracking was disabled.
>
> But request tracking is cheap - it involves filling in fields on the
> stack and adding them to a linked list.  So to keep things simple we
> always maintain this list.
>
> Stefan
>

Patch

diff --git a/block.c b/block.c
index 0eef122..d5faa6c 100644
--- a/block.c
+++ b/block.c
@@ -1464,6 +1464,61 @@  int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset,
     return 0;
 }
 
+static int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
+        int64_t sector_num, int nb_sectors, QEMUIOVector *qiov)
+{
+    /* Perform I/O through a temporary buffer so that users who scribble over
+     * their read buffer while the operation is in progress do not end up
+     * modifying the image file.  This is critical for zero-copy guest I/O
+     * where anything might happen inside guest memory.
+     */
+    void *bounce_buffer;
+
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    int64_t cluster_sector_num;
+    int cluster_nb_sectors;
+    size_t skip_bytes;
+    int ret;
+
+    /* Cover entire cluster so no additional backing file I/O is required when
+     * allocating cluster in the image file.
+     */
+    round_to_clusters(bs, sector_num, nb_sectors,
+                      &cluster_sector_num, &cluster_nb_sectors);
+
+    trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors,
+                                cluster_sector_num, cluster_nb_sectors);
+
+    iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE;
+    iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+    qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+
+    ret = bs->drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors,
+                                 &bounce_qiov);
+    if (ret < 0) {
+        goto err;
+    }
+
+    ret = bs->drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors,
+                                  &bounce_qiov);
+    if (ret < 0) {
+        /* It might be okay to ignore write errors for guest requests.  If this
+         * is a deliberate copy-on-read then we don't want to ignore the error.
+         * Simply report it in all cases.
+         */
+        goto err;
+    }
+
+    skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE;
+    qemu_iovec_from_buffer(qiov, bounce_buffer + skip_bytes,
+                           nb_sectors * BDRV_SECTOR_SIZE);
+
+err:
+    qemu_vfree(bounce_buffer);
+    return ret;
+}
+
 /*
  * Handle a read request in coroutine context
  */
@@ -1491,7 +1546,24 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
     }
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
+
+    if (bs->copy_on_read) {
+        int pnum;
+
+        ret = bdrv_co_is_allocated(bs, sector_num, nb_sectors, &pnum);
+        if (ret < 0) {
+            goto out;
+        }
+
+        if (!ret || pnum != nb_sectors) {
+            ret = bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, qiov);
+            goto out;
+        }
+    }
+
     ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
+
+out:
     tracked_request_end(&req);
     return ret;
 }
diff --git a/trace-events b/trace-events
index 962caca..518b76b 100644
--- a/trace-events
+++ b/trace-events
@@ -69,6 +69,7 @@  bdrv_lock_medium(void *bs, bool locked) "bs %p locked %d"
 bdrv_co_readv(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_writev(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_num %"PRId64" nb_sectors %d"
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
+bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
 
 # hw/virtio-blk.c
 virtio_blk_req_complete(void *req, int status) "req %p status %d"