Patchwork [v4,04/15] block: add image streaming block job

login
register
mail settings
Submitter Stefan Hajnoczi
Date Jan. 6, 2012, 2:01 p.m.
Message ID <1325858501-25741-5-git-send-email-stefanha@linux.vnet.ibm.com>
Download mbox | patch
Permalink /patch/134665/
State New
Headers show

Comments

Stefan Hajnoczi - Jan. 6, 2012, 2:01 p.m.
Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
---
 Makefile.objs  |    1 +
 block/stream.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 block_int.h    |    3 +
 trace-events   |    4 ++
 4 files changed, 127 insertions(+), 0 deletions(-)
 create mode 100644 block/stream.c
Luiz Capitulino - Jan. 11, 2012, 5:18 p.m.
On Fri,  6 Jan 2012 14:01:30 +0000
Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> wrote:

> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  Makefile.objs  |    1 +
>  block/stream.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h    |    3 +
>  trace-events   |    4 ++
>  4 files changed, 127 insertions(+), 0 deletions(-)
>  create mode 100644 block/stream.c
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 64d84de..fde3769 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -35,6 +35,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += stream.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_LIBISCSI) += iscsi.o
> diff --git a/block/stream.c b/block/stream.c
> new file mode 100644
> index 0000000..8ff98cf
> --- /dev/null
> +++ b/block/stream.c
> @@ -0,0 +1,119 @@
> +/*
> + * Image streaming
> + *
> + * Copyright IBM, Corp. 2011
> + *
> + * Authors:
> + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "trace.h"
> +#include "block_int.h"
> +
> +enum {
> +    /*
> +     * Size of data buffer for populating the image file.  This should be large
> +     * enough to process multiple clusters in a single call, so that populating
> +     * contiguous regions of the image is efficient.
> +     */
> +    STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
> +};
> +
> +typedef struct StreamBlockJob {
> +    BlockJob common;
> +    BlockDriverState *base;
> +} StreamBlockJob;
> +
> +static int coroutine_fn stream_populate(BlockDriverState *bs,
> +                                        int64_t sector_num, int nb_sectors,
> +                                        void *buf)
> +{
> +    struct iovec iov = {
> +        .iov_base = buf,
> +        .iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
> +    };
> +    QEMUIOVector qiov;
> +
> +    qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +    /* Copy-on-read the unallocated clusters */
> +    return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
> +}
> +
> +static void coroutine_fn stream_run(void *opaque)
> +{
> +    StreamBlockJob *s = opaque;
> +    BlockDriverState *bs = s->common.bs;
> +    int64_t sector_num, end;
> +    int ret = 0;
> +    int n;
> +    void *buf;
> +
> +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
> +    s->common.len = bdrv_getlength(bs);
> +    bdrv_get_geometry(bs, (uint64_t *)&end);
> +
> +    bdrv_enable_copy_on_read(bs);
> +
> +    for (sector_num = 0; sector_num < end; sector_num += n) {
> +        if (block_job_is_cancelled(&s->common)) {
> +            break;
> +        }
> +
> +        /* TODO rate-limit */
> +        /* Note that even when no rate limit is applied we need to yield with
> +         * no pending I/O here so that qemu_aio_flush() is able to return.
> +         */
> +        co_sleep_ns(rt_clock, 0);
> +
> +        ret = bdrv_co_is_allocated(bs, sector_num,
> +                                   STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
> +        trace_stream_one_iteration(s, sector_num, n, ret);
> +        if (ret == 0) {
> +            ret = stream_populate(bs, sector_num, n, buf);
> +        }
> +        if (ret < 0) {
> +            break;
> +        }
> +
> +        /* Publish progress */
> +        s->common.offset += n * BDRV_SECTOR_SIZE;
> +    }
> +
> +    bdrv_disable_copy_on_read(bs);
> +
> +    if (sector_num == end && ret == 0) {
> +        ret = bdrv_change_backing_file(bs, NULL, NULL);
> +    }
> +
> +    qemu_vfree(buf);
> +    block_job_complete(&s->common, ret);
> +}
> +
> +static BlockJobType stream_job_type = {
> +    .instance_size = sizeof(StreamBlockJob),
> +    .job_type      = "stream",
> +};
> +
> +int stream_start(BlockDriverState *bs, BlockDriverState *base,
> +                 BlockDriverCompletionFunc *cb, void *opaque)
> +{
> +    StreamBlockJob *s;
> +    Coroutine *co;
> +
> +    if (bs->job) {
> +        return -EBUSY;
> +    }
> +
> +    s = block_job_create(&stream_job_type, bs, cb, opaque);
> +    s->base = base;

This is missing a check against NULL.

> +
> +    co = qemu_coroutine_create(stream_run);
> +    trace_stream_start(bs, base, s, co, opaque);
> +    qemu_coroutine_enter(co, s);
> +    return 0;
> +}
> diff --git a/block_int.h b/block_int.h
> index 316443e..c7c9178 100644
> --- a/block_int.h
> +++ b/block_int.h
> @@ -332,4 +332,7 @@ int block_job_set_speed(BlockJob *job, int64_t value);
>  void block_job_cancel(BlockJob *job);
>  bool block_job_is_cancelled(BlockJob *job);
>  
> +int stream_start(BlockDriverState *bs, BlockDriverState *base,
> +                 BlockDriverCompletionFunc *cb, void *opaque);
> +
>  #endif /* BLOCK_INT_H */
> diff --git a/trace-events b/trace-events
> index 360f039..c5368fa 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -70,6 +70,10 @@ bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_
>  bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
>  bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
>  
> +# block/stream.c
> +stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
> +stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
> +
>  # hw/virtio-blk.c
>  virtio_blk_req_complete(void *req, int status) "req %p status %d"
>  virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"
Stefan Hajnoczi - Jan. 12, 2012, 9:11 a.m.
On Wed, Jan 11, 2012 at 03:18:28PM -0200, Luiz Capitulino wrote:
> On Fri,  6 Jan 2012 14:01:30 +0000
> Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> wrote:
> > +int stream_start(BlockDriverState *bs, BlockDriverState *base,
> > +                 BlockDriverCompletionFunc *cb, void *opaque)
> > +{
> > +    StreamBlockJob *s;
> > +    Coroutine *co;
> > +
> > +    if (bs->job) {
> > +        return -EBUSY;
> > +    }
> > +
> > +    s = block_job_create(&stream_job_type, bs, cb, opaque);
> > +    s->base = base;
> 
> This is missing a check against NULL.

Good catch.
Kevin Wolf - Jan. 12, 2012, 10:59 a.m.
Am 06.01.2012 15:01, schrieb Stefan Hajnoczi:
> Signed-off-by: Stefan Hajnoczi <stefanha@linux.vnet.ibm.com>
> ---
>  Makefile.objs  |    1 +
>  block/stream.c |  119 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  block_int.h    |    3 +
>  trace-events   |    4 ++
>  4 files changed, 127 insertions(+), 0 deletions(-)
>  create mode 100644 block/stream.c
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 64d84de..fde3769 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -35,6 +35,7 @@ block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
>  block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
>  block-nested-y += qed-check.o
>  block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
> +block-nested-y += stream.o
>  block-nested-$(CONFIG_WIN32) += raw-win32.o
>  block-nested-$(CONFIG_POSIX) += raw-posix.o
>  block-nested-$(CONFIG_LIBISCSI) += iscsi.o
> diff --git a/block/stream.c b/block/stream.c
> new file mode 100644
> index 0000000..8ff98cf
> --- /dev/null
> +++ b/block/stream.c
> @@ -0,0 +1,119 @@
> +/*
> + * Image streaming
> + *
> + * Copyright IBM, Corp. 2011
> + *
> + * Authors:
> + *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU LGPL, version 2 or later.
> + * See the COPYING.LIB file in the top-level directory.
> + *
> + */
> +
> +#include "trace.h"
> +#include "block_int.h"
> +
> +enum {
> +    /*
> +     * Size of data buffer for populating the image file.  This should be large
> +     * enough to process multiple clusters in a single call, so that populating
> +     * contiguous regions of the image is efficient.
> +     */
> +    STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
> +};
> +
> +typedef struct StreamBlockJob {
> +    BlockJob common;
> +    BlockDriverState *base;
> +} StreamBlockJob;
> +
> +static int coroutine_fn stream_populate(BlockDriverState *bs,
> +                                        int64_t sector_num, int nb_sectors,
> +                                        void *buf)
> +{
> +    struct iovec iov = {
> +        .iov_base = buf,
> +        .iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
> +    };
> +    QEMUIOVector qiov;
> +
> +    qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +    /* Copy-on-read the unallocated clusters */
> +    return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
> +}
> +
> +static void coroutine_fn stream_run(void *opaque)
> +{
> +    StreamBlockJob *s = opaque;
> +    BlockDriverState *bs = s->common.bs;
> +    int64_t sector_num, end;
> +    int ret = 0;
> +    int n;
> +    void *buf;
> +
> +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
> +    s->common.len = bdrv_getlength(bs);

No error check?

> +    bdrv_get_geometry(bs, (uint64_t *)&end);

Why call bdrv_getlength() twice? end = s->common.len >> BDRV_SECTOR_BITS
should be the same.

Kevin
Stefan Hajnoczi - Jan. 12, 2012, 11:39 a.m.
On Thu, Jan 12, 2012 at 10:59 AM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 06.01.2012 15:01, schrieb Stefan Hajnoczi:
>> +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
>> +    s->common.len = bdrv_getlength(bs);
>
> No error check?

Will fix.

>> +    bdrv_get_geometry(bs, (uint64_t *)&end);
>
> Why call bdrv_getlength() twice? end = s->common.len >> BDRV_SECTOR_BITS
> should be the same.

Okay, I'll change it.  I got sick of BDRV_SECTOR_* and called twice instead.
Kevin Wolf - Jan. 12, 2012, 12:53 p.m.
Am 12.01.2012 12:39, schrieb Stefan Hajnoczi:
> On Thu, Jan 12, 2012 at 10:59 AM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 06.01.2012 15:01, schrieb Stefan Hajnoczi:
>>> +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
>>> +    s->common.len = bdrv_getlength(bs);
>>
>> No error check?
> 
> Will fix.
> 
>>> +    bdrv_get_geometry(bs, (uint64_t *)&end);
>>
>> Why call bdrv_getlength() twice? end = s->common.len >> BDRV_SECTOR_BITS
>> should be the same.
> 
> Okay, I'll change it.  I got sick of BDRV_SECTOR_* and called twice instead.

Well, you can try and change everything in the streaming code to bytes
instead of sectors. We should probably do this sooner or later anyway.
Sectors of 512 bytes are a completely arbitrary unit that doesn't make
much sense generally.

Kevin
Stefan Hajnoczi - Jan. 12, 2012, 1:05 p.m.
On Thu, Jan 12, 2012 at 12:53 PM, Kevin Wolf <kwolf@redhat.com> wrote:
> Am 12.01.2012 12:39, schrieb Stefan Hajnoczi:
>> On Thu, Jan 12, 2012 at 10:59 AM, Kevin Wolf <kwolf@redhat.com> wrote:
>>> Am 06.01.2012 15:01, schrieb Stefan Hajnoczi:
>>>> +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
>>>> +    s->common.len = bdrv_getlength(bs);
>>>
>>> No error check?
>>
>> Will fix.
>>
>>>> +    bdrv_get_geometry(bs, (uint64_t *)&end);
>>>
>>> Why call bdrv_getlength() twice? end = s->common.len >> BDRV_SECTOR_BITS
>>> should be the same.
>>
>> Okay, I'll change it.  I got sick of BDRV_SECTOR_* and called twice instead.
>
> Well, you can try and change everything in the streaming code to bytes
> instead of sectors. We should probably do this sooner or later anyway.
> Sectors of 512 bytes are a completely arbitrary unit that doesn't make
> much sense generally.

That doesn't work because block layer interfaces use nb_sectors.  We
still need to convert.

Stefan
Kevin Wolf - Jan. 12, 2012, 1:17 p.m.
Am 12.01.2012 14:05, schrieb Stefan Hajnoczi:
> On Thu, Jan 12, 2012 at 12:53 PM, Kevin Wolf <kwolf@redhat.com> wrote:
>> Am 12.01.2012 12:39, schrieb Stefan Hajnoczi:
>>> On Thu, Jan 12, 2012 at 10:59 AM, Kevin Wolf <kwolf@redhat.com> wrote:
>>>> Am 06.01.2012 15:01, schrieb Stefan Hajnoczi:
>>>>> +    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
>>>>> +    s->common.len = bdrv_getlength(bs);
>>>>
>>>> No error check?
>>>
>>> Will fix.
>>>
>>>>> +    bdrv_get_geometry(bs, (uint64_t *)&end);
>>>>
>>>> Why call bdrv_getlength() twice? end = s->common.len >> BDRV_SECTOR_BITS
>>>> should be the same.
>>>
>>> Okay, I'll change it.  I got sick of BDRV_SECTOR_* and called twice instead.
>>
>> Well, you can try and change everything in the streaming code to bytes
>> instead of sectors. We should probably do this sooner or later anyway.
>> Sectors of 512 bytes are a completely arbitrary unit that doesn't make
>> much sense generally.
> 
> That doesn't work because block layer interfaces use nb_sectors.  We
> still need to convert.

Sure, somewhere you'll have the conversion. You can only push it a bit
closer to the invocation of the block drivers if you like. Everything
else would be a major refactoring (but eventually I think we'll do it).

Kevin

Patch

diff --git a/Makefile.objs b/Makefile.objs
index 64d84de..fde3769 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -35,6 +35,7 @@  block-nested-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow
 block-nested-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o
 block-nested-y += qed-check.o
 block-nested-y += parallels.o nbd.o blkdebug.o sheepdog.o blkverify.o
+block-nested-y += stream.o
 block-nested-$(CONFIG_WIN32) += raw-win32.o
 block-nested-$(CONFIG_POSIX) += raw-posix.o
 block-nested-$(CONFIG_LIBISCSI) += iscsi.o
diff --git a/block/stream.c b/block/stream.c
new file mode 100644
index 0000000..8ff98cf
--- /dev/null
+++ b/block/stream.c
@@ -0,0 +1,119 @@ 
+/*
+ * Image streaming
+ *
+ * Copyright IBM, Corp. 2011
+ *
+ * Authors:
+ *  Stefan Hajnoczi   <stefanha@linux.vnet.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ *
+ */
+
+#include "trace.h"
+#include "block_int.h"
+
+enum {
+    /*
+     * Size of data buffer for populating the image file.  This should be large
+     * enough to process multiple clusters in a single call, so that populating
+     * contiguous regions of the image is efficient.
+     */
+    STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */
+};
+
+typedef struct StreamBlockJob {
+    BlockJob common;
+    BlockDriverState *base;
+} StreamBlockJob;
+
+static int coroutine_fn stream_populate(BlockDriverState *bs,
+                                        int64_t sector_num, int nb_sectors,
+                                        void *buf)
+{
+    struct iovec iov = {
+        .iov_base = buf,
+        .iov_len  = nb_sectors * BDRV_SECTOR_SIZE,
+    };
+    QEMUIOVector qiov;
+
+    qemu_iovec_init_external(&qiov, &iov, 1);
+
+    /* Copy-on-read the unallocated clusters */
+    return bdrv_co_readv(bs, sector_num, nb_sectors, &qiov);
+}
+
+static void coroutine_fn stream_run(void *opaque)
+{
+    StreamBlockJob *s = opaque;
+    BlockDriverState *bs = s->common.bs;
+    int64_t sector_num, end;
+    int ret = 0;
+    int n;
+    void *buf;
+
+    buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE);
+    s->common.len = bdrv_getlength(bs);
+    bdrv_get_geometry(bs, (uint64_t *)&end);
+
+    bdrv_enable_copy_on_read(bs);
+
+    for (sector_num = 0; sector_num < end; sector_num += n) {
+        if (block_job_is_cancelled(&s->common)) {
+            break;
+        }
+
+        /* TODO rate-limit */
+        /* Note that even when no rate limit is applied we need to yield with
+         * no pending I/O here so that qemu_aio_flush() is able to return.
+         */
+        co_sleep_ns(rt_clock, 0);
+
+        ret = bdrv_co_is_allocated(bs, sector_num,
+                                   STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n);
+        trace_stream_one_iteration(s, sector_num, n, ret);
+        if (ret == 0) {
+            ret = stream_populate(bs, sector_num, n, buf);
+        }
+        if (ret < 0) {
+            break;
+        }
+
+        /* Publish progress */
+        s->common.offset += n * BDRV_SECTOR_SIZE;
+    }
+
+    bdrv_disable_copy_on_read(bs);
+
+    if (sector_num == end && ret == 0) {
+        ret = bdrv_change_backing_file(bs, NULL, NULL);
+    }
+
+    qemu_vfree(buf);
+    block_job_complete(&s->common, ret);
+}
+
+static BlockJobType stream_job_type = {
+    .instance_size = sizeof(StreamBlockJob),
+    .job_type      = "stream",
+};
+
+int stream_start(BlockDriverState *bs, BlockDriverState *base,
+                 BlockDriverCompletionFunc *cb, void *opaque)
+{
+    StreamBlockJob *s;
+    Coroutine *co;
+
+    if (bs->job) {
+        return -EBUSY;
+    }
+
+    s = block_job_create(&stream_job_type, bs, cb, opaque);
+    s->base = base;
+
+    co = qemu_coroutine_create(stream_run);
+    trace_stream_start(bs, base, s, co, opaque);
+    qemu_coroutine_enter(co, s);
+    return 0;
+}
diff --git a/block_int.h b/block_int.h
index 316443e..c7c9178 100644
--- a/block_int.h
+++ b/block_int.h
@@ -332,4 +332,7 @@  int block_job_set_speed(BlockJob *job, int64_t value);
 void block_job_cancel(BlockJob *job);
 bool block_job_is_cancelled(BlockJob *job);
 
+int stream_start(BlockDriverState *bs, BlockDriverState *base,
+                 BlockDriverCompletionFunc *cb, void *opaque);
+
 #endif /* BLOCK_INT_H */
diff --git a/trace-events b/trace-events
index 360f039..c5368fa 100644
--- a/trace-events
+++ b/trace-events
@@ -70,6 +70,10 @@  bdrv_co_write_zeroes(void *bs, int64_t sector_num, int nb_sector) "bs %p sector_
 bdrv_co_io_em(void *bs, int64_t sector_num, int nb_sectors, int is_write, void *acb) "bs %p sector_num %"PRId64" nb_sectors %d is_write %d acb %p"
 bdrv_co_copy_on_readv(void *bs, int64_t sector_num, int nb_sectors, int64_t cluster_sector_num, int cluster_nb_sectors) "bs %p sector_num %"PRId64" nb_sectors %d cluster_sector_num %"PRId64" cluster_nb_sectors %d"
 
+# block/stream.c
+stream_one_iteration(void *s, int64_t sector_num, int nb_sectors, int is_allocated) "s %p sector_num %"PRId64" nb_sectors %d is_allocated %d"
+stream_start(void *bs, void *base, void *s, void *co, void *opaque) "bs %p base %p s %p co %p opaque %p"
+
 # hw/virtio-blk.c
 virtio_blk_req_complete(void *req, int status) "req %p status %d"
 virtio_blk_rw_complete(void *req, int ret) "req %p ret %d"