Patchwork [v2,11/12] mirror: support more than one in-flight AIO operation

login
register
mail settings
Submitter Paolo Bonzini
Date Jan. 16, 2013, 5:31 p.m.
Message ID <1358357479-7912-12-git-send-email-pbonzini@redhat.com>
Download mbox | patch
Permalink /patch/212889/
State New
Headers show

Comments

Paolo Bonzini - Jan. 16, 2013, 5:31 p.m.
With AIO support in place, we can start copying more than one chunk
in parallel.  This patch introduces the required infrastructure for
this: the buffer is split into multiple granularity-sized chunks,
and there is a free list to access them.

Because of copy-on-write, a single operation may already require
multiple chunks to be available on the free list.

In addition, two different iterations on the HBitmap may want to
copy the same cluster.  We avoid this by keeping a bitmap of in-flight
I/O operations, and blocking until the previous iteration completes.
This should be a pretty rare occurrence, though; as long as there is
no overlap the next iteration can start before the previous one finishes.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
        v1->v2: the in_flight_bitmap is now properly set and cleared [Stefan]

 block/mirror.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++------
 trace-events   |    4 ++-
 2 files changed, 102 insertions(+), 13 deletions(-)
Kevin Wolf - Jan. 21, 2013, 12:35 p.m.
Am 16.01.2013 18:31, schrieb Paolo Bonzini:
> With AIO support in place, we can start copying more than one chunk
> in parallel.  This patch introduces the required infrastructure for
> this: the buffer is split into multiple granularity-sized chunks,
> and there is a free list to access them.
> 
> Because of copy-on-write, a single operation may already require
> multiple chunks to be available on the free list.
> 
> In addition, two different iterations on the HBitmap may want to
> copy the same cluster.  We avoid this by keeping a bitmap of in-flight
> I/O operations, and blocking until the previous iteration completes.
> This should be a pretty rare occurrence, though; as long as there is
> no overlap the next iteration can start before the previous one finishes.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

I'm wondering if a whole bitmap is really appropriate when you have at
most 16 parallel requests in flight. Other places in qemu (like
copy-on-read or qcow2 cluster allocation) use lists of in-flight
requests instead.

I'm not requesting a change here, just wondering what the reasons are
and whether this, or the other places, or none of both should be changed
long term.

> ---
>         v1->v2: the in_flight_bitmap is now properly set and cleared [Stefan]
> 
>  block/mirror.c |  111 ++++++++++++++++++++++++++++++++++++++++++++++++++------
>  trace-events   |    4 ++-
>  2 files changed, 102 insertions(+), 13 deletions(-)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 77bb184..686d2b7 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -17,7 +17,15 @@
>  #include "qemu/ratelimit.h"
>  #include "qemu/bitmap.h"
>  
> -#define SLICE_TIME 100000000ULL /* ns */
> +#define SLICE_TIME    100000000ULL /* ns */
> +#define MAX_IN_FLIGHT 16
> +
> +/* The mirroring buffer is a list of granularity-sized chunks.
> + * Free chunks are organized in a list.
> + */
> +typedef struct MirrorBuffer {
> +    QSIMPLEQ_ENTRY(MirrorBuffer) next;
> +} MirrorBuffer;
>  
>  typedef struct MirrorBlockJob {
>      BlockJob common;
> @@ -33,7 +41,10 @@ typedef struct MirrorBlockJob {
>      unsigned long *cow_bitmap;
>      HBitmapIter hbi;
>      uint8_t *buf;
> +    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
> +    int buf_free_count;
>  
> +    unsigned long *in_flight_bitmap;
>      int in_flight;
>      int ret;
>  } MirrorBlockJob;
> @@ -41,7 +52,6 @@ typedef struct MirrorBlockJob {
>  typedef struct MirrorOp {
>      MirrorBlockJob *s;
>      QEMUIOVector qiov;
> -    struct iovec iov;
>      int64_t sector_num;
>      int nb_sectors;
>  } MirrorOp;
> @@ -62,8 +72,23 @@ static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
>  static void mirror_iteration_done(MirrorOp *op)
>  {
>      MirrorBlockJob *s = op->s;
> +    struct iovec *iov;
> +    int64_t cluster_num;
> +    int i, nb_chunks, nb_sectors_chunk;
>  
>      s->in_flight--;
> +    iov = op->qiov.iov;
> +    for (i = 0; i < op->qiov.niov; i++) {
> +        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
> +        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
> +        s->buf_free_count++;
> +    }
> +
> +    nb_sectors_chunk = s->granularity >> BDRV_SECTOR_BITS;
> +    cluster_num = op->sector_num / nb_sectors_chunk;
> +    nb_chunks = op->nb_sectors / nb_sectors_chunk;
> +    bitmap_clear(s->in_flight_bitmap, cluster_num, nb_chunks);
> +
>      trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors);
>      g_slice_free(MirrorOp, op);
>      qemu_coroutine_enter(s->common.co, NULL);
> @@ -110,8 +135,8 @@ static void mirror_read_complete(void *opaque, int ret)
>  static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
>  {
>      BlockDriverState *source = s->common.bs;
> -    int nb_sectors, nb_sectors_chunk;
> -    int64_t end, sector_num, cluster_num;
> +    int nb_sectors, nb_sectors_chunk, nb_chunks;
> +    int64_t end, sector_num, cluster_num, next_sector, hbitmap_next_sector;
>      MirrorOp *op;
>  
>      s->sector_num = hbitmap_iter_next(&s->hbi);
> @@ -122,6 +147,8 @@ static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
>          assert(s->sector_num >= 0);
>      }
>  
> +    hbitmap_next_sector = s->sector_num;

Is there even a reason why s->sector_num exists in the first place? If
I'm not mistaken, it's only used locally and could live on the stack as
hbitmap_next_sector from the beginning.

Kevin
Paolo Bonzini - Jan. 21, 2013, 12:55 p.m.
Il 21/01/2013 13:35, Kevin Wolf ha scritto:
> I'm wondering if a whole bitmap is really appropriate when you have at
> most 16 parallel requests in flight. Other places in qemu (like
> copy-on-read or qcow2 cluster allocation) use lists of in-flight
> requests instead.
> 
> I'm not requesting a change here, just wondering what the reasons are
> and whether this, or the other places, or none of both should be changed
> long term.

The reason is simply that the code is reasoning in bitmaps a lot
(cow_bitmap and of course the dirty bitmap), so it was a natural pick
and the memory usage is not important.  It is simpler and more efficient
than a linear scan.

I have the required information already in the MirrorOp struct indeed,
but I didn't need those in a list.

Paolo

Patch

diff --git a/block/mirror.c b/block/mirror.c
index 77bb184..686d2b7 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -17,7 +17,15 @@ 
 #include "qemu/ratelimit.h"
 #include "qemu/bitmap.h"
 
-#define SLICE_TIME 100000000ULL /* ns */
+#define SLICE_TIME    100000000ULL /* ns */
+#define MAX_IN_FLIGHT 16
+
+/* The mirroring buffer is a list of granularity-sized chunks.
+ * Free chunks are organized in a list.
+ */
+typedef struct MirrorBuffer {
+    QSIMPLEQ_ENTRY(MirrorBuffer) next;
+} MirrorBuffer;
 
 typedef struct MirrorBlockJob {
     BlockJob common;
@@ -33,7 +41,10 @@  typedef struct MirrorBlockJob {
     unsigned long *cow_bitmap;
     HBitmapIter hbi;
     uint8_t *buf;
+    QSIMPLEQ_HEAD(, MirrorBuffer) buf_free;
+    int buf_free_count;
 
+    unsigned long *in_flight_bitmap;
     int in_flight;
     int ret;
 } MirrorBlockJob;
@@ -41,7 +52,6 @@  typedef struct MirrorBlockJob {
 typedef struct MirrorOp {
     MirrorBlockJob *s;
     QEMUIOVector qiov;
-    struct iovec iov;
     int64_t sector_num;
     int nb_sectors;
 } MirrorOp;
@@ -62,8 +72,23 @@  static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read,
 static void mirror_iteration_done(MirrorOp *op)
 {
     MirrorBlockJob *s = op->s;
+    struct iovec *iov;
+    int64_t cluster_num;
+    int i, nb_chunks, nb_sectors_chunk;
 
     s->in_flight--;
+    iov = op->qiov.iov;
+    for (i = 0; i < op->qiov.niov; i++) {
+        MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base;
+        QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next);
+        s->buf_free_count++;
+    }
+
+    nb_sectors_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    cluster_num = op->sector_num / nb_sectors_chunk;
+    nb_chunks = op->nb_sectors / nb_sectors_chunk;
+    bitmap_clear(s->in_flight_bitmap, cluster_num, nb_chunks);
+
     trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors);
     g_slice_free(MirrorOp, op);
     qemu_coroutine_enter(s->common.co, NULL);
@@ -110,8 +135,8 @@  static void mirror_read_complete(void *opaque, int ret)
 static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
 {
     BlockDriverState *source = s->common.bs;
-    int nb_sectors, nb_sectors_chunk;
-    int64_t end, sector_num, cluster_num;
+    int nb_sectors, nb_sectors_chunk, nb_chunks;
+    int64_t end, sector_num, cluster_num, next_sector, hbitmap_next_sector;
     MirrorOp *op;
 
     s->sector_num = hbitmap_iter_next(&s->hbi);
@@ -122,6 +147,8 @@  static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
         assert(s->sector_num >= 0);
     }
 
+    hbitmap_next_sector = s->sector_num;
+
     /* If we have no backing file yet in the destination, and the cluster size
      * is very large, we need to do COW ourselves.  The first time a cluster is
      * copied, copy it entirely.
@@ -137,21 +164,59 @@  static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
         bdrv_round_to_clusters(s->target,
                                sector_num, nb_sectors_chunk,
                                &sector_num, &nb_sectors);
-        bitmap_set(s->cow_bitmap, sector_num / nb_sectors_chunk,
-                   nb_sectors / nb_sectors_chunk);
+
+        /* The rounding may make us copy sectors before the
+         * first dirty one.
+         */
+        cluster_num = sector_num / nb_sectors_chunk;
+    }
+
+    /* Wait for I/O to this cluster (from a previous iteration) to be done.  */
+    while (test_bit(cluster_num, s->in_flight_bitmap)) {
+        trace_mirror_yield_in_flight(s, sector_num, s->in_flight);
+        qemu_coroutine_yield();
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
     nb_sectors = MIN(nb_sectors, end - sector_num);
+    nb_chunks = (nb_sectors + nb_sectors_chunk - 1) / nb_sectors_chunk;
+    while (s->buf_free_count < nb_chunks) {
+        trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight);
+        qemu_coroutine_yield();
+    }
+
+    /* We have enough free space to copy these sectors.  */
+    bitmap_set(s->in_flight_bitmap, cluster_num, nb_chunks);
+    if (s->cow_bitmap) {
+        bitmap_set(s->cow_bitmap, cluster_num, nb_chunks);
+    }
 
     /* Allocate a MirrorOp that is used as an AIO callback.  */
     op = g_slice_new(MirrorOp);
     op->s = s;
-    op->iov.iov_base = s->buf;
-    op->iov.iov_len  = nb_sectors * 512;
     op->sector_num = sector_num;
     op->nb_sectors = nb_sectors;
-    qemu_iovec_init_external(&op->qiov, &op->iov, 1);
+
+    /* Now make a QEMUIOVector taking enough granularity-sized chunks
+     * from s->buf_free.
+     */
+    qemu_iovec_init(&op->qiov, nb_chunks);
+    next_sector = sector_num;
+    while (nb_chunks-- > 0) {
+        MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free);
+        QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next);
+        s->buf_free_count--;
+        qemu_iovec_add(&op->qiov, buf, s->granularity);
+
+        /* Advance the HBitmapIter in parallel, so that we do not examine
+         * the same sector twice.
+         */
+        if (next_sector > hbitmap_next_sector && bdrv_get_dirty(source, next_sector)) {
+            hbitmap_next_sector = hbitmap_iter_next(&s->hbi);
+        }
+
+        next_sector += nb_sectors_chunk;
+    }
 
     bdrv_reset_dirty(source, sector_num, nb_sectors);
 
@@ -162,6 +227,23 @@  static void coroutine_fn mirror_iteration(MirrorBlockJob *s)
                    mirror_read_complete, op);
 }
 
+static void mirror_free_init(MirrorBlockJob *s)
+{
+    int granularity = s->granularity;
+    size_t buf_size = s->buf_size;
+    uint8_t *buf = s->buf;
+
+    assert(s->buf_free_count == 0);
+    QSIMPLEQ_INIT(&s->buf_free);
+    while (buf_size != 0) {
+        MirrorBuffer *cur = (MirrorBuffer *)buf;
+        QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next);
+        s->buf_free_count++;
+        buf_size -= granularity;
+        buf += granularity;
+    }
+}
+
 static void mirror_drain(MirrorBlockJob *s)
 {
     while (s->in_flight > 0) {
@@ -190,6 +272,9 @@  static void coroutine_fn mirror_run(void *opaque)
         return;
     }
 
+    length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
+    s->in_flight_bitmap = bitmap_new(length);
+
     /* If we have no backing file yet in the destination, we cannot let
      * the destination do COW.  Instead, we copy sectors around the
      * dirty data if needed.  We need a bitmap to do that.
@@ -200,7 +285,6 @@  static void coroutine_fn mirror_run(void *opaque)
         bdrv_get_info(s->target, &bdi);
         if (s->buf_size < bdi.cluster_size) {
             s->buf_size = bdi.cluster_size;
-            length = (bdrv_getlength(bs) + s->granularity - 1) / s->granularity;
             s->cow_bitmap = bitmap_new(length);
         }
     }
@@ -208,6 +292,7 @@  static void coroutine_fn mirror_run(void *opaque)
     end = s->common.len >> BDRV_SECTOR_BITS;
     s->buf = qemu_blockalign(bs, s->buf_size);
     nb_sectors_chunk = s->granularity >> BDRV_SECTOR_BITS;
+    mirror_free_init(s);
 
     if (s->mode != MIRROR_SYNC_MODE_NONE) {
         /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -253,8 +338,9 @@  static void coroutine_fn mirror_run(void *opaque)
          */
         if (qemu_get_clock_ns(rt_clock) - last_pause_ns < SLICE_TIME &&
             s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) {
-            if (s->in_flight > 0) {
-                trace_mirror_yield(s, s->in_flight, cnt);
+            if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 ||
+                (cnt == 0 && s->in_flight > 0)) {
+                trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt);
                 qemu_coroutine_yield();
                 continue;
             } else if (cnt != 0) {
@@ -346,6 +432,7 @@  immediate_exit:
     assert(s->in_flight == 0);
     qemu_vfree(s->buf);
     g_free(s->cow_bitmap);
+    g_free(s->in_flight_bitmap);
     bdrv_set_dirty_tracking(bs, 0);
     bdrv_iostatus_disable(s->target);
     if (s->should_complete && ret == 0) {
diff --git a/trace-events b/trace-events
index 4427d1f..1988ccd 100644
--- a/trace-events
+++ b/trace-events
@@ -86,7 +86,9 @@  mirror_before_sleep(void *s, int64_t cnt, int synced) "s %p dirty count %"PRId64
 mirror_one_iteration(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
 mirror_cow(void *s, int64_t sector_num) "s %p sector_num %"PRId64
 mirror_iteration_done(void *s, int64_t sector_num, int nb_sectors) "s %p sector_num %"PRId64" nb_sectors %d"
-mirror_yield(void *s, int64_t cnt, int in_flight) "s %p dirty count %"PRId64" in_flight %d"
+mirror_yield(void *s, int64_t cnt, int buf_free_count, int in_flight) "s %p dirty count %"PRId64" free buffers %d in_flight %d"
+mirror_yield_in_flight(void *s, int64_t sector_num, int in_flight) "s %p sector_num %"PRId64" in_flight %d"
+mirror_yield_buf_busy(void *s, int nb_chunks, int in_flight) "s %p requested chunks %d in_flight %d"
 
 # blockdev.c
 qmp_block_job_cancel(void *job) "job %p"