Patchwork [RFC,v2,17/23] qcow2: Move COW and L2 update into own coroutine

login
register
mail settings
Submitter Kevin Wolf
Date Feb. 13, 2013, 1:22 p.m.
Message ID <1360761733-25347-18-git-send-email-kwolf@redhat.com>
Download mbox | patch
Permalink /patch/220132/
State New
Headers show

Comments

Kevin Wolf - Feb. 13, 2013, 1:22 p.m.
This creates a separate coroutine for processing the COW and the L2
table update of allocating requests. The request itself can then
complete while the second part is still being processed.

We need a qemu_aio_flush() hook in order to ensure that these
coroutines for the second part aren't still running after bdrv_drain_all
(e.g. when the VM is stopped).

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block.c                   |    5 ++
 block/qcow2.c             |  131 ++++++++++++++++++++++++++++++++++++++-------
 block/qcow2.h             |    8 +++
 include/block/block_int.h |    3 +
 4 files changed, 127 insertions(+), 20 deletions(-)
Stefan Hajnoczi - Feb. 15, 2013, 1:01 p.m.
On Wed, Feb 13, 2013 at 02:22:07PM +0100, Kevin Wolf wrote:
> +static bool qcow2_drain(BlockDriverState *bs)

I don't like this function name.  This function is a bool query but
normal drain() means flush requests (i.e. do something that modified
state).

> +{
> +    BDRVQcowState *s = bs->opaque;
> +
> +    return !QLIST_EMPTY(&s->cluster_allocs);
> +}
> +
> +static inline coroutine_fn void stop_l2meta(BlockDriverState *bs)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +
> +    qemu_co_rwlock_wrlock(&s->l2meta_flush);
> +}
> +
> +static inline coroutine_fn void resume_l2meta(BlockDriverState *bs)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +
> +    qemu_co_rwlock_unlock(&s->l2meta_flush);
> +}

Maybe later patches will add more behavior here.  I don't like wrapper
locks - it just obfuscates the locking protocol.  If nothing else gets
added to these functions then it's cleaner to remove them and let code
directly lock/unlock.

Patch

diff --git a/block.c b/block.c
index 50dab8e..5ae80a0 100644
--- a/block.c
+++ b/block.c
@@ -1225,7 +1225,12 @@  void bdrv_drain_all(void)
                 qemu_co_queue_restart_all(&bs->throttled_reqs);
                 busy = true;
             }
+
+            if (bs->drv && bs->drv->bdrv_drain) {
+                busy |= bs->drv->bdrv_drain(bs);
+            }
         }
+
     } while (busy);
 
     /* If requests are still pending there is a bug somewhere */
diff --git a/block/qcow2.c b/block/qcow2.c
index 07f7493..3f169b8 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -483,6 +483,7 @@  static int qcow2_open(BlockDriverState *bs, int flags)
 
     /* Initialise locks */
     qemu_co_mutex_init(&s->lock);
+    qemu_co_rwlock_init(&s->l2meta_flush);
 
     /* Repair image if dirty */
     if (!(flags & BDRV_O_CHECK) && !bs->read_only &&
@@ -745,6 +746,70 @@  fail:
     return ret;
 }
 
+typedef struct ProcessL2Meta {
+    BlockDriverState *bs;
+    QCowL2Meta *m;
+} ProcessL2Meta;
+
+/**
+ * Processes the second part of a request that wrote to newly allocated
+ * clusters (most importantly, doing COW and updating the L2).
+ *
+ * Make sure that s->l2meta_flush is held as a reader when when entering the
+ * coroutine.
+ */
+static void coroutine_fn process_l2meta(void *opaque)
+{
+    ProcessL2Meta *p = opaque;
+    QCowL2Meta *m = p->m;
+    BlockDriverState *bs = p->bs;
+    BDRVQcowState *s = bs->opaque;
+    int ret;
+
+    assert(s->l2meta_flush.reader > 0);
+    qemu_co_mutex_lock(&s->lock);
+
+    ret = qcow2_alloc_cluster_link_l2(bs, m);
+    if (ret < 0) {
+        /* FIXME */
+    }
+
+    qemu_co_mutex_unlock(&s->lock);
+
+    /* Take the request off the list of running requests */
+    if (m->nb_clusters != 0) {
+        QLIST_REMOVE(m, next_in_flight);
+    }
+
+    /* Meanwhile some new dependencies could have accumulated */
+    qemu_co_queue_restart_all(&m->dependent_requests);
+
+    g_free(m);
+
+    qemu_co_rwlock_unlock(&s->l2meta_flush);
+}
+
+static bool qcow2_drain(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    return !QLIST_EMPTY(&s->cluster_allocs);
+}
+
+static inline coroutine_fn void stop_l2meta(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    qemu_co_rwlock_wrlock(&s->l2meta_flush);
+}
+
+static inline coroutine_fn void resume_l2meta(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    qemu_co_rwlock_unlock(&s->l2meta_flush);
+}
+
 static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                            int64_t sector_num,
                            int remaining_sectors,
@@ -824,26 +889,37 @@  static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
             goto fail;
         }
 
-        while (l2meta != NULL) {
-            QCowL2Meta *next;
+        if (l2meta != NULL) {
+            qemu_co_mutex_unlock(&s->lock);
 
-            l2meta->is_written = true;
+            while (l2meta != NULL) {
+                Coroutine *co;
+                QCowL2Meta *next;
 
-            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
-            if (ret < 0) {
-                goto fail;
-            }
+                ProcessL2Meta p = {
+                    .bs = bs,
+                    .m  = l2meta,
+                };
 
-            /* Take the request off the list of running requests */
-            if (l2meta->nb_clusters != 0) {
-                QLIST_REMOVE(l2meta, next_in_flight);
-            }
+                /*
+                 * Must take l2meta_flush already here instead of in the
+                 * coroutine; otherwise it would be possible that a concurrent
+                 * flush would claim that this request is written to the disk
+                 * when the metadata isn't written yet in fact.
+                 */
+                qemu_co_rwlock_rdlock(&s->l2meta_flush);
+                l2meta->is_written = true;
+
+                /* l2meta might already be freed after the coroutine has run */
+                next = l2meta->next;
 
-            qemu_co_queue_restart_all(&l2meta->dependent_requests);
+                co = qemu_coroutine_create(process_l2meta);
+                qemu_coroutine_enter(co, &p);
 
-            next = l2meta->next;
-            g_free(l2meta);
-            l2meta = next;
+                l2meta = next;
+            }
+
+            qemu_co_mutex_lock(&s->lock);
         }
 
         remaining_sectors -= cur_nr_sectors;
@@ -879,6 +955,11 @@  fail:
 static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
+
+    while (qcow2_drain(bs)) {
+        qemu_aio_wait();
+    }
+
     g_free(s->l1_table);
 
     qcow2_cache_flush(bs, s->l2_table_cache);
@@ -1417,10 +1498,12 @@  static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs,
     }
 
     /* Whatever is left can use real zero clusters */
+    stop_l2meta(bs);
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS,
         nb_sectors);
     qemu_co_mutex_unlock(&s->lock);
+    resume_l2meta(bs);
 
     return ret;
 }
@@ -1431,10 +1514,13 @@  static coroutine_fn int qcow2_co_discard(BlockDriverState *bs,
     int ret;
     BDRVQcowState *s = bs->opaque;
 
+    stop_l2meta(bs);
     qemu_co_mutex_lock(&s->lock);
     ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS,
         nb_sectors);
     qemu_co_mutex_unlock(&s->lock);
+    resume_l2meta(bs);
+
     return ret;
 }
 
@@ -1560,23 +1646,27 @@  static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs)
     BDRVQcowState *s = bs->opaque;
     int ret;
 
+    stop_l2meta(bs);
     qemu_co_mutex_lock(&s->lock);
+
     ret = qcow2_cache_flush(bs, s->l2_table_cache);
     if (ret < 0) {
-        qemu_co_mutex_unlock(&s->lock);
-        return ret;
+        goto fail;
     }
 
     if (qcow2_need_accurate_refcounts(s)) {
         ret = qcow2_cache_flush(bs, s->refcount_block_cache);
         if (ret < 0) {
-            qemu_co_mutex_unlock(&s->lock);
-            return ret;
+            goto fail;
         }
     }
+
+    ret = 0;
+fail:
     qemu_co_mutex_unlock(&s->lock);
+    resume_l2meta(bs);
 
-    return 0;
+    return ret;
 }
 
 static int64_t qcow2_vm_state_offset(BDRVQcowState *s)
@@ -1703,6 +1793,7 @@  static BlockDriver bdrv_qcow2 = {
     .bdrv_co_readv          = qcow2_co_readv,
     .bdrv_co_writev         = qcow2_co_writev,
     .bdrv_co_flush_to_os    = qcow2_co_flush_to_os,
+    .bdrv_drain             = qcow2_drain,
 
     .bdrv_co_write_zeroes   = qcow2_co_write_zeroes,
     .bdrv_co_discard        = qcow2_co_discard,
diff --git a/block/qcow2.h b/block/qcow2.h
index 4c139d0..46ed112 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -162,6 +162,14 @@  typedef struct BDRVQcowState {
 
     CoMutex lock;
 
+    /*
+     * Only to be aquired while s->lock is not held.
+     *
+     * Readers: All l2meta coroutines that are in flight
+     * Writers: Anyone who requires l2meta to be flushed
+     */
+    CoRwlock l2meta_flush;
+
     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
     uint32_t crypt_method_header;
     AES_KEY aes_encrypt_key;
diff --git a/include/block/block_int.h b/include/block/block_int.h
index eaad53e..6ee7536 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -141,6 +141,9 @@  struct BlockDriver {
      */
     int coroutine_fn (*bdrv_co_flush_to_os)(BlockDriverState *bs);
 
+    /** Returns true if the block device is still busy */
+    bool (*bdrv_drain)(BlockDriverState *bs);
+
     const char *protocol_name;
     int (*bdrv_truncate)(BlockDriverState *bs, int64_t offset);
     int64_t (*bdrv_getlength)(BlockDriverState *bs);