Patchwork [RFC,15/16] qcow2: Cancel COW when overwritten

login
register
mail settings
Submitter Kevin Wolf
Date Sept. 18, 2012, 11:40 a.m.
Message ID <1347968442-8860-16-git-send-email-kwolf@redhat.com>
Download mbox | patch
Permalink /patch/184705/
State New
Headers show

Comments

Kevin Wolf - Sept. 18, 2012, 11:40 a.m.
This is the first part of an optimisation to improve the performance of
sequential cluster allocations.

Typically, write requests aren't aligned to cluster boundaries, so
sequential allocation means that every other request has to wait for the
COW of the previous request to complete. We can do better: Just cancel
the COW instead of waiting for it and then overwriting the same area
with the second write request.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/qcow2-cluster.c |  127 +++++++++++++++++++++++++++++++++++++++++++------
 block/qcow2.c         |   21 ++++++++
 block/qcow2.h         |   47 ++++++++++++++++++
 3 files changed, 180 insertions(+), 15 deletions(-)
Paolo Bonzini - Sept. 18, 2012, 2:44 p.m.
Il 18/09/2012 13:40, Kevin Wolf ha scritto:
> +    qemu_co_mutex_unlock(&s->lock);
> +    qemu_co_rwlock_wrlock(&m->l2_writeback_lock);

Can anybody else take the lock as reader again at this point?  If not, I
wonder if this is more clear if you write it as a CoQueue.

Paolo

> +    has_wr_lock = true;
> +    qemu_co_mutex_lock(&s->lock);
Kevin Wolf - Sept. 18, 2012, 3:02 p.m.
Am 18.09.2012 16:44, schrieb Paolo Bonzini:
> Il 18/09/2012 13:40, Kevin Wolf ha scritto:
>> +    qemu_co_mutex_unlock(&s->lock);
>> +    qemu_co_rwlock_wrlock(&m->l2_writeback_lock);
> 
> Can anybody else take the lock as reader again at this point?  If not, I
> wonder if this is more clear if you write it as a CoQueue.

This isn't "let one writer complete, then wake up n readers" (which
would indeed be represented more naturally as CoQueue), but rather "let
n readers complete, then wake up one writer".

Kevin
Paolo Bonzini - Sept. 18, 2012, 3:05 p.m.
Il 18/09/2012 17:02, Kevin Wolf ha scritto:
>>> >> +    qemu_co_mutex_unlock(&s->lock);
>>> >> +    qemu_co_rwlock_wrlock(&m->l2_writeback_lock);
>> > 
>> > Can anybody else take the lock as reader again at this point?  If not, I
>> > wonder if this is more clear if you write it as a CoQueue.
> This isn't "let one writer complete, then wake up n readers" (which
> would indeed be represented more naturally as CoQueue), but rather "let
> n readers complete, then wake up one writer".

So would it be representable as a list of children (matching the parent
pointer) and then unlocking is

  remove self from children list
  if the children list is empty, wake parent->co

?

Paolo
Paolo Bonzini - Sept. 18, 2012, 3:08 p.m.
Il 18/09/2012 17:05, Paolo Bonzini ha scritto:
>>>>>> >>> >> +    qemu_co_mutex_unlock(&s->lock);
>>>>>> >>> >> +    qemu_co_rwlock_wrlock(&m->l2_writeback_lock);
>>>> >> > 
>>>> >> > Can anybody else take the lock as reader again at this point?  If not, I
>>>> >> > wonder if this is more clear if you write it as a CoQueue.
>> > This isn't "let one writer complete, then wake up n readers" (which
>> > would indeed be represented more naturally as CoQueue), but rather "let
>> > n readers complete, then wake up one writer".
> So would it be representable as a list of children (matching the parent
> pointer) and then unlocking is
> 
>   remove self from children list
>   if the children list is empty, wake parent->co
> 
> ?

Of course this only work if the list of children cannot become non-empty
anymore after the writer restarts.

Paolo

Patch

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 440fdbf..ff22992 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -659,6 +659,8 @@  static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r)
     BDRVQcowState *s = bs->opaque;
     int ret;
 
+    r->final = true;
+
     if (r->nb_sectors == 0) {
         return 0;
     }
@@ -689,6 +691,7 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     int i, j = 0, l2_index, ret;
     uint64_t *old_cluster, *l2_table;
     uint64_t cluster_offset = m->alloc_offset;
+    bool has_wr_lock = false;
 
     trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters);
     assert(m->nb_clusters > 0);
@@ -707,6 +710,16 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
     }
 
     /* Update L2 table. */
+    qemu_co_mutex_unlock(&s->lock);
+    qemu_co_rwlock_wrlock(&m->l2_writeback_lock);
+    has_wr_lock = true;
+    qemu_co_mutex_lock(&s->lock);
+
+    if (m->no_l2_update) {
+        ret = 0;
+        goto err;
+    }
+
     if (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS) {
         qcow2_mark_dirty(bs);
     }
@@ -753,6 +766,9 @@  int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m)
 
     ret = 0;
 err:
+    if (has_wr_lock) {
+        qemu_co_rwlock_unlock(&m->l2_writeback_lock);
+    }
     g_free(old_cluster);
     return ret;
  }
@@ -825,7 +841,8 @@  static void kick_l2meta(QCowL2Meta *m)
  * request has completed and updated the L2 table accordingly.
  */
 static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
-    uint64_t bytes, unsigned int *nb_clusters)
+    uint64_t *host_offset, uint64_t bytes, unsigned int *nb_clusters,
+    QCowL2Meta **m)
 {
     BDRVQcowState *s = bs->opaque;
     QCowL2Meta *old_alloc;
@@ -840,22 +857,96 @@  static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset,
         if (end <= old_start || start >= old_end) {
             /* No intersection */
         } else {
+            uint64_t new_bytes;
+            uint64_t old_cow_end;
+
+           /*
+            * Shorten the request to stop at the start of a running
+            * allocation.
+            */
             if (start < old_start) {
-                /* Stop at the start of a running allocation */
-                bytes = old_start - start;
+                new_bytes = old_start - start;
             } else {
-                bytes = 0;
+                new_bytes = 0;
+            }
+
+            if (new_bytes > 0) {
+                bytes = new_bytes;
+                continue;
+            }
+
+            /*
+             * Check if we're just overwriting some COW of the old allocation
+             * that is safe to be replaced by the data of this request.
+             */
+            old_cow_end = old_alloc->offset + old_alloc->cow_end.offset;
+
+            if ((old_end & (s->cluster_size - 1)) == 0
+                && start >= old_cow_end
+                && !old_alloc->cow_end.final)
+            {
+                uint64_t subcluster_offset;
+                int nb_sectors;
+
+                *nb_clusters = 1;
+                subcluster_offset = offset_into_cluster(s, guest_offset);
+                nb_sectors = (subcluster_offset + bytes) >> BDRV_SECTOR_BITS;
+
+                /* Move forward cluster by cluster when overwriting COW areas,
+                 * or we'd have to deal with multiple overlapping requests and
+                 * things would become complicated. */
+                nb_sectors = MIN(s->cluster_sectors, nb_sectors);
+
+                /* Shorten the COW area at the end of the old request */
+                old_alloc->cow_end.nb_sectors =
+                    (guest_offset - old_cow_end) >> BDRV_SECTOR_BITS;
+
+                /* The new data region starts in the same cluster where the COW
+                 * region at the end of the old request starts. */
+                *host_offset = start_of_cluster(s,
+                    old_alloc->alloc_offset + old_alloc->cow_end.offset);
+
+                /* Create new l2meta that doesn't actually allocate new L2
+                 * entries, but describes the new data area so that reads
+                 * access the right cluster */
+                *m = g_malloc0(sizeof(**m));
+                **m = (QCowL2Meta) {
+                    .parent         = old_alloc,
+                    .alloc_offset   = *host_offset,
+                    .offset         = guest_offset & ~(s->cluster_size - 1),
+                    .nb_clusters    = 1,
+                    .nb_available   = nb_sectors,
+                    .no_l2_update   = true,
+
+                    .cow_start = {
+                        .offset     = subcluster_offset,
+                        .nb_sectors = 0,
+                    },
+                    .cow_end = {
+                        .offset     = nb_sectors << BDRV_SECTOR_BITS,
+                        .nb_sectors = s->cluster_sectors - nb_sectors,
+                    },
+                };
+                qemu_co_queue_init(&(*m)->dependent_requests);
+                qemu_co_rwlock_init(&(*m)->l2_writeback_lock);
+                QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
+
+                /* Ensure that the L2 isn't written before COW has completed */
+                assert(!old_alloc->l2_writeback_lock.writer);
+                qemu_co_rwlock_rdlock(&old_alloc->l2_writeback_lock);
+
+                return 0;
             }
 
-            if (bytes == 0) {
-                /* Wait for the dependency to complete. We need to recheck
-                 * the free/allocated clusters when we continue. */
-                qemu_co_mutex_unlock(&s->lock);
+            /* Wait for the dependency to complete. We need to recheck
+             * the free/allocated clusters when we continue. */
+            qemu_co_mutex_unlock(&s->lock);
+            if (old_alloc->sleeping) {
                 kick_l2meta(old_alloc);
-                qemu_co_queue_wait(&old_alloc->dependent_requests);
-                qemu_co_mutex_lock(&s->lock);
-                return -EAGAIN;
             }
+            qemu_co_queue_wait(&old_alloc->dependent_requests);
+            qemu_co_mutex_lock(&s->lock);
+            return -EAGAIN;
         }
     }
 
@@ -942,7 +1033,7 @@  int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
     int l2_index, ret, sectors;
     uint64_t *l2_table;
     unsigned int nb_clusters, keep_clusters;
-    uint64_t cluster_offset;
+    uint64_t cluster_offset = 0;
 
     trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset,
                                       n_start, n_end);
@@ -969,7 +1060,7 @@  again:
      *         contiguous clusters (the situation could have changed while we
      *         were sleeping)
      *
-     *      c) TODO: Request starts in the same cluster as the in-flight
+     *      c) Request starts in the same cluster as the in-flight
      *         allocation ends. Shorten the COW of the in-fight allocation, set
      *         cluster_offset to write to the same cluster and set up the right
      *         synchronisation between the in-flight request and the new one.
@@ -980,13 +1071,17 @@  again:
      * 3. If the request still hasn't completed, allocate new clusters,
      *    considering any cluster_offset of steps 1c or 2.
      */
-    ret = handle_dependencies(bs, offset,
+    ret = handle_dependencies(bs, offset, &cluster_offset,
                               (n_end - n_start) * BDRV_SECTOR_SIZE,
-                              &nb_clusters);
+                              &nb_clusters, m);
     if (ret == -EAGAIN) {
         goto again;
     } else if (ret < 0) {
         return ret;
+    } else if (*m) {
+        keep_clusters = 1;
+        nb_clusters = 0;
+        goto done;
     }
 
     /* Find L2 entry for the first involved cluster */
@@ -1105,11 +1200,13 @@  again:
                 },
             };
             qemu_co_queue_init(&(*m)->dependent_requests);
+            qemu_co_rwlock_init(&(*m)->l2_writeback_lock);
             QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight);
         }
     }
 
     /* Some cleanup work */
+done:
     sectors = (keep_clusters + nb_clusters) << (s->cluster_bits - 9);
     if (sectors > n_end) {
         sectors = n_end;
diff --git a/block/qcow2.c b/block/qcow2.c
index 88a2020..abc3de3 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -779,6 +779,11 @@  again:
          * to be recoverable e.g. after the block device has been grown or the
          * network connection restored. Sleep until the next flush comes and
          * then retry.
+         *
+         * FIXME:
+         * - Need to cancel any other requests to the same cluster (or at least
+         *   prevent the L2 entry from being updated) (Really? Don't they wait
+         *   automatically?)
          */
         s->flush_error = ret;
 
@@ -795,6 +800,22 @@  again:
 
     qemu_co_mutex_unlock(&s->lock);
 
+    if (m->parent) {
+        /*
+         * Allow the parent to write the L2 table entry now that both guest
+         * data write and COW have completed. Don't remove the request from the
+         * queue yet until the L2 is updated, we still need it for overriding
+         * the cluster_offset of requests to the same area.
+         *
+         * The release of the l2_writeback_lock and the queuing must be
+         * performed atomically to avoid a race. This code implements this
+         * correctly because unlocking only schedules a BH to wake the parent,
+         * but doesn't yield yet.
+         */
+        qemu_co_rwlock_unlock(&m->parent->l2_writeback_lock);
+        qemu_co_queue_wait(&m->parent->dependent_requests);
+    }
+
     /* Take the request off the list of running requests */
     if (m->nb_clusters != 0) {
         QLIST_REMOVE(m, next_in_flight);
diff --git a/block/qcow2.h b/block/qcow2.h
index 06ca195..7ed082b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -216,6 +216,12 @@  typedef struct Qcow2COWRegion {
 
     /** Number of sectors to copy */
     int         nb_sectors;
+
+    /**
+     * Becomes true when offset and nb_sectors cannot be modified any more
+     * because the COW is already being processed.
+     */
+    bool        final;
 } Qcow2COWRegion;
 
 /**
@@ -254,6 +260,12 @@  typedef struct QCowL2Meta
     bool sleeping;
     bool error;
 
+    /**
+     * true indicates that this is not an actual cluster allocation, but reuses
+     * the allocation of an earlier request, so don't update the L2 table.
+     */
+    bool no_l2_update;
+
     /** Coroutine that handles delayed COW and updates L2 entry */
     Coroutine *co;
 
@@ -275,6 +287,31 @@  typedef struct QCowL2Meta
      */
     Qcow2COWRegion cow_end;
 
+    /**
+     * The L2 table must only be updated when all COWs to the same sector have
+     * completed. A request that shortens part of the COW, takes a reader lock
+     * until its data and COW is written.
+     */
+    CoRwlock l2_writeback_lock;
+
+    /**
+     * QCowL2Metas can form a tree. The root of this tree is a request that
+     * actually allocated the host cluster in the image file. Children are
+     * requests that overwrite (part of) a COW region of their parents, so that
+     * the COW of their parent was cancelled.
+     *
+     * Important rules to bear in mind:
+     *
+     * - The parent must not write its L2 entry before all children have
+     *   written both their guest data and COW regions. If it did, the image on
+     *   disk would yet have incomplete COW and data would be corrupted.
+     *
+     * - The chilren must stay in s->cluster_allocs until the parent has
+     *   written the L2 entry, so that read requests correctly return the new
+     *   data even though the L2 table is not updated yet.
+     */
+    struct QCowL2Meta *parent;
+
     QLIST_ENTRY(QCowL2Meta) next_in_flight;
 } QCowL2Meta;
 
@@ -291,6 +328,16 @@  enum {
 
 #define REFT_OFFSET_MASK 0xffffffffffffff00ULL
 
+static inline int start_of_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & ~(s->cluster_size - 1);
+}
+
+static inline int offset_into_cluster(BDRVQcowState *s, int64_t offset)
+{
+    return offset & (s->cluster_size - 1);
+}
+
 static inline int size_to_clusters(BDRVQcowState *s, int64_t size)
 {
     return (size + (s->cluster_size - 1)) >> s->cluster_bits;