diff mbox

[v3,1/2] block: allow live commit of active image

Message ID 1376554447-28638-2-git-send-email-famz@redhat.com
State New
Headers show

Commit Message

Fam Zheng Aug. 15, 2013, 8:14 a.m. UTC
This patch eliminates limitation of committing the active device.

bdrv_drop_intermediate is reimplemented to take pointers to
(BlockDriverState *), so it can modify the caller's local pointers to
preserve their semantics, while updating active BDS in-place by
bdrv_swap active and base: we need data in 'base' as it's the only
remaining after commit, but we can't delete 'active' as it's referenced
everywhere in the program.

Guest writes to active device during the commit are tracked by dirty map
and committed like block-mirror.

Signed-off-by: Fam Zheng <famz@redhat.com>
---
 block.c               | 102 +++++++-----------------
 block/commit.c        | 215 ++++++++++++++++++++++++++++++--------------------
 include/block/block.h |   5 +-
 3 files changed, 164 insertions(+), 158 deletions(-)

Comments

Stefan Hajnoczi Sept. 4, 2013, 12:35 p.m. UTC | #1
On Thu, Aug 15, 2013 at 04:14:06PM +0800, Fam Zheng wrote:
> diff --git a/block/commit.c b/block/commit.c
> index 2227fc2..b5e024b 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -17,14 +17,13 @@
>  #include "block/blockjob.h"
>  #include "qemu/ratelimit.h"
>  
> -enum {
> -    /*
> -     * Size of data buffer for populating the image file.  This should be large
> -     * enough to process multiple clusters in a single call, so that populating
> -     * contiguous regions of the image is efficient.
> -     */
> -    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
> -};
> +/*
> + * Size of data buffer for populating the image file.  This should be large
> + * enough to process multiple clusters in a single call, so that populating
> + * contiguous regions of the image is efficient.
> + */
> +#define COMMIT_BUFFER_SECTORS 128
> +#define COMMIT_BUFFER_BYTES (COMMIT_BUFFER_SECTORS * BDRV_SECTOR_SIZE)

Changing from 512 KB to 64 KB can affect performance.  8 times as many
iops may be issued to copy data.

Also, the image's cluster size should really be taken into account.
Otherwise additional inefficiency will be suffered when we populate a
128 KB cluster with a COMMIT_BUFFER_SECTORS (64 KB) write only to
overwrite the remaining part in the next loop iteration.

This can be solved by setting dirty bitmap granularity to cluster size
or 64 KB minimum *and* finding continuous runs of dirty bits so larger
I/Os can be performed by the main loop (up to 512 KB in one request).

>  #define SLICE_TIME 100000000ULL /* ns */
>  
> @@ -34,11 +33,27 @@ typedef struct CommitBlockJob {
>      BlockDriverState *active;
>      BlockDriverState *top;
>      BlockDriverState *base;
> +    BlockDriverState *overlay;
>      BlockdevOnError on_error;
>      int base_flags;
>      int orig_overlay_flags;
> +    bool should_complete;
> +    bool ready;

Why introduce the ready state when the active layer is being committed?

There is no documentation update that mentions the job will not complete
by itself if the top image is active.

> +    for (;;) {
> +        int64_t cnt = bdrv_get_dirty_count(s->top);
> +        if (cnt == 0) {
> +            if (!s->overlay && !s->ready) {
> +                s->ready = true;
> +                block_job_ready(&s->common);
>              }
> -            ret = commit_populate(top, base, sector_num, n, buf);
> -            bytes_written += n * BDRV_SECTOR_SIZE;
> +            /* We can complete if user called complete job or the job is
> +             * committing non-active image */
> +            if (s->should_complete || s->overlay) {
> +                break;

This termination condition is not safe:

A write request only marks the dirty bitmap upon completion.  A guest
write request could still be in flight so we get cnt == 0 but we
actually have not copied all data into the base.

Completing safely is a little tricky because bdrv_drain_all() is
synchronous and we don't want to call that.  But waiting for
bs->tracked_requests to be empty is also bad because it's a busy wait.

Ideally we'd get woken up whenever a write request finishes.
Fam Zheng Sept. 18, 2013, 3:32 a.m. UTC | #2
On Wed, 09/04 14:35, Stefan Hajnoczi wrote:
> On Thu, Aug 15, 2013 at 04:14:06PM +0800, Fam Zheng wrote:
> > diff --git a/block/commit.c b/block/commit.c
> > index 2227fc2..b5e024b 100644
> > --- a/block/commit.c
> > +++ b/block/commit.c
> > @@ -17,14 +17,13 @@
> >  #include "block/blockjob.h"
> >  #include "qemu/ratelimit.h"
> >  
> > -enum {
> > -    /*
> > -     * Size of data buffer for populating the image file.  This should be large
> > -     * enough to process multiple clusters in a single call, so that populating
> > -     * contiguous regions of the image is efficient.
> > -     */
> > -    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
> > -};
> > +/*
> > + * Size of data buffer for populating the image file.  This should be large
> > + * enough to process multiple clusters in a single call, so that populating
> > + * contiguous regions of the image is efficient.
> > + */
> > +#define COMMIT_BUFFER_SECTORS 128
> > +#define COMMIT_BUFFER_BYTES (COMMIT_BUFFER_SECTORS * BDRV_SECTOR_SIZE)
> 
> Changing from 512 KB to 64 KB can affect performance.  8 times as many
> iops may be issued to copy data.
> 
> Also, the image's cluster size should really be taken into account.
> Otherwise additional inefficiency will be suffered when we populate a
> 128 KB cluster with a COMMIT_BUFFER_SECTORS (64 KB) write only to
> overwrite the remaining part in the next loop iteration.
> 
> This can be solved by setting dirty bitmap granularity to cluster size
> or 64 KB minimum *and* finding continuous runs of dirty bits so larger
> I/Os can be performed by the main loop (up to 512 KB in one request).
> 
> >  #define SLICE_TIME 100000000ULL /* ns */
> >  
> > @@ -34,11 +33,27 @@ typedef struct CommitBlockJob {
> >      BlockDriverState *active;
> >      BlockDriverState *top;
> >      BlockDriverState *base;
> > +    BlockDriverState *overlay;
> >      BlockdevOnError on_error;
> >      int base_flags;
> >      int orig_overlay_flags;
> > +    bool should_complete;
> > +    bool ready;
> 
> Why introduce the ready state when the active layer is being committed?
> 
> There is no documentation update that mentions the job will not complete
> by itself if the top image is active.
> 
> > +    for (;;) {
> > +        int64_t cnt = bdrv_get_dirty_count(s->top);
> > +        if (cnt == 0) {
> > +            if (!s->overlay && !s->ready) {
> > +                s->ready = true;
> > +                block_job_ready(&s->common);
> >              }
> > -            ret = commit_populate(top, base, sector_num, n, buf);
> > -            bytes_written += n * BDRV_SECTOR_SIZE;
> > +            /* We can complete if user called complete job or the job is
> > +             * committing non-active image */
> > +            if (s->should_complete || s->overlay) {
> > +                break;
> 
> This termination condition is not safe:
> 
> A write request only marks the dirty bitmap upon completion.  A guest
> write request could still be in flight so we get cnt == 0 but we
> actually have not copied all data into the base.

Can we mark the dirty bitmap immediately upon getting guest write request?

Fam

> Completing safely is a little tricky because bdrv_drain_all() is
> synchronous and we don't want to call that.  But waiting for
> bs->tracked_requests to be empty is also bad because it's a busy wait.
> 
> Ideally we'd get woken up whenever a write request finishes.
Stefan Hajnoczi Sept. 18, 2013, 9:36 a.m. UTC | #3
On Wed, Sep 18, 2013 at 11:32:31AM +0800, Fam Zheng wrote:
> On Wed, 09/04 14:35, Stefan Hajnoczi wrote:
> > On Thu, Aug 15, 2013 at 04:14:06PM +0800, Fam Zheng wrote:
> > > diff --git a/block/commit.c b/block/commit.c
> > > index 2227fc2..b5e024b 100644
> > > --- a/block/commit.c
> > > +++ b/block/commit.c
> > > @@ -17,14 +17,13 @@
> > >  #include "block/blockjob.h"
> > >  #include "qemu/ratelimit.h"
> > >  
> > > -enum {
> > > -    /*
> > > -     * Size of data buffer for populating the image file.  This should be large
> > > -     * enough to process multiple clusters in a single call, so that populating
> > > -     * contiguous regions of the image is efficient.
> > > -     */
> > > -    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
> > > -};
> > > +/*
> > > + * Size of data buffer for populating the image file.  This should be large
> > > + * enough to process multiple clusters in a single call, so that populating
> > > + * contiguous regions of the image is efficient.
> > > + */
> > > +#define COMMIT_BUFFER_SECTORS 128
> > > +#define COMMIT_BUFFER_BYTES (COMMIT_BUFFER_SECTORS * BDRV_SECTOR_SIZE)
> > 
> > Changing from 512 KB to 64 KB can affect performance.  8 times as many
> > iops may be issued to copy data.
> > 
> > Also, the image's cluster size should really be taken into account.
> > Otherwise additional inefficiency will be suffered when we populate a
> > 128 KB cluster with a COMMIT_BUFFER_SECTORS (64 KB) write only to
> > overwrite the remaining part in the next loop iteration.
> > 
> > This can be solved by setting dirty bitmap granularity to cluster size
> > or 64 KB minimum *and* finding continuous runs of dirty bits so larger
> > I/Os can be performed by the main loop (up to 512 KB in one request).
> > 
> > >  #define SLICE_TIME 100000000ULL /* ns */
> > >  
> > > @@ -34,11 +33,27 @@ typedef struct CommitBlockJob {
> > >      BlockDriverState *active;
> > >      BlockDriverState *top;
> > >      BlockDriverState *base;
> > > +    BlockDriverState *overlay;
> > >      BlockdevOnError on_error;
> > >      int base_flags;
> > >      int orig_overlay_flags;
> > > +    bool should_complete;
> > > +    bool ready;
> > 
> > Why introduce the ready state when the active layer is being committed?
> > 
> > There is no documentation update that mentions the job will not complete
> > by itself if the top image is active.
> > 
> > > +    for (;;) {
> > > +        int64_t cnt = bdrv_get_dirty_count(s->top);
> > > +        if (cnt == 0) {
> > > +            if (!s->overlay && !s->ready) {
> > > +                s->ready = true;
> > > +                block_job_ready(&s->common);
> > >              }
> > > -            ret = commit_populate(top, base, sector_num, n, buf);
> > > -            bytes_written += n * BDRV_SECTOR_SIZE;
> > > +            /* We can complete if user called complete job or the job is
> > > +             * committing non-active image */
> > > +            if (s->should_complete || s->overlay) {
> > > +                break;
> > 
> > This termination condition is not safe:
> > 
> > A write request only marks the dirty bitmap upon completion.  A guest
> > write request could still be in flight so we get cnt == 0 but we
> > actually have not copied all data into the base.
> 
> Can we mark the dirty bitmap immediately upon getting guest write request?

No, because then the bit might get cleared before the request completes.
The actual request might not hit the disk straight away - it could yield
on an image format coroutine mutex.

We could do the equivalent of drain asynchronously: get a callback when
there are no requests.  There is also a stricter form of this with a
guarantee that the guest cannot make us wait forever: "freeze" the block
device so new requests will yield immediately until the device is
unfrozen.  Now a guest cannot stop us from completing by continuously
submitting requests.

Note that freeze has the disadvantage that the guest might time out if
we don't unfreeze the device soon.

Stefan
Paolo Bonzini Sept. 18, 2013, 11:46 a.m. UTC | #4
Il 04/09/2013 14:35, Stefan Hajnoczi ha scritto:
> Changing from 512 KB to 64 KB can affect performance.  8 times as many
> iops may be issued to copy data.
> 
> Also, the image's cluster size should really be taken into account.
> Otherwise additional inefficiency will be suffered when we populate a
> 128 KB cluster with a COMMIT_BUFFER_SECTORS (64 KB) write only to
> overwrite the remaining part in the next loop iteration.
> 
> This can be solved by setting dirty bitmap granularity to cluster size
> or 64 KB minimum *and* finding continuous runs of dirty bits so larger
> I/Os can be performed by the main loop (up to 512 KB in one request).

At this point, you're basically reinventing the algorithms in
block/mirror.c (even more than this patch is already doing :)).

I wonder if commit-to-active should be internally a separate kind of
job, implemented in block/mirror.c instead of block/commit.c.

Paolo
diff mbox

Patch

diff --git a/block.c b/block.c
index 01b66d8..bc7249e 100644
--- a/block.c
+++ b/block.c
@@ -2024,18 +2024,11 @@  BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
     return overlay;
 }
 
-typedef struct BlkIntermediateStates {
-    BlockDriverState *bs;
-    QSIMPLEQ_ENTRY(BlkIntermediateStates) entry;
-} BlkIntermediateStates;
-
-
 /*
- * Drops images above 'base' up to and including 'top', and sets the image
- * above 'top' to have base as its backing file.
- *
- * Requires that the overlay to 'top' is opened r/w, so that the backing file
- * information in 'bs' can be properly updated.
+ * Drops images above '*base' up to and including '*top', and sets new '*base'
+ * as backing_hd of top_overlay (the image orignally has 'top' as backing
+ * file). top_overlay may be NULL if '*top' is active, no such update needed.
+ * Requires that the top_overlay to 'top' is opened r/w.
  *
  * E.g., this will convert the following chain:
  * bottom <- base <- intermediate <- top <- active
@@ -2052,82 +2045,47 @@  typedef struct BlkIntermediateStates {
  *
  * base <- active
  *
- * Error conditions:
- *  if active == top, that is considered an error
+ * It also allows active==top, in which case it converts:
+ *
+ * base <- intermediate <- active (also top)
+ *
+ * to
+ *
+ * base == active == top, i.e. only base remains: *top == *base when return.
  *
  */
-int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
-                           BlockDriverState *base)
+int bdrv_drop_intermediate(BlockDriverState *top_overlay,
+                           BlockDriverState **top,
+                           BlockDriverState **base)
 {
-    BlockDriverState *intermediate;
+    BlockDriverState *pbs;
     BlockDriverState *base_bs = NULL;
-    BlockDriverState *new_top_bs = NULL;
-    BlkIntermediateStates *intermediate_state, *next;
     int ret = -EIO;
 
-    QSIMPLEQ_HEAD(states_to_delete, BlkIntermediateStates) states_to_delete;
-    QSIMPLEQ_INIT(&states_to_delete);
-
-    if (!top->drv || !base->drv) {
+    if (!(*top)->drv || !(*base)->drv) {
         goto exit;
     }
 
-    new_top_bs = bdrv_find_overlay(active, top);
-
-    if (new_top_bs == NULL) {
-        /* we could not find the image above 'top', this is an error */
-        goto exit;
+    for (pbs = (*top)->backing_hd; pbs != *base; pbs = base_bs) {
+        assert(pbs);
+        base_bs = pbs->backing_hd;
+        pbs->backing_hd = NULL;
+        bdrv_delete(pbs);
     }
 
-    /* special case of new_top_bs->backing_hd already pointing to base - nothing
-     * to do, no intermediate images */
-    if (new_top_bs->backing_hd == base) {
-        ret = 0;
-        goto exit;
-    }
+    bdrv_swap(*base, *top);
 
-    intermediate = top;
+    (*base)->backing_hd = NULL;
+    bdrv_delete(*base);
+    *base = *top;
 
-    /* now we will go down through the list, and add each BDS we find
-     * into our deletion queue, until we hit the 'base'
-     */
-    while (intermediate) {
-        intermediate_state = g_malloc0(sizeof(BlkIntermediateStates));
-        intermediate_state->bs = intermediate;
-        QSIMPLEQ_INSERT_TAIL(&states_to_delete, intermediate_state, entry);
-
-        if (intermediate->backing_hd == base) {
-            base_bs = intermediate->backing_hd;
-            break;
-        }
-        intermediate = intermediate->backing_hd;
-    }
-    if (base_bs == NULL) {
-        /* something went wrong, we did not end at the base. safely
-         * unravel everything, and exit with error */
-        goto exit;
-    }
-
-    /* success - we can delete the intermediate states, and link top->base */
-    ret = bdrv_change_backing_file(new_top_bs, base_bs->filename,
-                                   base_bs->drv ? base_bs->drv->format_name : "");
-    if (ret) {
-        goto exit;
-    }
-    new_top_bs->backing_hd = base_bs;
-
-
-    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
-        /* so that bdrv_close() does not recursively close the chain */
-        intermediate_state->bs->backing_hd = NULL;
-        bdrv_delete(intermediate_state->bs);
+    /* overlay exists when active != top, need to change backing file for it */
+    if (top_overlay) {
+        ret = bdrv_change_backing_file(top_overlay, (*base)->filename,
+                                       (*base)->drv ?
+                                            (*base)->drv->format_name : "");
     }
-    ret = 0;
-
 exit:
-    QSIMPLEQ_FOREACH_SAFE(intermediate_state, &states_to_delete, entry, next) {
-        g_free(intermediate_state);
-    }
     return ret;
 }
 
diff --git a/block/commit.c b/block/commit.c
index 2227fc2..b5e024b 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -17,14 +17,13 @@ 
 #include "block/blockjob.h"
 #include "qemu/ratelimit.h"
 
-enum {
-    /*
-     * Size of data buffer for populating the image file.  This should be large
-     * enough to process multiple clusters in a single call, so that populating
-     * contiguous regions of the image is efficient.
-     */
-    COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */
-};
+/*
+ * Size of data buffer for populating the image file.  This should be large
+ * enough to process multiple clusters in a single call, so that populating
+ * contiguous regions of the image is efficient.
+ */
+#define COMMIT_BUFFER_SECTORS 128
+#define COMMIT_BUFFER_BYTES (COMMIT_BUFFER_SECTORS * BDRV_SECTOR_SIZE)
 
 #define SLICE_TIME 100000000ULL /* ns */
 
@@ -34,11 +33,27 @@  typedef struct CommitBlockJob {
     BlockDriverState *active;
     BlockDriverState *top;
     BlockDriverState *base;
+    BlockDriverState *overlay;
     BlockdevOnError on_error;
     int base_flags;
     int orig_overlay_flags;
+    bool should_complete;
+    bool ready;
 } CommitBlockJob;
 
+static void commit_complete(BlockJob *job, Error **errp)
+{
+    CommitBlockJob *s = container_of(job, CommitBlockJob, common);
+
+    if (!s->ready) {
+        error_set(errp, QERR_BLOCK_JOB_NOT_READY, job->bs->device_name);
+        return;
+    }
+
+    s->should_complete = true;
+    block_job_resume(job);
+}
+
 static int coroutine_fn commit_populate(BlockDriverState *bs,
                                         BlockDriverState *base,
                                         int64_t sector_num, int nb_sectors,
@@ -65,100 +80,136 @@  static void coroutine_fn commit_run(void *opaque)
     BlockDriverState *active = s->active;
     BlockDriverState *top = s->top;
     BlockDriverState *base = s->base;
-    BlockDriverState *overlay_bs;
     int64_t sector_num, end;
     int ret = 0;
     int n = 0;
     void *buf;
-    int bytes_written = 0;
     int64_t base_len;
+    int64_t next_dirty;
+    HBitmapIter hbi;
+    uint64_t delay_ns = 0;
 
+    buf = qemu_blockalign(top, COMMIT_BUFFER_BYTES);
     ret = s->common.len = bdrv_getlength(top);
 
-
     if (s->common.len < 0) {
-        goto exit_restore_reopen;
+        goto exit;
     }
 
     ret = base_len = bdrv_getlength(base);
     if (base_len < 0) {
-        goto exit_restore_reopen;
+        goto exit;
     }
 
     if (base_len < s->common.len) {
         ret = bdrv_truncate(base, s->common.len);
         if (ret) {
-            goto exit_restore_reopen;
+            goto exit;
         }
     }
 
     end = s->common.len >> BDRV_SECTOR_BITS;
-    buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE);
 
     for (sector_num = 0; sector_num < end; sector_num += n) {
-        uint64_t delay_ns = 0;
-        bool copy;
-
-wait:
-        /* Note that even when no rate limit is applied we need to yield
-         * with no pending I/O here so that bdrv_drain_all() returns.
-         */
-        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         if (block_job_is_cancelled(&s->common)) {
-            break;
+            goto exit;
         }
         /* Copy if allocated above the base */
         ret = bdrv_co_is_allocated_above(top, base, sector_num,
-                                         COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE,
+                                         COMMIT_BUFFER_SECTORS,
                                          &n);
-        copy = (ret == 1);
-        trace_commit_one_iteration(s, sector_num, n, ret);
-        if (copy) {
-            if (s->common.speed) {
-                delay_ns = ratelimit_calculate_delay(&s->limit, n);
-                if (delay_ns > 0) {
-                    goto wait;
-                }
+        if (ret < 0) {
+            goto exit;
+        } else if (ret > 0) {
+            bdrv_set_dirty(top, sector_num, n);
+        }
+
+        /* TODO: do we need to sleep longer to speed down this phase */
+        block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+    }
+
+    for (;;) {
+        int64_t cnt = bdrv_get_dirty_count(s->top);
+        if (cnt == 0) {
+            if (!s->overlay && !s->ready) {
+                s->ready = true;
+                block_job_ready(&s->common);
             }
-            ret = commit_populate(top, base, sector_num, n, buf);
-            bytes_written += n * BDRV_SECTOR_SIZE;
+            /* We can complete if user called complete job or the job is
+             * committing non-active image */
+            if (s->should_complete || s->overlay) {
+                break;
+            }
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
         }
-        if (ret < 0) {
-            if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
-                s->on_error == BLOCKDEV_ON_ERROR_REPORT||
-                (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) {
-                goto exit_free_buf;
-            } else {
-                n = 0;
-                continue;
+
+        if (block_job_is_cancelled(&s->common)) {
+            goto exit;
+        }
+
+        bdrv_dirty_iter_init(s->top, &hbi);
+        for (next_dirty = hbitmap_iter_next(&hbi);
+                next_dirty >= 0;
+                next_dirty = hbitmap_iter_next(&hbi)) {
+            sector_num = next_dirty;
+            if (block_job_is_cancelled(&s->common)) {
+                goto exit;
+            }
+            delay_ns = ratelimit_calculate_delay(&s->limit,
+                                                 COMMIT_BUFFER_SECTORS);
+            /* Note that even when no rate limit is applied we need to yield
+             * with no pending I/O here so that bdrv_drain_all() returns.
+             */
+            block_job_sleep_ns(&s->common, rt_clock, delay_ns);
+            trace_commit_one_iteration(s, sector_num,
+                                       COMMIT_BUFFER_SECTORS, ret);
+            bdrv_reset_dirty(top, sector_num, COMMIT_BUFFER_SECTORS);
+            ret = commit_populate(top, base, sector_num,
+                                  COMMIT_BUFFER_SECTORS, buf);
+            if (ret < 0) {
+                if (s->on_error == BLOCKDEV_ON_ERROR_STOP ||
+                    s->on_error == BLOCKDEV_ON_ERROR_REPORT ||
+                    (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC &&
+                         ret == -ENOSPC)) {
+                    goto exit;
+                } else {
+                    continue;
+                }
             }
+            /* Publish progress */
+            s->common.offset += COMMIT_BUFFER_BYTES;
         }
-        /* Publish progress */
-        s->common.offset += n * BDRV_SECTOR_SIZE;
     }
+    s->common.offset = end;
 
-    ret = 0;
-
-    if (!block_job_is_cancelled(&s->common) && sector_num == end) {
-        /* success */
-        ret = bdrv_drop_intermediate(active, top, base);
+    bdrv_co_flush(base);
+    if (!block_job_is_cancelled(&s->common)) {
+        /* Drop intermediate: [top, base) */
+        ret = bdrv_drop_intermediate(s->overlay, &top, &base);
+        s->common.offset = s->common.len;
     }
 
-exit_free_buf:
-    qemu_vfree(buf);
+    ret = 0;
 
-exit_restore_reopen:
-    /* restore base open flags here if appropriate (e.g., change the base back
-     * to r/o). These reopens do not need to be atomic, since we won't abort
-     * even on failure here */
-    if (s->base_flags != bdrv_get_flags(base)) {
-        bdrv_reopen(base, s->base_flags, NULL);
-    }
-    overlay_bs = bdrv_find_overlay(active, top);
-    if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) {
-        bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL);
+exit:
+    bdrv_set_dirty_tracking(active, 0);
+
+    if (s->overlay) {
+        /* We have overlay when committing non-active image, restore base open
+         * flags too if appropriate (e.g., change the base back to r/o).
+         * Otherwise, as we commit the active image, base is already active
+         * now, no need to reopen to old flags.  These reopens do not need to
+         * be atomic, since we won't abort even on failure here */
+        if (s->base_flags != bdrv_get_flags(base)) {
+            bdrv_reopen(base, s->base_flags, NULL);
+        }
+
+        if (s->orig_overlay_flags != bdrv_get_flags(s->overlay)) {
+            bdrv_reopen(s->overlay, s->orig_overlay_flags, NULL);
+        }
     }
 
+    qemu_vfree(buf);
     block_job_completed(&s->common, ret);
 }
 
@@ -177,6 +228,7 @@  static const BlockJobType commit_job_type = {
     .instance_size = sizeof(CommitBlockJob),
     .job_type      = "commit",
     .set_speed     = commit_set_speed,
+    .complete      = commit_complete,
 };
 
 void commit_start(BlockDriverState *bs, BlockDriverState *base,
@@ -198,37 +250,27 @@  void commit_start(BlockDriverState *bs, BlockDriverState *base,
         return;
     }
 
-    /* Once we support top == active layer, remove this check */
-    if (top == bs) {
-        error_setg(errp,
-                   "Top image as the active layer is currently unsupported");
-        return;
-    }
-
     if (top == base) {
         error_setg(errp, "Invalid files for merge: top and base are the same");
         return;
     }
 
-    overlay_bs = bdrv_find_overlay(bs, top);
-
-    if (overlay_bs == NULL) {
-        error_setg(errp, "Could not find overlay image for %s:", top->filename);
-        return;
-    }
-
-    orig_base_flags    = bdrv_get_flags(base);
-    orig_overlay_flags = bdrv_get_flags(overlay_bs);
-
     /* convert base & overlay_bs to r/w, if necessary */
+    orig_base_flags    = bdrv_get_flags(base);
     if (!(orig_base_flags & BDRV_O_RDWR)) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, base,
                                          orig_base_flags | BDRV_O_RDWR);
     }
-    if (!(orig_overlay_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
-                                         orig_overlay_flags | BDRV_O_RDWR);
+
+    overlay_bs = bdrv_find_overlay(bs, top);
+    if (overlay_bs) {
+        orig_overlay_flags = bdrv_get_flags(overlay_bs);
+        if (!(orig_overlay_flags & BDRV_O_RDWR)) {
+            reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs,
+                    orig_overlay_flags | BDRV_O_RDWR);
+        }
     }
+
     if (reopen_queue) {
         bdrv_reopen_multiple(reopen_queue, &local_err);
         if (local_err != NULL) {
@@ -237,7 +279,6 @@  void commit_start(BlockDriverState *bs, BlockDriverState *base,
         }
     }
 
-
     s = block_job_create(&commit_job_type, bs, speed, cb, opaque, errp);
     if (!s) {
         return;
@@ -246,13 +287,19 @@  void commit_start(BlockDriverState *bs, BlockDriverState *base,
     s->base   = base;
     s->top    = top;
     s->active = bs;
+    s->overlay = overlay_bs;
 
     s->base_flags          = orig_base_flags;
-    s->orig_overlay_flags  = orig_overlay_flags;
+    if (overlay_bs) {
+        s->orig_overlay_flags  = orig_overlay_flags;
+    }
 
     s->on_error = on_error;
     s->common.co = qemu_coroutine_create(commit_run);
 
     trace_commit_start(bs, base, top, s, s->common.co, opaque);
+
+    bdrv_set_dirty_tracking(top, COMMIT_BUFFER_BYTES);
+
     qemu_coroutine_enter(s->common.co, s);
 }
diff --git a/include/block/block.h b/include/block/block.h
index 742fce5..a5d05ab 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -199,8 +199,9 @@  int bdrv_commit_all(void);
 int bdrv_change_backing_file(BlockDriverState *bs,
     const char *backing_file, const char *backing_fmt);
 void bdrv_register(BlockDriver *bdrv);
-int bdrv_drop_intermediate(BlockDriverState *active, BlockDriverState *top,
-                           BlockDriverState *base);
+int bdrv_drop_intermediate(BlockDriverState *top_overlay,
+                           BlockDriverState **top,
+                           BlockDriverState **base);
 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
                                     BlockDriverState *bs);
 BlockDriverState *bdrv_find_base(BlockDriverState *bs);