diff mbox

[v4,2/8] block: add basic backup support to block driver

Message ID 1368693379-8434-3-git-send-email-stefanha@redhat.com
State New
Headers show

Commit Message

Stefan Hajnoczi May 16, 2013, 8:36 a.m. UTC
From: Dietmar Maurer <dietmar@proxmox.com>

backup_start() creates a block job that copies a point-in-time snapshot
of a block device to a target block device.

We call backup_do_cow() for each write during backup. That function
reads the original data from the block device before it gets
overwritten.  The data is then written to the target device.

Currently backup cluster size is hardcoded to 65536 bytes.

[I made a number of changes to Dietmar's original patch and folded them
in to make code review easy.  Here is the full list:

 * Drop BackupDumpFunc interface in favor of a target block device
 * Detect zero clusters with buffer_is_zero()
 * Don't write zero clusters to the target
 * Use 0 delay instead of 1us, like other block jobs
 * Unify creation/start functions into backup_start()
 * Simplify cleanup, free bitmap in backup_run() instead of cb
 * function
 * Use HBitmap to avoid duplicating bitmap code
 * Use bdrv_getlength() instead of accessing ->total_sectors
 * directly
 * Delete the backup.h header file, it is no longer necessary
 * Move ./backup.c to block/backup.c
 * Remove #ifdefed out code
 * Coding style and whitespace cleanups
 * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks
 * Keep our own in-flight CowRequest list instead of using block.c
   tracked requests.  This means a little code duplication but is much
   simpler than trying to share the tracked requests list and use the
   backup block size.

-- stefanha]

Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/Makefile.objs       |   1 +
 block/backup.c            | 283 ++++++++++++++++++++++++++++++++++++++++++++++
 include/block/block_int.h |  16 +++
 3 files changed, 300 insertions(+)
 create mode 100644 block/backup.c

Comments

Kevin Wolf May 22, 2013, 9:38 a.m. UTC | #1
Am 16.05.2013 um 10:36 hat Stefan Hajnoczi geschrieben:
> From: Dietmar Maurer <dietmar@proxmox.com>
> 
> backup_start() creates a block job that copies a point-in-time snapshot
> of a block device to a target block device.
> 
> We call backup_do_cow() for each write during backup. That function
> reads the original data from the block device before it gets
> overwritten.  The data is then written to the target device.
> 
> Currently backup cluster size is hardcoded to 65536 bytes.
> 
> [I made a number of changes to Dietmar's original patch and folded them
> in to make code review easy.  Here is the full list:
> 
>  * Drop BackupDumpFunc interface in favor of a target block device
>  * Detect zero clusters with buffer_is_zero()
>  * Don't write zero clusters to the target
>  * Use 0 delay instead of 1us, like other block jobs
>  * Unify creation/start functions into backup_start()
>  * Simplify cleanup, free bitmap in backup_run() instead of cb
>  * function
>  * Use HBitmap to avoid duplicating bitmap code
>  * Use bdrv_getlength() instead of accessing ->total_sectors
>  * directly
>  * Delete the backup.h header file, it is no longer necessary
>  * Move ./backup.c to block/backup.c
>  * Remove #ifdefed out code
>  * Coding style and whitespace cleanups
>  * Use bdrv_add_before_write_notifier() instead of blockjob-specific hooks
>  * Keep our own in-flight CowRequest list instead of using block.c
>    tracked requests.  This means a little code duplication but is much
>    simpler than trying to share the tracked requests list and use the
>    backup block size.
> 
> -- stefanha]
> 
> Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  block/Makefile.objs       |   1 +
>  block/backup.c            | 283 ++++++++++++++++++++++++++++++++++++++++++++++
>  include/block/block_int.h |  16 +++
>  3 files changed, 300 insertions(+)
>  create mode 100644 block/backup.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 5f0358a..88bd101 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -20,5 +20,6 @@ endif
>  common-obj-y += stream.o
>  common-obj-y += commit.o
>  common-obj-y += mirror.o
> +common-obj-y += backup.o
>  
>  $(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
> diff --git a/block/backup.c b/block/backup.c
> new file mode 100644
> index 0000000..5438e26
> --- /dev/null
> +++ b/block/backup.c
> @@ -0,0 +1,283 @@
> +/*
> + * QEMU backup
> + *
> + * Copyright (C) 2013 Proxmox Server Solutions
> + *
> + * Authors:
> + *  Dietmar Maurer (dietmar@proxmox.com)
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include <stdio.h>
> +#include <errno.h>
> +#include <unistd.h>
> +
> +#include "block/block.h"
> +#include "block/block_int.h"
> +#include "block/blockjob.h"
> +#include "qemu/ratelimit.h"
> +
> +#define DEBUG_BACKUP 0
> +
> +#define DPRINTF(fmt, ...) \
> +    do { \
> +        if (DEBUG_BACKUP) { \
> +            fprintf(stderr, "backup: " fmt, ## __VA_ARGS__); \
> +        } \
> +    } while (0)
> +
> +#define BACKUP_CLUSTER_BITS 16
> +#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
> +#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)

BACKUP_SECTORS_PER_CLUSTER for more consistent naming?

> +
> +#define SLICE_TIME 100000000ULL /* ns */
> +
> +typedef struct CowRequest {
> +    int64_t start;
> +    int64_t end;
> +    QLIST_ENTRY(CowRequest) list;
> +    CoQueue wait_queue; /* coroutines blocked on this request */
> +} CowRequest;
> +
> +typedef struct BackupBlockJob {
> +    BlockJob common;
> +    BlockDriverState *target;
> +    RateLimit limit;
> +    CoRwlock flush_rwlock;
> +    uint64_t sectors_read;
> +    HBitmap *bitmap;
> +    QLIST_HEAD(, CowRequest) inflight_reqs;
> +} BackupBlockJob;
> +
> +/* See if in-flight requests overlap and wait for them to complete */
> +static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
> +                                                       int64_t start,
> +                                                       int64_t end)
> +{
> +    CowRequest *req;
> +    bool retry;
> +
> +    do {
> +        retry = false;
> +        QLIST_FOREACH(req, &job->inflight_reqs, list) {
> +            if (end > req->start && start < req->end) {
> +                qemu_co_queue_wait(&req->wait_queue);
> +                retry = true;
> +                break;
> +            }
> +        }
> +    } while (retry);
> +}
> +
> +/* Keep track of an in-flight request */
> +static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
> +                                     int64_t start, int64_t end)
> +{
> +    req->start = start;
> +    req->end = end;
> +    qemu_co_queue_init(&req->wait_queue);
> +    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
> +}
> +
> +/* Forget about a completed request */
> +static void cow_request_end(CowRequest *req)
> +{
> +    QLIST_REMOVE(req, list);
> +    qemu_co_queue_restart_all(&req->wait_queue);
> +}
> +
> +static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> +                                      int64_t sector_num, int nb_sectors)
> +{
> +    BackupBlockJob *job = (BackupBlockJob *)bs->job;
> +    CowRequest cow_request;
> +    struct iovec iov;
> +    QEMUIOVector bounce_qiov;
> +    void *bounce_buffer = NULL;
> +    int ret = 0;
> +    int64_t start, end;
> +
> +    qemu_co_rwlock_rdlock(&job->flush_rwlock);
> +
> +    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
> +    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_BLOCKS_PER_CLUSTER);
> +
> +    DPRINTF("brdv_co_backup_cow enter %s C%" PRId64 " %" PRId64 " %d\n",
> +            bdrv_get_device_name(bs), start, sector_num, nb_sectors);
> +
> +    wait_for_overlapping_requests(job, start, end);
> +    cow_request_begin(&cow_request, job, start, end);
> +
> +    for (; start < end; start++) {
> +        if (hbitmap_get(job->bitmap, start)) {
> +            DPRINTF("brdv_co_backup_cow skip C%" PRId64 "\n", start);
> +            continue; /* already copied */
> +        }
> +
> +        /* immediately set bitmap (avoid coroutine race) */
> +        hbitmap_set(job->bitmap, start, 1);

Hm, what kind of race? Doesn't wait_for_overlapping_requests() already
serialise everything so that it doesn't matter where exactly the bit is
set, as long as it's between cow_request_begin/end?

> +
> +        DPRINTF("brdv_co_backup_cow C%" PRId64 "\n", start);
> +
> +        if (!bounce_buffer) {
> +            iov.iov_len = BACKUP_CLUSTER_SIZE;
> +            iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
> +            qemu_iovec_init_external(&bounce_qiov, &iov, 1);
> +        }
> +
> +        ret = bdrv_co_readv(bs, start * BACKUP_BLOCKS_PER_CLUSTER,
> +                            BACKUP_BLOCKS_PER_CLUSTER,
> +                            &bounce_qiov);
> +        if (ret < 0) {
> +            DPRINTF("brdv_co_backup_cow bdrv_read C%" PRId64 " failed\n",
> +                    start);
> +            goto out;
> +        }
> +
> +        job->sectors_read += BACKUP_BLOCKS_PER_CLUSTER;
> +
> +        if (!buffer_is_zero(bounce_buffer, BACKUP_CLUSTER_SIZE)) {
> +            ret = bdrv_co_writev(job->target, start * BACKUP_BLOCKS_PER_CLUSTER,
> +                                 BACKUP_BLOCKS_PER_CLUSTER,
> +                                 &bounce_qiov);
> +            if (ret < 0) {
> +                DPRINTF("brdv_co_backup_cow dump_cluster_cb C%" PRId64
> +                        " failed\n", start);
> +                goto out;
> +            }
> +        }

This series seems to only allow standalone target images without a
backing file, so this is okay. But we've been talking about use cases
like using the original image as a backing file and exposing the backup
via NBD in order to provide consistent inspection. Then we can't simply
ignore all-zero clusters.

Maybe we should use bdrv_co_write_zeroes() from the beginning?

> +
> +        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
> +    }
> +
> +out:
> +    if (bounce_buffer) {
> +        qemu_vfree(bounce_buffer);
> +    }
> +
> +    cow_request_end(&cow_request);
> +
> +    qemu_co_rwlock_unlock(&job->flush_rwlock);
> +
> +    return ret;
> +}
> +
> +static void coroutine_fn backup_before_write_notify(Notifier *notifier,
> +                                                    void *opaque)
> +{
> +    BdrvTrackedRequest *req = opaque;
> +    backup_do_cow(req->bs, req->sector_num, req->nb_sectors);
> +}

I don't think you can ignore errors here. Not sure if we can stop the VM
and resume later or something like that, but if we can't, the backup
will be invalid and we must fail the job.

Kevin
Paolo Bonzini May 22, 2013, 9:54 a.m. UTC | #2
Il 22/05/2013 11:38, Kevin Wolf ha scritto:
>> +
>> +        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
>> +    }
>> +
>> +out:
>> +    if (bounce_buffer) {
>> +        qemu_vfree(bounce_buffer);
>> +    }
>> +
>> +    cow_request_end(&cow_request);
>> +
>> +    qemu_co_rwlock_unlock(&job->flush_rwlock);
>> +
>> +    return ret;
>> +}
>> +
>> +static void coroutine_fn backup_before_write_notify(Notifier *notifier,
>> +                                                    void *opaque)
>> +{
>> +    BdrvTrackedRequest *req = opaque;
>> +    backup_do_cow(req->bs, req->sector_num, req->nb_sectors);
>> +}
> 
> I don't think you can ignore errors here. Not sure if we can stop the VM
> and resume later or something like that, but if we can't, the backup
> will be invalid and we must fail the job.

Yes, there is rerror/werror machinery for jobs that this patch is not using.

Paolo
Kevin Wolf May 22, 2013, 9:56 a.m. UTC | #3
Am 22.05.2013 um 11:54 hat Paolo Bonzini geschrieben:
> Il 22/05/2013 11:38, Kevin Wolf ha scritto:
> >> +
> >> +        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
> >> +    }
> >> +
> >> +out:
> >> +    if (bounce_buffer) {
> >> +        qemu_vfree(bounce_buffer);
> >> +    }
> >> +
> >> +    cow_request_end(&cow_request);
> >> +
> >> +    qemu_co_rwlock_unlock(&job->flush_rwlock);
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static void coroutine_fn backup_before_write_notify(Notifier *notifier,
> >> +                                                    void *opaque)
> >> +{
> >> +    BdrvTrackedRequest *req = opaque;
> >> +    backup_do_cow(req->bs, req->sector_num, req->nb_sectors);
> >> +}
> > 
> > I don't think you can ignore errors here. Not sure if we can stop the VM
> > and resume later or something like that, but if we can't, the backup
> > will be invalid and we must fail the job.
> 
> Yes, there is rerror/werror machinery for jobs that this patch is not using.

This is not enough here. The guest write can't continue before the old
content is saved to the backup image.

Kevin
Stefan Hajnoczi May 22, 2013, 1:58 p.m. UTC | #4
On Wed, May 22, 2013 at 11:56:45AM +0200, Kevin Wolf wrote:
> Am 22.05.2013 um 11:54 hat Paolo Bonzini geschrieben:
> > Il 22/05/2013 11:38, Kevin Wolf ha scritto:
> > >> +
> > >> +        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
> > >> +    }
> > >> +
> > >> +out:
> > >> +    if (bounce_buffer) {
> > >> +        qemu_vfree(bounce_buffer);
> > >> +    }
> > >> +
> > >> +    cow_request_end(&cow_request);
> > >> +
> > >> +    qemu_co_rwlock_unlock(&job->flush_rwlock);
> > >> +
> > >> +    return ret;
> > >> +}
> > >> +
> > >> +static void coroutine_fn backup_before_write_notify(Notifier *notifier,
> > >> +                                                    void *opaque)
> > >> +{
> > >> +    BdrvTrackedRequest *req = opaque;
> > >> +    backup_do_cow(req->bs, req->sector_num, req->nb_sectors);
> > >> +}
> > > 
> > > I don't think you can ignore errors here. Not sure if we can stop the VM
> > > and resume later or something like that, but if we can't, the backup
> > > will be invalid and we must fail the job.
> > 
> > Yes, there is rerror/werror machinery for jobs that this patch is not using.
> 
> This is not enough here. The guest write can't continue before the old
> content is saved to the backup image.

Are you saying just the vm_stop(RUN_STATE_IO_ERROR) call is missing?

I think the reason it's not there is because there's an assumption that
block jobs are in the background and do not affect the guest.  The guest
continues running while the block job is in an error state.

But perhaps we can make the vm_stop() call optional so that drive-backup
can use it.

Stefan
Kevin Wolf May 22, 2013, 2:08 p.m. UTC | #5
Am 22.05.2013 um 15:58 hat Stefan Hajnoczi geschrieben:
> On Wed, May 22, 2013 at 11:56:45AM +0200, Kevin Wolf wrote:
> > Am 22.05.2013 um 11:54 hat Paolo Bonzini geschrieben:
> > > Il 22/05/2013 11:38, Kevin Wolf ha scritto:
> > > >> +
> > > >> +        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
> > > >> +    }
> > > >> +
> > > >> +out:
> > > >> +    if (bounce_buffer) {
> > > >> +        qemu_vfree(bounce_buffer);
> > > >> +    }
> > > >> +
> > > >> +    cow_request_end(&cow_request);
> > > >> +
> > > >> +    qemu_co_rwlock_unlock(&job->flush_rwlock);
> > > >> +
> > > >> +    return ret;
> > > >> +}
> > > >> +
> > > >> +static void coroutine_fn backup_before_write_notify(Notifier *notifier,
> > > >> +                                                    void *opaque)
> > > >> +{
> > > >> +    BdrvTrackedRequest *req = opaque;
> > > >> +    backup_do_cow(req->bs, req->sector_num, req->nb_sectors);
> > > >> +}
> > > > 
> > > > I don't think you can ignore errors here. Not sure if we can stop the VM
> > > > and resume later or something like that, but if we can't, the backup
> > > > will be invalid and we must fail the job.
> > > 
> > > Yes, there is rerror/werror machinery for jobs that this patch is not using.
> > 
> > This is not enough here. The guest write can't continue before the old
> > content is saved to the backup image.
> 
> Are you saying just the vm_stop(RUN_STATE_IO_ERROR) call is missing?

No. Stopping the VM and on 'cont' assuming that the request was
successful (which it wasn't) would be wrong as well. You need the full
failed request handling, with restarting the request from the device
emulation and everything.

> I think the reason it's not there is because there's an assumption that
> block jobs are in the background and do not affect the guest.  The guest
> continues running while the block job is in an error state.

But this is _not_ a background job. This is the write request hook, it
is active on the guest write path. If you can't backup a given sector,
you can't overwrite it and must either fail the write request or the
backup job. Failing the write request is probably nicer because you get
the usual werror handling then, but it means that your assumption
doesn't hold true.

(See, this is one of the reasons why I was for a BlockDriver instead of
notifiers into block jobs. Things would be clearer that way because the
control flow would be explicit in the filter code.)

> But perhaps we can make the vm_stop() call optional so that drive-backup
> can use it.

Not sure what you're trying to do here.

Kevin
diff mbox

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 5f0358a..88bd101 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -20,5 +20,6 @@  endif
 common-obj-y += stream.o
 common-obj-y += commit.o
 common-obj-y += mirror.o
+common-obj-y += backup.o
 
 $(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
diff --git a/block/backup.c b/block/backup.c
new file mode 100644
index 0000000..5438e26
--- /dev/null
+++ b/block/backup.c
@@ -0,0 +1,283 @@ 
+/*
+ * QEMU backup
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "block/block.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+
+#define DEBUG_BACKUP 0
+
+#define DPRINTF(fmt, ...) \
+    do { \
+        if (DEBUG_BACKUP) { \
+            fprintf(stderr, "backup: " fmt, ## __VA_ARGS__); \
+        } \
+    } while (0)
+
+#define BACKUP_CLUSTER_BITS 16
+#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
+#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CowRequest {
+    int64_t start;
+    int64_t end;
+    QLIST_ENTRY(CowRequest) list;
+    CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
+typedef struct BackupBlockJob {
+    BlockJob common;
+    BlockDriverState *target;
+    RateLimit limit;
+    CoRwlock flush_rwlock;
+    uint64_t sectors_read;
+    HBitmap *bitmap;
+    QLIST_HEAD(, CowRequest) inflight_reqs;
+} BackupBlockJob;
+
+/* See if in-flight requests overlap and wait for them to complete */
+static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
+                                                       int64_t start,
+                                                       int64_t end)
+{
+    CowRequest *req;
+    bool retry;
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &job->inflight_reqs, list) {
+            if (end > req->start && start < req->end) {
+                qemu_co_queue_wait(&req->wait_queue);
+                retry = true;
+                break;
+            }
+        }
+    } while (retry);
+}
+
+/* Keep track of an in-flight request */
+static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
+                                     int64_t start, int64_t end)
+{
+    req->start = start;
+    req->end = end;
+    qemu_co_queue_init(&req->wait_queue);
+    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
+}
+
+/* Forget about a completed request */
+static void cow_request_end(CowRequest *req)
+{
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors)
+{
+    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    CowRequest cow_request;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buffer = NULL;
+    int ret = 0;
+    int64_t start, end;
+
+    qemu_co_rwlock_rdlock(&job->flush_rwlock);
+
+    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_BLOCKS_PER_CLUSTER);
+
+    DPRINTF("brdv_co_backup_cow enter %s C%" PRId64 " %" PRId64 " %d\n",
+            bdrv_get_device_name(bs), start, sector_num, nb_sectors);
+
+    wait_for_overlapping_requests(job, start, end);
+    cow_request_begin(&cow_request, job, start, end);
+
+    for (; start < end; start++) {
+        if (hbitmap_get(job->bitmap, start)) {
+            DPRINTF("brdv_co_backup_cow skip C%" PRId64 "\n", start);
+            continue; /* already copied */
+        }
+
+        /* immediately set bitmap (avoid coroutine race) */
+        hbitmap_set(job->bitmap, start, 1);
+
+        DPRINTF("brdv_co_backup_cow C%" PRId64 "\n", start);
+
+        if (!bounce_buffer) {
+            iov.iov_len = BACKUP_CLUSTER_SIZE;
+            iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+            qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+        }
+
+        ret = bdrv_co_readv(bs, start * BACKUP_BLOCKS_PER_CLUSTER,
+                            BACKUP_BLOCKS_PER_CLUSTER,
+                            &bounce_qiov);
+        if (ret < 0) {
+            DPRINTF("brdv_co_backup_cow bdrv_read C%" PRId64 " failed\n",
+                    start);
+            goto out;
+        }
+
+        job->sectors_read += BACKUP_BLOCKS_PER_CLUSTER;
+
+        if (!buffer_is_zero(bounce_buffer, BACKUP_CLUSTER_SIZE)) {
+            ret = bdrv_co_writev(job->target, start * BACKUP_BLOCKS_PER_CLUSTER,
+                                 BACKUP_BLOCKS_PER_CLUSTER,
+                                 &bounce_qiov);
+            if (ret < 0) {
+                DPRINTF("brdv_co_backup_cow dump_cluster_cb C%" PRId64
+                        " failed\n", start);
+                goto out;
+            }
+        }
+
+        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
+    }
+
+out:
+    if (bounce_buffer) {
+        qemu_vfree(bounce_buffer);
+    }
+
+    cow_request_end(&cow_request);
+
+    qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+    return ret;
+}
+
+static void coroutine_fn backup_before_write_notify(Notifier *notifier,
+                                                    void *opaque)
+{
+    BdrvTrackedRequest *req = opaque;
+    backup_do_cow(req->bs, req->sector_num, req->nb_sectors);
+}
+
+static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType backup_job_type = {
+    .instance_size = sizeof(BackupBlockJob),
+    .job_type = "backup",
+    .set_speed = backup_set_speed,
+};
+
+static void coroutine_fn backup_run(void *opaque)
+{
+    BackupBlockJob *job = opaque;
+    BlockDriverState *bs = job->common.bs;
+    Notifier before_write = {
+        .notify = backup_before_write_notify,
+    };
+    int64_t start, end;
+    int ret = 0;
+
+    QLIST_INIT(&job->inflight_reqs);
+    qemu_co_rwlock_init(&job->flush_rwlock);
+
+    start = 0;
+    end = DIV_ROUND_UP(bdrv_getlength(bs) / BDRV_SECTOR_SIZE,
+                       BACKUP_BLOCKS_PER_CLUSTER);
+
+    job->bitmap = hbitmap_alloc(end, 0);
+
+    bdrv_add_before_write_notifier(bs, &before_write);
+
+    DPRINTF("backup_run start %s %" PRId64 " %" PRId64 "\n",
+            bdrv_get_device_name(bs), start, end);
+
+    for (; start < end; start++) {
+        if (block_job_is_cancelled(&job->common)) {
+            break;
+        }
+
+        /* we need to yield so that qemu_aio_flush() returns.
+         * (without, VM does not reboot)
+         */
+        if (job->common.speed) {
+            uint64_t delay_ns = ratelimit_calculate_delay(
+                &job->limit, job->sectors_read);
+            job->sectors_read = 0;
+            block_job_sleep_ns(&job->common, rt_clock, delay_ns);
+        } else {
+            block_job_sleep_ns(&job->common, rt_clock, 0);
+        }
+
+        if (block_job_is_cancelled(&job->common)) {
+            break;
+        }
+
+        DPRINTF("backup_run loop C%" PRId64 "\n", start);
+
+        ret = backup_do_cow(bs, start * BACKUP_BLOCKS_PER_CLUSTER, 1);
+        if (ret < 0) {
+            break;
+        }
+
+        /* Publish progress */
+        job->common.offset += BACKUP_CLUSTER_SIZE;
+    }
+
+    notifier_remove(&before_write);
+
+    /* wait until pending backup_do_cow() calls have completed */
+    qemu_co_rwlock_wrlock(&job->flush_rwlock);
+    qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+    hbitmap_free(job->bitmap);
+
+    bdrv_delete(job->target);
+
+    DPRINTF("backup_run complete %d\n", ret);
+    block_job_completed(&job->common, ret);
+}
+
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed,
+                  BlockDriverCompletionFunc *cb, void *opaque,
+                  Error **errp)
+{
+    assert(bs);
+    assert(target);
+    assert(cb);
+
+    DPRINTF("backup_start %s\n", bdrv_get_device_name(bs));
+
+    BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
+                                           cb, opaque, errp);
+    if (!job) {
+        return;
+    }
+
+    job->target = target;
+    job->common.len = bdrv_getlength(bs);
+    job->common.co = qemu_coroutine_create(backup_run);
+    qemu_coroutine_enter(job->common.co, job);
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index a498fb0..625ebcf 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -397,4 +397,20 @@  void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp);
 
+/*
+ * backup_start:
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Start a backup operation on @bs.  Clusters in @bs are written to @target
+ * until the job is cancelled or manually completed.
+ */
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed,
+                  BlockDriverCompletionFunc *cb, void *opaque,
+                  Error **errp);
+
 #endif /* BLOCK_INT_H */