diff mbox

[v3,2/8] block: add basic backup support to block driver

Message ID 1368628476-19622-3-git-send-email-stefanha@redhat.com
State New
Headers show

Commit Message

Stefan Hajnoczi May 15, 2013, 2:34 p.m. UTC
From: Dietmar Maurer <dietmar@proxmox.com>

backup_start() creates a block job that copies a point-in-time snapshot
of a block device to a target block device.

We call backup_do_cow() for each write during backup. That function
reads the original data from the block device before it gets
overwritten.  The data is then written to the target device.

Currently backup cluster size is hardcoded to 65536 bytes.

[I made a number of changes to Dietmar's original patch and folded them
in to make code review easy.  Here is the full list:

 * Drop BackupDumpFunc interface in favor of a target block device
 * Detect zero clusters with buffer_is_zero()
 * Don't write zero clusters to the target
 * Use 0 delay instead of 1us, like other block jobs
 * Unify creation/start functions into backup_start()
 * Simplify cleanup, free bitmap in backup_run() instead of cb
 * function
 * Use HBitmap to avoid duplicating bitmap code
 * Use bdrv_getlength() instead of accessing ->total_sectors
 * directly
 * Delete the backup.h header file, it is no longer necessary
 * Move ./backup.c to block/backup.c
 * Remove #ifdefed out code
 * Coding style and whitespace cleanups
 * Use bdrv_add_before_write_cb() instead of blockjob-specific hooks
 * Keep our own in-flight CowRequest list instead of using block.c
   tracked requests

-- stefanha]

Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 block/Makefile.objs       |   1 +
 block/backup.c            | 282 ++++++++++++++++++++++++++++++++++++++++++++++
 include/block/block_int.h |  16 +++
 3 files changed, 299 insertions(+)
 create mode 100644 block/backup.c

Comments

Wayne Xia May 16, 2013, 3:27 a.m. UTC | #1
> From: Dietmar Maurer <dietmar@proxmox.com>
> 
> backup_start() creates a block job that copies a point-in-time snapshot
> of a block device to a target block device.
> 
> We call backup_do_cow() for each write during backup. That function
> reads the original data from the block device before it gets
> overwritten.  The data is then written to the target device.
> 
> Currently backup cluster size is hardcoded to 65536 bytes.
> 
> [I made a number of changes to Dietmar's original patch and folded them
> in to make code review easy.  Here is the full list:
> 
>   * Drop BackupDumpFunc interface in favor of a target block device
>   * Detect zero clusters with buffer_is_zero()
>   * Don't write zero clusters to the target
>   * Use 0 delay instead of 1us, like other block jobs
>   * Unify creation/start functions into backup_start()
>   * Simplify cleanup, free bitmap in backup_run() instead of cb
>   * function
>   * Use HBitmap to avoid duplicating bitmap code
>   * Use bdrv_getlength() instead of accessing ->total_sectors
>   * directly
>   * Delete the backup.h header file, it is no longer necessary
>   * Move ./backup.c to block/backup.c
>   * Remove #ifdefed out code
>   * Coding style and whitespace cleanups
>   * Use bdrv_add_before_write_cb() instead of blockjob-specific hooks
>   * Keep our own in-flight CowRequest list instead of using block.c
>     tracked requests
> 
> -- stefanha]
> 
> Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>   block/Makefile.objs       |   1 +
>   block/backup.c            | 282 ++++++++++++++++++++++++++++++++++++++++++++++
>   include/block/block_int.h |  16 +++
>   3 files changed, 299 insertions(+)
>   create mode 100644 block/backup.c
> 
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 5f0358a..88bd101 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -20,5 +20,6 @@ endif
>   common-obj-y += stream.o
>   common-obj-y += commit.o
>   common-obj-y += mirror.o
> +common-obj-y += backup.o
> 
>   $(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
> diff --git a/block/backup.c b/block/backup.c
> new file mode 100644
> index 0000000..01d2fd3
> --- /dev/null
> +++ b/block/backup.c
> @@ -0,0 +1,282 @@
> +/*
> + * QEMU backup
> + *
> + * Copyright (C) 2013 Proxmox Server Solutions
> + *
> + * Authors:
> + *  Dietmar Maurer (dietmar@proxmox.com)
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include <stdio.h>
> +#include <errno.h>
> +#include <unistd.h>
> +
> +#include "block/block.h"
> +#include "block/block_int.h"
> +#include "block/blockjob.h"
> +#include "qemu/ratelimit.h"
> +
> +#define DEBUG_BACKUP 0
> +
> +#define DPRINTF(fmt, ...) \
> +    do { \
> +        if (DEBUG_BACKUP) { \
> +            fprintf(stderr, "backup: " fmt, ## __VA_ARGS__); \
> +        } \
> +    } while (0)
> +
> +#define BACKUP_CLUSTER_BITS 16
> +#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
> +#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
> +
> +#define SLICE_TIME 100000000ULL /* ns */
> +
> +typedef struct CowRequest {
> +    int64_t start;
> +    int64_t end;
> +    QLIST_ENTRY(CowRequest) list;
> +    CoQueue wait_queue; /* coroutines blocked on this request */
> +} CowRequest;
> +
> +typedef struct BackupBlockJob {
> +    BlockJob common;
> +    BlockDriverState *target;
> +    RateLimit limit;
> +    CoRwlock flush_rwlock;
> +    uint64_t sectors_read;
> +    HBitmap *bitmap;
> +    QLIST_HEAD(, CowRequest) inflight_reqs;
> +} BackupBlockJob;
> +
> +/* See if in-flight requests overlap and wait for them to complete */
> +static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
> +                                                       int64_t start,
> +                                                       int64_t end)
> +{
> +    CowRequest *req;
> +    bool retry;
> +
> +    do {
> +        retry = false;
> +        QLIST_FOREACH(req, &job->inflight_reqs, list) {
> +            if (end > req->start && start < req->end) {
> +                qemu_co_queue_wait(&req->wait_queue);
> +                retry = true;
> +                break;
> +            }
> +        }
> +    } while (retry);
> +}
> +

  In my understanding, there will be possible two program routines entering
here at same time since it holds read lock instead of write lock, and
they may also modify job->inflight_reqs, is it possible a race
condition here? I am not sure whether back-ground job will becomes a
thread.
Stefan Hajnoczi May 16, 2013, 7:30 a.m. UTC | #2
On Thu, May 16, 2013 at 11:27:38AM +0800, Wenchao Xia wrote:
> > +/* See if in-flight requests overlap and wait for them to complete */
> > +static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
> > +                                                       int64_t start,
> > +                                                       int64_t end)
> > +{
> > +    CowRequest *req;
> > +    bool retry;
> > +
> > +    do {
> > +        retry = false;
> > +        QLIST_FOREACH(req, &job->inflight_reqs, list) {
> > +            if (end > req->start && start < req->end) {
> > +                qemu_co_queue_wait(&req->wait_queue);
> > +                retry = true;
> > +                break;
> > +            }
> > +        }
> > +    } while (retry);
> > +}
> > +
> 
>   In my understanding, there will be possible two program routines entering
> here at same time since it holds read lock instead of write lock, and
> they may also modify job->inflight_reqs, is it possible a race
> condition here? I am not sure whether back-ground job will becomes a
> thread.

No, all operations on a BlockDriverState execute in the same event loop
thread.  Only coroutine synchronization is necessary, which is provided
in these patches.
Paolo Bonzini May 20, 2013, 11:30 a.m. UTC | #3
Il 15/05/2013 16:34, Stefan Hajnoczi ha scritto:
> +    wait_for_overlapping_requests(job, start, end);
> +    cow_request_begin(&cow_request, job, start, end);
> +
> +    for (; start < end; start++) {
> +        if (hbitmap_get(job->bitmap, start)) {
> +            DPRINTF("brdv_co_backup_cow skip C%" PRId64 "\n", start);
> +            continue; /* already copied */
> +        }
> +
> +        /* immediately set bitmap (avoid coroutine race) */
> +        hbitmap_set(job->bitmap, start, 1);
> +

HBitmap already has code for finding the next set bit, but you're not
using it.

If you reverse the direction of the bitmap, you can use an HBitmapIter here:

> 
> +    start = 0;
> +    end = DIV_ROUND_UP(bdrv_getlength(bs) / BDRV_SECTOR_SIZE,
> +                       BACKUP_BLOCKS_PER_CLUSTER);
> +
> +    job->bitmap = hbitmap_alloc(end, 0);
> +
> +    before_write = bdrv_add_before_write_cb(bs, backup_before_write);
> +
> +    DPRINTF("backup_run start %s %" PRId64 " %" PRId64 "\n",
> +            bdrv_get_device_name(bs), start, end);
> +
> +    for (; start < end; start++) {

... instead of iterating through each item manually.

Paolo

> +        if (block_job_is_cancelled(&job->common)) {
> +            break;
> +        }
> +
> +        /* we need to yield so that qemu_aio_flush() returns.
> +         * (without, VM does not reboot)
> +         */
> +        if (job->common.speed) {
> +            uint64_t delay_ns = ratelimit_calculate_delay(
> +                &job->limit, job->sectors_read);
> +            job->sectors_read = 0;
> +            block_job_sleep_ns(&job->common, rt_clock, delay_ns);
> +        } else {
> +            block_job_sleep_ns(&job->common, rt_clock, 0);
> +        }
> +
> +        if (block_job_is_cancelled(&job->common)) {
> +            break;
> +        }
> +
> +        DPRINTF("backup_run loop C%" PRId64 "\n", start);
> +
> +        ret = backup_do_cow(bs, start * BACKUP_BLOCKS_PER_CLUSTER, 1);
> +        if (ret < 0) {
> +            break;
> +        }
> +
> +        /* Publish progress */
> +        job->common.offset += BACKUP_CLUSTER_SIZE;
> +    }
Stefan Hajnoczi May 21, 2013, 1:34 p.m. UTC | #4
On Mon, May 20, 2013 at 01:30:37PM +0200, Paolo Bonzini wrote:
> Il 15/05/2013 16:34, Stefan Hajnoczi ha scritto:
> > +    wait_for_overlapping_requests(job, start, end);
> > +    cow_request_begin(&cow_request, job, start, end);
> > +
> > +    for (; start < end; start++) {
> > +        if (hbitmap_get(job->bitmap, start)) {
> > +            DPRINTF("brdv_co_backup_cow skip C%" PRId64 "\n", start);
> > +            continue; /* already copied */
> > +        }
> > +
> > +        /* immediately set bitmap (avoid coroutine race) */
> > +        hbitmap_set(job->bitmap, start, 1);
> > +
> 
> HBitmap already has code for finding the next set bit, but you're not
> using it.
> 
> If you reverse the direction of the bitmap, you can use an HBitmapIter here:
> 
> > 
> > +    start = 0;
> > +    end = DIV_ROUND_UP(bdrv_getlength(bs) / BDRV_SECTOR_SIZE,
> > +                       BACKUP_BLOCKS_PER_CLUSTER);
> > +
> > +    job->bitmap = hbitmap_alloc(end, 0);
> > +
> > +    before_write = bdrv_add_before_write_cb(bs, backup_before_write);
> > +
> > +    DPRINTF("backup_run start %s %" PRId64 " %" PRId64 "\n",
> > +            bdrv_get_device_name(bs), start, end);
> > +
> > +    for (; start < end; start++) {
> 
> ... instead of iterating through each item manually.

/**
 * hbitmap_iter_init:
[...]
 * position of the iterator is also okay.  However, concurrent resetting of
 * bits can lead to unexpected behavior if the iterator has not yet reached
 * those bits. 
 */ 
void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first);

This worries me.  We would initialize the bitmap to all 1s.  Backing up
a cluster resets the bit.  But the documentation says it is not safe to
reset bits while iterating?

Stefan
Paolo Bonzini May 21, 2013, 3:11 p.m. UTC | #5
Il 21/05/2013 15:34, Stefan Hajnoczi ha scritto:
> /**
>  * hbitmap_iter_init:
> [...]
>  * position of the iterator is also okay.  However, concurrent resetting of
>  * bits can lead to unexpected behavior if the iterator has not yet reached
>  * those bits. 
>  */ 
> void hbitmap_iter_init(HBitmapIter *hbi, const HBitmap *hb, uint64_t first);
> 
> This worries me.  We would initialize the bitmap to all 1s.  Backing up
> a cluster resets the bit.  But the documentation says it is not safe to
> reset bits while iterating?

Hmm, right.  But do we need the bitmap at all?  We can just use
bdrv_is_allocated like bdrv_co_do_readv does.

The code has a comment:

> +        /* immediately set bitmap (avoid coroutine race) */
> +        hbitmap_set(job->bitmap, start, 1);

but wouldn't this be avoided anyway, because of the mutual exclusion
between overlapping requests?

Paolo
Dietmar Maurer May 21, 2013, 3:25 p.m. UTC | #6
> Hmm, right.  But do we need the bitmap at all?  We can just use
> bdrv_is_allocated like bdrv_co_do_readv does.

If a write occur, we read and backup that cluster immediately (out of order). So
I am quite sure we need the bitmap.
Paolo Bonzini May 21, 2013, 3:35 p.m. UTC | #7
Il 21/05/2013 17:25, Dietmar Maurer ha scritto:
>> Hmm, right.  But do we need the bitmap at all?  We can just use
>> > bdrv_is_allocated like bdrv_co_do_readv does.
> If a write occur, we read and backup that cluster immediately (out of order). So
> I am quite sure we need the bitmap.

This is the same as how copy-on-read happens during image streaming, and
it doesn't use a bitmap.

Paolo
Dietmar Maurer May 21, 2013, 3:54 p.m. UTC | #8
> >> Hmm, right.  But do we need the bitmap at all?  We can just use
> >> > bdrv_is_allocated like bdrv_co_do_readv does.
> > If a write occur, we read and backup that cluster immediately (out of
> > order). So I am quite sure we need the bitmap.
> 
> This is the same as how copy-on-read happens during image streaming, and it
> doesn't use a bitmap.

But a read does not modify the content.
Paolo Bonzini May 21, 2013, 4:03 p.m. UTC | #9
Il 21/05/2013 17:54, Dietmar Maurer ha scritto:
>>>> Hmm, right.  But do we need the bitmap at all?  We can just use
>>>>> bdrv_is_allocated like bdrv_co_do_readv does.
>>> If a write occur, we read and backup that cluster immediately (out of
>>> order). So I am quite sure we need the bitmap.
>>
>> This is the same as how copy-on-read happens during image streaming, and it
>> doesn't use a bitmap.
> 
> But a read does not modify the content.

Copy-on-read modifies the topmost image even without changing the
content, just like copy-before-write modifies the backup image.

But writes to the backup copy are serialized anyway via
wait_for_overlapping_requests, so there is no problem here.

Paolo
Dietmar Maurer May 21, 2013, 4:15 p.m. UTC | #10
> Copy-on-read modifies the topmost image even without changing the content,
> just like copy-before-write modifies the backup image.

Ah, got it. But this only works if you use a BS as target. This will not work with my
old backup driver patches (I am still not convinced about using a BS as backup target)
Dietmar Maurer May 21, 2013, 4:26 p.m. UTC | #11
> Hmm, right.  But do we need the bitmap at all?  We can just use
> bdrv_is_allocated like bdrv_co_do_readv does.

Does that works with a nbd driver? Or does that add another RPC call (slow down)?
Paolo Bonzini May 21, 2013, 4:46 p.m. UTC | #12
Il 21/05/2013 18:26, Dietmar Maurer ha scritto:
>> Hmm, right.  But do we need the bitmap at all?  We can just use
>> > bdrv_is_allocated like bdrv_co_do_readv does.
> Does that works with a nbd driver?

Ah, right.  That's the answer.

> Or does that add another RPC call (slow down)?

It doesn't work at all.  Thus Stefan's patch is fine (except if you
don't use HBitmapIter, there's no advantage in using HBitmap; you can
use qemu/bitmap.h instead).

Paolo
Stefan Hajnoczi May 22, 2013, 1:37 p.m. UTC | #13
On Tue, May 21, 2013 at 06:46:39PM +0200, Paolo Bonzini wrote:
> Il 21/05/2013 18:26, Dietmar Maurer ha scritto:
> >> Hmm, right.  But do we need the bitmap at all?  We can just use
> >> > bdrv_is_allocated like bdrv_co_do_readv does.
> > Does that works with a nbd driver?
> 
> Ah, right.  That's the answer.

Yes, the target may not supported allocation info.

> > Or does that add another RPC call (slow down)?
> 
> It doesn't work at all.  Thus Stefan's patch is fine (except if you
> don't use HBitmapIter, there's no advantage in using HBitmap; you can
> use qemu/bitmap.h instead).

I chose HBitmap because bitmap.h uses the int type instead of uint64_t,
which makes it risky to use in the block layer.

Stefan
diff mbox

Patch

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 5f0358a..88bd101 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -20,5 +20,6 @@  endif
 common-obj-y += stream.o
 common-obj-y += commit.o
 common-obj-y += mirror.o
+common-obj-y += backup.o
 
 $(obj)/curl.o: QEMU_CFLAGS+=$(CURL_CFLAGS)
diff --git a/block/backup.c b/block/backup.c
new file mode 100644
index 0000000..01d2fd3
--- /dev/null
+++ b/block/backup.c
@@ -0,0 +1,282 @@ 
+/*
+ * QEMU backup
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "block/block.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+
+#define DEBUG_BACKUP 0
+
+#define DPRINTF(fmt, ...) \
+    do { \
+        if (DEBUG_BACKUP) { \
+            fprintf(stderr, "backup: " fmt, ## __VA_ARGS__); \
+        } \
+    } while (0)
+
+#define BACKUP_CLUSTER_BITS 16
+#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS)
+#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE)
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct CowRequest {
+    int64_t start;
+    int64_t end;
+    QLIST_ENTRY(CowRequest) list;
+    CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
+typedef struct BackupBlockJob {
+    BlockJob common;
+    BlockDriverState *target;
+    RateLimit limit;
+    CoRwlock flush_rwlock;
+    uint64_t sectors_read;
+    HBitmap *bitmap;
+    QLIST_HEAD(, CowRequest) inflight_reqs;
+} BackupBlockJob;
+
+/* See if in-flight requests overlap and wait for them to complete */
+static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job,
+                                                       int64_t start,
+                                                       int64_t end)
+{
+    CowRequest *req;
+    bool retry;
+
+    do {
+        retry = false;
+        QLIST_FOREACH(req, &job->inflight_reqs, list) {
+            if (end > req->start && start < req->end) {
+                qemu_co_queue_wait(&req->wait_queue);
+                retry = true;
+                break;
+            }
+        }
+    } while (retry);
+}
+
+/* Keep track of an in-flight request */
+static void cow_request_begin(CowRequest *req, BackupBlockJob *job,
+                                     int64_t start, int64_t end)
+{
+    req->start = start;
+    req->end = end;
+    qemu_co_queue_init(&req->wait_queue);
+    QLIST_INSERT_HEAD(&job->inflight_reqs, req, list);
+}
+
+/* Forget about a completed request */
+static void cow_request_end(CowRequest *req)
+{
+    QLIST_REMOVE(req, list);
+    qemu_co_queue_restart_all(&req->wait_queue);
+}
+
+static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors)
+{
+    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    CowRequest cow_request;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buffer = NULL;
+    int ret = 0;
+    int64_t start, end;
+
+    qemu_co_rwlock_rdlock(&job->flush_rwlock);
+
+    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
+    end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_BLOCKS_PER_CLUSTER);
+
+    DPRINTF("brdv_co_backup_cow enter %s C%" PRId64 " %" PRId64 " %d\n",
+            bdrv_get_device_name(bs), start, sector_num, nb_sectors);
+
+    wait_for_overlapping_requests(job, start, end);
+    cow_request_begin(&cow_request, job, start, end);
+
+    for (; start < end; start++) {
+        if (hbitmap_get(job->bitmap, start)) {
+            DPRINTF("brdv_co_backup_cow skip C%" PRId64 "\n", start);
+            continue; /* already copied */
+        }
+
+        /* immediately set bitmap (avoid coroutine race) */
+        hbitmap_set(job->bitmap, start, 1);
+
+        DPRINTF("brdv_co_backup_cow C%" PRId64 "\n", start);
+
+        if (!bounce_buffer) {
+            iov.iov_len = BACKUP_CLUSTER_SIZE;
+            iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+            qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+        }
+
+        ret = bdrv_co_readv(bs, start * BACKUP_BLOCKS_PER_CLUSTER,
+                            BACKUP_BLOCKS_PER_CLUSTER,
+                            &bounce_qiov);
+        if (ret < 0) {
+            DPRINTF("brdv_co_backup_cow bdrv_read C%" PRId64 " failed\n",
+                    start);
+            goto out;
+        }
+
+        job->sectors_read += BACKUP_BLOCKS_PER_CLUSTER;
+
+        if (!buffer_is_zero(bounce_buffer, BACKUP_CLUSTER_SIZE)) {
+            ret = bdrv_co_writev(job->target, start * BACKUP_BLOCKS_PER_CLUSTER,
+                                 BACKUP_BLOCKS_PER_CLUSTER,
+                                 &bounce_qiov);
+            if (ret < 0) {
+                DPRINTF("brdv_co_backup_cow dump_cluster_cb C%" PRId64
+                        " failed\n", start);
+                goto out;
+            }
+        }
+
+        DPRINTF("brdv_co_backup_cow done C%" PRId64 "\n", start);
+    }
+
+out:
+    if (bounce_buffer) {
+        qemu_vfree(bounce_buffer);
+    }
+
+    cow_request_end(&cow_request);
+
+    qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+    return ret;
+}
+
+static void coroutine_fn backup_before_write(BlockDriverState *bs,
+                                             int64_t sector_num,
+                                             int nb_sectors,
+                                             QEMUIOVector *qiov)
+{
+    backup_do_cow(bs, sector_num, nb_sectors);
+}
+
+static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType backup_job_type = {
+    .instance_size = sizeof(BackupBlockJob),
+    .job_type = "backup",
+    .set_speed = backup_set_speed,
+};
+
+static void coroutine_fn backup_run(void *opaque)
+{
+    BackupBlockJob *job = opaque;
+    BlockDriverState *bs = job->common.bs;
+    BDRVBeforeWrite *before_write;
+    int64_t start, end;
+    int ret = 0;
+
+    QLIST_INIT(&job->inflight_reqs);
+    qemu_co_rwlock_init(&job->flush_rwlock);
+
+    start = 0;
+    end = DIV_ROUND_UP(bdrv_getlength(bs) / BDRV_SECTOR_SIZE,
+                       BACKUP_BLOCKS_PER_CLUSTER);
+
+    job->bitmap = hbitmap_alloc(end, 0);
+
+    before_write = bdrv_add_before_write_cb(bs, backup_before_write);
+
+    DPRINTF("backup_run start %s %" PRId64 " %" PRId64 "\n",
+            bdrv_get_device_name(bs), start, end);
+
+    for (; start < end; start++) {
+        if (block_job_is_cancelled(&job->common)) {
+            break;
+        }
+
+        /* we need to yield so that qemu_aio_flush() returns.
+         * (without, VM does not reboot)
+         */
+        if (job->common.speed) {
+            uint64_t delay_ns = ratelimit_calculate_delay(
+                &job->limit, job->sectors_read);
+            job->sectors_read = 0;
+            block_job_sleep_ns(&job->common, rt_clock, delay_ns);
+        } else {
+            block_job_sleep_ns(&job->common, rt_clock, 0);
+        }
+
+        if (block_job_is_cancelled(&job->common)) {
+            break;
+        }
+
+        DPRINTF("backup_run loop C%" PRId64 "\n", start);
+
+        ret = backup_do_cow(bs, start * BACKUP_BLOCKS_PER_CLUSTER, 1);
+        if (ret < 0) {
+            break;
+        }
+
+        /* Publish progress */
+        job->common.offset += BACKUP_CLUSTER_SIZE;
+    }
+
+    bdrv_remove_before_write_cb(bs, before_write);
+
+    /* wait until pending backup_do_cow() calls have completed */
+    qemu_co_rwlock_wrlock(&job->flush_rwlock);
+    qemu_co_rwlock_unlock(&job->flush_rwlock);
+
+    hbitmap_free(job->bitmap);
+
+    bdrv_delete(job->target);
+
+    DPRINTF("backup_run complete %d\n", ret);
+    block_job_completed(&job->common, ret);
+}
+
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed,
+                  BlockDriverCompletionFunc *cb, void *opaque,
+                  Error **errp)
+{
+    assert(bs);
+    assert(target);
+    assert(cb);
+
+    DPRINTF("backup_start %s\n", bdrv_get_device_name(bs));
+
+    BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
+                                           cb, opaque, errp);
+    if (!job) {
+        return;
+    }
+
+    job->target = target;
+    job->common.len = bdrv_getlength(bs);
+    job->common.co = qemu_coroutine_create(backup_run);
+    qemu_coroutine_enter(job->common.co, job);
+}
diff --git a/include/block/block_int.h b/include/block/block_int.h
index e2299df..e10ac50 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -409,4 +409,20 @@  void mirror_start(BlockDriverState *bs, BlockDriverState *target,
                   BlockDriverCompletionFunc *cb,
                   void *opaque, Error **errp);
 
+/*
+ * backup_start:
+ * @bs: Block device to operate on.
+ * @target: Block device to write to.
+ * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
+ * @cb: Completion function for the job.
+ * @opaque: Opaque pointer value passed to @cb.
+ *
+ * Start a backup operation on @bs.  Clusters in @bs are written to @target
+ * until the job is cancelled or manually completed.
+ */
+void backup_start(BlockDriverState *bs, BlockDriverState *target,
+                  int64_t speed,
+                  BlockDriverCompletionFunc *cb, void *opaque,
+                  Error **errp);
+
 #endif /* BLOCK_INT_H */