Patchwork [v4,2/6] add basic backup support to block driver

login
register
mail settings
Submitter Dietmar Maurer
Date Feb. 20, 2013, 9:31 a.m.
Message ID <1361352723-218544-3-git-send-email-dietmar@proxmox.com>
Download mbox | patch
Permalink /patch/222041/
State New
Headers show

Comments

Dietmar Maurer - Feb. 20, 2013, 9:31 a.m.
Function backup_job_create() creates a block job to backup a block device.
The coroutine is started with backup_job_start().

We call backup_do_cow() for each write during backup. That function
reads the original data and pass it to backup_dump_cb().

The tracked_request infrastructure is used to serialize access.

Currently backup cluster size is hardcoded to 65536 bytes.

Signed-off-by: Dietmar Maurer <dietmar@proxmox.com>
---
 Makefile.objs            |    1 +
 backup.c                 |  338 ++++++++++++++++++++++++++++++++++++++++++++++
 backup.h                 |   32 +++++
 block.c                  |   71 +++++++++-
 include/block/block.h    |    2 +
 include/block/blockjob.h |   10 ++
 6 files changed, 448 insertions(+), 6 deletions(-)
 create mode 100644 backup.c
 create mode 100644 backup.h
Stefan Hajnoczi - Feb. 20, 2013, 1:56 p.m.
On Wed, Feb 20, 2013 at 10:31:59AM +0100, Dietmar Maurer wrote:
> +typedef struct BackupBlockJob {
> +    BlockJob common;
> +    RateLimit limit;
> +    uint64_t sectors_read;
> +    unsigned long *bitmap;
> +    int bitmap_size;
> +    BackupDumpFunc *backup_dump_cb;
> +    BlockDriverCompletionFunc *backup_complete_cb;
> +    void *opaque;
> +} BackupBlockJob;
> +
> +static int backup_get_bitmap(BackupBlockJob *job, int64_t cluster_num)

bool return value

> +{
> +    assert(job);
> +    assert(job->bitmap);
> +
> +    unsigned long val, idx, bit;
> +
> +    idx = cluster_num / BITS_PER_LONG;
> +
> +    assert(job->bitmap_size > idx);
> +
> +    bit = cluster_num % BITS_PER_LONG;
> +    val = job->bitmap[idx];
> +
> +    return !!(val & (1UL << bit));
> +}
> +
> +static void backup_set_bitmap(BackupBlockJob *job, int64_t cluster_num,
> +                              int dirty)

s/int dirty/bool dirty/

> +{
> +    assert(job);
> +    assert(job->bitmap);
> +
> +    unsigned long val, idx, bit;
> +
> +    idx = cluster_num / BITS_PER_LONG;
> +
> +    assert(job->bitmap_size > idx);
> +
> +    bit = cluster_num % BITS_PER_LONG;
> +    val = job->bitmap[idx];
> +    if (dirty) {
> +        if (!(val & (1UL << bit))) {
> +            val |= 1UL << bit;
> +        }
> +    } else {
> +        if (val & (1UL << bit)) {
> +            val &= ~(1UL << bit);
> +        }
> +    }

The set and clear can be unconditional.  No need to check that the bit
is clear before setting it and vice versa.

> +    job->bitmap[idx] = val;
> +}
> +
> +static int backup_in_progress_count;
> +
> +static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> +                                      int64_t sector_num, int nb_sectors)
> +{
> +    assert(bs);
> +    BackupBlockJob *job = (BackupBlockJob *)bs->job;
> +    assert(job);
> +
> +    BlockDriver *drv = bs->drv;
> +    struct iovec iov;
> +    QEMUIOVector bounce_qiov;
> +    void *bounce_buffer = NULL;
> +    int ret = 0;
> +
> +    backup_in_progress_count++;
> +
> +    int64_t start, end;
> +
> +    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
> +    end = (sector_num + nb_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
> +        BACKUP_BLOCKS_PER_CLUSTER;
> +
> +    DPRINTF("brdv_co_backup_cow enter %s C%zd %zd %d\n",
> +            bdrv_get_device_name(bs), start, sector_num, nb_sectors);

Please use PRId64 for int64_t printf arguments in this function.  "z" is
for size_t, which will be wrong on 32-bit hosts.

> +static void coroutine_fn backup_run(void *opaque)
> +{
> +    BackupBlockJob *job = opaque;
> +    BlockDriverState *bs = job->common.bs;
> +    assert(bs);
> +
> +    int64_t start, end;
> +
> +    start = 0;
> +    end = (bs->total_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
> +        BACKUP_BLOCKS_PER_CLUSTER;
> +
> +    DPRINTF("backup_run start %s %zd %zd\n", bdrv_get_device_name(bs),
> +            start, end);
> +
> +    int ret = 0;
> +
> +    for (; start < end; start++) {
> +        if (block_job_is_cancelled(&job->common)) {
> +            ret = -1;
> +            break;
> +        }
> +
> +        /* we need to yield so that qemu_aio_flush() returns.
> +         * (without, VM does not reboot)
> +        * Note: use 1000 instead of 0 (0 prioritize this task too much)

indentation

What does "0 prioritize this task too much" mean?  If no rate limit has
been set the job should run at full speed.  We should not hardcode
arbitrary delays like 1000.

> +         */
> +        if (job->common.speed) {
> +            uint64_t delay_ns = ratelimit_calculate_delay(
> +                &job->limit, job->sectors_read);
> +            job->sectors_read = 0;
> +            block_job_sleep_ns(&job->common, rt_clock, delay_ns);
> +        } else {
> +            block_job_sleep_ns(&job->common, rt_clock, 1000);
> +        }
> +
> +        if (block_job_is_cancelled(&job->common)) {
> +            ret = -1;
> +            break;
> +        }
> +
> +        if (backup_get_bitmap(job, start)) {
> +            continue; /* already copied */
> +        }
> +
> +        DPRINTF("backup_run loop C%zd\n", start);
> +
> +        /**
> +         * This triggers a cluster copy
> +         * Note: avoid direct call to brdv_co_backup_cow, because
> +         * this does not call tracked_request_begin()
> +         */
> +        ret = bdrv_co_backup(bs, start*BACKUP_BLOCKS_PER_CLUSTER, 1);
> +        if (ret < 0) {
> +            break;
> +        }
> +        /* Publish progress */
> +        job->common.offset += BACKUP_CLUSTER_SIZE;
> +    }
> +
> +    while (backup_in_progress_count > 0) {
> +        DPRINTF("backup_run backup_in_progress_count != 0 (%d)",
> +                backup_in_progress_count);
> +        block_job_sleep_ns(&job->common, rt_clock, 10000);
> +
> +    }

Sleeping is bad.  You can use a coroutine rwlock: backup_do_cow() takes
a read lock and backup_run() takes a write lock to ensure all pending
backup_do_cow() calls have completed.

This also gets rid of the backup_in_progress_count global.

> diff --git a/backup.h b/backup.h
> new file mode 100644
> index 0000000..d9395bc
> --- /dev/null
> +++ b/backup.h
> @@ -0,0 +1,32 @@
> +/*
> + * QEMU backup related definitions
> + *
> + * Copyright (C) 2013 Proxmox Server Solutions
> + *
> + * Authors:
> + *  Dietmar Maurer (dietmar@proxmox.com)
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef QEMU_BACKUP_H
> +#define QEMU_BACKUP_H
> +
> +#include <uuid/uuid.h>

Not used in this patch.
Dietmar Maurer - Feb. 21, 2013, 5:23 a.m.
> > +    bit = cluster_num % BITS_PER_LONG;
> > +    val = job->bitmap[idx];
> > +    if (dirty) {
> > +        if (!(val & (1UL << bit))) {
> > +            val |= 1UL << bit;
> > +        }
> > +    } else {
> > +        if (val & (1UL << bit)) {
> > +            val &= ~(1UL << bit);
> > +        }
> > +    }
> 
> The set and clear can be unconditional.  No need to check that the bit is clear
> before setting it and vice versa.

Thanks, will change.
 
> > +    job->bitmap[idx] = val;
> > +}
> > +
> > +static int backup_in_progress_count;
> > +
> > +static int coroutine_fn backup_do_cow(BlockDriverState *bs,
> > +                                      int64_t sector_num, int
> > +nb_sectors) {
> > +    assert(bs);
> > +    BackupBlockJob *job = (BackupBlockJob *)bs->job;
> > +    assert(job);
> > +
> > +    BlockDriver *drv = bs->drv;
> > +    struct iovec iov;
> > +    QEMUIOVector bounce_qiov;
> > +    void *bounce_buffer = NULL;
> > +    int ret = 0;
> > +
> > +    backup_in_progress_count++;
> > +
> > +    int64_t start, end;
> > +
> > +    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
> > +    end = (sector_num + nb_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
> > +        BACKUP_BLOCKS_PER_CLUSTER;
> > +
> > +    DPRINTF("brdv_co_backup_cow enter %s C%zd %zd %d\n",
> > +            bdrv_get_device_name(bs), start, sector_num, nb_sectors);
> 
> Please use PRId64 for int64_t printf arguments in this function.  "z" is for size_t,
> which will be wrong on 32-bit hosts.

OK

> > +static void coroutine_fn backup_run(void *opaque) {
> > +    BackupBlockJob *job = opaque;
> > +    BlockDriverState *bs = job->common.bs;
> > +    assert(bs);
> > +
> > +    int64_t start, end;
> > +
> > +    start = 0;
> > +    end = (bs->total_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
> > +        BACKUP_BLOCKS_PER_CLUSTER;
> > +
> > +    DPRINTF("backup_run start %s %zd %zd\n", bdrv_get_device_name(bs),
> > +            start, end);
> > +
> > +    int ret = 0;
> > +
> > +    for (; start < end; start++) {
> > +        if (block_job_is_cancelled(&job->common)) {
> > +            ret = -1;
> > +            break;
> > +        }
> > +
> > +        /* we need to yield so that qemu_aio_flush() returns.
> > +         * (without, VM does not reboot)
> > +        * Note: use 1000 instead of 0 (0 prioritize this task too
> > + much)
> 
> indentation
> 
> What does "0 prioritize this task too much" mean?  If no rate limit has been set
> the job should run at full speed.  We should not hardcode arbitrary delays like
> 1000.

The VM itself gets somehow slower during backup - do not know why. As workaround sleep 1000 works.

> > +         */
> > +        if (job->common.speed) {
> > +            uint64_t delay_ns = ratelimit_calculate_delay(
> > +                &job->limit, job->sectors_read);
> > +            job->sectors_read = 0;
> > +            block_job_sleep_ns(&job->common, rt_clock, delay_ns);
> > +        } else {
> > +            block_job_sleep_ns(&job->common, rt_clock, 1000);
> > +        }
> > +
> > +        if (block_job_is_cancelled(&job->common)) {
> > +            ret = -1;
> > +            break;
> > +        }
> > +
> > +        if (backup_get_bitmap(job, start)) {
> > +            continue; /* already copied */
> > +        }
> > +
> > +        DPRINTF("backup_run loop C%zd\n", start);
> > +
> > +        /**
> > +         * This triggers a cluster copy
> > +         * Note: avoid direct call to brdv_co_backup_cow, because
> > +         * this does not call tracked_request_begin()
> > +         */
> > +        ret = bdrv_co_backup(bs, start*BACKUP_BLOCKS_PER_CLUSTER, 1);
> > +        if (ret < 0) {
> > +            break;
> > +        }
> > +        /* Publish progress */
> > +        job->common.offset += BACKUP_CLUSTER_SIZE;
> > +    }
> > +
> > +    while (backup_in_progress_count > 0) {
> > +        DPRINTF("backup_run backup_in_progress_count != 0 (%d)",
> > +                backup_in_progress_count);
> > +        block_job_sleep_ns(&job->common, rt_clock, 10000);
> > +
> > +    }
> 
> Sleeping is bad.  
> You can use a coroutine rwlock: backup_do_cow() takes a read
> lock and backup_run() takes a write lock to ensure all pending
> backup_do_cow() calls have completed.
> 
> This also gets rid of the backup_in_progress_count global.

OK, will do.
Stefan Hajnoczi - Feb. 21, 2013, 12:46 p.m.
On Thu, Feb 21, 2013 at 05:23:30AM +0000, Dietmar Maurer wrote:
> > > +static void coroutine_fn backup_run(void *opaque) {
> > > +    BackupBlockJob *job = opaque;
> > > +    BlockDriverState *bs = job->common.bs;
> > > +    assert(bs);
> > > +
> > > +    int64_t start, end;
> > > +
> > > +    start = 0;
> > > +    end = (bs->total_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
> > > +        BACKUP_BLOCKS_PER_CLUSTER;
> > > +
> > > +    DPRINTF("backup_run start %s %zd %zd\n", bdrv_get_device_name(bs),
> > > +            start, end);
> > > +
> > > +    int ret = 0;
> > > +
> > > +    for (; start < end; start++) {
> > > +        if (block_job_is_cancelled(&job->common)) {
> > > +            ret = -1;
> > > +            break;
> > > +        }
> > > +
> > > +        /* we need to yield so that qemu_aio_flush() returns.
> > > +         * (without, VM does not reboot)
> > > +        * Note: use 1000 instead of 0 (0 prioritize this task too
> > > + much)
> > 
> > indentation
> > 
> > What does "0 prioritize this task too much" mean?  If no rate limit has been set
> > the job should run at full speed.  We should not hardcode arbitrary delays like
> > 1000.
> 
> The VM itself gets somehow slower during backup - do not know why. As workaround sleep 1000 works.

Please find out why, it's a bug that an arbitrary sleep hides but
doesn't fix (plus the sleep makes backup less efficient).

If the VM becomes slow this loop is probably "spinning" without doing
blocking I/O and only doing sleep 0.  I guess that can happen when you
loop over blocks that have already been backed up (bit has been set)?

Stefan
Dietmar Maurer - Feb. 25, 2013, 6:48 a.m.
> > > > +    for (; start < end; start++) {
> > > > +        if (block_job_is_cancelled(&job->common)) {
> > > > +            ret = -1;
> > > > +            break;
> > > > +        }
> > > > +
> > > > +        /* we need to yield so that qemu_aio_flush() returns.
> > > > +         * (without, VM does not reboot)
> > > > +        * Note: use 1000 instead of 0 (0 prioritize this task too
> > > > + much)
> > >
> > > indentation
> > >
> > > What does "0 prioritize this task too much" mean?  If no rate limit
> > > has been set the job should run at full speed.  We should not
> > > hardcode arbitrary delays like 1000.
> >
> > The VM itself gets somehow slower during backup - do not know why. As
> workaround sleep 1000 works.
> 
> Please find out why, it's a bug that an arbitrary sleep hides but doesn't fix (plus
> the sleep makes backup less efficient).
> 
> If the VM becomes slow this loop is probably "spinning" without doing blocking
> I/O and only doing sleep 0.  I guess that can happen when you loop over blocks
> that have already been backed up (bit has been set)?

Well, 'slow' is the wrong term. The VM just gets a bit unresponsive - its just a feeling.

I think this is because the backup job runs at same priority as normal guest IO.

We previously used LVM and run backup with 'idle' IO priority (CFQ) to avoid such behavior.

But qemu does not provide an IO queue where we can set scheduling priorities?
Stefan Hajnoczi - Feb. 25, 2013, 1:27 p.m.
On Mon, Feb 25, 2013 at 06:48:39AM +0000, Dietmar Maurer wrote:
> > > > > +    for (; start < end; start++) {
> > > > > +        if (block_job_is_cancelled(&job->common)) {
> > > > > +            ret = -1;
> > > > > +            break;
> > > > > +        }
> > > > > +
> > > > > +        /* we need to yield so that qemu_aio_flush() returns.
> > > > > +         * (without, VM does not reboot)
> > > > > +        * Note: use 1000 instead of 0 (0 prioritize this task too
> > > > > + much)
> > > >
> > > > indentation
> > > >
> > > > What does "0 prioritize this task too much" mean?  If no rate limit
> > > > has been set the job should run at full speed.  We should not
> > > > hardcode arbitrary delays like 1000.
> > >
> > > The VM itself gets somehow slower during backup - do not know why. As
> > workaround sleep 1000 works.
> > 
> > Please find out why, it's a bug that an arbitrary sleep hides but doesn't fix (plus
> > the sleep makes backup less efficient).
> > 
> > If the VM becomes slow this loop is probably "spinning" without doing blocking
> > I/O and only doing sleep 0.  I guess that can happen when you loop over blocks
> > that have already been backed up (bit has been set)?
> 
> Well, 'slow' is the wrong term. The VM just gets a bit unresponsive - its just a feeling.
> 
> I think this is because the backup job runs at same priority as normal guest IO.
> 
> We previously used LVM and run backup with 'idle' IO priority (CFQ) to avoid such behavior.
> 
> But qemu does not provide an IO queue where we can set scheduling priorities?

QEMU block jobs support rate-limiting.  Set it to 10-20% of the disk's
throughput and the slowness should go away but the backup takes longer.

Stefan
Dietmar Maurer - Feb. 25, 2013, 3:49 p.m.
> > We previously used LVM and run backup with 'idle' IO priority (CFQ) to avoid
> such behavior.
> >
> > But qemu does not provide an IO queue where we can set scheduling
> priorities?
> 
> QEMU block jobs support rate-limiting.  Set it to 10-20% of the disk's throughput
> and the slowness should go away but the backup takes longer.

Sorry, but I don't want to set a rate limit. I want to set an IO priority - this is something different.
My solution does not delay the backup.
Stefan Hajnoczi - Feb. 26, 2013, 4:49 p.m.
On Mon, Feb 25, 2013 at 03:49:11PM +0000, Dietmar Maurer wrote:
> > > We previously used LVM and run backup with 'idle' IO priority (CFQ) to avoid
> > such behavior.
> > >
> > > But qemu does not provide an IO queue where we can set scheduling
> > priorities?
> > 
> > QEMU block jobs support rate-limiting.  Set it to 10-20% of the disk's throughput
> > and the slowness should go away but the backup takes longer.
> 
> Sorry, but I don't want to set a rate limit. I want to set an IO priority - this is something different.
> My solution does not delay the backup.

QEMU is not in a position to implement global scheduling (priorities).
The QEMU process only knows about one guest.  If you have multiple
guests on the host an I/O "priority" inside QEMU process A will not take
into account QEMU process B.

Your solution *does* delay backup by 1 ms each iteration :).

Stefan
Stefan Hajnoczi - Feb. 26, 2013, 4:52 p.m.
On Mon, Feb 25, 2013 at 03:49:11PM +0000, Dietmar Maurer wrote:
> > > We previously used LVM and run backup with 'idle' IO priority (CFQ) to avoid
> > such behavior.
> > >
> > > But qemu does not provide an IO queue where we can set scheduling
> > priorities?
> > 
> > QEMU block jobs support rate-limiting.  Set it to 10-20% of the disk's throughput
> > and the slowness should go away but the backup takes longer.
> 
> Sorry, but I don't want to set a rate limit. I want to set an IO priority - this is something different.
> My solution does not delay the backup.

I just reread the code and noticed the delay is 1 microsecond, not 1
millisecond.

The problem is that this is a magic value.  It worked on your machine
with your workload, but there's no guarantee it works on anyone else's
machine or workload.  We can't depend on magic values like this.

If you want to implement an alternative to rate-limiting, please do it
in a separate patch series and make it work for all block job types.

Stefan
Dietmar Maurer - Feb. 26, 2013, 4:57 p.m.
> QEMU is not in a position to implement global scheduling (priorities).
> The QEMU process only knows about one guest.  If you have multiple guests on
> the host an I/O "priority" inside QEMU process A will not take into account
> QEMU process B.

I only want priorities inside the guest. The backup job should have lower priority
that the guest itself. Global priorities are something else.
Dietmar Maurer - Feb. 26, 2013, 5:02 p.m.
> I just reread the code and noticed the delay is 1 microsecond, not 1 millisecond.

Yes, It does not really add a delay.

> The problem is that this is a magic value.  It worked on your machine with your
> workload, but there's no guarantee it works on anyone else's machine or
> workload.  We can't depend on magic values like this.
> 
> If you want to implement an alternative to rate-limiting, please do it in a
> separate patch series and make it work for all block job types.

Well, I hoped you can see how to fix that. I guess the same applies for other types of block
jobs, and I simply found no other solution.

Adding and IO queue and implement a scheduler is likely not what we want to do.
Stefan Hajnoczi - Feb. 27, 2013, 1:26 p.m.
On Tue, Feb 26, 2013 at 05:02:19PM +0000, Dietmar Maurer wrote:
> > If you want to implement an alternative to rate-limiting, please do it in a
> > separate patch series and make it work for all block job types.
> 
> Well, I hoped you can see how to fix that. I guess the same applies for other types of block
> jobs, and I simply found no other solution.
> 
> Adding and IO queue and implement a scheduler is likely not what we want to do. 

The starting point for something smarter than just no policy or
rate-limiting is BlockDriverState->tracked_requests.  block.c keeps
track of active requests using this list.

The simplest policy using bs->tracked_requests is to only submit block
job I/O requests when the list is empty.

Want to try this approach?  (The drawback is that the guest can starve
the block job by submitting I/O all the time.)

Stefan
Dietmar Maurer - Feb. 27, 2013, 3:38 p.m.
> > Adding and IO queue and implement a scheduler is likely not what we want to
> do.
> 
> The starting point for something smarter than just no policy or rate-limiting is
> BlockDriverState->tracked_requests.  block.c keeps track of active requests
> using this list.
> 
> The simplest policy using bs->tracked_requests is to only submit block job I/O
> requests when the list is empty.
> 
> Want to try this approach?  (The drawback is that the guest can starve the block
> job by submitting I/O all the time.)

Already tried that, but the 1000ns delay behaves better (again, just a feeling, because
I found no way to measure that behaviour).
Kevin Wolf - Feb. 28, 2013, 11:54 a.m.
Am 27.02.2013 um 16:38 hat Dietmar Maurer geschrieben:
> > > Adding and IO queue and implement a scheduler is likely not what we want to
> > do.
> > 
> > The starting point for something smarter than just no policy or rate-limiting is
> > BlockDriverState->tracked_requests.  block.c keeps track of active requests
> > using this list.
> > 
> > The simplest policy using bs->tracked_requests is to only submit block job I/O
> > requests when the list is empty.
> > 
> > Want to try this approach?  (The drawback is that the guest can starve the block
> > job by submitting I/O all the time.)
> 
> Already tried that, but the 1000ns delay behaves better (again, just a feeling, because
> I found no way to measure that behaviour).

Maybe mix the approaches? If there is no guest I/O going on, run the
backup at full speed, but if there is, then rate-limit it?

Kevin

Patch

diff --git a/Makefile.objs b/Makefile.objs
index a68cdac..df64f70 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -13,6 +13,7 @@  block-obj-$(CONFIG_POSIX) += aio-posix.o
 block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qapi-types.o qapi-visit.o
+block-obj-y += backup.o
 
 block-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 block-obj-y += qemu-coroutine-sleep.o
diff --git a/backup.c b/backup.c
new file mode 100644
index 0000000..c9576d5
--- /dev/null
+++ b/backup.c
@@ -0,0 +1,338 @@ 
+/*
+ * QEMU backup
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "block/block.h"
+#include "block/block_int.h"
+#include "block/blockjob.h"
+#include "qemu/ratelimit.h"
+#include "backup.h"
+
+#define DEBUG_BACKUP 0
+
+#define DPRINTF(fmt, ...) \
+    do { if (DEBUG_BACKUP) { printf("backup: " fmt, ## __VA_ARGS__); } } \
+    while (0)
+
+
+#define SLICE_TIME 100000000ULL /* ns */
+
+typedef struct BackupBlockJob {
+    BlockJob common;
+    RateLimit limit;
+    uint64_t sectors_read;
+    unsigned long *bitmap;
+    int bitmap_size;
+    BackupDumpFunc *backup_dump_cb;
+    BlockDriverCompletionFunc *backup_complete_cb;
+    void *opaque;
+} BackupBlockJob;
+
+static int backup_get_bitmap(BackupBlockJob *job, int64_t cluster_num)
+{
+    assert(job);
+    assert(job->bitmap);
+
+    unsigned long val, idx, bit;
+
+    idx = cluster_num / BITS_PER_LONG;
+
+    assert(job->bitmap_size > idx);
+
+    bit = cluster_num % BITS_PER_LONG;
+    val = job->bitmap[idx];
+
+    return !!(val & (1UL << bit));
+}
+
+static void backup_set_bitmap(BackupBlockJob *job, int64_t cluster_num,
+                              int dirty)
+{
+    assert(job);
+    assert(job->bitmap);
+
+    unsigned long val, idx, bit;
+
+    idx = cluster_num / BITS_PER_LONG;
+
+    assert(job->bitmap_size > idx);
+
+    bit = cluster_num % BITS_PER_LONG;
+    val = job->bitmap[idx];
+    if (dirty) {
+        if (!(val & (1UL << bit))) {
+            val |= 1UL << bit;
+        }
+    } else {
+        if (val & (1UL << bit)) {
+            val &= ~(1UL << bit);
+        }
+    }
+    job->bitmap[idx] = val;
+}
+
+static int backup_in_progress_count;
+
+static int coroutine_fn backup_do_cow(BlockDriverState *bs,
+                                      int64_t sector_num, int nb_sectors)
+{
+    assert(bs);
+    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    assert(job);
+
+    BlockDriver *drv = bs->drv;
+    struct iovec iov;
+    QEMUIOVector bounce_qiov;
+    void *bounce_buffer = NULL;
+    int ret = 0;
+
+    backup_in_progress_count++;
+
+    int64_t start, end;
+
+    start = sector_num / BACKUP_BLOCKS_PER_CLUSTER;
+    end = (sector_num + nb_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
+        BACKUP_BLOCKS_PER_CLUSTER;
+
+    DPRINTF("brdv_co_backup_cow enter %s C%zd %zd %d\n",
+            bdrv_get_device_name(bs), start, sector_num, nb_sectors);
+
+    for (; start < end; start++) {
+        if (backup_get_bitmap(job, start)) {
+            DPRINTF("brdv_co_backup_cow skip C%zd\n", start);
+            continue; /* already copied */
+        }
+
+        /* immediately set bitmap (avoid coroutine race) */
+        backup_set_bitmap(job, start, 1);
+
+        DPRINTF("brdv_co_backup_cow C%zd\n", start);
+
+        if (!bounce_buffer) {
+            iov.iov_len = BACKUP_CLUSTER_SIZE;
+            iov.iov_base = bounce_buffer = qemu_blockalign(bs, iov.iov_len);
+            qemu_iovec_init_external(&bounce_qiov, &iov, 1);
+        }
+
+        ret = drv->bdrv_co_readv(bs, start * BACKUP_BLOCKS_PER_CLUSTER,
+                                 BACKUP_BLOCKS_PER_CLUSTER,
+                                 &bounce_qiov);
+
+        job->sectors_read += BACKUP_BLOCKS_PER_CLUSTER;
+
+        if (ret < 0) {
+            DPRINTF("brdv_co_backup_cow bdrv_read C%zd failed\n", start);
+            goto out;
+        }
+
+        ret = job->backup_dump_cb(job->opaque, bs, start, bounce_buffer);
+        if (ret < 0) {
+            DPRINTF("brdv_co_backup_cow dump_cluster_cb C%zd failed\n", start);
+            goto out;
+        }
+
+        DPRINTF("brdv_co_backup_cow done C%zd\n", start);
+    }
+
+out:
+    if (bounce_buffer) {
+        qemu_vfree(bounce_buffer);
+    }
+
+    backup_in_progress_count--;
+
+    return ret;
+}
+
+static int coroutine_fn backup_before_read(BlockDriverState *bs,
+                                           int64_t sector_num,
+                                           int nb_sectors, QEMUIOVector *qiov)
+{
+    return backup_do_cow(bs, sector_num, nb_sectors);
+}
+
+static int coroutine_fn backup_before_write(BlockDriverState *bs,
+                                            int64_t sector_num,
+                                            int nb_sectors, QEMUIOVector *qiov)
+{
+    return backup_do_cow(bs, sector_num, nb_sectors);
+}
+
+static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+    if (speed < 0) {
+        error_set(errp, QERR_INVALID_PARAMETER, "speed");
+        return;
+    }
+    ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME);
+}
+
+static BlockJobType backup_job_type = {
+    .instance_size = sizeof(BackupBlockJob),
+    .before_read = backup_before_read,
+    .before_write = backup_before_write,
+    .job_type = "backup",
+    .set_speed = backup_set_speed,
+};
+
+static void coroutine_fn backup_run(void *opaque)
+{
+    BackupBlockJob *job = opaque;
+    BlockDriverState *bs = job->common.bs;
+    assert(bs);
+
+    int64_t start, end;
+
+    start = 0;
+    end = (bs->total_sectors + BACKUP_BLOCKS_PER_CLUSTER - 1) /
+        BACKUP_BLOCKS_PER_CLUSTER;
+
+    DPRINTF("backup_run start %s %zd %zd\n", bdrv_get_device_name(bs),
+            start, end);
+
+    int ret = 0;
+
+    for (; start < end; start++) {
+        if (block_job_is_cancelled(&job->common)) {
+            ret = -1;
+            break;
+        }
+
+        /* we need to yield so that qemu_aio_flush() returns.
+         * (without, VM does not reboot)
+        * Note: use 1000 instead of 0 (0 prioritize this task too much)
+         */
+        if (job->common.speed) {
+            uint64_t delay_ns = ratelimit_calculate_delay(
+                &job->limit, job->sectors_read);
+            job->sectors_read = 0;
+            block_job_sleep_ns(&job->common, rt_clock, delay_ns);
+        } else {
+            block_job_sleep_ns(&job->common, rt_clock, 1000);
+        }
+
+        if (block_job_is_cancelled(&job->common)) {
+            ret = -1;
+            break;
+        }
+
+        if (backup_get_bitmap(job, start)) {
+            continue; /* already copied */
+        }
+
+        DPRINTF("backup_run loop C%zd\n", start);
+
+        /**
+         * This triggers a cluster copy
+         * Note: avoid direct call to brdv_co_backup_cow, because
+         * this does not call tracked_request_begin()
+         */
+        ret = bdrv_co_backup(bs, start*BACKUP_BLOCKS_PER_CLUSTER, 1);
+        if (ret < 0) {
+            break;
+        }
+        /* Publish progress */
+        job->common.offset += BACKUP_CLUSTER_SIZE;
+    }
+
+    while (backup_in_progress_count > 0) {
+        DPRINTF("backup_run backup_in_progress_count != 0 (%d)",
+                backup_in_progress_count);
+        block_job_sleep_ns(&job->common, rt_clock, 10000);
+
+    }
+
+    DPRINTF("backup_run complete %d\n", ret);
+    block_job_completed(&job->common, ret);
+}
+
+static void backup_job_cleanup_cb(void *opaque, int ret)
+{
+    BlockDriverState *bs = opaque;
+    assert(bs);
+    BackupBlockJob *job = (BackupBlockJob *)bs->job;
+    assert(job);
+
+    DPRINTF("backup_job_cleanup_cb start %d\n", ret);
+
+    job->backup_complete_cb(job->opaque, ret);
+
+    DPRINTF("backup_job_cleanup_cb end\n");
+
+    g_free(job->bitmap);
+}
+
+void
+backup_job_start(BlockDriverState *bs, bool cancel)
+{
+    assert(bs);
+    assert(bs->job);
+    assert(bs->job->co == NULL);
+
+    if (cancel) {
+        block_job_cancel(bs->job); /* set cancel flag */
+    }
+
+    bs->job->co = qemu_coroutine_create(backup_run);
+    qemu_coroutine_enter(bs->job->co, bs->job);
+}
+
+int
+backup_job_create(BlockDriverState *bs, BackupDumpFunc *backup_dump_cb,
+                  BlockDriverCompletionFunc *backup_complete_cb,
+                  void *opaque, int64_t speed)
+{
+    assert(bs);
+    assert(backup_dump_cb);
+    assert(backup_complete_cb);
+
+    if (bs->job) {
+        DPRINTF("bdrv_backup_init failed - running job on %s\n",
+                bdrv_get_device_name(bs));
+        return -1;
+    }
+
+    int64_t bitmap_size;
+    const char *devname = bdrv_get_device_name(bs);
+
+    if (!devname || !devname[0]) {
+        return -1;
+    }
+
+    DPRINTF("bdrv_backup_init %s\n", bdrv_get_device_name(bs));
+
+    Error *errp;
+    BackupBlockJob *job = block_job_create(&backup_job_type, bs, speed,
+                                           backup_job_cleanup_cb, bs, &errp);
+
+    job->common.cluster_size = BACKUP_CLUSTER_SIZE;
+
+    bitmap_size = bs->total_sectors +
+        BACKUP_BLOCKS_PER_CLUSTER * BITS_PER_LONG - 1;
+    bitmap_size /= BACKUP_BLOCKS_PER_CLUSTER * BITS_PER_LONG;
+
+    job->backup_dump_cb = backup_dump_cb;
+    job->backup_complete_cb = backup_complete_cb;
+    job->opaque = opaque;
+    job->bitmap_size = bitmap_size;
+    job->bitmap = g_new0(unsigned long, bitmap_size);
+
+    job->common.len = bs->total_sectors*BDRV_SECTOR_SIZE;
+
+    return 0;
+}
diff --git a/backup.h b/backup.h
new file mode 100644
index 0000000..d9395bc
--- /dev/null
+++ b/backup.h
@@ -0,0 +1,32 @@ 
+/*
+ * QEMU backup related definitions
+ *
+ * Copyright (C) 2013 Proxmox Server Solutions
+ *
+ * Authors:
+ *  Dietmar Maurer (dietmar@proxmox.com)
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_BACKUP_H
+#define QEMU_BACKUP_H
+
+#include <uuid/uuid.h>
+
+#define BACKUP_CLUSTER_BITS 16
+#define BACKUP_CLUSTER_SIZE (1<<BACKUP_CLUSTER_BITS)
+#define BACKUP_BLOCKS_PER_CLUSTER (BACKUP_CLUSTER_SIZE/BDRV_SECTOR_SIZE)
+
+typedef int BackupDumpFunc(void *opaque, BlockDriverState *bs,
+                           int64_t cluster_num, unsigned char *buf);
+
+void backup_job_start(BlockDriverState *bs, bool cancel);
+
+int backup_job_create(BlockDriverState *bs, BackupDumpFunc *backup_dump_cb,
+                      BlockDriverCompletionFunc *backup_complete_cb,
+                      void *opaque, int64_t speed);
+
+#endif /* QEMU_BACKUP_H */
diff --git a/block.c b/block.c
index 50dab8e..6e6d08f 100644
--- a/block.c
+++ b/block.c
@@ -54,6 +54,7 @@ 
 typedef enum {
     BDRV_REQ_COPY_ON_READ = 0x1,
     BDRV_REQ_ZERO_WRITE   = 0x2,
+    BDRV_REQ_BACKUP_ONLY  = 0x4,
 } BdrvRequestFlags;
 
 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
@@ -1554,7 +1555,7 @@  int bdrv_commit(BlockDriverState *bs)
 
     if (!drv)
         return -ENOMEDIUM;
-    
+
     if (!bs->backing_hd) {
         return -ENOTSUP;
     }
@@ -1691,6 +1692,22 @@  void bdrv_round_to_clusters(BlockDriverState *bs,
     }
 }
 
+/**
+ * Round a region to job cluster boundaries
+ */
+static void round_to_job_clusters(BlockDriverState *bs,
+                                  int64_t sector_num, int nb_sectors,
+                                  int job_cluster_size,
+                                  int64_t *cluster_sector_num,
+                                  int *cluster_nb_sectors)
+{
+    int64_t c = job_cluster_size/BDRV_SECTOR_SIZE;
+
+    *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
+    *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num +
+                                        nb_sectors, c);
+}
+
 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
                                      int64_t sector_num, int nb_sectors) {
     /*        aaaa   bbbb */
@@ -1705,7 +1722,9 @@  static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 }
 
 static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
-        int64_t sector_num, int nb_sectors)
+                                                       int64_t sector_num,
+                                                       int nb_sectors,
+                                                       int job_cluster_size)
 {
     BdrvTrackedRequest *req;
     int64_t cluster_sector_num;
@@ -1721,6 +1740,11 @@  static void coroutine_fn wait_for_overlapping_requests(BlockDriverState *bs,
     bdrv_round_to_clusters(bs, sector_num, nb_sectors,
                            &cluster_sector_num, &cluster_nb_sectors);
 
+    if (job_cluster_size) {
+        round_to_job_clusters(bs, sector_num, nb_sectors, job_cluster_size,
+                              &cluster_sector_num, &cluster_nb_sectors);
+    }
+
     do {
         retry = false;
         QLIST_FOREACH(req, &bs->tracked_requests, list) {
@@ -2260,12 +2284,24 @@  static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs,
         bs->copy_on_read_in_flight++;
     }
 
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    int job_cluster_size = bs->job && bs->job->cluster_size ?
+        bs->job->cluster_size : 0;
+
+    if (bs->copy_on_read_in_flight || job_cluster_size) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors,
+                                      job_cluster_size);
     }
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, false);
 
+    if (bs->job && bs->job->job_type->before_read) {
+        ret = bs->job->job_type->before_read(bs, sector_num, nb_sectors, qiov);
+        if ((ret < 0) || (flags & BDRV_REQ_BACKUP_ONLY)) {
+            /* Note: We do not return any data to the caller */
+            goto out;
+        }
+    }
+
     if (flags & BDRV_REQ_COPY_ON_READ) {
         int pnum;
 
@@ -2309,6 +2345,17 @@  int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
                             BDRV_REQ_COPY_ON_READ);
 }
 
+int coroutine_fn bdrv_co_backup(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors)
+{
+    if (!bs->job) {
+        return -ENOTSUP;
+    }
+
+    return bdrv_co_do_readv(bs, sector_num, nb_sectors, NULL,
+                            BDRV_REQ_BACKUP_ONLY);
+}
+
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors)
 {
@@ -2366,12 +2413,23 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bdrv_io_limits_intercept(bs, true, nb_sectors);
     }
 
-    if (bs->copy_on_read_in_flight) {
-        wait_for_overlapping_requests(bs, sector_num, nb_sectors);
+    int job_cluster_size = bs->job && bs->job->cluster_size ?
+        bs->job->cluster_size : 0;
+
+    if (bs->copy_on_read_in_flight || job_cluster_size) {
+        wait_for_overlapping_requests(bs, sector_num, nb_sectors,
+                                      job_cluster_size);
     }
 
     tracked_request_begin(&req, bs, sector_num, nb_sectors, true);
 
+    if (bs->job && bs->job->job_type->before_write) {
+        ret = bs->job->job_type->before_write(bs, sector_num, nb_sectors, qiov);
+        if (ret < 0) {
+            goto out;
+        }
+    }
+
     if (flags & BDRV_REQ_ZERO_WRITE) {
         ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
     } else {
@@ -2390,6 +2448,7 @@  static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs,
         bs->wr_highest_sector = sector_num + nb_sectors - 1;
     }
 
+out:
     tracked_request_end(&req);
 
     return ret;
diff --git a/include/block/block.h b/include/block/block.h
index 5c3b911..b6144be 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -172,6 +172,8 @@  int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs,
     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov);
+int coroutine_fn bdrv_co_backup(BlockDriverState *bs,
+    int64_t sector_num, int nb_sectors);
 int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num,
     int nb_sectors, QEMUIOVector *qiov);
 /*
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index c290d07..6f42495 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -50,6 +50,13 @@  typedef struct BlockJobType {
      * manually.
      */
     void (*complete)(BlockJob *job, Error **errp);
+
+    /** tracked requests */
+    int coroutine_fn (*before_read)(BlockDriverState *bs, int64_t sector_num,
+                                    int nb_sectors, QEMUIOVector *qiov);
+    int coroutine_fn (*before_write)(BlockDriverState *bs, int64_t sector_num,
+                                     int nb_sectors, QEMUIOVector *qiov);
+
 } BlockJobType;
 
 /**
@@ -103,6 +110,9 @@  struct BlockJob {
     /** Speed that was set with @block_job_set_speed.  */
     int64_t speed;
 
+    /** tracked requests */
+    int cluster_size;
+
     /** The completion function that will be called when the job completes.  */
     BlockDriverCompletionFunc *cb;