Patchwork [2/3] live block copy

login
register
mail settings
Submitter Marcelo Tosatti
Date Dec. 16, 2010, 5:44 p.m.
Message ID <20101216174810.695380191@redhat.com>
Download mbox | patch
Permalink /patch/75793/
State New
Headers show

Comments

Marcelo Tosatti - Dec. 16, 2010, 5:44 p.m.
Add support for live block copy.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Stefan Hajnoczi - Jan. 14, 2011, 10:46 a.m.
On Thu, Dec 16, 2010 at 5:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> Add support for live block copy.
>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Are you still pushing this, are you looking for reviews?

Stefan
Marcelo Tosatti - Jan. 14, 2011, 12:20 p.m.
On Fri, Jan 14, 2011 at 10:46:53AM +0000, Stefan Hajnoczi wrote:
> On Thu, Dec 16, 2010 at 5:44 PM, Marcelo Tosatti <mtosatti@redhat.com> wrote:
> > Add support for live block copy.
> >
> > Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
> 
> Are you still pushing this, are you looking for reviews?
> 
> Stefan

Reviews are welcome. The known issues, which will be fixed in the next
submission, are:

- Interaction with device hotplug.
- Use a context different than timer to issue AIOs, if possible.

Patch

Index: qemu-kvm/block-copy.c
===================================================================
--- /dev/null
+++ qemu-kvm/block-copy.c
@@ -0,0 +1,728 @@ 
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "block_int.h"
+#include "qemu-queue.h"
+#include "qemu-timer.h"
+#include "monitor.h"
+#include "block-copy.h"
+#include "migration.h"
+#include "sysemu.h"
+#include "qjson.h"
+#include <assert.h>
+
+#define BLOCK_SIZE (BDRV_SECTORS_PER_DIRTY_CHUNK << BDRV_SECTOR_BITS)
+#define MAX_IS_ALLOCATED_SEARCH 65536
+
+/*
+ * Stages:
+ *
+ * STAGE_BULK: bulk reads/writes in progress
+ * STAGE_BULK_FINISHED: bulk reads finished, bulk writes in progress
+ * STAGE_DIRTY: bulk writes finished, dirty reads/writes in progress
+ * STAGE_SWITCH_FINISHED: switched to new image.
+ */
+
+enum BdrvCopyStage {
+    STAGE_BULK,
+    STAGE_BULK_FINISHED,
+    STAGE_DIRTY,
+    STAGE_SWITCH_FINISHED,
+};
+
+typedef struct BdrvCopyState {
+    BlockDriverState *src;
+    BlockDriverState *dst;
+    bool shared_base;
+
+    int64_t curr_sector;
+    int64_t completed_sectors;
+    int64_t nr_sectors;
+
+    enum BdrvCopyStage stage;
+    int inflight_reads;
+    int error;
+    int failed;
+    int cancelled;
+    QLIST_HEAD(, BdrvCopyBlock) io_list;
+    unsigned long *aio_bitmap;
+    QEMUTimer *aio_timer;
+    QLIST_ENTRY(BdrvCopyState) list;
+
+    int64_t blocks;
+    int64_t total_time;
+
+    char src_device_name[32];
+    char dst_filename[1024];
+    int commit_fd;
+} BdrvCopyState;
+
+typedef struct BdrvCopyBlock {
+    BdrvCopyState *state;
+    uint8_t *buf;
+    int64_t sector;
+    int64_t nr_sectors;
+    struct iovec iov;
+    QEMUIOVector qiov;
+    BlockDriverAIOCB *aiocb;
+    int64_t time;
+    QLIST_ENTRY(BdrvCopyBlock) list;
+} BdrvCopyBlock;
+
+static QLIST_HEAD(, BdrvCopyState) block_copy_list =
+    QLIST_HEAD_INITIALIZER(block_copy_list);
+
+static void alloc_aio_bitmap(BdrvCopyState *s)
+{
+    BlockDriverState *bs = s->src;
+    int64_t bitmap_size;
+
+    bitmap_size = (bdrv_getlength(bs) >> BDRV_SECTOR_BITS) +
+            BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1;
+    bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8;
+
+    s->aio_bitmap = qemu_mallocz(bitmap_size);
+}
+
+static bool aio_inflight(BdrvCopyState *s, int64_t sector)
+{
+    int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    if (s->aio_bitmap &&
+        (sector << BDRV_SECTOR_BITS) < bdrv_getlength(s->src)) {
+        return !!(s->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] &
+            (1UL << (chunk % (sizeof(unsigned long) * 8))));
+    } else {
+        return 0;
+    }
+}
+
+static void set_aio_inflight(BdrvCopyState *s, int64_t sector_num,
+                             int nb_sectors, int set)
+{
+    int64_t start, end;
+    unsigned long val, idx, bit;
+
+    start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK;
+    end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    for (; start <= end; start++) {
+        idx = start / (sizeof(unsigned long) * 8);
+        bit = start % (sizeof(unsigned long) * 8);
+        val = s->aio_bitmap[idx];
+        if (set) {
+            if (!(val & (1UL << bit))) {
+                val |= 1UL << bit;
+            }
+        } else {
+            if (val & (1UL << bit)) {
+                val &= ~(1UL << bit);
+            }
+        }
+        s->aio_bitmap[idx] = val;
+    }
+}
+
+static void blkcopy_set_stage(BdrvCopyState *s, enum BdrvCopyStage stage)
+{
+    s->stage = stage;
+
+    switch (stage) {
+    case STAGE_BULK:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK);
+        break;
+    case STAGE_BULK_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_BULK_FINISHED);
+        break;
+    case STAGE_DIRTY:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_DIRTY);
+        break;
+    case STAGE_SWITCH_FINISHED:
+        BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED);
+        break;
+    default:
+        break;
+    }
+}
+
+static void blk_copy_handle_cb_error(BdrvCopyState *s, int ret)
+{
+    s->error = ret;
+    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+}
+
+static inline void add_avg_transfer_time(BdrvCopyState *s, int64_t time)
+{
+    s->blocks++;
+    s->total_time += time;
+}
+
+static void blk_copy_write_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+
+    QLIST_REMOVE(blk, list);
+    add_avg_transfer_time(s, qemu_get_clock_ns(rt_clock) - blk->time);
+
+    /* schedule switch to STAGE_DIRTY on last bulk write completion */
+    if (blk->state->stage == STAGE_BULK_FINISHED) {
+        qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+    }
+
+    if (blk->state->stage > STAGE_BULK_FINISHED) {
+        set_aio_inflight(blk->state, blk->sector, blk->nr_sectors, 0);
+    }
+
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_issue_write(BdrvCopyState *s, BdrvCopyBlock *read_blk)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->state = s;
+    blk->sector = read_blk->sector;
+    blk->nr_sectors = read_blk->nr_sectors;
+    blk->time = read_blk->time;
+    blk->buf = read_blk->buf;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = read_blk->buf;
+    blk->iov.iov_len = read_blk->iov.iov_len;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_AIO_WRITE);
+    blk->aiocb = bdrv_aio_writev(s->dst, blk->sector, &blk->qiov,
+                                 blk->iov.iov_len / BDRV_SECTOR_SIZE,
+                                 blk_copy_write_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    QLIST_REMOVE(blk, list);
+    qemu_free(read_blk->buf);
+    qemu_free(blk);
+}
+
+static void blk_copy_read_cb(void *opaque, int ret)
+{
+    BdrvCopyBlock *blk = opaque;
+    BdrvCopyState *s = blk->state;
+
+    s->inflight_reads--;
+    if (ret < 0) {
+        QLIST_REMOVE(blk, list);
+        qemu_free(blk->buf);
+        qemu_free(blk);
+        blk_copy_handle_cb_error(s, ret);
+        return;
+    }
+    blk_copy_issue_write(s, blk);
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk);
+    qemu_mod_timer(s->aio_timer, qemu_get_clock(rt_clock));
+}
+
+static void blk_copy_issue_read(BdrvCopyState *s, int64_t sector,
+                                int nr_sectors)
+{
+    BdrvCopyBlock *blk = qemu_mallocz(sizeof(BdrvCopyBlock));
+    blk->buf = qemu_mallocz(BLOCK_SIZE);
+    blk->state = s;
+    blk->sector = sector;
+    blk->nr_sectors = nr_sectors;
+    QLIST_INSERT_HEAD(&s->io_list, blk, list);
+
+    blk->iov.iov_base = blk->buf;
+    blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE;
+    qemu_iovec_init_external(&blk->qiov, &blk->iov, 1);
+
+    s->inflight_reads++;
+    blk->time = qemu_get_clock_ns(rt_clock);
+    blk->aiocb = bdrv_aio_readv(s->src, sector, &blk->qiov, nr_sectors,
+                                blk_copy_read_cb, blk);
+    if (!blk->aiocb) {
+        s->error = 1;
+        goto error;
+    }
+
+    return;
+
+error:
+    s->inflight_reads--;
+    QLIST_REMOVE(blk, list);
+    qemu_free(blk->buf);
+    qemu_free(blk);
+}
+
+static bool blkcopy_can_switch(BdrvCopyState *s)
+{
+    int64_t remaining_dirty;
+    int64_t avg_transfer_time;
+
+    remaining_dirty = bdrv_get_dirty_count(s->src);
+    if (remaining_dirty == 0 || s->blocks == 0) {
+        return true;
+    }
+
+    avg_transfer_time = s->total_time / s->blocks;
+    if ((remaining_dirty * avg_transfer_time) <= migrate_max_downtime()) {
+        return true;
+    }
+    return false;
+}
+
+static int blk_issue_reads_dirty(BdrvCopyState *s)
+{
+    int64_t sector;
+
+    for (sector = s->curr_sector; sector < s->nr_sectors;) {
+        if (bdrv_get_dirty(s->src, sector) && !aio_inflight(s, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - s->curr_sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            blk_copy_issue_read(s, sector, nr_sectors);
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+            set_aio_inflight(s, sector, nr_sectors, 1);
+            break;
+        }
+
+        sector += BDRV_SECTORS_PER_DIRTY_CHUNK;
+        s->curr_sector = sector;
+    }
+
+    if (sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+    }
+    return 0;
+}
+
+static int blk_issue_reads_bulk(BdrvCopyState *s)
+{
+    int nr_sectors;
+    int64_t curr_sector = s->curr_sector;
+
+    if (s->shared_base) {
+        while (curr_sector < s->nr_sectors &&
+                !bdrv_is_allocated(s->src, curr_sector,
+                                   MAX_IS_ALLOCATED_SEARCH, &nr_sectors)) {
+                curr_sector += nr_sectors;
+        }
+    }
+
+    if (curr_sector >= s->nr_sectors) {
+        s->curr_sector = 0;
+        return 1;
+    }
+
+    curr_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1);
+    nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK;
+
+    blk_copy_issue_read(s, s->curr_sector, nr_sectors);
+    s->curr_sector += nr_sectors;
+    s->completed_sectors = curr_sector;
+    return 0;
+}
+
+static void blkcopy_finish(BdrvCopyState *s)
+{
+    int64_t sector;
+    uint8_t *buf;
+
+    buf = qemu_malloc(BLOCK_SIZE);
+
+    /* FIXME: speed up loop, get_next_dirty_block? */
+    for (sector = 0; sector < s->nr_sectors;
+         sector += BDRV_SECTORS_PER_DIRTY_CHUNK) {
+        if (bdrv_get_dirty(s->src, sector)) {
+            int nr_sectors = MIN(s->nr_sectors - sector,
+                                 BDRV_SECTORS_PER_DIRTY_CHUNK);
+
+            memset(buf, 0, BLOCK_SIZE);
+            if (bdrv_read(s->src, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            if (bdrv_write(s->dst, sector, buf, nr_sectors) < 0) {
+                goto error;
+            }
+            bdrv_reset_dirty(s->src, sector, nr_sectors);
+        }
+
+        if (bdrv_get_dirty_count(s->src) == 0)
+            break;
+    }
+    qemu_free(buf);
+    return;
+
+error:
+    qemu_free(buf);
+    s->error = 1;
+}
+
+static int write_commit_file(BdrvCopyState *s)
+{
+    char commit_msg[1400];
+    const char *buf = commit_msg;
+    int len, ret;
+
+    sprintf(commit_msg, "commit QEMU block_copy %s -> %s\n", s->src_device_name,
+                        s->dst_filename);
+
+    len = strlen(commit_msg);
+    while (len > 0) {
+        ret = write(s->commit_fd, buf, len);
+        if (ret == -1 && errno == EINTR) {
+            continue;
+        }
+        if (ret <= 0) {
+            return -errno;
+        }
+        buf += ret;
+        len -= ret;
+    }
+
+    if (fsync(s->commit_fd) == -1) {
+        return -errno;
+    }
+
+    return 0;
+}
+
+static void blkcopy_cleanup(BdrvCopyState *s)
+{
+    assert(s->inflight_reads == 0);
+    assert(QLIST_EMPTY(&s->io_list));
+    bdrv_set_dirty_tracking(s->src, 0);
+    if (s->stage >= STAGE_DIRTY)
+        qemu_free(s->aio_bitmap);
+    qemu_del_timer(s->aio_timer);
+    if (s->commit_fd)
+        close(s->commit_fd);
+}
+
+static void blkcopy_free(BdrvCopyState *s)
+{
+    QLIST_REMOVE(s, list);
+    qemu_free(s);
+}
+
+static void handle_error(BdrvCopyState *s)
+{
+    if (!QLIST_EMPTY(&s->io_list))
+        return;
+    s->failed = 1;
+    blkcopy_cleanup(s);
+}
+
+static void blkcopy_switch(BdrvCopyState *s)
+{
+    char src_filename[1024];
+    int open_flags;
+
+    strncpy(src_filename, s->src->filename, sizeof(src_filename));
+    open_flags = s->src->open_flags;
+
+    assert(s->stage == STAGE_DIRTY);
+
+    vm_stop(0);
+    /* flush any guest writes, dirty bitmap uptodate after this.
+     * copy AIO also finished.
+     */
+    qemu_aio_flush();
+    assert(QLIST_EMPTY(&s->io_list));
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    blkcopy_finish(s);
+    if (s->error) {
+        handle_error(s);
+        goto vm_start;
+    }
+    assert(bdrv_get_dirty_count(s->src) == 0);
+    bdrv_flush_all();
+    bdrv_close(s->src);
+    bdrv_close(s->dst);
+    if (bdrv_open(s->src, s->dst->filename, s->src->open_flags, NULL) < 0) {
+        s->failed = 1;
+        goto err;
+    }
+    if (s->commit_fd && write_commit_file(s)) {
+        s->failed = 1;
+        bdrv_close(s->src);
+        goto err;
+    }
+
+    blkcopy_set_stage(s, STAGE_SWITCH_FINISHED);
+    blkcopy_cleanup(s);
+vm_start:
+    vm_start();
+    return;
+
+err:
+    if (bdrv_open(s->src, src_filename, open_flags, NULL) < 0) {
+        error_report("%s: %s: cannot fallback to source image\n", __func__,
+                     s->dst_filename);
+        abort();
+    }
+    blkcopy_cleanup(s);
+    goto vm_start;
+}
+
+#define BLKCOPY_INFLIGHT 2
+
+static void aio_timer(void *opaque)
+{
+    BdrvCopyState *s = opaque;
+
+    assert(s->cancelled == 0);
+
+    if (s->error) {
+        handle_error(s);
+        return;
+    }
+
+    while (s->stage == STAGE_BULK) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        if (blk_issue_reads_bulk(s)) {
+            blkcopy_set_stage(s, STAGE_BULK_FINISHED);
+        }
+    }
+
+    if (s->stage == STAGE_BULK_FINISHED) {
+        if (QLIST_EMPTY(&s->io_list)) {
+            blkcopy_set_stage(s, STAGE_DIRTY);
+            alloc_aio_bitmap(s);
+        }
+    }
+
+    while (s->stage == STAGE_DIRTY) {
+        if (s->inflight_reads >= BLKCOPY_INFLIGHT) {
+            break;
+        }
+        blk_issue_reads_dirty(s);
+        if (blkcopy_can_switch(s)) {
+            BLKDBG_EVENT(s->dst->file, BLKDBG_BLKCOPY_SWITCH_START);
+            blkcopy_switch(s);
+            return;
+        }
+    }
+}
+
+static int bdrv_copy(Monitor *mon, const char * device, BlockDriverState *src,
+                     BlockDriverState *dst, const char *commit_file,
+                     bool shared_base)
+{
+    int64_t sectors;
+    BdrvCopyState *blkcopy, *safe;
+    int f;
+
+    QLIST_FOREACH_SAFE(blkcopy, &block_copy_list, list, safe) {
+        if (!strcmp(blkcopy->src_device_name, src->device_name)) {
+            if (blkcopy->stage == STAGE_SWITCH_FINISHED || blkcopy->failed) {
+                blkcopy_free(blkcopy);
+            } else {
+                qerror_report(QERR_BLOCKCOPY_IN_PROGRESS, src->device_name);
+                return -1;
+            }
+        }
+    }
+
+    sectors = bdrv_getlength(src) >> BDRV_SECTOR_BITS;
+    if (sectors != bdrv_getlength(dst) >> BDRV_SECTOR_BITS) {
+        qerror_report(QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS);
+        return -1;
+    }
+
+    if (commit_file) {
+        f = open(commit_file, O_CREAT|O_WRONLY, S_IRUSR);
+        if (f == -1) {
+            qerror_report(QERR_OPEN_FILE_FAILED, commit_file);
+            return -1;
+        }
+    }
+
+    blkcopy = qemu_mallocz(sizeof(BdrvCopyState));
+    blkcopy->src = src;
+    blkcopy->dst = dst;
+    blkcopy->curr_sector = 0;
+    blkcopy->nr_sectors = sectors;
+    blkcopy_set_stage(blkcopy, STAGE_BULK);
+    blkcopy->aio_timer = qemu_new_timer(rt_clock, aio_timer, blkcopy);
+    blkcopy->shared_base = shared_base;
+    blkcopy->commit_fd = f;
+    strncpy(blkcopy->src_device_name, blkcopy->src->device_name,
+            sizeof(blkcopy->src_device_name) - 1);
+    strncpy(blkcopy->dst_filename, blkcopy->dst->filename,
+            sizeof(blkcopy->dst_filename) - 1);
+
+    bdrv_set_dirty_tracking(src, 1);
+    qemu_mod_timer(blkcopy->aio_timer, qemu_get_clock(rt_clock));
+
+    QLIST_INSERT_HEAD(&block_copy_list, blkcopy, list);
+    return 0;
+}
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    const char *filename = qdict_get_str(qdict, "filename");
+    const char *commit_file = qdict_get_try_str(qdict, "commit_filename");
+    bool shared_base = qdict_get_try_bool(qdict, "inc", 0);
+    BlockDriverState *new_bs, *bs;
+    int ret;
+
+    if (migration_active()) {
+        qerror_report(QERR_MIGRATION_IN_PROGRESS);
+        return -1;
+    }
+
+    bs = bdrv_find(device);
+    if (!bs) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    new_bs = bdrv_new("");
+    if (bdrv_open(new_bs, filename, bs->open_flags, NULL) < 0) {
+        bdrv_delete(new_bs);
+        qerror_report(QERR_OPEN_FILE_FAILED, filename);
+        return -1;
+    }
+
+    ret = bdrv_copy(mon, device, bs, new_bs, commit_file, shared_base);
+    return ret;
+}
+
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data)
+{
+    BdrvCopyState *blkcopy, *s = NULL;
+    const char *device = qdict_get_str(qdict, "device");
+
+    QLIST_FOREACH(blkcopy, &block_copy_list, list) {
+        if (!strcmp(blkcopy->src_device_name, device)) {
+            s = blkcopy;
+            break;
+        }
+    }
+
+    if (!s) {
+        qerror_report(QERR_DEVICE_NOT_FOUND, device);
+        return -1;
+    }
+
+    s->cancelled = 1;
+    do {
+        qemu_aio_flush();
+    } while (!QLIST_EMPTY(&s->io_list));
+    blkcopy_cleanup(s);
+    blkcopy_free(s);
+
+    return 0;
+}
+
+static void blockcopy_print_dict(QObject *obj, void *opaque)
+{
+    QDict *c_dict;
+    Monitor *mon = opaque;
+
+    c_dict = qobject_to_qdict(obj);
+
+    monitor_printf(mon, "%s: status=%s ",
+                        qdict_get_str(c_dict, "device"),
+                        qdict_get_str(c_dict, "status"));
+
+    if (qdict_haskey(c_dict, "info")) {
+        QDict *qdict = qobject_to_qdict(qdict_get(c_dict, "info"));
+
+        monitor_printf(mon, "percentage=%ld %%",
+                       qdict_get_int(qdict, "percentage"));
+    }
+
+    monitor_printf(mon, "\n");
+}
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data)
+{
+    qlist_iter(qobject_to_qlist(data), blockcopy_print_dict, mon);
+}
+
+void do_info_blockcopy(Monitor *mon, QObject **ret_data)
+{
+    QList *c_list;
+    BdrvCopyState *s;
+
+    c_list = qlist_new();
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        QObject *c_obj;
+        static const char *status[] = { "failed", "active", "completed" };
+        int i;
+
+        if (s->failed) {
+            i = 0;
+        } else if (s->stage < STAGE_SWITCH_FINISHED) {
+            i = 1;
+        } else {
+            i = 2;
+        }
+
+        c_obj = qobject_from_jsonf("{ 'device': %s, 'status': %s }",
+                                    s->src_device_name, status[i]);
+
+        if (i == 1) {
+            QDict *dict = qobject_to_qdict(c_obj);
+            QObject *obj;
+
+            /* FIXME: add dirty stage progress? */
+            obj = qobject_from_jsonf("{ 'percentage': %" PRId64 "}",
+                                     s->completed_sectors * 100 / s->nr_sectors);
+            qdict_put_obj(dict, "info", obj);
+        }
+        qlist_append_obj(c_list, c_obj);
+    }
+
+    *ret_data = QOBJECT(c_list);
+}
+
+bool block_copy_active(void)
+{
+    BdrvCopyState *s;
+
+    QLIST_FOREACH(s, &block_copy_list, list) {
+        if (s->failed) {
+            continue;
+        }
+        if (s->stage < STAGE_SWITCH_FINISHED) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
Index: qemu-kvm/block-copy.h
===================================================================
--- /dev/null
+++ qemu-kvm/block-copy.h
@@ -0,0 +1,25 @@ 
+/*
+ * QEMU live block copy
+ *
+ * Copyright (C) 2010 Red Hat Inc.
+ *
+ * Authors: Marcelo Tosatti <mtosatti@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef BLOCK_COPY_H
+#define BLOCK_COPY_H
+
+int do_bdrv_copy(Monitor *mon, const QDict *qdict, QObject **ret_data);
+int do_bdrv_copy_cancel(Monitor *mon, const QDict *qdict, QObject **ret_data);
+
+void do_info_blockcopy_print(Monitor *mon, const QObject *data);
+void do_info_blockcopy(Monitor *mon, QObject **ret_data);
+
+bool block_copy_active(void);
+
+#endif /* BLOCK_COPY_H */
+
Index: qemu-kvm/hmp-commands.hx
===================================================================
--- qemu-kvm.orig/hmp-commands.hx
+++ qemu-kvm/hmp-commands.hx
@@ -769,6 +769,43 @@  Set maximum speed to @var{value} (in byt
 ETEXI
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
+        .params     = "device filename [commit_filename] [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t optional commit filename "
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between src and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+STEXI
+@item block_copy @var{device} @var{filename} [@var{commit_filename}] [-i]
+@findex block_copy
+Live copy block device @var{device} to image @var{filename}.
+        -i for incremental copy (base image is shared)
+
+Optionally a commit message is written to @var{commit_filename}
+once the switch to the new image is performed.
+ETEXI
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+STEXI
+@item block_copy_cancel @var{device}
+@findex block_copy_cancel
+Cancel live block copy on @var{device}.
+ETEXI
+
+    {
         .name       = "migrate_set_downtime",
         .args_type  = "value:T",
         .params     = "value",
@@ -1213,6 +1250,8 @@  show device tree
 show qdev device model list
 @item info roms
 show roms
+@item info block-copy
+show block copy status
 @end table
 ETEXI
 
Index: qemu-kvm/monitor.c
===================================================================
--- qemu-kvm.orig/monitor.c
+++ qemu-kvm/monitor.c
@@ -44,6 +44,7 @@ 
 #include "balloon.h"
 #include "qemu-timer.h"
 #include "migration.h"
+#include "block-copy.h"
 #include "kvm.h"
 #include "acl.h"
 #include "qint.h"
@@ -2651,6 +2652,14 @@  static const mon_cmd_t info_cmds[] = {
     },
 #endif
     {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
+    {
         .name       = NULL,
     },
 };
@@ -2782,6 +2791,14 @@  static const mon_cmd_t qmp_query_cmds[] 
         .mhandler.info_async = do_info_balloon,
         .flags      = MONITOR_CMD_ASYNC,
     },
+    {
+        .name       = "block-copy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "show block copy status",
+        .user_print = do_info_blockcopy_print,
+        .mhandler.info_new = do_info_blockcopy,
+    },
     { /* NULL */ },
 };
 
Index: qemu-kvm/block.h
===================================================================
--- qemu-kvm.orig/block.h
+++ qemu-kvm/block.h
@@ -281,6 +281,13 @@  typedef enum {
     BLKDBG_CLUSTER_ALLOC_BYTES,
     BLKDBG_CLUSTER_FREE,
 
+    BLKDBG_BLKCOPY_STAGE_BULK,
+    BLKDBG_BLKCOPY_STAGE_BULK_FINISHED,
+    BLKDBG_BLKCOPY_STAGE_DIRTY,
+    BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED,
+    BLKDBG_BLKCOPY_SWITCH_START,
+    BLKDBG_BLKCOPY_AIO_WRITE,
+
     BLKDBG_EVENT_MAX,
 } BlkDebugEvent;
 
Index: qemu-kvm/block/blkdebug.c
===================================================================
--- qemu-kvm.orig/block/blkdebug.c
+++ qemu-kvm/block/blkdebug.c
@@ -178,6 +178,14 @@  static const char *event_names[BLKDBG_EV
     [BLKDBG_CLUSTER_ALLOC]                  = "cluster_alloc",
     [BLKDBG_CLUSTER_ALLOC_BYTES]            = "cluster_alloc_bytes",
     [BLKDBG_CLUSTER_FREE]                   = "cluster_free",
+
+
+    [BLKDBG_BLKCOPY_STAGE_BULK]             = "blkcopy_stage_bulk",
+    [BLKDBG_BLKCOPY_STAGE_BULK_FINISHED]    = "blkcopy_stage_bulk_finished",
+    [BLKDBG_BLKCOPY_STAGE_DIRTY]            = "blkcopy_stage_dirty",
+    [BLKDBG_BLKCOPY_STAGE_SWITCH_FINISHED]  = "blkcopy_stage_switch_finished",
+    [BLKDBG_BLKCOPY_SWITCH_START]           = "blkcopy_switch_start",
+    [BLKDBG_BLKCOPY_AIO_WRITE]              = "blkcopy_aio_write",
 };
 
 static int get_event_by_name(const char *name, BlkDebugEvent *event)
Index: qemu-kvm/qerror.c
===================================================================
--- qemu-kvm.orig/qerror.c
+++ qemu-kvm/qerror.c
@@ -200,6 +200,18 @@  static const QErrorStringTable qerror_ta
         .error_fmt = QERR_VNC_SERVER_FAILED,
         .desc      = "Could not start VNC server on %(target)",
     },
+    {
+        .error_fmt = QERR_BLOCKCOPY_IN_PROGRESS,
+        .desc      = "Block copy for %(device) in progress",
+    },
+    {
+        .error_fmt = QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS,
+        .desc      = "Length of destination image differs from source image",
+    },
+    {
+        .error_fmt = QERR_MIGRATION_IN_PROGRESS,
+        .desc      = "Migration in progress",
+    },
     {}
 };
 
Index: qemu-kvm/qerror.h
===================================================================
--- qemu-kvm.orig/qerror.h
+++ qemu-kvm/qerror.h
@@ -165,4 +165,13 @@  QError *qobject_to_qerror(const QObject 
 #define QERR_VNC_SERVER_FAILED \
     "{ 'class': 'VNCServerFailed', 'data': { 'target': %s } }"
 
+#define QERR_BLOCKCOPY_IN_PROGRESS \
+    "{ 'class': 'BlockCopyInProgress', 'data': { 'device': %s } }"
+
+#define QERR_BLOCKCOPY_IMAGE_SIZE_DIFFERS \
+    "{ 'class': 'BlockCopyImageSizeDiffers', 'data': {} }"
+
+#define QERR_MIGRATION_IN_PROGRESS \
+    "{ 'class': 'MigrationInProgress', 'data': {} }"
+
 #endif /* QERROR_H */
Index: qemu-kvm/qmp-commands.hx
===================================================================
--- qemu-kvm.orig/qmp-commands.hx
+++ qemu-kvm/qmp-commands.hx
@@ -546,6 +546,75 @@  Example:
 EQMP
 
     {
+        .name       = "block_copy",
+        .args_type  = "device:s,filename:s,commit_filename:s?,inc:-i",
+        .params     = "device filename [commit_filename] [-i]",
+        .help       = "live block copy device to image"
+                      "\n\t\t\t optional commit filename "
+                      "\n\t\t\t -i for incremental copy "
+                      "(base image shared between src and destination)",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy,
+    },
+
+SQMP
+block-copy
+-------
+
+Live block copy.
+
+Arguments:
+
+- "device": device name (json-string)
+- "filename": target image filename (json-string)
+- "commit_filename": target commit filename (json-string, optional)
+- "inc": incremental disk copy (json-bool, optional)
+
+Example:
+
+-> { "execute": "block_copy",
+                            "arguments": { "device": "ide0-hd1",
+                               "filename": "/mnt/new-disk.img",
+                               "commit_filename: "/mnt/commit-new-disk.img"
+                             } }
+
+<- { "return": {} }
+
+Notes:
+
+(1) The 'query-block-copy' command should be used to check block copy progress
+    and final result (this information is provided by the 'status' member)
+(2) Boolean argument "inc" defaults to false
+
+EQMP
+
+    {
+        .name       = "block_copy_cancel",
+        .args_type  = "device:s",
+        .params     = "device",
+        .help       = "cancel live block copy",
+        .user_print = monitor_user_noop,
+        .mhandler.cmd_new = do_bdrv_copy_cancel,
+    },
+
+SQMP
+block_copy_cancel
+--------------
+
+Cancel live block copy.
+
+Arguments:
+
+- device: device name (json-string)
+
+Example:
+
+-> { "execute": "block_copy_cancel", "arguments": { "device": "ide0-hd1" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "netdev_add",
         .args_type  = "netdev:O",
         .params     = "[user|tap|socket],id=str[,prop=value][,...]",
@@ -1505,6 +1574,44 @@  Examples:
 EQMP
 
 SQMP
+query-block-copy
+-------------
+
+Live block copy status.
+
+Each block copy instance information is stored in a json-object and the returned
+value is a json-array of all instances.
+
+Each json-object contains the following:
+
+- "device": device name (json-string)
+- "status": block copy status (json-string)
+    - Possible values: "active", "failed", "completed"
+- "info": A json-object with the statistics information, if status is "active":
+    - "percentage": percentage completed (json-int)
+
+Example:
+
+Block copy for "ide1-hd0" active and block copy for "ide1-hd1" failed:
+
+-> { "execute": "query-block-copy" }
+<- {
+      "return":[
+        {"device":"ide1-hd0",
+            "status":"active",
+            "info":{
+               "percentage":23,
+            }
+        },
+        {"device":"ide1-hd1",
+         "status":"failed"
+        }
+      ]
+   }
+
+EQMP
+
+SQMP
 query-balloon
 -------------
 
Index: qemu-kvm/Makefile.objs
===================================================================
--- qemu-kvm.orig/Makefile.objs
+++ qemu-kvm/Makefile.objs
@@ -91,7 +91,7 @@  common-obj-y += buffered_file.o migratio
 common-obj-y += qemu-char.o savevm.o #aio.o
 common-obj-y += msmouse.o ps2.o
 common-obj-y += qdev.o qdev-properties.o
-common-obj-y += block-migration.o
+common-obj-y += block-migration.o block-copy.o
 common-obj-y += pflib.o
 
 common-obj-$(CONFIG_BRLAPI) += baum.o