diff mbox

[12/17] migration: add postcopy migration of dirty bitmaps

Message ID 1486979689-230770-13-git-send-email-vsementsov@virtuozzo.com
State New
Headers show

Commit Message

Vladimir Sementsov-Ogievskiy Feb. 13, 2017, 9:54 a.m. UTC
Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
associated with root nodes and non-root named nodes are migrated.

If destination qemu is already containing a dirty bitmap with the same name
as a migrated bitmap (for the same node), than, if their granularities are
the same the migration will be done, otherwise the error will be generated.

If destination qemu doesn't contain such bitmap it will be created.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 include/migration/block.h      |   1 +
 include/migration/migration.h  |   4 +
 migration/Makefile.objs        |   2 +-
 migration/block-dirty-bitmap.c | 679 +++++++++++++++++++++++++++++++++++++++++
 migration/migration.c          |   3 +
 migration/savevm.c             |   2 +
 migration/trace-events         |  14 +
 vl.c                           |   1 +
 8 files changed, 705 insertions(+), 1 deletion(-)
 create mode 100644 migration/block-dirty-bitmap.c

Comments

Fam Zheng Feb. 16, 2017, 1:04 p.m. UTC | #1
On Mon, 02/13 12:54, Vladimir Sementsov-Ogievskiy wrote:
> Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
> associated with root nodes and non-root named nodes are migrated.
> 
> If destination qemu is already containing a dirty bitmap with the same name
> as a migrated bitmap (for the same node), than, if their granularities are

s/than/then/

> the same the migration will be done, otherwise the error will be generated.
> 
> If destination qemu doesn't contain such bitmap it will be created.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> +#include "qemu/osdep.h"
> +#include "block/block.h"
> +#include "block/block_int.h"
> +#include "sysemu/block-backend.h"
> +#include "qemu/main-loop.h"
> +#include "qemu/error-report.h"
> +#include "migration/block.h"
> +#include "migration/migration.h"
> +#include "qemu/hbitmap.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/cutils.h"
> +#include "qapi/error.h"
> +#include "trace.h"
> +#include <assert.h>

Please drop this line since "assert.h" is already included in osdep.h. (And
system headers, if any, ought to be included before local headers.)

> +
> +#define CHUNK_SIZE     (1 << 10)
> +
> +/* Flags occupy from one to four bytes. In all but one the 7-th (EXTRA_FLAGS)
> + * bit should be set. */
> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
> +#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
> +#define DIRTY_BITMAP_MIG_FLAG_START         0x10
> +#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
> +#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
> +
> +#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000

This flag means two bytes, right? But your above comment says "7-th bit should
be set". This doesn't make sense. Should this be "0x80" too?

> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080
> +
> +#define DEBUG_DIRTY_BITMAP_MIGRATION 0

This macro is unused, and you have done well in adding trace point, should it be
removed?

> +
> +typedef struct DirtyBitmapMigBitmapState {
> +    /* Written during setup phase. */
> +    BlockDriverState *bs;
> +    const char *node_name;
> +    BdrvDirtyBitmap *bitmap;
> +    uint64_t total_sectors;
> +    uint64_t sectors_per_chunk;
> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
> +
> +    /* For bulk phase. */
> +    bool bulk_completed;
> +    uint64_t cur_sector;
> +} DirtyBitmapMigBitmapState;
> +
> +typedef struct DirtyBitmapMigState {
> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
> +
> +    bool bulk_completed;
> +
> +    /* for send_bitmap_bits() */
> +    BlockDriverState *prev_bs;
> +    BdrvDirtyBitmap *prev_bitmap;
> +} DirtyBitmapMigState;
> +
> +typedef struct DirtyBitmapLoadState {
> +    uint32_t flags;
> +    char node_name[256];
> +    char bitmap_name[256];
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +} DirtyBitmapLoadState;
> +
> +static DirtyBitmapMigState dirty_bitmap_mig_state;
> +
> +typedef struct DirtyBitmapLoadBitmapState {
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +    bool migrated;
> +} DirtyBitmapLoadBitmapState;
> +static GSList *enabled_bitmaps;
> +QemuMutex finish_lock;
> +
> +void init_dirty_bitmap_incoming_migration(void)
> +{
> +    qemu_mutex_init(&finish_lock);
> +}
> +
> +static uint32_t qemu_get_bitmap_flags(QEMUFile *f)
> +{
> +    uint8_t flags = qemu_get_byte(f);
> +    if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
> +        flags = flags << 8 | qemu_get_byte(f);
> +        if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
> +            flags = flags << 16 | qemu_get_be16(f);
> +        }
> +    }
> +
> +    return flags;
> +}
> +
> +static void qemu_put_bitmap_flags(QEMUFile *f, uint32_t flags)
> +{
> +    if (!(flags & 0xffffff00)) {
> +        qemu_put_byte(f, flags);

Maybe "assert(!(flags & 0x80))",

> +        return;
> +    }
> +
> +    if (!(flags & 0xffff0000)) {
> +        qemu_put_be16(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_16);
> +        return;
> +    }
> +
> +    qemu_put_be32(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_32);
> +}
> +
> +static void send_bitmap_header(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> +                               uint32_t additional_flags)
> +{
> +    BlockDriverState *bs = dbms->bs;
> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
> +    uint32_t flags = additional_flags;
> +    trace_send_bitmap_header_enter();
> +
> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
> +        dirty_bitmap_mig_state.prev_bs = bs;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
> +    }
> +
> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
> +    }
> +
> +    qemu_put_bitmap_flags(f, flags);
> +
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> +        qemu_put_counted_string(f, dbms->node_name);
> +    }
> +
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> +        qemu_put_counted_string(f, bdrv_dirty_bitmap_name(bitmap));
> +    }
> +}
> +
> +static void send_bitmap_start(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_START);
> +    qemu_put_be32(f, bdrv_dirty_bitmap_granularity(dbms->bitmap));
> +    qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
> +}
> +
> +static void send_bitmap_complete(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_COMPLETE);
> +}
> +
> +static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> +                             uint64_t start_sector, uint32_t nr_sectors)
> +{
> +    /* align for buffer_is_zero() */
> +    uint64_t align = 4 * sizeof(long);
> +    uint64_t unaligned_size =
> +        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
> +                                             start_sector, nr_sectors);
> +    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
> +    uint8_t *buf = g_malloc0(buf_size);
> +    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
> +
> +    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
> +                                     start_sector, nr_sectors);

While these bdrv_dirty_bitmap_* calls here seem fine, BdrvDirtyBitmap API is not
in general thread-safe, while this function is called without any lock. This
feels dangerous, as noted below, I'm most concerned about use-after-free.

> +
> +    if (buffer_is_zero(buf, buf_size)) {
> +        g_free(buf);
> +        buf = NULL;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
> +    }
> +
> +    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
> +
> +    send_bitmap_header(f, dbms, flags);
> +
> +    qemu_put_be64(f, start_sector);
> +    qemu_put_be32(f, nr_sectors);
> +
> +    /* if a block is zero we need to flush here since the network
> +     * bandwidth is now a lot higher than the storage device bandwidth.
> +     * thus if we queue zero blocks we slow down the migration. */
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
> +        qemu_fflush(f);
> +    } else {
> +        qemu_put_be64(f, buf_size);
> +        qemu_put_buffer(f, buf, buf_size);
> +    }
> +
> +    g_free(buf);
> +}
> +
> +
> +/* Called with iothread lock taken.  */
> +
> +static void init_dirty_bitmap_migration(void)
> +{
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +    DirtyBitmapMigBitmapState *dbms;
> +    BdrvNextIterator it;
> +    uint64_t total_bytes = 0;
> +
> +    dirty_bitmap_mig_state.bulk_completed = false;
> +    dirty_bitmap_mig_state.prev_bs = NULL;
> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
> +
> +    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
> +                continue;
> +            }
> +
> +            if (!bdrv_get_device_or_node_name(bs)) {
> +                /* not named non-root node */
> +                continue;
> +            }

Moving this to the outter loop is better, no need to check dirty bitmap 

> +
> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
> +            dbms->bs = bs;

Should we do bdrv_ref? migration/block.c references its BDSes indirectly through
BlockBackends it owns.

> +            dbms->node_name = bdrv_get_node_name(bs);
> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
> +                dbms->node_name = bdrv_get_device_name(bs);
> +            }
> +            dbms->bitmap = bitmap;

What protects the case that the bitmap is released before migration completes?

> +            dbms->total_sectors = bdrv_nb_sectors(bs);
> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
> +                bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
> +
> +            total_bytes +=
> +                bdrv_dirty_bitmap_serialization_size(bitmap,
> +                                                     0, dbms->total_sectors);

Please drop total_bytes as it is assigned but not used.

> +
> +            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
> +                                 dbms, entry);
> +        }
> +    }
> +}
> +
> +/* Called with no lock taken.  */
> +static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
> +                             dbms->sectors_per_chunk);
> +
> +    send_bitmap_bits(f, dbms, dbms->cur_sector, nr_sectors);
> +
> +    dbms->cur_sector += nr_sectors;
> +    if (dbms->cur_sector >= dbms->total_sectors) {
> +        dbms->bulk_completed = true;
> +    }
> +}
> +
> +/* Called with no lock taken.  */
> +static void bulk_phase(QEMUFile *f, bool limit)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        while (!dbms->bulk_completed) {
> +            bulk_phase_send_chunk(f, dbms);
> +            if (limit && qemu_file_rate_limit(f)) {
> +                return;
> +            }
> +        }
> +    }
> +
> +    dirty_bitmap_mig_state.bulk_completed = true;
> +}
> +
> +/* Called with iothread lock taken.  */
> +static void dirty_bitmap_mig_cleanup(void)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
> +        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
> +        g_free(dbms);
> +    }
> +}
> +
> +/* for SaveVMHandlers */
> +static void dirty_bitmap_migration_cleanup(void *opaque)
> +{
> +    dirty_bitmap_mig_cleanup();
> +}
> +
> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
> +{
> +    trace_dirty_bitmap_save_iterate(
> +            migration_in_postcopy(migrate_get_current()));
> +
> +    if (migration_in_postcopy(migrate_get_current()) &&
> +        !dirty_bitmap_mig_state.bulk_completed) {
> +        bulk_phase(f, true);
> +    }
> +
> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    return dirty_bitmap_mig_state.bulk_completed;
> +}
> +
> +/* Called with iothread lock taken.  */
> +
> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +    trace_dirty_bitmap_save_complete_enter();
> +
> +    if (!dirty_bitmap_mig_state.bulk_completed) {
> +        bulk_phase(f, false);
> +    }
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        send_bitmap_complete(f, dbms);
> +    }
> +
> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    trace_dirty_bitmap_save_complete_finish();
> +
> +    dirty_bitmap_mig_cleanup();
> +    return 0;
> +}
> +
> +static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
> +                                      uint64_t max_size,
> +                                      uint64_t *res_precopy_only,
> +                                      uint64_t *res_compatible,
> +                                      uint64_t *res_postcopy_only)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +    uint64_t pending = 0;
> +
> +    qemu_mutex_lock_iothread();

Why do you need the BQL here but not in bulk_phase()?

> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        uint64_t gran = bdrv_dirty_bitmap_granularity(dbms->bitmap);
> +        uint64_t sectors = dbms->bulk_completed ? 0 :
> +                           dbms->total_sectors - dbms->cur_sector;
> +
> +        pending += (sectors * BDRV_SECTOR_SIZE + gran - 1) / gran;
> +    }
> +
> +    qemu_mutex_unlock_iothread();
> +
> +    trace_dirty_bitmap_save_pending(pending, max_size);
> +
> +    *res_postcopy_only += pending;
> +}
> +
> +/* First occurrence of this bitmap. It should be created if doesn't exist */
> +static int dirty_bitmap_load_start(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    Error *local_err = NULL;
> +    uint32_t granularity = qemu_get_be32(f);
> +    bool enabled = qemu_get_byte(f);
> +
> +    if (!s->bitmap) {
> +        s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
> +                                             s->bitmap_name, &local_err);
> +        if (!s->bitmap) {
> +            error_report_err(local_err);
> +            return -EINVAL;
> +        }
> +    } else {
> +        uint32_t dest_granularity =
> +            bdrv_dirty_bitmap_granularity(s->bitmap);
> +        if (dest_granularity != granularity) {
> +            fprintf(stderr,

error_report please, to be consistent with the error_report_err above. Applies
to some other places in this patch, too.

> +                    "Error: "
> +                    "Migrated bitmap granularity (%" PRIu32 ") "
> +                    "doesn't match the destination bitmap '%s' "
> +                    "granularity (%" PRIu32 ")\n",
> +                    granularity,
> +                    bdrv_dirty_bitmap_name(s->bitmap),
> +                    dest_granularity);
> +            return -EINVAL;
> +        }
> +    }
> +
> +    bdrv_disable_dirty_bitmap(s->bitmap);
> +    if (enabled) {
> +        DirtyBitmapLoadBitmapState *b;
> +
> +        bdrv_dirty_bitmap_create_successor(s->bs, s->bitmap, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -EINVAL;
> +        }
> +
> +        b = g_new(DirtyBitmapLoadBitmapState, 1);
> +        b->bs = s->bs;
> +        b->bitmap = s->bitmap;
> +        b->migrated = false;
> +        enabled_bitmaps = g_slist_prepend(enabled_bitmaps, b);
> +    }
> +
> +    return 0;
> +}
> +
> +void dirty_bitmap_mig_before_vm_start(void)
> +{
> +    GSList *item;
> +
> +    qemu_mutex_lock(&finish_lock);
> +
> +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
> +        DirtyBitmapLoadBitmapState *b = item->data;
> +
> +        if (b->migrated) {
> +            bdrv_enable_dirty_bitmap(b->bitmap);
> +        } else {
> +            bdrv_dirty_bitmap_enable_successor(b->bitmap);
> +        }
> +
> +        g_free(b);
> +    }
> +
> +    g_slist_free(enabled_bitmaps);
> +    enabled_bitmaps = NULL;
> +
> +    qemu_mutex_unlock(&finish_lock);
> +}
> +
> +static void dirty_bitmap_load_complete(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    GSList *item;
> +    trace_dirty_bitmap_load_complete();
> +    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
> +
> +    qemu_mutex_lock(&finish_lock);
> +
> +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
> +        DirtyBitmapLoadBitmapState *b = item->data;
> +
> +        if (b->bitmap == s->bitmap) {
> +            b->migrated = true;
> +        }
> +    }
> +
> +    if (bdrv_dirty_bitmap_frozen(s->bitmap)) {
> +        if (enabled_bitmaps == NULL) {
> +            /* in postcopy */
> +            AioContext *aio_context = bdrv_get_aio_context(s->bs);
> +            aio_context_acquire(aio_context);
> +
> +            bdrv_reclaim_dirty_bitmap(s->bs, s->bitmap, &error_abort);
> +            bdrv_enable_dirty_bitmap(s->bitmap);
> +
> +            aio_context_release(aio_context);
> +        } else {
> +            /* target not started, successor is empty */
> +            bdrv_dirty_bitmap_release_successor(s->bs, s->bitmap);
> +        }
> +    }
> +
> +    qemu_mutex_unlock(&finish_lock);
> +}
> +
> +static int dirty_bitmap_load_bits(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    uint64_t first_sector = qemu_get_be64(f);
> +    uint32_t nr_sectors = qemu_get_be32(f);
> +    trace_dirty_bitmap_load_bits_enter(first_sector, nr_sectors);
> +
> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
> +        trace_dirty_bitmap_load_bits_zeroes();
> +        bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_sector,
> +                                             nr_sectors, false);
> +    } else {
> +        uint8_t *buf;
> +        uint64_t buf_size = qemu_get_be64(f);
> +        uint64_t needed_size =
> +            bdrv_dirty_bitmap_serialization_size(s->bitmap,
> +                                                 first_sector, nr_sectors);
> +
> +        if (needed_size > buf_size) {
> +            fprintf(stderr,
> +                    "Error: Migrated bitmap granularity doesn't "
> +                    "match the destination bitmap '%s' granularity\n",
> +                    bdrv_dirty_bitmap_name(s->bitmap));
> +            return -EINVAL;
> +        }
> +
> +        buf = g_malloc(buf_size);
> +        qemu_get_buffer(f, buf, buf_size);
> +        bdrv_dirty_bitmap_deserialize_part(s->bitmap, buf,
> +                                           first_sector,
> +                                           nr_sectors, false);
> +        g_free(buf);
> +    }
> +
> +    return 0;
> +}
> +
> +static int dirty_bitmap_load_header(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    Error *local_err = NULL;
> +    s->flags = qemu_get_bitmap_flags(f);
> +    trace_dirty_bitmap_load_header(s->flags);
> +
> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> +        if (!qemu_get_counted_string(f, s->node_name)) {
> +            fprintf(stderr, "Unable to read node name string\n");
> +            return -EINVAL;
> +        }
> +        s->bs = bdrv_lookup_bs(s->node_name, s->node_name, &local_err);
> +        if (!s->bs) {
> +            error_report("%s", error_get_pretty(local_err));
> +            error_free(local_err);
> +            return -EINVAL;
> +        }
> +    } else if (!s->bs) {
> +        fprintf(stderr, "Error: block device name is not set\n");
> +        return -EINVAL;
> +    }
> +
> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> +        if (!qemu_get_counted_string(f, s->bitmap_name)) {
> +            fprintf(stderr, "Unable to read node name string\n");
> +            return -EINVAL;
> +        }
> +        s->bitmap = bdrv_find_dirty_bitmap(s->bs, s->bitmap_name);
> +
> +        /* bitmap may be NULL here, it wouldn't be an error if it is the
> +         * first occurrence of the bitmap */
> +        if (!s->bitmap && !(s->flags & DIRTY_BITMAP_MIG_FLAG_START)) {
> +            fprintf(stderr, "Error: unknown dirty bitmap "
> +                    "'%s' for block device '%s'\n",
> +                    s->bitmap_name, s->node_name);
> +            return -EINVAL;
> +        }
> +    } else if (!s->bitmap) {
> +        fprintf(stderr, "Error: block device name is not set\n");
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    static DirtyBitmapLoadState s;
> +
> +    int ret = 0;
> +
> +    trace_dirty_bitmap_load_enter();
> +

Should version_id be checked? We should detect a version bump of a future
release and fail if not supported.

> +    do {
> +        dirty_bitmap_load_header(f, &s);
> +
> +        if (s.flags & DIRTY_BITMAP_MIG_FLAG_START) {
> +            ret = dirty_bitmap_load_start(f, &s);
> +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_COMPLETE) {
> +            dirty_bitmap_load_complete(f, &s);
> +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_BITS) {
> +            ret = dirty_bitmap_load_bits(f, &s);
> +        }
> +
> +        if (!ret) {
> +            ret = qemu_file_get_error(f);
> +        }
> +
> +        if (ret) {
> +            return ret;
> +        }
> +    } while (!(s.flags & DIRTY_BITMAP_MIG_FLAG_EOS));
> +
> +    trace_dirty_bitmap_load_success();
> +    return 0;
> +}
> +
> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
> +{
> +    DirtyBitmapMigBitmapState *dbms = NULL;
> +    init_dirty_bitmap_migration();
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        send_bitmap_start(f, dbms);
> +    }
> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    return 0;
> +}
> +
> +static bool dirty_bitmap_is_active(void *opaque)
> +{
> +    return migrate_dirty_bitmaps();
> +}
> +
> +static bool dirty_bitmap_is_active_iterate(void *opaque)
> +{
> +    return dirty_bitmap_is_active(opaque) && !runstate_is_running();
> +}
> +

Fam
Dr. David Alan Gilbert Feb. 24, 2017, 1:26 p.m. UTC | #2
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
> associated with root nodes and non-root named nodes are migrated.
> 
> If destination qemu is already containing a dirty bitmap with the same name
> as a migrated bitmap (for the same node), than, if their granularities are
> the same the migration will be done, otherwise the error will be generated.
> 
> If destination qemu doesn't contain such bitmap it will be created.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  include/migration/block.h      |   1 +
>  include/migration/migration.h  |   4 +
>  migration/Makefile.objs        |   2 +-
>  migration/block-dirty-bitmap.c | 679 +++++++++++++++++++++++++++++++++++++++++
>  migration/migration.c          |   3 +
>  migration/savevm.c             |   2 +
>  migration/trace-events         |  14 +
>  vl.c                           |   1 +
>  8 files changed, 705 insertions(+), 1 deletion(-)
>  create mode 100644 migration/block-dirty-bitmap.c
> 
> diff --git a/include/migration/block.h b/include/migration/block.h
> index 41a1ac8..8333c43 100644
> --- a/include/migration/block.h
> +++ b/include/migration/block.h
> @@ -14,6 +14,7 @@
>  #ifndef MIGRATION_BLOCK_H
>  #define MIGRATION_BLOCK_H
>  
> +void dirty_bitmap_mig_init(void);
>  void blk_mig_init(void);
>  int blk_mig_active(void);
>  uint64_t blk_mig_bytes_transferred(void);
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 46645f4..03a4993 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -371,4 +371,8 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>  PostcopyState postcopy_state_get(void);
>  /* Set the state and return the old state */
>  PostcopyState postcopy_state_set(PostcopyState new_state);
> +
> +void dirty_bitmap_mig_before_vm_start(void);
> +void init_dirty_bitmap_incoming_migration(void);
> +
>  #endif
> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
> index 480dd49..fa3bf6a 100644
> --- a/migration/Makefile.objs
> +++ b/migration/Makefile.objs
> @@ -9,5 +9,5 @@ common-obj-y += qjson.o
>  
>  common-obj-$(CONFIG_RDMA) += rdma.o
>  
> -common-obj-y += block.o
> +common-obj-y += block.o block-dirty-bitmap.o
>  
> diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
> new file mode 100644
> index 0000000..28e3732
> --- /dev/null
> +++ b/migration/block-dirty-bitmap.c
> @@ -0,0 +1,679 @@
> +/*
> + * Block dirty bitmap postcopy migration
> + *
> + * Copyright IBM, Corp. 2009
> + * Copyright (C) 2016 Parallels IP Holdings GmbH. All rights reserved.
> + *
> + * Authors:
> + *  Liran Schour   <lirans@il.ibm.com>
> + *  Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + * This file is derived from migration/block.c, so it's author and IBM copyright
> + * are here, although content is quite different.
> + *
> + * Contributions after 2012-01-13 are licensed under the terms of the
> + * GNU GPL, version 2 or (at your option) any later version.
> + *
> + *                                ***
> + *
> + * Here postcopy migration of dirty bitmaps is realized. Only named dirty
> + * bitmaps, associated with root nodes and non-root named nodes are migrated.
> + *
> + * If destination qemu is already containing a dirty bitmap with the same name
> + * as a migrated bitmap (for the same node), then, if their granularities are
> + * the same the migration will be done, otherwise the error will be generated.
> + *
> + * If destination qemu doesn't contain such bitmap it will be created.
> + *
> + * format of migration:
> + *
> + * # Header (shared for different chunk types)
> + * 1, 2 or 4 bytes: flags (see qemu_{put,put}_flags)
> + * [ 1 byte: node name size ] \  flags & DEVICE_NAME
> + * [ n bytes: node name     ] /
> + * [ 1 byte: bitmap name size ] \  flags & BITMAP_NAME
> + * [ n bytes: bitmap name     ] /
> + *
> + * # Start of bitmap migration (flags & START)
> + * header
> + * be64: granularity
> + * 1 byte: bitmap enabled flag
> + *
> + * # Complete of bitmap migration (flags & COMPLETE)
> + * header
> + *
> + * # Data chunk of bitmap migration
> + * header
> + * be64: start sector
> + * be32: number of sectors
> + * [ be64: buffer size  ] \ ! (flags & ZEROES)
> + * [ n bytes: buffer    ] /
> + *
> + * The last chunk in stream should contain flags & EOS. The chunk may skip
> + * device and/or bitmap names, assuming them to be the same with the previous
> + * chunk.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "block/block.h"
> +#include "block/block_int.h"
> +#include "sysemu/block-backend.h"
> +#include "qemu/main-loop.h"
> +#include "qemu/error-report.h"
> +#include "migration/block.h"
> +#include "migration/migration.h"
> +#include "qemu/hbitmap.h"
> +#include "sysemu/sysemu.h"
> +#include "qemu/cutils.h"
> +#include "qapi/error.h"
> +#include "trace.h"
> +#include <assert.h>
> +
> +#define CHUNK_SIZE     (1 << 10)
> +
> +/* Flags occupy from one to four bytes. In all but one the 7-th (EXTRA_FLAGS)
> + * bit should be set. */
> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
> +#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
> +#define DIRTY_BITMAP_MIG_FLAG_START         0x10
> +#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
> +#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
> +
> +#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000
> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080
> +
> +#define DEBUG_DIRTY_BITMAP_MIGRATION 0
> +
> +typedef struct DirtyBitmapMigBitmapState {
> +    /* Written during setup phase. */
> +    BlockDriverState *bs;
> +    const char *node_name;
> +    BdrvDirtyBitmap *bitmap;
> +    uint64_t total_sectors;
> +    uint64_t sectors_per_chunk;
> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
> +
> +    /* For bulk phase. */
> +    bool bulk_completed;
> +    uint64_t cur_sector;
> +} DirtyBitmapMigBitmapState;
> +
> +typedef struct DirtyBitmapMigState {
> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
> +
> +    bool bulk_completed;
> +
> +    /* for send_bitmap_bits() */
> +    BlockDriverState *prev_bs;
> +    BdrvDirtyBitmap *prev_bitmap;
> +} DirtyBitmapMigState;
> +
> +typedef struct DirtyBitmapLoadState {
> +    uint32_t flags;
> +    char node_name[256];
> +    char bitmap_name[256];
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +} DirtyBitmapLoadState;
> +
> +static DirtyBitmapMigState dirty_bitmap_mig_state;
> +
> +typedef struct DirtyBitmapLoadBitmapState {
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +    bool migrated;
> +} DirtyBitmapLoadBitmapState;
> +static GSList *enabled_bitmaps;
> +QemuMutex finish_lock;
> +
> +void init_dirty_bitmap_incoming_migration(void)
> +{
> +    qemu_mutex_init(&finish_lock);
> +}
> +
> +static uint32_t qemu_get_bitmap_flags(QEMUFile *f)
> +{
> +    uint8_t flags = qemu_get_byte(f);
> +    if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
> +        flags = flags << 8 | qemu_get_byte(f);
> +        if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
> +            flags = flags << 16 | qemu_get_be16(f);
> +        }
> +    }
> +
> +    return flags;
> +}
> +
> +static void qemu_put_bitmap_flags(QEMUFile *f, uint32_t flags)
> +{
> +    if (!(flags & 0xffffff00)) {
> +        qemu_put_byte(f, flags);
> +        return;
> +    }
> +
> +    if (!(flags & 0xffff0000)) {
> +        qemu_put_be16(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_16);
> +        return;
> +    }
> +
> +    qemu_put_be32(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_32);

Is this stuff worth the hastle? It would seem easier just to send
the 32bits each time;  the overhead doesn't seem to be much except
in the case where you send minimum size chunks a lot.

Dave

> +}
> +
> +static void send_bitmap_header(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> +                               uint32_t additional_flags)
> +{
> +    BlockDriverState *bs = dbms->bs;
> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
> +    uint32_t flags = additional_flags;
> +    trace_send_bitmap_header_enter();
> +
> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
> +        dirty_bitmap_mig_state.prev_bs = bs;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
> +    }
> +
> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
> +    }
> +
> +    qemu_put_bitmap_flags(f, flags);
> +
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> +        qemu_put_counted_string(f, dbms->node_name);
> +    }
> +
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> +        qemu_put_counted_string(f, bdrv_dirty_bitmap_name(bitmap));
> +    }
> +}
> +
> +static void send_bitmap_start(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_START);
> +    qemu_put_be32(f, bdrv_dirty_bitmap_granularity(dbms->bitmap));
> +    qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
> +}
> +
> +static void send_bitmap_complete(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_COMPLETE);
> +}
> +
> +static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> +                             uint64_t start_sector, uint32_t nr_sectors)
> +{
> +    /* align for buffer_is_zero() */
> +    uint64_t align = 4 * sizeof(long);
> +    uint64_t unaligned_size =
> +        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
> +                                             start_sector, nr_sectors);
> +    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
> +    uint8_t *buf = g_malloc0(buf_size);
> +    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
> +
> +    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
> +                                     start_sector, nr_sectors);
> +
> +    if (buffer_is_zero(buf, buf_size)) {
> +        g_free(buf);
> +        buf = NULL;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
> +    }
> +
> +    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
> +
> +    send_bitmap_header(f, dbms, flags);
> +
> +    qemu_put_be64(f, start_sector);
> +    qemu_put_be32(f, nr_sectors);
> +
> +    /* if a block is zero we need to flush here since the network
> +     * bandwidth is now a lot higher than the storage device bandwidth.
> +     * thus if we queue zero blocks we slow down the migration. */
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
> +        qemu_fflush(f);
> +    } else {
> +        qemu_put_be64(f, buf_size);
> +        qemu_put_buffer(f, buf, buf_size);
> +    }
> +
> +    g_free(buf);
> +}
> +
> +
> +/* Called with iothread lock taken.  */
> +
> +static void init_dirty_bitmap_migration(void)
> +{
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +    DirtyBitmapMigBitmapState *dbms;
> +    BdrvNextIterator it;
> +    uint64_t total_bytes = 0;
> +
> +    dirty_bitmap_mig_state.bulk_completed = false;
> +    dirty_bitmap_mig_state.prev_bs = NULL;
> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
> +
> +    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
> +                continue;
> +            }
> +
> +            if (!bdrv_get_device_or_node_name(bs)) {
> +                /* not named non-root node */
> +                continue;
> +            }
> +
> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
> +            dbms->bs = bs;
> +            dbms->node_name = bdrv_get_node_name(bs);
> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
> +                dbms->node_name = bdrv_get_device_name(bs);
> +            }
> +            dbms->bitmap = bitmap;
> +            dbms->total_sectors = bdrv_nb_sectors(bs);
> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
> +                bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
> +
> +            total_bytes +=
> +                bdrv_dirty_bitmap_serialization_size(bitmap,
> +                                                     0, dbms->total_sectors);
> +
> +            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
> +                                 dbms, entry);
> +        }
> +    }
> +}
> +
> +/* Called with no lock taken.  */
> +static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
> +                             dbms->sectors_per_chunk);
> +
> +    send_bitmap_bits(f, dbms, dbms->cur_sector, nr_sectors);
> +
> +    dbms->cur_sector += nr_sectors;
> +    if (dbms->cur_sector >= dbms->total_sectors) {
> +        dbms->bulk_completed = true;
> +    }
> +}
> +
> +/* Called with no lock taken.  */
> +static void bulk_phase(QEMUFile *f, bool limit)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        while (!dbms->bulk_completed) {
> +            bulk_phase_send_chunk(f, dbms);
> +            if (limit && qemu_file_rate_limit(f)) {
> +                return;
> +            }
> +        }
> +    }
> +
> +    dirty_bitmap_mig_state.bulk_completed = true;
> +}
> +
> +/* Called with iothread lock taken.  */
> +static void dirty_bitmap_mig_cleanup(void)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
> +        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
> +        g_free(dbms);
> +    }
> +}
> +
> +/* for SaveVMHandlers */
> +static void dirty_bitmap_migration_cleanup(void *opaque)
> +{
> +    dirty_bitmap_mig_cleanup();
> +}
> +
> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
> +{
> +    trace_dirty_bitmap_save_iterate(
> +            migration_in_postcopy(migrate_get_current()));
> +
> +    if (migration_in_postcopy(migrate_get_current()) &&
> +        !dirty_bitmap_mig_state.bulk_completed) {
> +        bulk_phase(f, true);
> +    }
> +
> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    return dirty_bitmap_mig_state.bulk_completed;
> +}
> +
> +/* Called with iothread lock taken.  */
> +
> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +    trace_dirty_bitmap_save_complete_enter();
> +
> +    if (!dirty_bitmap_mig_state.bulk_completed) {
> +        bulk_phase(f, false);
> +    }
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        send_bitmap_complete(f, dbms);
> +    }
> +
> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    trace_dirty_bitmap_save_complete_finish();
> +
> +    dirty_bitmap_mig_cleanup();
> +    return 0;
> +}
> +
> +static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
> +                                      uint64_t max_size,
> +                                      uint64_t *res_precopy_only,
> +                                      uint64_t *res_compatible,
> +                                      uint64_t *res_postcopy_only)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +    uint64_t pending = 0;
> +
> +    qemu_mutex_lock_iothread();
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        uint64_t gran = bdrv_dirty_bitmap_granularity(dbms->bitmap);
> +        uint64_t sectors = dbms->bulk_completed ? 0 :
> +                           dbms->total_sectors - dbms->cur_sector;
> +
> +        pending += (sectors * BDRV_SECTOR_SIZE + gran - 1) / gran;
> +    }
> +
> +    qemu_mutex_unlock_iothread();
> +
> +    trace_dirty_bitmap_save_pending(pending, max_size);
> +
> +    *res_postcopy_only += pending;
> +}
> +
> +/* First occurrence of this bitmap. It should be created if doesn't exist */
> +static int dirty_bitmap_load_start(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    Error *local_err = NULL;
> +    uint32_t granularity = qemu_get_be32(f);
> +    bool enabled = qemu_get_byte(f);
> +
> +    if (!s->bitmap) {
> +        s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
> +                                             s->bitmap_name, &local_err);
> +        if (!s->bitmap) {
> +            error_report_err(local_err);
> +            return -EINVAL;
> +        }
> +    } else {
> +        uint32_t dest_granularity =
> +            bdrv_dirty_bitmap_granularity(s->bitmap);
> +        if (dest_granularity != granularity) {
> +            fprintf(stderr,
> +                    "Error: "
> +                    "Migrated bitmap granularity (%" PRIu32 ") "
> +                    "doesn't match the destination bitmap '%s' "
> +                    "granularity (%" PRIu32 ")\n",
> +                    granularity,
> +                    bdrv_dirty_bitmap_name(s->bitmap),
> +                    dest_granularity);
> +            return -EINVAL;
> +        }
> +    }
> +
> +    bdrv_disable_dirty_bitmap(s->bitmap);
> +    if (enabled) {
> +        DirtyBitmapLoadBitmapState *b;
> +
> +        bdrv_dirty_bitmap_create_successor(s->bs, s->bitmap, &local_err);
> +        if (local_err) {
> +            error_report_err(local_err);
> +            return -EINVAL;
> +        }
> +
> +        b = g_new(DirtyBitmapLoadBitmapState, 1);
> +        b->bs = s->bs;
> +        b->bitmap = s->bitmap;
> +        b->migrated = false;
> +        enabled_bitmaps = g_slist_prepend(enabled_bitmaps, b);
> +    }
> +
> +    return 0;
> +}
> +
> +void dirty_bitmap_mig_before_vm_start(void)
> +{
> +    GSList *item;
> +
> +    qemu_mutex_lock(&finish_lock);
> +
> +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
> +        DirtyBitmapLoadBitmapState *b = item->data;
> +
> +        if (b->migrated) {
> +            bdrv_enable_dirty_bitmap(b->bitmap);
> +        } else {
> +            bdrv_dirty_bitmap_enable_successor(b->bitmap);
> +        }
> +
> +        g_free(b);
> +    }
> +
> +    g_slist_free(enabled_bitmaps);
> +    enabled_bitmaps = NULL;
> +
> +    qemu_mutex_unlock(&finish_lock);
> +}
> +
> +static void dirty_bitmap_load_complete(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    GSList *item;
> +    trace_dirty_bitmap_load_complete();
> +    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
> +
> +    qemu_mutex_lock(&finish_lock);
> +
> +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
> +        DirtyBitmapLoadBitmapState *b = item->data;
> +
> +        if (b->bitmap == s->bitmap) {
> +            b->migrated = true;
> +        }
> +    }
> +
> +    if (bdrv_dirty_bitmap_frozen(s->bitmap)) {
> +        if (enabled_bitmaps == NULL) {
> +            /* in postcopy */
> +            AioContext *aio_context = bdrv_get_aio_context(s->bs);
> +            aio_context_acquire(aio_context);
> +
> +            bdrv_reclaim_dirty_bitmap(s->bs, s->bitmap, &error_abort);
> +            bdrv_enable_dirty_bitmap(s->bitmap);
> +
> +            aio_context_release(aio_context);
> +        } else {
> +            /* target not started, successor is empty */
> +            bdrv_dirty_bitmap_release_successor(s->bs, s->bitmap);
> +        }
> +    }
> +
> +    qemu_mutex_unlock(&finish_lock);
> +}
> +
> +static int dirty_bitmap_load_bits(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    uint64_t first_sector = qemu_get_be64(f);
> +    uint32_t nr_sectors = qemu_get_be32(f);
> +    trace_dirty_bitmap_load_bits_enter(first_sector, nr_sectors);
> +
> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
> +        trace_dirty_bitmap_load_bits_zeroes();
> +        bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_sector,
> +                                             nr_sectors, false);
> +    } else {
> +        uint8_t *buf;
> +        uint64_t buf_size = qemu_get_be64(f);
> +        uint64_t needed_size =
> +            bdrv_dirty_bitmap_serialization_size(s->bitmap,
> +                                                 first_sector, nr_sectors);
> +
> +        if (needed_size > buf_size) {
> +            fprintf(stderr,
> +                    "Error: Migrated bitmap granularity doesn't "
> +                    "match the destination bitmap '%s' granularity\n",
> +                    bdrv_dirty_bitmap_name(s->bitmap));
> +            return -EINVAL;
> +        }
> +
> +        buf = g_malloc(buf_size);
> +        qemu_get_buffer(f, buf, buf_size);
> +        bdrv_dirty_bitmap_deserialize_part(s->bitmap, buf,
> +                                           first_sector,
> +                                           nr_sectors, false);
> +        g_free(buf);
> +    }
> +
> +    return 0;
> +}
> +
> +static int dirty_bitmap_load_header(QEMUFile *f, DirtyBitmapLoadState *s)
> +{
> +    Error *local_err = NULL;
> +    s->flags = qemu_get_bitmap_flags(f);
> +    trace_dirty_bitmap_load_header(s->flags);
> +
> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> +        if (!qemu_get_counted_string(f, s->node_name)) {
> +            fprintf(stderr, "Unable to read node name string\n");
> +            return -EINVAL;
> +        }
> +        s->bs = bdrv_lookup_bs(s->node_name, s->node_name, &local_err);
> +        if (!s->bs) {
> +            error_report("%s", error_get_pretty(local_err));
> +            error_free(local_err);
> +            return -EINVAL;
> +        }
> +    } else if (!s->bs) {
> +        fprintf(stderr, "Error: block device name is not set\n");
> +        return -EINVAL;
> +    }
> +
> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> +        if (!qemu_get_counted_string(f, s->bitmap_name)) {
> +            fprintf(stderr, "Unable to read node name string\n");
> +            return -EINVAL;
> +        }
> +        s->bitmap = bdrv_find_dirty_bitmap(s->bs, s->bitmap_name);
> +
> +        /* bitmap may be NULL here, it wouldn't be an error if it is the
> +         * first occurrence of the bitmap */
> +        if (!s->bitmap && !(s->flags & DIRTY_BITMAP_MIG_FLAG_START)) {
> +            fprintf(stderr, "Error: unknown dirty bitmap "
> +                    "'%s' for block device '%s'\n",
> +                    s->bitmap_name, s->node_name);
> +            return -EINVAL;
> +        }
> +    } else if (!s->bitmap) {
> +        fprintf(stderr, "Error: block device name is not set\n");
> +        return -EINVAL;
> +    }
> +
> +    return 0;
> +}
> +
> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    static DirtyBitmapLoadState s;
> +
> +    int ret = 0;
> +
> +    trace_dirty_bitmap_load_enter();
> +
> +    do {
> +        dirty_bitmap_load_header(f, &s);
> +
> +        if (s.flags & DIRTY_BITMAP_MIG_FLAG_START) {
> +            ret = dirty_bitmap_load_start(f, &s);
> +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_COMPLETE) {
> +            dirty_bitmap_load_complete(f, &s);
> +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_BITS) {
> +            ret = dirty_bitmap_load_bits(f, &s);
> +        }
> +
> +        if (!ret) {
> +            ret = qemu_file_get_error(f);
> +        }
> +
> +        if (ret) {
> +            return ret;
> +        }
> +    } while (!(s.flags & DIRTY_BITMAP_MIG_FLAG_EOS));
> +
> +    trace_dirty_bitmap_load_success();
> +    return 0;
> +}
> +
> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
> +{
> +    DirtyBitmapMigBitmapState *dbms = NULL;
> +    init_dirty_bitmap_migration();
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        send_bitmap_start(f, dbms);
> +    }
> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    return 0;
> +}
> +
> +static bool dirty_bitmap_is_active(void *opaque)
> +{
> +    return migrate_dirty_bitmaps();
> +}
> +
> +static bool dirty_bitmap_is_active_iterate(void *opaque)
> +{
> +    return dirty_bitmap_is_active(opaque) && !runstate_is_running();
> +}
> +
> +static bool dirty_bitmap_has_postcopy(void *opaque)
> +{
> +    return true;
> +}
> +
> +static SaveVMHandlers savevm_dirty_bitmap_handlers = {
> +    .save_live_setup = dirty_bitmap_save_setup,
> +    .save_live_complete_postcopy = dirty_bitmap_save_complete,
> +    .save_live_complete_precopy = dirty_bitmap_save_complete,
> +    .has_postcopy = dirty_bitmap_has_postcopy,
> +    .save_live_pending = dirty_bitmap_save_pending,
> +    .save_live_iterate = dirty_bitmap_save_iterate,
> +    .is_active_iterate = dirty_bitmap_is_active_iterate,
> +    .load_state = dirty_bitmap_load,
> +    .cleanup = dirty_bitmap_migration_cleanup,
> +    .is_active = dirty_bitmap_is_active,
> +};
> +
> +void dirty_bitmap_mig_init(void)
> +{
> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
> +
> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
> +                         &savevm_dirty_bitmap_handlers,
> +                         &dirty_bitmap_mig_state);
> +}
> diff --git a/migration/migration.c b/migration/migration.c
> index 179bd04..edf5623 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -122,6 +122,9 @@ MigrationIncomingState *migration_incoming_get_current(void)
>          QLIST_INIT(&mis_current.loadvm_handlers);
>          qemu_mutex_init(&mis_current.rp_mutex);
>          qemu_event_init(&mis_current.main_thread_load_event, false);
> +
> +        init_dirty_bitmap_incoming_migration();
> +
>          once = true;
>      }
>      return &mis_current;
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 54572ef..927a680 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1651,6 +1651,8 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
>  
>      trace_loadvm_postcopy_handle_run_vmstart();
>  
> +    dirty_bitmap_mig_before_vm_start();
> +
>      if (autostart) {
>          /* Hold onto your hats, starting the CPU */
>          vm_start();
> diff --git a/migration/trace-events b/migration/trace-events
> index 0212929..38fca41 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -222,3 +222,17 @@ colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
>  colo_send_message(const char *msg) "Send '%s' message"
>  colo_receive_message(const char *msg) "Receive '%s' message"
>  colo_failover_set_state(const char *new_state) "new state %s"
> +
> +# migration/block-dirty-bitmap.c
> +send_bitmap_header_enter(void) ""
> +send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "\n   flags:        %x\n   start_sector: %" PRIu64 "\n   nr_sectors:   %" PRIu32 "\n   data_size:    %" PRIu64 "\n"
> +dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
> +dirty_bitmap_save_complete_enter(void) ""
> +dirty_bitmap_save_complete_finish(void) ""
> +dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
> +dirty_bitmap_load_complete(void) ""
> +dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
> +dirty_bitmap_load_bits_zeroes(void) ""
> +dirty_bitmap_load_header(uint32_t flags) "flags %x"
> +dirty_bitmap_load_enter(void) ""
> +dirty_bitmap_load_success(void) ""
> diff --git a/vl.c b/vl.c
> index b4eaf03..f1ee9ff 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -4405,6 +4405,7 @@ int main(int argc, char **argv, char **envp)
>  
>      blk_mig_init();
>      ram_mig_init();
> +    dirty_bitmap_mig_init();
>  
>      /* If the currently selected machine wishes to override the units-per-bus
>       * property of its default HBA interface type, do so now. */
> -- 
> 1.8.3.1
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy Feb. 25, 2017, 5:25 p.m. UTC | #3
24.02.2017 16:26, Dr. David Alan Gilbert wrote:
> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>> Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
>> associated with root nodes and non-root named nodes are migrated.
>>
>> If destination qemu is already containing a dirty bitmap with the same name
>> as a migrated bitmap (for the same node), than, if their granularities are
>> the same the migration will be done, otherwise the error will be generated.
>>
>> If destination qemu doesn't contain such bitmap it will be created.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>   include/migration/block.h      |   1 +
>>   include/migration/migration.h  |   4 +
>>   migration/Makefile.objs        |   2 +-
>>   migration/block-dirty-bitmap.c | 679 +++++++++++++++++++++++++++++++++++++++++
>>   migration/migration.c          |   3 +
>>   migration/savevm.c             |   2 +
>>   migration/trace-events         |  14 +
>>   vl.c                           |   1 +
>>   8 files changed, 705 insertions(+), 1 deletion(-)
>>   create mode 100644 migration/block-dirty-bitmap.c
>>
>> diff --git a/include/migration/block.h b/include/migration/block.h
>> index 41a1ac8..8333c43 100644
>> --- a/include/migration/block.h
>> +++ b/include/migration/block.h
>> @@ -14,6 +14,7 @@
>>   #ifndef MIGRATION_BLOCK_H
>>   #define MIGRATION_BLOCK_H
>>   
>> +void dirty_bitmap_mig_init(void);
>>   void blk_mig_init(void);
>>   int blk_mig_active(void);
>>   uint64_t blk_mig_bytes_transferred(void);
>> diff --git a/include/migration/migration.h b/include/migration/migration.h
>> index 46645f4..03a4993 100644
>> --- a/include/migration/migration.h
>> +++ b/include/migration/migration.h
>> @@ -371,4 +371,8 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>>   PostcopyState postcopy_state_get(void);
>>   /* Set the state and return the old state */
>>   PostcopyState postcopy_state_set(PostcopyState new_state);
>> +
>> +void dirty_bitmap_mig_before_vm_start(void);
>> +void init_dirty_bitmap_incoming_migration(void);
>> +
>>   #endif
>> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
>> index 480dd49..fa3bf6a 100644
>> --- a/migration/Makefile.objs
>> +++ b/migration/Makefile.objs
>> @@ -9,5 +9,5 @@ common-obj-y += qjson.o
>>   
>>   common-obj-$(CONFIG_RDMA) += rdma.o
>>   
>> -common-obj-y += block.o
>> +common-obj-y += block.o block-dirty-bitmap.o
>>   
>> diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
>> new file mode 100644
>> index 0000000..28e3732
>> --- /dev/null
>> +++ b/migration/block-dirty-bitmap.c
>> @@ -0,0 +1,679 @@
>> +/*
>> + * Block dirty bitmap postcopy migration
>> + *
>> + * Copyright IBM, Corp. 2009
>> + * Copyright (C) 2016 Parallels IP Holdings GmbH. All rights reserved.
>> + *
>> + * Authors:
>> + *  Liran Schour   <lirans@il.ibm.com>
>> + *  Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>> + * the COPYING file in the top-level directory.
>> + * This file is derived from migration/block.c, so it's author and IBM copyright
>> + * are here, although content is quite different.
>> + *
>> + * Contributions after 2012-01-13 are licensed under the terms of the
>> + * GNU GPL, version 2 or (at your option) any later version.
>> + *
>> + *                                ***
>> + *
>> + * Here postcopy migration of dirty bitmaps is realized. Only named dirty
>> + * bitmaps, associated with root nodes and non-root named nodes are migrated.
>> + *
>> + * If destination qemu is already containing a dirty bitmap with the same name
>> + * as a migrated bitmap (for the same node), then, if their granularities are
>> + * the same the migration will be done, otherwise the error will be generated.
>> + *
>> + * If destination qemu doesn't contain such bitmap it will be created.
>> + *
>> + * format of migration:
>> + *
>> + * # Header (shared for different chunk types)
>> + * 1, 2 or 4 bytes: flags (see qemu_{put,put}_flags)
>> + * [ 1 byte: node name size ] \  flags & DEVICE_NAME
>> + * [ n bytes: node name     ] /
>> + * [ 1 byte: bitmap name size ] \  flags & BITMAP_NAME
>> + * [ n bytes: bitmap name     ] /
>> + *
>> + * # Start of bitmap migration (flags & START)
>> + * header
>> + * be64: granularity
>> + * 1 byte: bitmap enabled flag
>> + *
>> + * # Complete of bitmap migration (flags & COMPLETE)
>> + * header
>> + *
>> + * # Data chunk of bitmap migration
>> + * header
>> + * be64: start sector
>> + * be32: number of sectors
>> + * [ be64: buffer size  ] \ ! (flags & ZEROES)
>> + * [ n bytes: buffer    ] /
>> + *
>> + * The last chunk in stream should contain flags & EOS. The chunk may skip
>> + * device and/or bitmap names, assuming them to be the same with the previous
>> + * chunk.
>> + */
>> +
>> +#include "qemu/osdep.h"
>> +#include "block/block.h"
>> +#include "block/block_int.h"
>> +#include "sysemu/block-backend.h"
>> +#include "qemu/main-loop.h"
>> +#include "qemu/error-report.h"
>> +#include "migration/block.h"
>> +#include "migration/migration.h"
>> +#include "qemu/hbitmap.h"
>> +#include "sysemu/sysemu.h"
>> +#include "qemu/cutils.h"
>> +#include "qapi/error.h"
>> +#include "trace.h"
>> +#include <assert.h>
>> +
>> +#define CHUNK_SIZE     (1 << 10)
>> +
>> +/* Flags occupy from one to four bytes. In all but one the 7-th (EXTRA_FLAGS)
>> + * bit should be set. */
>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>> +#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
>> +#define DIRTY_BITMAP_MIG_FLAG_START         0x10
>> +#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
>> +#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
>> +
>> +#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
>> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000
>> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080
>> +
>> +#define DEBUG_DIRTY_BITMAP_MIGRATION 0
>> +
>> +typedef struct DirtyBitmapMigBitmapState {
>> +    /* Written during setup phase. */
>> +    BlockDriverState *bs;
>> +    const char *node_name;
>> +    BdrvDirtyBitmap *bitmap;
>> +    uint64_t total_sectors;
>> +    uint64_t sectors_per_chunk;
>> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
>> +
>> +    /* For bulk phase. */
>> +    bool bulk_completed;
>> +    uint64_t cur_sector;
>> +} DirtyBitmapMigBitmapState;
>> +
>> +typedef struct DirtyBitmapMigState {
>> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
>> +
>> +    bool bulk_completed;
>> +
>> +    /* for send_bitmap_bits() */
>> +    BlockDriverState *prev_bs;
>> +    BdrvDirtyBitmap *prev_bitmap;
>> +} DirtyBitmapMigState;
>> +
>> +typedef struct DirtyBitmapLoadState {
>> +    uint32_t flags;
>> +    char node_name[256];
>> +    char bitmap_name[256];
>> +    BlockDriverState *bs;
>> +    BdrvDirtyBitmap *bitmap;
>> +} DirtyBitmapLoadState;
>> +
>> +static DirtyBitmapMigState dirty_bitmap_mig_state;
>> +
>> +typedef struct DirtyBitmapLoadBitmapState {
>> +    BlockDriverState *bs;
>> +    BdrvDirtyBitmap *bitmap;
>> +    bool migrated;
>> +} DirtyBitmapLoadBitmapState;
>> +static GSList *enabled_bitmaps;
>> +QemuMutex finish_lock;
>> +
>> +void init_dirty_bitmap_incoming_migration(void)
>> +{
>> +    qemu_mutex_init(&finish_lock);
>> +}
>> +
>> +static uint32_t qemu_get_bitmap_flags(QEMUFile *f)
>> +{
>> +    uint8_t flags = qemu_get_byte(f);
>> +    if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
>> +        flags = flags << 8 | qemu_get_byte(f);
>> +        if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
>> +            flags = flags << 16 | qemu_get_be16(f);
>> +        }
>> +    }
>> +
>> +    return flags;
>> +}
>> +
>> +static void qemu_put_bitmap_flags(QEMUFile *f, uint32_t flags)
>> +{
>> +    if (!(flags & 0xffffff00)) {
>> +        qemu_put_byte(f, flags);
>> +        return;
>> +    }
>> +
>> +    if (!(flags & 0xffff0000)) {
>> +        qemu_put_be16(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_16);
>> +        return;
>> +    }
>> +
>> +    qemu_put_be32(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_32);
> Is this stuff worth the hastle? It would seem easier just to send
> the 32bits each time;  the overhead doesn't seem to be much except
> in the case where you send minimum size chunks a lot.

I've started with one byte of flags (I don't need more) and I've 
implemented this after proposal by John Snow:
> I might recommend reserving the last bit of the second byte to be a 
> flag such as DIRTY_BITMAP_EXTRA_FLAGS that indicates the presence of 
> additional byte(s) of flags, to be determined later, if we ever need 
> them, but two bytes for now should be sufficient.

And I think it is not bad. Code is a bit more complex, yes, but why 
should we send 4 bytes with every chunk when (very likely) we will never 
use more than one?

>
> Dave
>
>> +}
>> +
>> +static void send_bitmap_header(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
>> +                               uint32_t additional_flags)
>> +{
>> +    BlockDriverState *bs = dbms->bs;
>> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
>> +    uint32_t flags = additional_flags;
>> +    trace_send_bitmap_header_enter();
>> +
>> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
>> +        dirty_bitmap_mig_state.prev_bs = bs;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
>> +    }
>> +
>> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
>> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
>> +    }
>> +
>> +    qemu_put_bitmap_flags(f, flags);
>> +
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>> +        qemu_put_counted_string(f, dbms->node_name);
>> +    }
>> +
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>> +        qemu_put_counted_string(f, bdrv_dirty_bitmap_name(bitmap));
>> +    }
>> +}
>> +
>> +static void send_bitmap_start(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
>> +{
>> +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_START);
>> +    qemu_put_be32(f, bdrv_dirty_bitmap_granularity(dbms->bitmap));
>> +    qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>> +}
>> +
>> +static void send_bitmap_complete(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
>> +{
>> +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_COMPLETE);
>> +}
>> +
>> +static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
>> +                             uint64_t start_sector, uint32_t nr_sectors)
>> +{
>> +    /* align for buffer_is_zero() */
>> +    uint64_t align = 4 * sizeof(long);
>> +    uint64_t unaligned_size =
>> +        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
>> +                                             start_sector, nr_sectors);
>> +    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
>> +    uint8_t *buf = g_malloc0(buf_size);
>> +    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
>> +
>> +    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
>> +                                     start_sector, nr_sectors);
>> +
>> +    if (buffer_is_zero(buf, buf_size)) {
>> +        g_free(buf);
>> +        buf = NULL;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
>> +    }
>> +
>> +    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
>> +
>> +    send_bitmap_header(f, dbms, flags);
>> +
>> +    qemu_put_be64(f, start_sector);
>> +    qemu_put_be32(f, nr_sectors);
>> +
>> +    /* if a block is zero we need to flush here since the network
>> +     * bandwidth is now a lot higher than the storage device bandwidth.
>> +     * thus if we queue zero blocks we slow down the migration. */
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
>> +        qemu_fflush(f);
>> +    } else {
>> +        qemu_put_be64(f, buf_size);
>> +        qemu_put_buffer(f, buf, buf_size);
>> +    }
>> +
>> +    g_free(buf);
>> +}
>> +
>> +
>> +/* Called with iothread lock taken.  */
>> +
>> +static void init_dirty_bitmap_migration(void)
>> +{
>> +    BlockDriverState *bs;
>> +    BdrvDirtyBitmap *bitmap;
>> +    DirtyBitmapMigBitmapState *dbms;
>> +    BdrvNextIterator it;
>> +    uint64_t total_bytes = 0;
>> +
>> +    dirty_bitmap_mig_state.bulk_completed = false;
>> +    dirty_bitmap_mig_state.prev_bs = NULL;
>> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
>> +
>> +    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
>> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
>> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
>> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
>> +                continue;
>> +            }
>> +
>> +            if (!bdrv_get_device_or_node_name(bs)) {
>> +                /* not named non-root node */
>> +                continue;
>> +            }
>> +
>> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
>> +            dbms->bs = bs;
>> +            dbms->node_name = bdrv_get_node_name(bs);
>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>> +                dbms->node_name = bdrv_get_device_name(bs);
>> +            }
>> +            dbms->bitmap = bitmap;
>> +            dbms->total_sectors = bdrv_nb_sectors(bs);
>> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
>> +                bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
>> +
>> +            total_bytes +=
>> +                bdrv_dirty_bitmap_serialization_size(bitmap,
>> +                                                     0, dbms->total_sectors);
>> +
>> +            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
>> +                                 dbms, entry);
>> +        }
>> +    }
>> +}
>> +
>> +/* Called with no lock taken.  */
>> +static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
>> +{
>> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
>> +                             dbms->sectors_per_chunk);
>> +
>> +    send_bitmap_bits(f, dbms, dbms->cur_sector, nr_sectors);
>> +
>> +    dbms->cur_sector += nr_sectors;
>> +    if (dbms->cur_sector >= dbms->total_sectors) {
>> +        dbms->bulk_completed = true;
>> +    }
>> +}
>> +
>> +/* Called with no lock taken.  */
>> +static void bulk_phase(QEMUFile *f, bool limit)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        while (!dbms->bulk_completed) {
>> +            bulk_phase_send_chunk(f, dbms);
>> +            if (limit && qemu_file_rate_limit(f)) {
>> +                return;
>> +            }
>> +        }
>> +    }
>> +
>> +    dirty_bitmap_mig_state.bulk_completed = true;
>> +}
>> +
>> +/* Called with iothread lock taken.  */
>> +static void dirty_bitmap_mig_cleanup(void)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
>> +        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
>> +        g_free(dbms);
>> +    }
>> +}
>> +
>> +/* for SaveVMHandlers */
>> +static void dirty_bitmap_migration_cleanup(void *opaque)
>> +{
>> +    dirty_bitmap_mig_cleanup();
>> +}
>> +
>> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
>> +{
>> +    trace_dirty_bitmap_save_iterate(
>> +            migration_in_postcopy(migrate_get_current()));
>> +
>> +    if (migration_in_postcopy(migrate_get_current()) &&
>> +        !dirty_bitmap_mig_state.bulk_completed) {
>> +        bulk_phase(f, true);
>> +    }
>> +
>> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>> +
>> +    return dirty_bitmap_mig_state.bulk_completed;
>> +}
>> +
>> +/* Called with iothread lock taken.  */
>> +
>> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +    trace_dirty_bitmap_save_complete_enter();
>> +
>> +    if (!dirty_bitmap_mig_state.bulk_completed) {
>> +        bulk_phase(f, false);
>> +    }
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        send_bitmap_complete(f, dbms);
>> +    }
>> +
>> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>> +
>> +    trace_dirty_bitmap_save_complete_finish();
>> +
>> +    dirty_bitmap_mig_cleanup();
>> +    return 0;
>> +}
>> +
>> +static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>> +                                      uint64_t max_size,
>> +                                      uint64_t *res_precopy_only,
>> +                                      uint64_t *res_compatible,
>> +                                      uint64_t *res_postcopy_only)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +    uint64_t pending = 0;
>> +
>> +    qemu_mutex_lock_iothread();
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        uint64_t gran = bdrv_dirty_bitmap_granularity(dbms->bitmap);
>> +        uint64_t sectors = dbms->bulk_completed ? 0 :
>> +                           dbms->total_sectors - dbms->cur_sector;
>> +
>> +        pending += (sectors * BDRV_SECTOR_SIZE + gran - 1) / gran;
>> +    }
>> +
>> +    qemu_mutex_unlock_iothread();
>> +
>> +    trace_dirty_bitmap_save_pending(pending, max_size);
>> +
>> +    *res_postcopy_only += pending;
>> +}
>> +
>> +/* First occurrence of this bitmap. It should be created if doesn't exist */
>> +static int dirty_bitmap_load_start(QEMUFile *f, DirtyBitmapLoadState *s)
>> +{
>> +    Error *local_err = NULL;
>> +    uint32_t granularity = qemu_get_be32(f);
>> +    bool enabled = qemu_get_byte(f);
>> +
>> +    if (!s->bitmap) {
>> +        s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
>> +                                             s->bitmap_name, &local_err);
>> +        if (!s->bitmap) {
>> +            error_report_err(local_err);
>> +            return -EINVAL;
>> +        }
>> +    } else {
>> +        uint32_t dest_granularity =
>> +            bdrv_dirty_bitmap_granularity(s->bitmap);
>> +        if (dest_granularity != granularity) {
>> +            fprintf(stderr,
>> +                    "Error: "
>> +                    "Migrated bitmap granularity (%" PRIu32 ") "
>> +                    "doesn't match the destination bitmap '%s' "
>> +                    "granularity (%" PRIu32 ")\n",
>> +                    granularity,
>> +                    bdrv_dirty_bitmap_name(s->bitmap),
>> +                    dest_granularity);
>> +            return -EINVAL;
>> +        }
>> +    }
>> +
>> +    bdrv_disable_dirty_bitmap(s->bitmap);
>> +    if (enabled) {
>> +        DirtyBitmapLoadBitmapState *b;
>> +
>> +        bdrv_dirty_bitmap_create_successor(s->bs, s->bitmap, &local_err);
>> +        if (local_err) {
>> +            error_report_err(local_err);
>> +            return -EINVAL;
>> +        }
>> +
>> +        b = g_new(DirtyBitmapLoadBitmapState, 1);
>> +        b->bs = s->bs;
>> +        b->bitmap = s->bitmap;
>> +        b->migrated = false;
>> +        enabled_bitmaps = g_slist_prepend(enabled_bitmaps, b);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +void dirty_bitmap_mig_before_vm_start(void)
>> +{
>> +    GSList *item;
>> +
>> +    qemu_mutex_lock(&finish_lock);
>> +
>> +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
>> +        DirtyBitmapLoadBitmapState *b = item->data;
>> +
>> +        if (b->migrated) {
>> +            bdrv_enable_dirty_bitmap(b->bitmap);
>> +        } else {
>> +            bdrv_dirty_bitmap_enable_successor(b->bitmap);
>> +        }
>> +
>> +        g_free(b);
>> +    }
>> +
>> +    g_slist_free(enabled_bitmaps);
>> +    enabled_bitmaps = NULL;
>> +
>> +    qemu_mutex_unlock(&finish_lock);
>> +}
>> +
>> +static void dirty_bitmap_load_complete(QEMUFile *f, DirtyBitmapLoadState *s)
>> +{
>> +    GSList *item;
>> +    trace_dirty_bitmap_load_complete();
>> +    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
>> +
>> +    qemu_mutex_lock(&finish_lock);
>> +
>> +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
>> +        DirtyBitmapLoadBitmapState *b = item->data;
>> +
>> +        if (b->bitmap == s->bitmap) {
>> +            b->migrated = true;
>> +        }
>> +    }
>> +
>> +    if (bdrv_dirty_bitmap_frozen(s->bitmap)) {
>> +        if (enabled_bitmaps == NULL) {
>> +            /* in postcopy */
>> +            AioContext *aio_context = bdrv_get_aio_context(s->bs);
>> +            aio_context_acquire(aio_context);
>> +
>> +            bdrv_reclaim_dirty_bitmap(s->bs, s->bitmap, &error_abort);
>> +            bdrv_enable_dirty_bitmap(s->bitmap);
>> +
>> +            aio_context_release(aio_context);
>> +        } else {
>> +            /* target not started, successor is empty */
>> +            bdrv_dirty_bitmap_release_successor(s->bs, s->bitmap);
>> +        }
>> +    }
>> +
>> +    qemu_mutex_unlock(&finish_lock);
>> +}
>> +
>> +static int dirty_bitmap_load_bits(QEMUFile *f, DirtyBitmapLoadState *s)
>> +{
>> +    uint64_t first_sector = qemu_get_be64(f);
>> +    uint32_t nr_sectors = qemu_get_be32(f);
>> +    trace_dirty_bitmap_load_bits_enter(first_sector, nr_sectors);
>> +
>> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
>> +        trace_dirty_bitmap_load_bits_zeroes();
>> +        bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_sector,
>> +                                             nr_sectors, false);
>> +    } else {
>> +        uint8_t *buf;
>> +        uint64_t buf_size = qemu_get_be64(f);
>> +        uint64_t needed_size =
>> +            bdrv_dirty_bitmap_serialization_size(s->bitmap,
>> +                                                 first_sector, nr_sectors);
>> +
>> +        if (needed_size > buf_size) {
>> +            fprintf(stderr,
>> +                    "Error: Migrated bitmap granularity doesn't "
>> +                    "match the destination bitmap '%s' granularity\n",
>> +                    bdrv_dirty_bitmap_name(s->bitmap));
>> +            return -EINVAL;
>> +        }
>> +
>> +        buf = g_malloc(buf_size);
>> +        qemu_get_buffer(f, buf, buf_size);
>> +        bdrv_dirty_bitmap_deserialize_part(s->bitmap, buf,
>> +                                           first_sector,
>> +                                           nr_sectors, false);
>> +        g_free(buf);
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int dirty_bitmap_load_header(QEMUFile *f, DirtyBitmapLoadState *s)
>> +{
>> +    Error *local_err = NULL;
>> +    s->flags = qemu_get_bitmap_flags(f);
>> +    trace_dirty_bitmap_load_header(s->flags);
>> +
>> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>> +        if (!qemu_get_counted_string(f, s->node_name)) {
>> +            fprintf(stderr, "Unable to read node name string\n");
>> +            return -EINVAL;
>> +        }
>> +        s->bs = bdrv_lookup_bs(s->node_name, s->node_name, &local_err);
>> +        if (!s->bs) {
>> +            error_report("%s", error_get_pretty(local_err));
>> +            error_free(local_err);
>> +            return -EINVAL;
>> +        }
>> +    } else if (!s->bs) {
>> +        fprintf(stderr, "Error: block device name is not set\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>> +        if (!qemu_get_counted_string(f, s->bitmap_name)) {
>> +            fprintf(stderr, "Unable to read node name string\n");
>> +            return -EINVAL;
>> +        }
>> +        s->bitmap = bdrv_find_dirty_bitmap(s->bs, s->bitmap_name);
>> +
>> +        /* bitmap may be NULL here, it wouldn't be an error if it is the
>> +         * first occurrence of the bitmap */
>> +        if (!s->bitmap && !(s->flags & DIRTY_BITMAP_MIG_FLAG_START)) {
>> +            fprintf(stderr, "Error: unknown dirty bitmap "
>> +                    "'%s' for block device '%s'\n",
>> +                    s->bitmap_name, s->node_name);
>> +            return -EINVAL;
>> +        }
>> +    } else if (!s->bitmap) {
>> +        fprintf(stderr, "Error: block device name is not set\n");
>> +        return -EINVAL;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
>> +{
>> +    static DirtyBitmapLoadState s;
>> +
>> +    int ret = 0;
>> +
>> +    trace_dirty_bitmap_load_enter();
>> +
>> +    do {
>> +        dirty_bitmap_load_header(f, &s);
>> +
>> +        if (s.flags & DIRTY_BITMAP_MIG_FLAG_START) {
>> +            ret = dirty_bitmap_load_start(f, &s);
>> +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_COMPLETE) {
>> +            dirty_bitmap_load_complete(f, &s);
>> +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_BITS) {
>> +            ret = dirty_bitmap_load_bits(f, &s);
>> +        }
>> +
>> +        if (!ret) {
>> +            ret = qemu_file_get_error(f);
>> +        }
>> +
>> +        if (ret) {
>> +            return ret;
>> +        }
>> +    } while (!(s.flags & DIRTY_BITMAP_MIG_FLAG_EOS));
>> +
>> +    trace_dirty_bitmap_load_success();
>> +    return 0;
>> +}
>> +
>> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms = NULL;
>> +    init_dirty_bitmap_migration();
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        send_bitmap_start(f, dbms);
>> +    }
>> +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>> +
>> +    return 0;
>> +}
>> +
>> +static bool dirty_bitmap_is_active(void *opaque)
>> +{
>> +    return migrate_dirty_bitmaps();
>> +}
>> +
>> +static bool dirty_bitmap_is_active_iterate(void *opaque)
>> +{
>> +    return dirty_bitmap_is_active(opaque) && !runstate_is_running();
>> +}
>> +
>> +static bool dirty_bitmap_has_postcopy(void *opaque)
>> +{
>> +    return true;
>> +}
>> +
>> +static SaveVMHandlers savevm_dirty_bitmap_handlers = {
>> +    .save_live_setup = dirty_bitmap_save_setup,
>> +    .save_live_complete_postcopy = dirty_bitmap_save_complete,
>> +    .save_live_complete_precopy = dirty_bitmap_save_complete,
>> +    .has_postcopy = dirty_bitmap_has_postcopy,
>> +    .save_live_pending = dirty_bitmap_save_pending,
>> +    .save_live_iterate = dirty_bitmap_save_iterate,
>> +    .is_active_iterate = dirty_bitmap_is_active_iterate,
>> +    .load_state = dirty_bitmap_load,
>> +    .cleanup = dirty_bitmap_migration_cleanup,
>> +    .is_active = dirty_bitmap_is_active,
>> +};
>> +
>> +void dirty_bitmap_mig_init(void)
>> +{
>> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
>> +
>> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
>> +                         &savevm_dirty_bitmap_handlers,
>> +                         &dirty_bitmap_mig_state);
>> +}
>> diff --git a/migration/migration.c b/migration/migration.c
>> index 179bd04..edf5623 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -122,6 +122,9 @@ MigrationIncomingState *migration_incoming_get_current(void)
>>           QLIST_INIT(&mis_current.loadvm_handlers);
>>           qemu_mutex_init(&mis_current.rp_mutex);
>>           qemu_event_init(&mis_current.main_thread_load_event, false);
>> +
>> +        init_dirty_bitmap_incoming_migration();
>> +
>>           once = true;
>>       }
>>       return &mis_current;
>> diff --git a/migration/savevm.c b/migration/savevm.c
>> index 54572ef..927a680 100644
>> --- a/migration/savevm.c
>> +++ b/migration/savevm.c
>> @@ -1651,6 +1651,8 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
>>   
>>       trace_loadvm_postcopy_handle_run_vmstart();
>>   
>> +    dirty_bitmap_mig_before_vm_start();
>> +
>>       if (autostart) {
>>           /* Hold onto your hats, starting the CPU */
>>           vm_start();
>> diff --git a/migration/trace-events b/migration/trace-events
>> index 0212929..38fca41 100644
>> --- a/migration/trace-events
>> +++ b/migration/trace-events
>> @@ -222,3 +222,17 @@ colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
>>   colo_send_message(const char *msg) "Send '%s' message"
>>   colo_receive_message(const char *msg) "Receive '%s' message"
>>   colo_failover_set_state(const char *new_state) "new state %s"
>> +
>> +# migration/block-dirty-bitmap.c
>> +send_bitmap_header_enter(void) ""
>> +send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "\n   flags:        %x\n   start_sector: %" PRIu64 "\n   nr_sectors:   %" PRIu32 "\n   data_size:    %" PRIu64 "\n"
>> +dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
>> +dirty_bitmap_save_complete_enter(void) ""
>> +dirty_bitmap_save_complete_finish(void) ""
>> +dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
>> +dirty_bitmap_load_complete(void) ""
>> +dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
>> +dirty_bitmap_load_bits_zeroes(void) ""
>> +dirty_bitmap_load_header(uint32_t flags) "flags %x"
>> +dirty_bitmap_load_enter(void) ""
>> +dirty_bitmap_load_success(void) ""
>> diff --git a/vl.c b/vl.c
>> index b4eaf03..f1ee9ff 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -4405,6 +4405,7 @@ int main(int argc, char **argv, char **envp)
>>   
>>       blk_mig_init();
>>       ram_mig_init();
>> +    dirty_bitmap_mig_init();
>>   
>>       /* If the currently selected machine wishes to override the units-per-bus
>>        * property of its default HBA interface type, do so now. */
>> -- 
>> 1.8.3.1
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy Feb. 25, 2017, 5:56 p.m. UTC | #4
16.02.2017 16:04, Fam Zheng wrote:
> On Mon, 02/13 12:54, Vladimir Sementsov-Ogievskiy wrote:
>> Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
>> associated with root nodes and non-root named nodes are migrated.
>>
>> If destination qemu is already containing a dirty bitmap with the same name
>> as a migrated bitmap (for the same node), than, if their granularities are

[...]

>> +
>> +#define CHUNK_SIZE     (1 << 10)
>> +
>> +/* Flags occupy from one to four bytes. In all but one the 7-th (EXTRA_FLAGS)
>> + * bit should be set. */
>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>> +#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
>> +#define DIRTY_BITMAP_MIG_FLAG_START         0x10
>> +#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
>> +#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
>> +
>> +#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
>> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000
> This flag means two bytes, right? But your above comment says "7-th bit should
> be set". This doesn't make sense. Should this be "0x80" too?

Hmm, good caught, you are right. Also, the comment should be fixed so 
that there are may be 1,2 or 4 bytes, and of course EXTRA_FLAGS bit may 
be only in first and second bytes (the code do so).

>
>> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080
>> +
>> +#define DEBUG_DIRTY_BITMAP_MIGRATION 0

[...]

> +
> +static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> +                             uint64_t start_sector, uint32_t nr_sectors)
> +{
> +    /* align for buffer_is_zero() */
> +    uint64_t align = 4 * sizeof(long);
> +    uint64_t unaligned_size =
> +        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
> +                                             start_sector, nr_sectors);
> +    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
> +    uint8_t *buf = g_malloc0(buf_size);
> +    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
> +
> +    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
> +                                     start_sector, nr_sectors);
> While these bdrv_dirty_bitmap_* calls here seem fine, BdrvDirtyBitmap API is not
> in general thread-safe, while this function is called without any lock. This
> feels dangerous, as noted below, I'm most concerned about use-after-free.

This should be safe as it is a postcopy migration - source should be 
already inactive.

>
>> +
>> +    if (buffer_is_zero(buf, buf_size)) {
>> +        g_free(buf);
>> +        buf = NULL;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
>> +    }
>> +
>> +    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
>> +
>> +    send_bitmap_header(f, dbms, flags);
>> +
>> +    qemu_put_be64(f, start_sector);
>> +    qemu_put_be32(f, nr_sectors);
>> +
>> +    /* if a block is zero we need to flush here since the network
>> +     * bandwidth is now a lot higher than the storage device bandwidth.
>> +     * thus if we queue zero blocks we slow down the migration. */
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
>> +        qemu_fflush(f);
>> +    } else {
>> +        qemu_put_be64(f, buf_size);
>> +        qemu_put_buffer(f, buf, buf_size);
>> +    }
>> +
>> +    g_free(buf);
>> +}
>> +
>> +

[...]

>> +
>> +static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>> +                                      uint64_t max_size,
>> +                                      uint64_t *res_precopy_only,
>> +                                      uint64_t *res_compatible,
>> +                                      uint64_t *res_postcopy_only)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +    uint64_t pending = 0;
>> +
>> +    qemu_mutex_lock_iothread();
> Why do you need the BQL here but not in bulk_phase()?

bulk_phase is in postcopy, source is inactive
Dr. David Alan Gilbert Feb. 27, 2017, 8:12 p.m. UTC | #5
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> 24.02.2017 16:26, Dr. David Alan Gilbert wrote:
> > * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> > > Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
> > > associated with root nodes and non-root named nodes are migrated.
> > > 
> > > If destination qemu is already containing a dirty bitmap with the same name
> > > as a migrated bitmap (for the same node), than, if their granularities are
> > > the same the migration will be done, otherwise the error will be generated.
> > > 
> > > If destination qemu doesn't contain such bitmap it will be created.
> > > 
> > > Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> > > ---
> > >   include/migration/block.h      |   1 +
> > >   include/migration/migration.h  |   4 +
> > >   migration/Makefile.objs        |   2 +-
> > >   migration/block-dirty-bitmap.c | 679 +++++++++++++++++++++++++++++++++++++++++
> > >   migration/migration.c          |   3 +
> > >   migration/savevm.c             |   2 +
> > >   migration/trace-events         |  14 +
> > >   vl.c                           |   1 +
> > >   8 files changed, 705 insertions(+), 1 deletion(-)
> > >   create mode 100644 migration/block-dirty-bitmap.c
> > > 
> > > diff --git a/include/migration/block.h b/include/migration/block.h
> > > index 41a1ac8..8333c43 100644
> > > --- a/include/migration/block.h
> > > +++ b/include/migration/block.h
> > > @@ -14,6 +14,7 @@
> > >   #ifndef MIGRATION_BLOCK_H
> > >   #define MIGRATION_BLOCK_H
> > > +void dirty_bitmap_mig_init(void);
> > >   void blk_mig_init(void);
> > >   int blk_mig_active(void);
> > >   uint64_t blk_mig_bytes_transferred(void);
> > > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > > index 46645f4..03a4993 100644
> > > --- a/include/migration/migration.h
> > > +++ b/include/migration/migration.h
> > > @@ -371,4 +371,8 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname,
> > >   PostcopyState postcopy_state_get(void);
> > >   /* Set the state and return the old state */
> > >   PostcopyState postcopy_state_set(PostcopyState new_state);
> > > +
> > > +void dirty_bitmap_mig_before_vm_start(void);
> > > +void init_dirty_bitmap_incoming_migration(void);
> > > +
> > >   #endif
> > > diff --git a/migration/Makefile.objs b/migration/Makefile.objs
> > > index 480dd49..fa3bf6a 100644
> > > --- a/migration/Makefile.objs
> > > +++ b/migration/Makefile.objs
> > > @@ -9,5 +9,5 @@ common-obj-y += qjson.o
> > >   common-obj-$(CONFIG_RDMA) += rdma.o
> > > -common-obj-y += block.o
> > > +common-obj-y += block.o block-dirty-bitmap.o
> > > diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
> > > new file mode 100644
> > > index 0000000..28e3732
> > > --- /dev/null
> > > +++ b/migration/block-dirty-bitmap.c
> > > @@ -0,0 +1,679 @@
> > > +/*
> > > + * Block dirty bitmap postcopy migration
> > > + *
> > > + * Copyright IBM, Corp. 2009
> > > + * Copyright (C) 2016 Parallels IP Holdings GmbH. All rights reserved.
> > > + *
> > > + * Authors:
> > > + *  Liran Schour   <lirans@il.ibm.com>
> > > + *  Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2.  See
> > > + * the COPYING file in the top-level directory.
> > > + * This file is derived from migration/block.c, so it's author and IBM copyright
> > > + * are here, although content is quite different.
> > > + *
> > > + * Contributions after 2012-01-13 are licensed under the terms of the
> > > + * GNU GPL, version 2 or (at your option) any later version.
> > > + *
> > > + *                                ***
> > > + *
> > > + * Here postcopy migration of dirty bitmaps is realized. Only named dirty
> > > + * bitmaps, associated with root nodes and non-root named nodes are migrated.
> > > + *
> > > + * If destination qemu is already containing a dirty bitmap with the same name
> > > + * as a migrated bitmap (for the same node), then, if their granularities are
> > > + * the same the migration will be done, otherwise the error will be generated.
> > > + *
> > > + * If destination qemu doesn't contain such bitmap it will be created.
> > > + *
> > > + * format of migration:
> > > + *
> > > + * # Header (shared for different chunk types)
> > > + * 1, 2 or 4 bytes: flags (see qemu_{put,put}_flags)
> > > + * [ 1 byte: node name size ] \  flags & DEVICE_NAME
> > > + * [ n bytes: node name     ] /
> > > + * [ 1 byte: bitmap name size ] \  flags & BITMAP_NAME
> > > + * [ n bytes: bitmap name     ] /
> > > + *
> > > + * # Start of bitmap migration (flags & START)
> > > + * header
> > > + * be64: granularity
> > > + * 1 byte: bitmap enabled flag
> > > + *
> > > + * # Complete of bitmap migration (flags & COMPLETE)
> > > + * header
> > > + *
> > > + * # Data chunk of bitmap migration
> > > + * header
> > > + * be64: start sector
> > > + * be32: number of sectors
> > > + * [ be64: buffer size  ] \ ! (flags & ZEROES)
> > > + * [ n bytes: buffer    ] /
> > > + *
> > > + * The last chunk in stream should contain flags & EOS. The chunk may skip
> > > + * device and/or bitmap names, assuming them to be the same with the previous
> > > + * chunk.
> > > + */
> > > +
> > > +#include "qemu/osdep.h"
> > > +#include "block/block.h"
> > > +#include "block/block_int.h"
> > > +#include "sysemu/block-backend.h"
> > > +#include "qemu/main-loop.h"
> > > +#include "qemu/error-report.h"
> > > +#include "migration/block.h"
> > > +#include "migration/migration.h"
> > > +#include "qemu/hbitmap.h"
> > > +#include "sysemu/sysemu.h"
> > > +#include "qemu/cutils.h"
> > > +#include "qapi/error.h"
> > > +#include "trace.h"
> > > +#include <assert.h>
> > > +
> > > +#define CHUNK_SIZE     (1 << 10)
> > > +
> > > +/* Flags occupy from one to four bytes. In all but one the 7-th (EXTRA_FLAGS)
> > > + * bit should be set. */
> > > +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
> > > +#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
> > > +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
> > > +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
> > > +#define DIRTY_BITMAP_MIG_FLAG_START         0x10
> > > +#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
> > > +#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
> > > +
> > > +#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
> > > +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000
> > > +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080
> > > +
> > > +#define DEBUG_DIRTY_BITMAP_MIGRATION 0
> > > +
> > > +typedef struct DirtyBitmapMigBitmapState {
> > > +    /* Written during setup phase. */
> > > +    BlockDriverState *bs;
> > > +    const char *node_name;
> > > +    BdrvDirtyBitmap *bitmap;
> > > +    uint64_t total_sectors;
> > > +    uint64_t sectors_per_chunk;
> > > +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
> > > +
> > > +    /* For bulk phase. */
> > > +    bool bulk_completed;
> > > +    uint64_t cur_sector;
> > > +} DirtyBitmapMigBitmapState;
> > > +
> > > +typedef struct DirtyBitmapMigState {
> > > +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
> > > +
> > > +    bool bulk_completed;
> > > +
> > > +    /* for send_bitmap_bits() */
> > > +    BlockDriverState *prev_bs;
> > > +    BdrvDirtyBitmap *prev_bitmap;
> > > +} DirtyBitmapMigState;
> > > +
> > > +typedef struct DirtyBitmapLoadState {
> > > +    uint32_t flags;
> > > +    char node_name[256];
> > > +    char bitmap_name[256];
> > > +    BlockDriverState *bs;
> > > +    BdrvDirtyBitmap *bitmap;
> > > +} DirtyBitmapLoadState;
> > > +
> > > +static DirtyBitmapMigState dirty_bitmap_mig_state;
> > > +
> > > +typedef struct DirtyBitmapLoadBitmapState {
> > > +    BlockDriverState *bs;
> > > +    BdrvDirtyBitmap *bitmap;
> > > +    bool migrated;
> > > +} DirtyBitmapLoadBitmapState;
> > > +static GSList *enabled_bitmaps;
> > > +QemuMutex finish_lock;
> > > +
> > > +void init_dirty_bitmap_incoming_migration(void)
> > > +{
> > > +    qemu_mutex_init(&finish_lock);
> > > +}
> > > +
> > > +static uint32_t qemu_get_bitmap_flags(QEMUFile *f)
> > > +{
> > > +    uint8_t flags = qemu_get_byte(f);
> > > +    if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
> > > +        flags = flags << 8 | qemu_get_byte(f);
> > > +        if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
> > > +            flags = flags << 16 | qemu_get_be16(f);
> > > +        }
> > > +    }
> > > +
> > > +    return flags;
> > > +}
> > > +
> > > +static void qemu_put_bitmap_flags(QEMUFile *f, uint32_t flags)
> > > +{
> > > +    if (!(flags & 0xffffff00)) {
> > > +        qemu_put_byte(f, flags);
> > > +        return;
> > > +    }
> > > +
> > > +    if (!(flags & 0xffff0000)) {
> > > +        qemu_put_be16(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_16);
> > > +        return;
> > > +    }
> > > +
> > > +    qemu_put_be32(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_32);
> > Is this stuff worth the hastle? It would seem easier just to send
> > the 32bits each time;  the overhead doesn't seem to be much except
> > in the case where you send minimum size chunks a lot.
> 
> I've started with one byte of flags (I don't need more) and I've implemented
> this after proposal by John Snow:
> > I might recommend reserving the last bit of the second byte to be a flag
> > such as DIRTY_BITMAP_EXTRA_FLAGS that indicates the presence of
> > additional byte(s) of flags, to be determined later, if we ever need
> > them, but two bytes for now should be sufficient.
> 
> And I think it is not bad. Code is a bit more complex, yes, but why should
> we send 4 bytes with every chunk when (very likely) we will never use more
> than one?

Yes if it's very rare and a useful % of the size then that makes sense; just
don't worry about squeezing every byte out of it if it's a tiny %.
And yes, keeping one flag set aside for 'and there's more' is good.

Dave

> > 
> > Dave
> > 
> > > +}
> > > +
> > > +static void send_bitmap_header(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> > > +                               uint32_t additional_flags)
> > > +{
> > > +    BlockDriverState *bs = dbms->bs;
> > > +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
> > > +    uint32_t flags = additional_flags;
> > > +    trace_send_bitmap_header_enter();
> > > +
> > > +    if (bs != dirty_bitmap_mig_state.prev_bs) {
> > > +        dirty_bitmap_mig_state.prev_bs = bs;
> > > +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
> > > +    }
> > > +
> > > +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
> > > +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
> > > +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
> > > +    }
> > > +
> > > +    qemu_put_bitmap_flags(f, flags);
> > > +
> > > +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> > > +        qemu_put_counted_string(f, dbms->node_name);
> > > +    }
> > > +
> > > +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> > > +        qemu_put_counted_string(f, bdrv_dirty_bitmap_name(bitmap));
> > > +    }
> > > +}
> > > +
> > > +static void send_bitmap_start(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> > > +{
> > > +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_START);
> > > +    qemu_put_be32(f, bdrv_dirty_bitmap_granularity(dbms->bitmap));
> > > +    qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
> > > +}
> > > +
> > > +static void send_bitmap_complete(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> > > +{
> > > +    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_COMPLETE);
> > > +}
> > > +
> > > +static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> > > +                             uint64_t start_sector, uint32_t nr_sectors)
> > > +{
> > > +    /* align for buffer_is_zero() */
> > > +    uint64_t align = 4 * sizeof(long);
> > > +    uint64_t unaligned_size =
> > > +        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
> > > +                                             start_sector, nr_sectors);
> > > +    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
> > > +    uint8_t *buf = g_malloc0(buf_size);
> > > +    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
> > > +
> > > +    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
> > > +                                     start_sector, nr_sectors);
> > > +
> > > +    if (buffer_is_zero(buf, buf_size)) {
> > > +        g_free(buf);
> > > +        buf = NULL;
> > > +        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
> > > +    }
> > > +
> > > +    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
> > > +
> > > +    send_bitmap_header(f, dbms, flags);
> > > +
> > > +    qemu_put_be64(f, start_sector);
> > > +    qemu_put_be32(f, nr_sectors);
> > > +
> > > +    /* if a block is zero we need to flush here since the network
> > > +     * bandwidth is now a lot higher than the storage device bandwidth.
> > > +     * thus if we queue zero blocks we slow down the migration. */
> > > +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
> > > +        qemu_fflush(f);
> > > +    } else {
> > > +        qemu_put_be64(f, buf_size);
> > > +        qemu_put_buffer(f, buf, buf_size);
> > > +    }
> > > +
> > > +    g_free(buf);
> > > +}
> > > +
> > > +
> > > +/* Called with iothread lock taken.  */
> > > +
> > > +static void init_dirty_bitmap_migration(void)
> > > +{
> > > +    BlockDriverState *bs;
> > > +    BdrvDirtyBitmap *bitmap;
> > > +    DirtyBitmapMigBitmapState *dbms;
> > > +    BdrvNextIterator it;
> > > +    uint64_t total_bytes = 0;
> > > +
> > > +    dirty_bitmap_mig_state.bulk_completed = false;
> > > +    dirty_bitmap_mig_state.prev_bs = NULL;
> > > +    dirty_bitmap_mig_state.prev_bitmap = NULL;
> > > +
> > > +    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
> > > +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
> > > +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
> > > +            if (!bdrv_dirty_bitmap_name(bitmap)) {
> > > +                continue;
> > > +            }
> > > +
> > > +            if (!bdrv_get_device_or_node_name(bs)) {
> > > +                /* not named non-root node */
> > > +                continue;
> > > +            }
> > > +
> > > +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
> > > +            dbms->bs = bs;
> > > +            dbms->node_name = bdrv_get_node_name(bs);
> > > +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
> > > +                dbms->node_name = bdrv_get_device_name(bs);
> > > +            }
> > > +            dbms->bitmap = bitmap;
> > > +            dbms->total_sectors = bdrv_nb_sectors(bs);
> > > +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
> > > +                bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
> > > +
> > > +            total_bytes +=
> > > +                bdrv_dirty_bitmap_serialization_size(bitmap,
> > > +                                                     0, dbms->total_sectors);
> > > +
> > > +            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
> > > +                                 dbms, entry);
> > > +        }
> > > +    }
> > > +}
> > > +
> > > +/* Called with no lock taken.  */
> > > +static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> > > +{
> > > +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
> > > +                             dbms->sectors_per_chunk);
> > > +
> > > +    send_bitmap_bits(f, dbms, dbms->cur_sector, nr_sectors);
> > > +
> > > +    dbms->cur_sector += nr_sectors;
> > > +    if (dbms->cur_sector >= dbms->total_sectors) {
> > > +        dbms->bulk_completed = true;
> > > +    }
> > > +}
> > > +
> > > +/* Called with no lock taken.  */
> > > +static void bulk_phase(QEMUFile *f, bool limit)
> > > +{
> > > +    DirtyBitmapMigBitmapState *dbms;
> > > +
> > > +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> > > +        while (!dbms->bulk_completed) {
> > > +            bulk_phase_send_chunk(f, dbms);
> > > +            if (limit && qemu_file_rate_limit(f)) {
> > > +                return;
> > > +            }
> > > +        }
> > > +    }
> > > +
> > > +    dirty_bitmap_mig_state.bulk_completed = true;
> > > +}
> > > +
> > > +/* Called with iothread lock taken.  */
> > > +static void dirty_bitmap_mig_cleanup(void)
> > > +{
> > > +    DirtyBitmapMigBitmapState *dbms;
> > > +
> > > +    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
> > > +        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
> > > +        g_free(dbms);
> > > +    }
> > > +}
> > > +
> > > +/* for SaveVMHandlers */
> > > +static void dirty_bitmap_migration_cleanup(void *opaque)
> > > +{
> > > +    dirty_bitmap_mig_cleanup();
> > > +}
> > > +
> > > +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
> > > +{
> > > +    trace_dirty_bitmap_save_iterate(
> > > +            migration_in_postcopy(migrate_get_current()));
> > > +
> > > +    if (migration_in_postcopy(migrate_get_current()) &&
> > > +        !dirty_bitmap_mig_state.bulk_completed) {
> > > +        bulk_phase(f, true);
> > > +    }
> > > +
> > > +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> > > +
> > > +    return dirty_bitmap_mig_state.bulk_completed;
> > > +}
> > > +
> > > +/* Called with iothread lock taken.  */
> > > +
> > > +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
> > > +{
> > > +    DirtyBitmapMigBitmapState *dbms;
> > > +    trace_dirty_bitmap_save_complete_enter();
> > > +
> > > +    if (!dirty_bitmap_mig_state.bulk_completed) {
> > > +        bulk_phase(f, false);
> > > +    }
> > > +
> > > +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> > > +        send_bitmap_complete(f, dbms);
> > > +    }
> > > +
> > > +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> > > +
> > > +    trace_dirty_bitmap_save_complete_finish();
> > > +
> > > +    dirty_bitmap_mig_cleanup();
> > > +    return 0;
> > > +}
> > > +
> > > +static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
> > > +                                      uint64_t max_size,
> > > +                                      uint64_t *res_precopy_only,
> > > +                                      uint64_t *res_compatible,
> > > +                                      uint64_t *res_postcopy_only)
> > > +{
> > > +    DirtyBitmapMigBitmapState *dbms;
> > > +    uint64_t pending = 0;
> > > +
> > > +    qemu_mutex_lock_iothread();
> > > +
> > > +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> > > +        uint64_t gran = bdrv_dirty_bitmap_granularity(dbms->bitmap);
> > > +        uint64_t sectors = dbms->bulk_completed ? 0 :
> > > +                           dbms->total_sectors - dbms->cur_sector;
> > > +
> > > +        pending += (sectors * BDRV_SECTOR_SIZE + gran - 1) / gran;
> > > +    }
> > > +
> > > +    qemu_mutex_unlock_iothread();
> > > +
> > > +    trace_dirty_bitmap_save_pending(pending, max_size);
> > > +
> > > +    *res_postcopy_only += pending;
> > > +}
> > > +
> > > +/* First occurrence of this bitmap. It should be created if doesn't exist */
> > > +static int dirty_bitmap_load_start(QEMUFile *f, DirtyBitmapLoadState *s)
> > > +{
> > > +    Error *local_err = NULL;
> > > +    uint32_t granularity = qemu_get_be32(f);
> > > +    bool enabled = qemu_get_byte(f);
> > > +
> > > +    if (!s->bitmap) {
> > > +        s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
> > > +                                             s->bitmap_name, &local_err);
> > > +        if (!s->bitmap) {
> > > +            error_report_err(local_err);
> > > +            return -EINVAL;
> > > +        }
> > > +    } else {
> > > +        uint32_t dest_granularity =
> > > +            bdrv_dirty_bitmap_granularity(s->bitmap);
> > > +        if (dest_granularity != granularity) {
> > > +            fprintf(stderr,
> > > +                    "Error: "
> > > +                    "Migrated bitmap granularity (%" PRIu32 ") "
> > > +                    "doesn't match the destination bitmap '%s' "
> > > +                    "granularity (%" PRIu32 ")\n",
> > > +                    granularity,
> > > +                    bdrv_dirty_bitmap_name(s->bitmap),
> > > +                    dest_granularity);
> > > +            return -EINVAL;
> > > +        }
> > > +    }
> > > +
> > > +    bdrv_disable_dirty_bitmap(s->bitmap);
> > > +    if (enabled) {
> > > +        DirtyBitmapLoadBitmapState *b;
> > > +
> > > +        bdrv_dirty_bitmap_create_successor(s->bs, s->bitmap, &local_err);
> > > +        if (local_err) {
> > > +            error_report_err(local_err);
> > > +            return -EINVAL;
> > > +        }
> > > +
> > > +        b = g_new(DirtyBitmapLoadBitmapState, 1);
> > > +        b->bs = s->bs;
> > > +        b->bitmap = s->bitmap;
> > > +        b->migrated = false;
> > > +        enabled_bitmaps = g_slist_prepend(enabled_bitmaps, b);
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +void dirty_bitmap_mig_before_vm_start(void)
> > > +{
> > > +    GSList *item;
> > > +
> > > +    qemu_mutex_lock(&finish_lock);
> > > +
> > > +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
> > > +        DirtyBitmapLoadBitmapState *b = item->data;
> > > +
> > > +        if (b->migrated) {
> > > +            bdrv_enable_dirty_bitmap(b->bitmap);
> > > +        } else {
> > > +            bdrv_dirty_bitmap_enable_successor(b->bitmap);
> > > +        }
> > > +
> > > +        g_free(b);
> > > +    }
> > > +
> > > +    g_slist_free(enabled_bitmaps);
> > > +    enabled_bitmaps = NULL;
> > > +
> > > +    qemu_mutex_unlock(&finish_lock);
> > > +}
> > > +
> > > +static void dirty_bitmap_load_complete(QEMUFile *f, DirtyBitmapLoadState *s)
> > > +{
> > > +    GSList *item;
> > > +    trace_dirty_bitmap_load_complete();
> > > +    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
> > > +
> > > +    qemu_mutex_lock(&finish_lock);
> > > +
> > > +    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
> > > +        DirtyBitmapLoadBitmapState *b = item->data;
> > > +
> > > +        if (b->bitmap == s->bitmap) {
> > > +            b->migrated = true;
> > > +        }
> > > +    }
> > > +
> > > +    if (bdrv_dirty_bitmap_frozen(s->bitmap)) {
> > > +        if (enabled_bitmaps == NULL) {
> > > +            /* in postcopy */
> > > +            AioContext *aio_context = bdrv_get_aio_context(s->bs);
> > > +            aio_context_acquire(aio_context);
> > > +
> > > +            bdrv_reclaim_dirty_bitmap(s->bs, s->bitmap, &error_abort);
> > > +            bdrv_enable_dirty_bitmap(s->bitmap);
> > > +
> > > +            aio_context_release(aio_context);
> > > +        } else {
> > > +            /* target not started, successor is empty */
> > > +            bdrv_dirty_bitmap_release_successor(s->bs, s->bitmap);
> > > +        }
> > > +    }
> > > +
> > > +    qemu_mutex_unlock(&finish_lock);
> > > +}
> > > +
> > > +static int dirty_bitmap_load_bits(QEMUFile *f, DirtyBitmapLoadState *s)
> > > +{
> > > +    uint64_t first_sector = qemu_get_be64(f);
> > > +    uint32_t nr_sectors = qemu_get_be32(f);
> > > +    trace_dirty_bitmap_load_bits_enter(first_sector, nr_sectors);
> > > +
> > > +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
> > > +        trace_dirty_bitmap_load_bits_zeroes();
> > > +        bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_sector,
> > > +                                             nr_sectors, false);
> > > +    } else {
> > > +        uint8_t *buf;
> > > +        uint64_t buf_size = qemu_get_be64(f);
> > > +        uint64_t needed_size =
> > > +            bdrv_dirty_bitmap_serialization_size(s->bitmap,
> > > +                                                 first_sector, nr_sectors);
> > > +
> > > +        if (needed_size > buf_size) {
> > > +            fprintf(stderr,
> > > +                    "Error: Migrated bitmap granularity doesn't "
> > > +                    "match the destination bitmap '%s' granularity\n",
> > > +                    bdrv_dirty_bitmap_name(s->bitmap));
> > > +            return -EINVAL;
> > > +        }
> > > +
> > > +        buf = g_malloc(buf_size);
> > > +        qemu_get_buffer(f, buf, buf_size);
> > > +        bdrv_dirty_bitmap_deserialize_part(s->bitmap, buf,
> > > +                                           first_sector,
> > > +                                           nr_sectors, false);
> > > +        g_free(buf);
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int dirty_bitmap_load_header(QEMUFile *f, DirtyBitmapLoadState *s)
> > > +{
> > > +    Error *local_err = NULL;
> > > +    s->flags = qemu_get_bitmap_flags(f);
> > > +    trace_dirty_bitmap_load_header(s->flags);
> > > +
> > > +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> > > +        if (!qemu_get_counted_string(f, s->node_name)) {
> > > +            fprintf(stderr, "Unable to read node name string\n");
> > > +            return -EINVAL;
> > > +        }
> > > +        s->bs = bdrv_lookup_bs(s->node_name, s->node_name, &local_err);
> > > +        if (!s->bs) {
> > > +            error_report("%s", error_get_pretty(local_err));
> > > +            error_free(local_err);
> > > +            return -EINVAL;
> > > +        }
> > > +    } else if (!s->bs) {
> > > +        fprintf(stderr, "Error: block device name is not set\n");
> > > +        return -EINVAL;
> > > +    }
> > > +
> > > +    if (s->flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> > > +        if (!qemu_get_counted_string(f, s->bitmap_name)) {
> > > +            fprintf(stderr, "Unable to read node name string\n");
> > > +            return -EINVAL;
> > > +        }
> > > +        s->bitmap = bdrv_find_dirty_bitmap(s->bs, s->bitmap_name);
> > > +
> > > +        /* bitmap may be NULL here, it wouldn't be an error if it is the
> > > +         * first occurrence of the bitmap */
> > > +        if (!s->bitmap && !(s->flags & DIRTY_BITMAP_MIG_FLAG_START)) {
> > > +            fprintf(stderr, "Error: unknown dirty bitmap "
> > > +                    "'%s' for block device '%s'\n",
> > > +                    s->bitmap_name, s->node_name);
> > > +            return -EINVAL;
> > > +        }
> > > +    } else if (!s->bitmap) {
> > > +        fprintf(stderr, "Error: block device name is not set\n");
> > > +        return -EINVAL;
> > > +    }
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
> > > +{
> > > +    static DirtyBitmapLoadState s;
> > > +
> > > +    int ret = 0;
> > > +
> > > +    trace_dirty_bitmap_load_enter();
> > > +
> > > +    do {
> > > +        dirty_bitmap_load_header(f, &s);
> > > +
> > > +        if (s.flags & DIRTY_BITMAP_MIG_FLAG_START) {
> > > +            ret = dirty_bitmap_load_start(f, &s);
> > > +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_COMPLETE) {
> > > +            dirty_bitmap_load_complete(f, &s);
> > > +        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_BITS) {
> > > +            ret = dirty_bitmap_load_bits(f, &s);
> > > +        }
> > > +
> > > +        if (!ret) {
> > > +            ret = qemu_file_get_error(f);
> > > +        }
> > > +
> > > +        if (ret) {
> > > +            return ret;
> > > +        }
> > > +    } while (!(s.flags & DIRTY_BITMAP_MIG_FLAG_EOS));
> > > +
> > > +    trace_dirty_bitmap_load_success();
> > > +    return 0;
> > > +}
> > > +
> > > +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
> > > +{
> > > +    DirtyBitmapMigBitmapState *dbms = NULL;
> > > +    init_dirty_bitmap_migration();
> > > +
> > > +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> > > +        send_bitmap_start(f, dbms);
> > > +    }
> > > +    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > > +static bool dirty_bitmap_is_active(void *opaque)
> > > +{
> > > +    return migrate_dirty_bitmaps();
> > > +}
> > > +
> > > +static bool dirty_bitmap_is_active_iterate(void *opaque)
> > > +{
> > > +    return dirty_bitmap_is_active(opaque) && !runstate_is_running();
> > > +}
> > > +
> > > +static bool dirty_bitmap_has_postcopy(void *opaque)
> > > +{
> > > +    return true;
> > > +}
> > > +
> > > +static SaveVMHandlers savevm_dirty_bitmap_handlers = {
> > > +    .save_live_setup = dirty_bitmap_save_setup,
> > > +    .save_live_complete_postcopy = dirty_bitmap_save_complete,
> > > +    .save_live_complete_precopy = dirty_bitmap_save_complete,
> > > +    .has_postcopy = dirty_bitmap_has_postcopy,
> > > +    .save_live_pending = dirty_bitmap_save_pending,
> > > +    .save_live_iterate = dirty_bitmap_save_iterate,
> > > +    .is_active_iterate = dirty_bitmap_is_active_iterate,
> > > +    .load_state = dirty_bitmap_load,
> > > +    .cleanup = dirty_bitmap_migration_cleanup,
> > > +    .is_active = dirty_bitmap_is_active,
> > > +};
> > > +
> > > +void dirty_bitmap_mig_init(void)
> > > +{
> > > +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
> > > +
> > > +    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
> > > +                         &savevm_dirty_bitmap_handlers,
> > > +                         &dirty_bitmap_mig_state);
> > > +}
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index 179bd04..edf5623 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -122,6 +122,9 @@ MigrationIncomingState *migration_incoming_get_current(void)
> > >           QLIST_INIT(&mis_current.loadvm_handlers);
> > >           qemu_mutex_init(&mis_current.rp_mutex);
> > >           qemu_event_init(&mis_current.main_thread_load_event, false);
> > > +
> > > +        init_dirty_bitmap_incoming_migration();
> > > +
> > >           once = true;
> > >       }
> > >       return &mis_current;
> > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > index 54572ef..927a680 100644
> > > --- a/migration/savevm.c
> > > +++ b/migration/savevm.c
> > > @@ -1651,6 +1651,8 @@ static void loadvm_postcopy_handle_run_bh(void *opaque)
> > >       trace_loadvm_postcopy_handle_run_vmstart();
> > > +    dirty_bitmap_mig_before_vm_start();
> > > +
> > >       if (autostart) {
> > >           /* Hold onto your hats, starting the CPU */
> > >           vm_start();
> > > diff --git a/migration/trace-events b/migration/trace-events
> > > index 0212929..38fca41 100644
> > > --- a/migration/trace-events
> > > +++ b/migration/trace-events
> > > @@ -222,3 +222,17 @@ colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
> > >   colo_send_message(const char *msg) "Send '%s' message"
> > >   colo_receive_message(const char *msg) "Receive '%s' message"
> > >   colo_failover_set_state(const char *new_state) "new state %s"
> > > +
> > > +# migration/block-dirty-bitmap.c
> > > +send_bitmap_header_enter(void) ""
> > > +send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "\n   flags:        %x\n   start_sector: %" PRIu64 "\n   nr_sectors:   %" PRIu32 "\n   data_size:    %" PRIu64 "\n"
> > > +dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
> > > +dirty_bitmap_save_complete_enter(void) ""
> > > +dirty_bitmap_save_complete_finish(void) ""
> > > +dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
> > > +dirty_bitmap_load_complete(void) ""
> > > +dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
> > > +dirty_bitmap_load_bits_zeroes(void) ""
> > > +dirty_bitmap_load_header(uint32_t flags) "flags %x"
> > > +dirty_bitmap_load_enter(void) ""
> > > +dirty_bitmap_load_success(void) ""
> > > diff --git a/vl.c b/vl.c
> > > index b4eaf03..f1ee9ff 100644
> > > --- a/vl.c
> > > +++ b/vl.c
> > > @@ -4405,6 +4405,7 @@ int main(int argc, char **argv, char **envp)
> > >       blk_mig_init();
> > >       ram_mig_init();
> > > +    dirty_bitmap_mig_init();
> > >       /* If the currently selected machine wishes to override the units-per-bus
> > >        * property of its default HBA interface type, do so now. */
> > > -- 
> > > 1.8.3.1
> > > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> 
> 
> -- 
> Best regards,
> Vladimir
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy April 26, 2017, 1:11 p.m. UTC | #6
25.02.2017 20:56, Vladimir Sementsov-Ogievskiy wrote:
> 16.02.2017 16:04, Fam Zheng wrote:
>> On Mon, 02/13 12:54, Vladimir Sementsov-Ogievskiy wrote:
>>> Postcopy migration of dirty bitmaps. Only named dirty bitmaps,
>>> associated with root nodes and non-root named nodes are migrated.
>>>
>>> If destination qemu is already containing a dirty bitmap with the 
>>> same name
>>> as a migrated bitmap (for the same node), than, if their 
>>> granularities are
>
> [...]
>
>>> +
>>> +#define CHUNK_SIZE     (1 << 10)
>>> +
>>> +/* Flags occupy from one to four bytes. In all but one the 7-th 
>>> (EXTRA_FLAGS)
>>> + * bit should be set. */
>>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>>> +#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
>>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
>>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
>>> +#define DIRTY_BITMAP_MIG_FLAG_START         0x10
>>> +#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
>>> +#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
>>> +
>>> +#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
>>> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000
>> This flag means two bytes, right? But your above comment says "7-th 
>> bit should
>> be set". This doesn't make sense. Should this be "0x80" too?
>
> Hmm, good caught, you are right. Also, the comment should be fixed so 
> that there are may be 1,2 or 4 bytes, and of course EXTRA_FLAGS bit 
> may be only in first and second bytes (the code do so).

Aha, now I understand that 0x8000 is ok for big endian

>
>>
>>> +#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080

and this is not ok)

I think, I'll just drop this. Anyway it is a dead code, we do not send 
flags more than 1 byte in the code. On the other hand, receive flags 
path is absolutely ok, and it is for backward compatibility - we just 
ignore unknown flags.

>>> +
>>> +#define DEBUG_DIRTY_BITMAP_MIGRATION 0
>
> [...]
>
>> +
>> +static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState 
>> *dbms,
>> +                             uint64_t start_sector, uint32_t 
>> nr_sectors)
>> +{
>> +    /* align for buffer_is_zero() */
>> +    uint64_t align = 4 * sizeof(long);
>> +    uint64_t unaligned_size =
>> +        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
>> +                                             start_sector, nr_sectors);
>> +    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
>> +    uint8_t *buf = g_malloc0(buf_size);
>> +    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
>> +
>> +    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
>> +                                     start_sector, nr_sectors);
>> While these bdrv_dirty_bitmap_* calls here seem fine, BdrvDirtyBitmap 
>> API is not
>> in general thread-safe, while this function is called without any 
>> lock. This
>> feels dangerous, as noted below, I'm most concerned about 
>> use-after-free.
>
> This should be safe as it is a postcopy migration - source should be 
> already inactive.
>
>>
>>> +
>>> +    if (buffer_is_zero(buf, buf_size)) {
>>> +        g_free(buf);
>>> +        buf = NULL;
>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
>>> +    }
>>> +
>>> +    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
>>> +
>>> +    send_bitmap_header(f, dbms, flags);
>>> +
>>> +    qemu_put_be64(f, start_sector);
>>> +    qemu_put_be32(f, nr_sectors);
>>> +
>>> +    /* if a block is zero we need to flush here since the network
>>> +     * bandwidth is now a lot higher than the storage device 
>>> bandwidth.
>>> +     * thus if we queue zero blocks we slow down the migration. */
>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
>>> +        qemu_fflush(f);
>>> +    } else {
>>> +        qemu_put_be64(f, buf_size);
>>> +        qemu_put_buffer(f, buf, buf_size);
>>> +    }
>>> +
>>> +    g_free(buf);
>>> +}
>>> +
>>> +
>
> [...]
>
>>> +
>>> +static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>>> +                                      uint64_t max_size,
>>> +                                      uint64_t *res_precopy_only,
>>> +                                      uint64_t *res_compatible,
>>> +                                      uint64_t *res_postcopy_only)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +    uint64_t pending = 0;
>>> +
>>> +    qemu_mutex_lock_iothread();
>> Why do you need the BQL here but not in bulk_phase()?
>
> bulk_phase is in postcopy, source is inactive
>
>
Vladimir Sementsov-Ogievskiy July 5, 2017, 9:24 a.m. UTC | #7
16.02.2017 16:04, Fam Zheng wrote:
>> +            dbms->node_name = bdrv_get_node_name(bs);
>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>> +                dbms->node_name = bdrv_get_device_name(bs);
>> +            }
>> +            dbms->bitmap = bitmap;
> What protects the case that the bitmap is released before migration completes?
>
What is the source of such deletion? qmp command? Theoretically possible.

I see the following variants:

1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap 
deletion

2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will not be 
available through qmp

what do you think?
John Snow July 5, 2017, 9:46 p.m. UTC | #8
On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
> 16.02.2017 16:04, Fam Zheng wrote:
>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>> +            }
>>> +            dbms->bitmap = bitmap;
>> What protects the case that the bitmap is released before migration
>> completes?
>>
> What is the source of such deletion? qmp command? Theoretically possible.
> 
> I see the following variants:
> 
> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
> deletion
> 
> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will not be
> available through qmp
> 

Making the bitmap anonymous would forbid us to query the bitmap, which
there is no general reason to do, excepting the idea that a third party
attempting to use the bitmap during a migration is probably a bad idea.
I don't really like the idea of "hiding" information from the user,
though, because then we'd have to worry about name collisions when we
de-anonymized the bitmap again. That's not so palatable.

> what do you think?
> 

The modes for bitmaps are getting messy.

As a reminder, the officially exposed "modes" of a bitmap are currently:

FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
otherwise "ACTIVE."
DISABLED: Not recording any writes (by choice.)
ACTIVE: Actively recording writes.

These are documented in the public API as possibilities for
DirtyBitmapStatus in block-core.json. We didn't add a new condition for
"readonly" either, which I think is actually required:

READONLY: Not recording any writes (by necessity.)


Your new use case here sounds like Frozen to me, but it simply does not
have an anonymous successor to force it to be recognized as "frozen." We
can add a `bool protected` or `bool frozen` field to force recognition
of this status and adjust the json documentation accordingly.

I think then we'd have four recognized states:

FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
other internal process. Bitmap is otherwise ACTIVE.
DISABLED: Not recording any writes (by choice.)
READONLY: Not able to record any writes (by necessity.)
ACTIVE: Normal bitmap status.

Sound right?
Vladimir Sementsov-Ogievskiy July 6, 2017, 8:05 a.m. UTC | #9
06.07.2017 00:46, John Snow wrote:
>
> On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 16.02.2017 16:04, Fam Zheng wrote:
>>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>>> +            }
>>>> +            dbms->bitmap = bitmap;
>>> What protects the case that the bitmap is released before migration
>>> completes?
>>>
>> What is the source of such deletion? qmp command? Theoretically possible.
>>
>> I see the following variants:
>>
>> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
>> deletion
>>
>> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will not be
>> available through qmp
>>
> Making the bitmap anonymous would forbid us to query the bitmap, which
> there is no general reason to do, excepting the idea that a third party
> attempting to use the bitmap during a migration is probably a bad idea.
> I don't really like the idea of "hiding" information from the user,
> though, because then we'd have to worry about name collisions when we
> de-anonymized the bitmap again. That's not so palatable.
>
>> what do you think?
>>
> The modes for bitmaps are getting messy.
>
> As a reminder, the officially exposed "modes" of a bitmap are currently:
>
> FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
> otherwise "ACTIVE."
> DISABLED: Not recording any writes (by choice.)
> ACTIVE: Actively recording writes.
>
> These are documented in the public API as possibilities for
> DirtyBitmapStatus in block-core.json. We didn't add a new condition for
> "readonly" either, which I think is actually required:
>
> READONLY: Not recording any writes (by necessity.)
>
>
> Your new use case here sounds like Frozen to me, but it simply does not
> have an anonymous successor to force it to be recognized as "frozen." We
> can add a `bool protected` or `bool frozen` field to force recognition
> of this status and adjust the json documentation accordingly.

Bitmaps are selected for migration when source is running, so we should 
protect them (from deletion (or frozing or disabling), not from chaning 
bits) before source stop, so that is not like frozen. Bitmaps may be 
changed in this state.
It is more like ACTIVE.

We can move bitmap selection on the point after precopy migration, after 
source stop, but I'm not sure that it would be good.

>
> I think then we'd have four recognized states:
>
> FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
> other internal process. Bitmap is otherwise ACTIVE.

? Frozen means that all writes goes to the successor and frozen bitmap 
itself is unchanged, no?

> DISABLED: Not recording any writes (by choice.)
> READONLY: Not able to record any writes (by necessity.)
> ACTIVE: Normal bitmap status.
>
> Sound right?
John Snow July 6, 2017, 5:53 p.m. UTC | #10
On 07/06/2017 04:05 AM, Vladimir Sementsov-Ogievskiy wrote:
> 06.07.2017 00:46, John Snow wrote:
>>
>> On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> 16.02.2017 16:04, Fam Zheng wrote:
>>>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>>>> +            }
>>>>> +            dbms->bitmap = bitmap;
>>>> What protects the case that the bitmap is released before migration
>>>> completes?
>>>>
>>> What is the source of such deletion? qmp command? Theoretically
>>> possible.
>>>
>>> I see the following variants:
>>>
>>> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
>>> deletion
>>>
>>> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will not be
>>> available through qmp
>>>
>> Making the bitmap anonymous would forbid us to query the bitmap, which
>> there is no general reason to do, excepting the idea that a third party
>> attempting to use the bitmap during a migration is probably a bad idea.
>> I don't really like the idea of "hiding" information from the user,
>> though, because then we'd have to worry about name collisions when we
>> de-anonymized the bitmap again. That's not so palatable.
>>
>>> what do you think?
>>>
>> The modes for bitmaps are getting messy.
>>
>> As a reminder, the officially exposed "modes" of a bitmap are currently:
>>
>> FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
>> otherwise "ACTIVE."
>> DISABLED: Not recording any writes (by choice.)
>> ACTIVE: Actively recording writes.
>>
>> These are documented in the public API as possibilities for
>> DirtyBitmapStatus in block-core.json. We didn't add a new condition for
>> "readonly" either, which I think is actually required:
>>
>> READONLY: Not recording any writes (by necessity.)
>>
>>
>> Your new use case here sounds like Frozen to me, but it simply does not
>> have an anonymous successor to force it to be recognized as "frozen." We
>> can add a `bool protected` or `bool frozen` field to force recognition
>> of this status and adjust the json documentation accordingly.
> 
> Bitmaps are selected for migration when source is running, so we should
> protect them (from deletion (or frozing or disabling), not from chaning
> bits) before source stop, so that is not like frozen. Bitmaps may be
> changed in this state.
> It is more like ACTIVE.
> 

Right, it's not exactly like frozen's _implementation_ today, but...

> We can move bitmap selection on the point after precopy migration, after
> source stop, but I'm not sure that it would be good.
> 
>>
>> I think then we'd have four recognized states:
>>
>> FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
>> other internal process. Bitmap is otherwise ACTIVE.
> 
> ? Frozen means that all writes goes to the successor and frozen bitmap
> itself is unchanged, no?
> 

I was thinking from the point of view of the API. Of course internally,
you're correct; a "frozen bitmap" is one that is actually disabled and
has an anonymous successor, and that successor records IO.

From the point of view of the API, a frozen bitmap is just "one bitmap"
that is still recording reads/writes, but is protected from being edited
from QMP.

It depends on if you're looking at bitmaps as opaque API objects or if
you're looking at the implementation.

From an API point of view, protecting an Active bitmap from being
renamed/cleared/deleted is functionally identical to the existing case
of protecting a bitmap-and-successor pair during a backup job.

>> DISABLED: Not recording any writes (by choice.)
>> READONLY: Not able to record any writes (by necessity.)
>> ACTIVE: Normal bitmap status.
>>
>> Sound right?
> 
>
Denis V. Lunev July 7, 2017, 9:04 a.m. UTC | #11
On 07/06/2017 08:53 PM, John Snow wrote:
>
> On 07/06/2017 04:05 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 06.07.2017 00:46, John Snow wrote:
>>> On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> 16.02.2017 16:04, Fam Zheng wrote:
>>>>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>>>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>>>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>>>>> +            }
>>>>>> +            dbms->bitmap = bitmap;
>>>>> What protects the case that the bitmap is released before migration
>>>>> completes?
>>>>>
>>>> What is the source of such deletion? qmp command? Theoretically
>>>> possible.
>>>>
>>>> I see the following variants:
>>>>
>>>> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
>>>> deletion
>>>>
>>>> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will not be
>>>> available through qmp
>>>>
>>> Making the bitmap anonymous would forbid us to query the bitmap, which
>>> there is no general reason to do, excepting the idea that a third party
>>> attempting to use the bitmap during a migration is probably a bad idea.
>>> I don't really like the idea of "hiding" information from the user,
>>> though, because then we'd have to worry about name collisions when we
>>> de-anonymized the bitmap again. That's not so palatable.
>>>
>>>> what do you think?
>>>>
>>> The modes for bitmaps are getting messy.
>>>
>>> As a reminder, the officially exposed "modes" of a bitmap are currently:
>>>
>>> FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
>>> otherwise "ACTIVE."
>>> DISABLED: Not recording any writes (by choice.)
>>> ACTIVE: Actively recording writes.
>>>
>>> These are documented in the public API as possibilities for
>>> DirtyBitmapStatus in block-core.json. We didn't add a new condition for
>>> "readonly" either, which I think is actually required:
>>>
>>> READONLY: Not recording any writes (by necessity.)
>>>
>>>
>>> Your new use case here sounds like Frozen to me, but it simply does not
>>> have an anonymous successor to force it to be recognized as "frozen." We
>>> can add a `bool protected` or `bool frozen` field to force recognition
>>> of this status and adjust the json documentation accordingly.
>> Bitmaps are selected for migration when source is running, so we should
>> protect them (from deletion (or frozing or disabling), not from chaning
>> bits) before source stop, so that is not like frozen. Bitmaps may be
>> changed in this state.
>> It is more like ACTIVE.
>>
> Right, it's not exactly like frozen's _implementation_ today, but...
>
>> We can move bitmap selection on the point after precopy migration, after
>> source stop, but I'm not sure that it would be good.
>>
>>> I think then we'd have four recognized states:
>>>
>>> FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
>>> other internal process. Bitmap is otherwise ACTIVE.
>> ? Frozen means that all writes goes to the successor and frozen bitmap
>> itself is unchanged, no?
>>
> I was thinking from the point of view of the API. Of course internally,
> you're correct; a "frozen bitmap" is one that is actually disabled and
> has an anonymous successor, and that successor records IO.
>
> From the point of view of the API, a frozen bitmap is just "one bitmap"
> that is still recording reads/writes, but is protected from being edited
> from QMP.
>
> It depends on if you're looking at bitmaps as opaque API objects or if
> you're looking at the implementation.
>
> From an API point of view, protecting an Active bitmap from being
> renamed/cleared/deleted is functionally identical to the existing case
> of protecting a bitmap-and-successor pair during a backup job.

I think that libvirt properly guards QMP avoid commands changing the
device tree etc at the moment. Thus we should be fine here.

Den

>>> DISABLED: Not recording any writes (by choice.)
>>> READONLY: Not able to record any writes (by necessity.)
>>> ACTIVE: Normal bitmap status.
>>>
>>> Sound right?
>>
Vladimir Sementsov-Ogievskiy July 7, 2017, 9:13 a.m. UTC | #12
06.07.2017 20:53, John Snow wrote:
>
> On 07/06/2017 04:05 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 06.07.2017 00:46, John Snow wrote:
>>> On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> 16.02.2017 16:04, Fam Zheng wrote:
>>>>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>>>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>>>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>>>>> +            }
>>>>>> +            dbms->bitmap = bitmap;
>>>>> What protects the case that the bitmap is released before migration
>>>>> completes?
>>>>>
>>>> What is the source of such deletion? qmp command? Theoretically
>>>> possible.
>>>>
>>>> I see the following variants:
>>>>
>>>> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
>>>> deletion
>>>>
>>>> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will not be
>>>> available through qmp
>>>>
>>> Making the bitmap anonymous would forbid us to query the bitmap, which
>>> there is no general reason to do, excepting the idea that a third party
>>> attempting to use the bitmap during a migration is probably a bad idea.
>>> I don't really like the idea of "hiding" information from the user,
>>> though, because then we'd have to worry about name collisions when we
>>> de-anonymized the bitmap again. That's not so palatable.
>>>
>>>> what do you think?
>>>>
>>> The modes for bitmaps are getting messy.
>>>
>>> As a reminder, the officially exposed "modes" of a bitmap are currently:
>>>
>>> FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
>>> otherwise "ACTIVE."
>>> DISABLED: Not recording any writes (by choice.)
>>> ACTIVE: Actively recording writes.
>>>
>>> These are documented in the public API as possibilities for
>>> DirtyBitmapStatus in block-core.json. We didn't add a new condition for
>>> "readonly" either, which I think is actually required:
>>>
>>> READONLY: Not recording any writes (by necessity.)
>>>
>>>
>>> Your new use case here sounds like Frozen to me, but it simply does not
>>> have an anonymous successor to force it to be recognized as "frozen." We
>>> can add a `bool protected` or `bool frozen` field to force recognition
>>> of this status and adjust the json documentation accordingly.
>> Bitmaps are selected for migration when source is running, so we should
>> protect them (from deletion (or frozing or disabling), not from chaning
>> bits) before source stop, so that is not like frozen. Bitmaps may be
>> changed in this state.
>> It is more like ACTIVE.
>>
> Right, it's not exactly like frozen's _implementation_ today, but...
>
>> We can move bitmap selection on the point after precopy migration, after
>> source stop, but I'm not sure that it would be good.
>>
>>> I think then we'd have four recognized states:
>>>
>>> FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
>>> other internal process. Bitmap is otherwise ACTIVE.
>> ? Frozen means that all writes goes to the successor and frozen bitmap
>> itself is unchanged, no?
>>
> I was thinking from the point of view of the API. Of course internally,
> you're correct; a "frozen bitmap" is one that is actually disabled and
> has an anonymous successor, and that successor records IO.
>
>  From the point of view of the API, a frozen bitmap is just "one bitmap"
> that is still recording reads/writes, but is protected from being edited
> from QMP.
>
> It depends on if you're looking at bitmaps as opaque API objects or if
> you're looking at the implementation.
>
>  From an API point of view, protecting an Active bitmap from being
> renamed/cleared/deleted is functionally identical to the existing case
> of protecting a bitmap-and-successor pair during a backup job.

then, this should be described in API ref..

for now I see here:
# @frozen: The bitmap is currently in-use by a backup operation or block 
job,
#          and is immutable.

Which looks more like the bitmap is unchanged at all.

---

Do not we want in future allow user to create successors through qmp?

We have the following case: exteranal backup.

Backup should be done through image fleecing (temporary image, online 
drive is backing for temp, start backup with sync=none from online drive 
to temp, export temp through nbd).
To make this backup incremental here is BLOCK_STATUS nbd extension, 
which allows dirty bitmap export.
But we need also frozen-bitmap mechanism like with normal incremental 
backup.. Should we create qmp_create_bitmap_successor and friends for it?
Or we will add separate mechanism like qmp_start_external_backup, 
qmp_finish_external_backup(success = true/false) ?



>
>>> DISABLED: Not recording any writes (by choice.)
>>> READONLY: Not able to record any writes (by necessity.)
>>> ACTIVE: Normal bitmap status.
>>>
>>> Sound right?
>>
John Snow July 7, 2017, 11:32 p.m. UTC | #13
On 07/07/2017 05:13 AM, Vladimir Sementsov-Ogievskiy wrote:
> 06.07.2017 20:53, John Snow wrote:
>>
>> On 07/06/2017 04:05 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> 06.07.2017 00:46, John Snow wrote:
>>>> On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>> 16.02.2017 16:04, Fam Zheng wrote:
>>>>>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>>>>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>>>>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>>>>>> +            }
>>>>>>> +            dbms->bitmap = bitmap;
>>>>>> What protects the case that the bitmap is released before migration
>>>>>> completes?
>>>>>>
>>>>> What is the source of such deletion? qmp command? Theoretically
>>>>> possible.
>>>>>
>>>>> I see the following variants:
>>>>>
>>>>> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
>>>>> deletion
>>>>>
>>>>> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will
>>>>> not be
>>>>> available through qmp
>>>>>
>>>> Making the bitmap anonymous would forbid us to query the bitmap, which
>>>> there is no general reason to do, excepting the idea that a third party
>>>> attempting to use the bitmap during a migration is probably a bad idea.
>>>> I don't really like the idea of "hiding" information from the user,
>>>> though, because then we'd have to worry about name collisions when we
>>>> de-anonymized the bitmap again. That's not so palatable.
>>>>
>>>>> what do you think?
>>>>>
>>>> The modes for bitmaps are getting messy.
>>>>
>>>> As a reminder, the officially exposed "modes" of a bitmap are
>>>> currently:
>>>>
>>>> FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
>>>> otherwise "ACTIVE."
>>>> DISABLED: Not recording any writes (by choice.)
>>>> ACTIVE: Actively recording writes.
>>>>
>>>> These are documented in the public API as possibilities for
>>>> DirtyBitmapStatus in block-core.json. We didn't add a new condition for
>>>> "readonly" either, which I think is actually required:
>>>>
>>>> READONLY: Not recording any writes (by necessity.)
>>>>
>>>>
>>>> Your new use case here sounds like Frozen to me, but it simply does not
>>>> have an anonymous successor to force it to be recognized as
>>>> "frozen." We
>>>> can add a `bool protected` or `bool frozen` field to force recognition
>>>> of this status and adjust the json documentation accordingly.
>>> Bitmaps are selected for migration when source is running, so we should
>>> protect them (from deletion (or frozing or disabling), not from chaning
>>> bits) before source stop, so that is not like frozen. Bitmaps may be
>>> changed in this state.
>>> It is more like ACTIVE.
>>>
>> Right, it's not exactly like frozen's _implementation_ today, but...
>>
>>> We can move bitmap selection on the point after precopy migration, after
>>> source stop, but I'm not sure that it would be good.
>>>
>>>> I think then we'd have four recognized states:
>>>>
>>>> FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
>>>> other internal process. Bitmap is otherwise ACTIVE.
>>> ? Frozen means that all writes goes to the successor and frozen bitmap
>>> itself is unchanged, no?
>>>
>> I was thinking from the point of view of the API. Of course internally,
>> you're correct; a "frozen bitmap" is one that is actually disabled and
>> has an anonymous successor, and that successor records IO.
>>
>>  From the point of view of the API, a frozen bitmap is just "one bitmap"
>> that is still recording reads/writes, but is protected from being edited
>> from QMP.
>>
>> It depends on if you're looking at bitmaps as opaque API objects or if
>> you're looking at the implementation.
>>
>>  From an API point of view, protecting an Active bitmap from being
>> renamed/cleared/deleted is functionally identical to the existing case
>> of protecting a bitmap-and-successor pair during a backup job.
> 
> then, this should be described in API ref..
> 
> for now I see here:
> # @frozen: The bitmap is currently in-use by a backup operation or block
> job,
> #          and is immutable.
> 
> Which looks more like the bitmap is unchanged at all.
> 

You're right, this is a bad wording. It's technically true of course,
but in truth the bitmap is effectively recording writes if you consider
the bitmap and the anonymous successor as "one object."

What's really immutable here from the API POV is any user-modifiable
properties.

> ---
> 
> Do not we want in future allow user to create successors through qmp?
> 

I am not sure if I want to expose this functionality *DIRECTLY* but yes,
this use case will necessitate the creation of successors. I just don't
really want to give the user direct control of them via QMP. They're an
implementation detail, nothing more.

> We have the following case: exteranal backup.
> 
> Backup should be done through image fleecing (temporary image, online
> drive is backing for temp, start backup with sync=none from online drive
> to temp, export temp through nbd).
> To make this backup incremental here is BLOCK_STATUS nbd extension,
> which allows dirty bitmap export.
> But we need also frozen-bitmap mechanism like with normal incremental
> backup.. Should we create qmp_create_bitmap_successor and friends for it?
> Or we will add separate mechanism like qmp_start_external_backup,
> qmp_finish_external_backup(success = true/false) ?
> 

That was my thought at least, yes. Some kind of job mechanism that
allowed us to have start/stop/cancel mechanics that made it clear to
QEMU if it should roll back the bitmap or promote the successor.

> 
> 
>>
>>>> DISABLED: Not recording any writes (by choice.)
>>>> READONLY: Not able to record any writes (by necessity.)
>>>> ACTIVE: Normal bitmap status.
>>>>
>>>> Sound right?
>>>
>
Vladimir Sementsov-Ogievskiy July 10, 2017, 9:17 a.m. UTC | #14
08.07.2017 02:32, John Snow wrote:
>
> On 07/07/2017 05:13 AM, Vladimir Sementsov-Ogievskiy wrote:
>> 06.07.2017 20:53, John Snow wrote:
>>> On 07/06/2017 04:05 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> 06.07.2017 00:46, John Snow wrote:
>>>>> On 07/05/2017 05:24 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>>> 16.02.2017 16:04, Fam Zheng wrote:
>>>>>>>> +            dbms->node_name = bdrv_get_node_name(bs);
>>>>>>>> +            if (!dbms->node_name || dbms->node_name[0] == '\0') {
>>>>>>>> +                dbms->node_name = bdrv_get_device_name(bs);
>>>>>>>> +            }
>>>>>>>> +            dbms->bitmap = bitmap;
>>>>>>> What protects the case that the bitmap is released before migration
>>>>>>> completes?
>>>>>>>
>>>>>> What is the source of such deletion? qmp command? Theoretically
>>>>>> possible.
>>>>>>
>>>>>> I see the following variants:
>>>>>>
>>>>>> 1. additional variable BdrvDirtyBItmap.migration, which forbids bitmap
>>>>>> deletion
>>>>>>
>>>>>> 2. make bitmap anonymous (bdrv_dirty_bitmap_make_anon) - it will
>>>>>> not be
>>>>>> available through qmp
>>>>>>
>>>>> Making the bitmap anonymous would forbid us to query the bitmap, which
>>>>> there is no general reason to do, excepting the idea that a third party
>>>>> attempting to use the bitmap during a migration is probably a bad idea.
>>>>> I don't really like the idea of "hiding" information from the user,
>>>>> though, because then we'd have to worry about name collisions when we
>>>>> de-anonymized the bitmap again. That's not so palatable.
>>>>>
>>>>>> what do you think?
>>>>>>
>>>>> The modes for bitmaps are getting messy.
>>>>>
>>>>> As a reminder, the officially exposed "modes" of a bitmap are
>>>>> currently:
>>>>>
>>>>> FROZEN: Cannot be reset/deleted. Implication is that the bitmap is
>>>>> otherwise "ACTIVE."
>>>>> DISABLED: Not recording any writes (by choice.)
>>>>> ACTIVE: Actively recording writes.
>>>>>
>>>>> These are documented in the public API as possibilities for
>>>>> DirtyBitmapStatus in block-core.json. We didn't add a new condition for
>>>>> "readonly" either, which I think is actually required:
>>>>>
>>>>> READONLY: Not recording any writes (by necessity.)
>>>>>
>>>>>
>>>>> Your new use case here sounds like Frozen to me, but it simply does not
>>>>> have an anonymous successor to force it to be recognized as
>>>>> "frozen." We
>>>>> can add a `bool protected` or `bool frozen` field to force recognition
>>>>> of this status and adjust the json documentation accordingly.
>>>> Bitmaps are selected for migration when source is running, so we should
>>>> protect them (from deletion (or frozing or disabling), not from chaning
>>>> bits) before source stop, so that is not like frozen. Bitmaps may be
>>>> changed in this state.
>>>> It is more like ACTIVE.
>>>>
>>> Right, it's not exactly like frozen's _implementation_ today, but...
>>>
>>>> We can move bitmap selection on the point after precopy migration, after
>>>> source stop, but I'm not sure that it would be good.
>>>>
>>>>> I think then we'd have four recognized states:
>>>>>
>>>>> FROZEN: Cannot be reset/deleted. Bitmap is in-use by a block job or
>>>>> other internal process. Bitmap is otherwise ACTIVE.
>>>> ? Frozen means that all writes goes to the successor and frozen bitmap
>>>> itself is unchanged, no?
>>>>
>>> I was thinking from the point of view of the API. Of course internally,
>>> you're correct; a "frozen bitmap" is one that is actually disabled and
>>> has an anonymous successor, and that successor records IO.
>>>
>>>   From the point of view of the API, a frozen bitmap is just "one bitmap"
>>> that is still recording reads/writes, but is protected from being edited
>>> from QMP.
>>>
>>> It depends on if you're looking at bitmaps as opaque API objects or if
>>> you're looking at the implementation.
>>>
>>>   From an API point of view, protecting an Active bitmap from being
>>> renamed/cleared/deleted is functionally identical to the existing case
>>> of protecting a bitmap-and-successor pair during a backup job.
>> then, this should be described in API ref..
>>
>> for now I see here:
>> # @frozen: The bitmap is currently in-use by a backup operation or block
>> job,
>> #          and is immutable.
>>
>> Which looks more like the bitmap is unchanged at all.
>>
> You're right, this is a bad wording. It's technically true of course,
> but in truth the bitmap is effectively recording writes if you consider
> the bitmap and the anonymous successor as "one object."
>
> What's really immutable here from the API POV is any user-modifiable
> properties.

Can we document this somehow?

In this terminology, what happens with frozen bitmap (which in fact is 
like "active" for the user) on successful backup finish? It turns into a 
difference between current-state and what-was-backed-up? It is not very 
transparent.. May be bitmap part, related to the backup is substituted 
from the bitmap?

// it's hard for me to believe that 'frozen' doesn't mean 'can not 
move', but if it is already established idea then ok.

>
>> ---
>>
>> Do not we want in future allow user to create successors through qmp?
>>
> I am not sure if I want to expose this functionality *DIRECTLY* but yes,
> this use case will necessitate the creation of successors. I just don't
> really want to give the user direct control of them via QMP. They're an
> implementation detail, nothing more.
>
>> We have the following case: exteranal backup.
>>
>> Backup should be done through image fleecing (temporary image, online
>> drive is backing for temp, start backup with sync=none from online drive
>> to temp, export temp through nbd).
>> To make this backup incremental here is BLOCK_STATUS nbd extension,
>> which allows dirty bitmap export.
>> But we need also frozen-bitmap mechanism like with normal incremental
>> backup.. Should we create qmp_create_bitmap_successor and friends for it?
>> Or we will add separate mechanism like qmp_start_external_backup,
>> qmp_finish_external_backup(success = true/false) ?
>>
> That was my thought at least, yes. Some kind of job mechanism that
> allowed us to have start/stop/cancel mechanics that made it clear to
> QEMU if it should roll back the bitmap or promote the successor.
>
>>
>>>>> DISABLED: Not recording any writes (by choice.)
>>>>> READONLY: Not able to record any writes (by necessity.)
>>>>> ACTIVE: Normal bitmap status.
>>>>>
>>>>> Sound right?
John Snow July 10, 2017, 11:27 p.m. UTC | #15
On 07/10/2017 05:17 AM, Vladimir Sementsov-Ogievskiy wrote:
> 
> Can we document this somehow?
> 
> In this terminology, what happens with frozen bitmap (which in fact is 
> like "active" for the user) on successful backup finish? It turns into a 
> difference between current-state and what-was-backed-up? It is not very 
> transparent.. May be bitmap part, related to the backup is substituted 
> from the bitmap?
> 
> // it's hard for me to believe that 'frozen' doesn't mean 'can not 
> move', but if it is already established idea then ok.

Admittedly a mistake on my part, mixing up implementation detail and 
exposed interface. It's not necessarily an established idea...

To the end-user, a bitmap that is "frozen" is still indeed recording new 
writes from a certain point of view, because once the backup completes, 
the bitmap (secretly: the promoted successor) will contain writes that 
occurred during that frozen period of time. How could this happen unless 
our "frozen" bitmap was recording writes?

Oops, that's not very frozen at all, is it.

That's an unfortunate bit of design that I didn't take into account when 
I named the "Frozen" property and exposed it via the JSON API. I was 
only thinking about it from the implementation point of view, like you 
are -- the bitmap _is_ well and truly frozen: it cannot be modified and 
does not record any writes. (Only the successor does, as you know.)

So as it stands we have a discrepancy in the code between internal and 
external usage over what exactly "frozen" implies. It's a poor name.

I should have named the QAPI-facing portion of it "locked" or 
"protected" or something that helped imply that it could continue to 
change, but it was off-limits.
diff mbox

Patch

diff --git a/include/migration/block.h b/include/migration/block.h
index 41a1ac8..8333c43 100644
--- a/include/migration/block.h
+++ b/include/migration/block.h
@@ -14,6 +14,7 @@ 
 #ifndef MIGRATION_BLOCK_H
 #define MIGRATION_BLOCK_H
 
+void dirty_bitmap_mig_init(void);
 void blk_mig_init(void);
 int blk_mig_active(void);
 uint64_t blk_mig_bytes_transferred(void);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 46645f4..03a4993 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -371,4 +371,8 @@  int ram_save_queue_pages(MigrationState *ms, const char *rbname,
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
 PostcopyState postcopy_state_set(PostcopyState new_state);
+
+void dirty_bitmap_mig_before_vm_start(void);
+void init_dirty_bitmap_incoming_migration(void);
+
 #endif
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index 480dd49..fa3bf6a 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -9,5 +9,5 @@  common-obj-y += qjson.o
 
 common-obj-$(CONFIG_RDMA) += rdma.o
 
-common-obj-y += block.o
+common-obj-y += block.o block-dirty-bitmap.o
 
diff --git a/migration/block-dirty-bitmap.c b/migration/block-dirty-bitmap.c
new file mode 100644
index 0000000..28e3732
--- /dev/null
+++ b/migration/block-dirty-bitmap.c
@@ -0,0 +1,679 @@ 
+/*
+ * Block dirty bitmap postcopy migration
+ *
+ * Copyright IBM, Corp. 2009
+ * Copyright (C) 2016 Parallels IP Holdings GmbH. All rights reserved.
+ *
+ * Authors:
+ *  Liran Schour   <lirans@il.ibm.com>
+ *  Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ * This file is derived from migration/block.c, so it's author and IBM copyright
+ * are here, although content is quite different.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ *
+ *                                ***
+ *
+ * Here postcopy migration of dirty bitmaps is realized. Only named dirty
+ * bitmaps, associated with root nodes and non-root named nodes are migrated.
+ *
+ * If destination qemu is already containing a dirty bitmap with the same name
+ * as a migrated bitmap (for the same node), then, if their granularities are
+ * the same the migration will be done, otherwise the error will be generated.
+ *
+ * If destination qemu doesn't contain such bitmap it will be created.
+ *
+ * format of migration:
+ *
+ * # Header (shared for different chunk types)
+ * 1, 2 or 4 bytes: flags (see qemu_{put,put}_flags)
+ * [ 1 byte: node name size ] \  flags & DEVICE_NAME
+ * [ n bytes: node name     ] /
+ * [ 1 byte: bitmap name size ] \  flags & BITMAP_NAME
+ * [ n bytes: bitmap name     ] /
+ *
+ * # Start of bitmap migration (flags & START)
+ * header
+ * be64: granularity
+ * 1 byte: bitmap enabled flag
+ *
+ * # Complete of bitmap migration (flags & COMPLETE)
+ * header
+ *
+ * # Data chunk of bitmap migration
+ * header
+ * be64: start sector
+ * be32: number of sectors
+ * [ be64: buffer size  ] \ ! (flags & ZEROES)
+ * [ n bytes: buffer    ] /
+ *
+ * The last chunk in stream should contain flags & EOS. The chunk may skip
+ * device and/or bitmap names, assuming them to be the same with the previous
+ * chunk.
+ */
+
+#include "qemu/osdep.h"
+#include "block/block.h"
+#include "block/block_int.h"
+#include "sysemu/block-backend.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "migration/block.h"
+#include "migration/migration.h"
+#include "qemu/hbitmap.h"
+#include "sysemu/sysemu.h"
+#include "qemu/cutils.h"
+#include "qapi/error.h"
+#include "trace.h"
+#include <assert.h>
+
+#define CHUNK_SIZE     (1 << 10)
+
+/* Flags occupy from one to four bytes. In all but one the 7-th (EXTRA_FLAGS)
+ * bit should be set. */
+#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
+#define DIRTY_BITMAP_MIG_FLAG_ZEROES        0x02
+#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x04
+#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x08
+#define DIRTY_BITMAP_MIG_FLAG_START         0x10
+#define DIRTY_BITMAP_MIG_FLAG_COMPLETE      0x20
+#define DIRTY_BITMAP_MIG_FLAG_BITS          0x40
+
+#define DIRTY_BITMAP_MIG_EXTRA_FLAGS        0x80
+#define DIRTY_BITMAP_MIG_FLAGS_SIZE_16      0x8000
+#define DIRTY_BITMAP_MIG_FLAGS_SIZE_32      0x8080
+
+#define DEBUG_DIRTY_BITMAP_MIGRATION 0
+
+typedef struct DirtyBitmapMigBitmapState {
+    /* Written during setup phase. */
+    BlockDriverState *bs;
+    const char *node_name;
+    BdrvDirtyBitmap *bitmap;
+    uint64_t total_sectors;
+    uint64_t sectors_per_chunk;
+    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
+
+    /* For bulk phase. */
+    bool bulk_completed;
+    uint64_t cur_sector;
+} DirtyBitmapMigBitmapState;
+
+typedef struct DirtyBitmapMigState {
+    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
+
+    bool bulk_completed;
+
+    /* for send_bitmap_bits() */
+    BlockDriverState *prev_bs;
+    BdrvDirtyBitmap *prev_bitmap;
+} DirtyBitmapMigState;
+
+typedef struct DirtyBitmapLoadState {
+    uint32_t flags;
+    char node_name[256];
+    char bitmap_name[256];
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+} DirtyBitmapLoadState;
+
+static DirtyBitmapMigState dirty_bitmap_mig_state;
+
+typedef struct DirtyBitmapLoadBitmapState {
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+    bool migrated;
+} DirtyBitmapLoadBitmapState;
+static GSList *enabled_bitmaps;
+QemuMutex finish_lock;
+
+void init_dirty_bitmap_incoming_migration(void)
+{
+    qemu_mutex_init(&finish_lock);
+}
+
+static uint32_t qemu_get_bitmap_flags(QEMUFile *f)
+{
+    uint8_t flags = qemu_get_byte(f);
+    if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
+        flags = flags << 8 | qemu_get_byte(f);
+        if (flags & DIRTY_BITMAP_MIG_EXTRA_FLAGS) {
+            flags = flags << 16 | qemu_get_be16(f);
+        }
+    }
+
+    return flags;
+}
+
+static void qemu_put_bitmap_flags(QEMUFile *f, uint32_t flags)
+{
+    if (!(flags & 0xffffff00)) {
+        qemu_put_byte(f, flags);
+        return;
+    }
+
+    if (!(flags & 0xffff0000)) {
+        qemu_put_be16(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_16);
+        return;
+    }
+
+    qemu_put_be32(f, flags | DIRTY_BITMAP_MIG_FLAGS_SIZE_32);
+}
+
+static void send_bitmap_header(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
+                               uint32_t additional_flags)
+{
+    BlockDriverState *bs = dbms->bs;
+    BdrvDirtyBitmap *bitmap = dbms->bitmap;
+    uint32_t flags = additional_flags;
+    trace_send_bitmap_header_enter();
+
+    if (bs != dirty_bitmap_mig_state.prev_bs) {
+        dirty_bitmap_mig_state.prev_bs = bs;
+        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
+    }
+
+    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
+        dirty_bitmap_mig_state.prev_bitmap = bitmap;
+        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
+    }
+
+    qemu_put_bitmap_flags(f, flags);
+
+    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
+        qemu_put_counted_string(f, dbms->node_name);
+    }
+
+    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
+        qemu_put_counted_string(f, bdrv_dirty_bitmap_name(bitmap));
+    }
+}
+
+static void send_bitmap_start(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
+{
+    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_START);
+    qemu_put_be32(f, bdrv_dirty_bitmap_granularity(dbms->bitmap));
+    qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
+}
+
+static void send_bitmap_complete(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
+{
+    send_bitmap_header(f, dbms, DIRTY_BITMAP_MIG_FLAG_COMPLETE);
+}
+
+static void send_bitmap_bits(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
+                             uint64_t start_sector, uint32_t nr_sectors)
+{
+    /* align for buffer_is_zero() */
+    uint64_t align = 4 * sizeof(long);
+    uint64_t unaligned_size =
+        bdrv_dirty_bitmap_serialization_size(dbms->bitmap,
+                                             start_sector, nr_sectors);
+    uint64_t buf_size = (unaligned_size + align - 1) & ~(align - 1);
+    uint8_t *buf = g_malloc0(buf_size);
+    uint32_t flags = DIRTY_BITMAP_MIG_FLAG_BITS;
+
+    bdrv_dirty_bitmap_serialize_part(dbms->bitmap, buf,
+                                     start_sector, nr_sectors);
+
+    if (buffer_is_zero(buf, buf_size)) {
+        g_free(buf);
+        buf = NULL;
+        flags |= DIRTY_BITMAP_MIG_FLAG_ZEROES;
+    }
+
+    trace_send_bitmap_bits(flags, start_sector, nr_sectors, buf_size);
+
+    send_bitmap_header(f, dbms, flags);
+
+    qemu_put_be64(f, start_sector);
+    qemu_put_be32(f, nr_sectors);
+
+    /* if a block is zero we need to flush here since the network
+     * bandwidth is now a lot higher than the storage device bandwidth.
+     * thus if we queue zero blocks we slow down the migration. */
+    if (flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
+        qemu_fflush(f);
+    } else {
+        qemu_put_be64(f, buf_size);
+        qemu_put_buffer(f, buf, buf_size);
+    }
+
+    g_free(buf);
+}
+
+
+/* Called with iothread lock taken.  */
+
+static void init_dirty_bitmap_migration(void)
+{
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+    DirtyBitmapMigBitmapState *dbms;
+    BdrvNextIterator it;
+    uint64_t total_bytes = 0;
+
+    dirty_bitmap_mig_state.bulk_completed = false;
+    dirty_bitmap_mig_state.prev_bs = NULL;
+    dirty_bitmap_mig_state.prev_bitmap = NULL;
+
+    for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
+        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
+             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
+            if (!bdrv_dirty_bitmap_name(bitmap)) {
+                continue;
+            }
+
+            if (!bdrv_get_device_or_node_name(bs)) {
+                /* not named non-root node */
+                continue;
+            }
+
+            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
+            dbms->bs = bs;
+            dbms->node_name = bdrv_get_node_name(bs);
+            if (!dbms->node_name || dbms->node_name[0] == '\0') {
+                dbms->node_name = bdrv_get_device_name(bs);
+            }
+            dbms->bitmap = bitmap;
+            dbms->total_sectors = bdrv_nb_sectors(bs);
+            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
+                bdrv_dirty_bitmap_granularity(bitmap) >> BDRV_SECTOR_BITS;
+
+            total_bytes +=
+                bdrv_dirty_bitmap_serialization_size(bitmap,
+                                                     0, dbms->total_sectors);
+
+            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
+                                 dbms, entry);
+        }
+    }
+}
+
+/* Called with no lock taken.  */
+static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
+{
+    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
+                             dbms->sectors_per_chunk);
+
+    send_bitmap_bits(f, dbms, dbms->cur_sector, nr_sectors);
+
+    dbms->cur_sector += nr_sectors;
+    if (dbms->cur_sector >= dbms->total_sectors) {
+        dbms->bulk_completed = true;
+    }
+}
+
+/* Called with no lock taken.  */
+static void bulk_phase(QEMUFile *f, bool limit)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        while (!dbms->bulk_completed) {
+            bulk_phase_send_chunk(f, dbms);
+            if (limit && qemu_file_rate_limit(f)) {
+                return;
+            }
+        }
+    }
+
+    dirty_bitmap_mig_state.bulk_completed = true;
+}
+
+/* Called with iothread lock taken.  */
+static void dirty_bitmap_mig_cleanup(void)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
+        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
+        g_free(dbms);
+    }
+}
+
+/* for SaveVMHandlers */
+static void dirty_bitmap_migration_cleanup(void *opaque)
+{
+    dirty_bitmap_mig_cleanup();
+}
+
+static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
+{
+    trace_dirty_bitmap_save_iterate(
+            migration_in_postcopy(migrate_get_current()));
+
+    if (migration_in_postcopy(migrate_get_current()) &&
+        !dirty_bitmap_mig_state.bulk_completed) {
+        bulk_phase(f, true);
+    }
+
+    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
+
+    return dirty_bitmap_mig_state.bulk_completed;
+}
+
+/* Called with iothread lock taken.  */
+
+static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
+{
+    DirtyBitmapMigBitmapState *dbms;
+    trace_dirty_bitmap_save_complete_enter();
+
+    if (!dirty_bitmap_mig_state.bulk_completed) {
+        bulk_phase(f, false);
+    }
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        send_bitmap_complete(f, dbms);
+    }
+
+    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
+
+    trace_dirty_bitmap_save_complete_finish();
+
+    dirty_bitmap_mig_cleanup();
+    return 0;
+}
+
+static void dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
+                                      uint64_t max_size,
+                                      uint64_t *res_precopy_only,
+                                      uint64_t *res_compatible,
+                                      uint64_t *res_postcopy_only)
+{
+    DirtyBitmapMigBitmapState *dbms;
+    uint64_t pending = 0;
+
+    qemu_mutex_lock_iothread();
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        uint64_t gran = bdrv_dirty_bitmap_granularity(dbms->bitmap);
+        uint64_t sectors = dbms->bulk_completed ? 0 :
+                           dbms->total_sectors - dbms->cur_sector;
+
+        pending += (sectors * BDRV_SECTOR_SIZE + gran - 1) / gran;
+    }
+
+    qemu_mutex_unlock_iothread();
+
+    trace_dirty_bitmap_save_pending(pending, max_size);
+
+    *res_postcopy_only += pending;
+}
+
+/* First occurrence of this bitmap. It should be created if doesn't exist */
+static int dirty_bitmap_load_start(QEMUFile *f, DirtyBitmapLoadState *s)
+{
+    Error *local_err = NULL;
+    uint32_t granularity = qemu_get_be32(f);
+    bool enabled = qemu_get_byte(f);
+
+    if (!s->bitmap) {
+        s->bitmap = bdrv_create_dirty_bitmap(s->bs, granularity,
+                                             s->bitmap_name, &local_err);
+        if (!s->bitmap) {
+            error_report_err(local_err);
+            return -EINVAL;
+        }
+    } else {
+        uint32_t dest_granularity =
+            bdrv_dirty_bitmap_granularity(s->bitmap);
+        if (dest_granularity != granularity) {
+            fprintf(stderr,
+                    "Error: "
+                    "Migrated bitmap granularity (%" PRIu32 ") "
+                    "doesn't match the destination bitmap '%s' "
+                    "granularity (%" PRIu32 ")\n",
+                    granularity,
+                    bdrv_dirty_bitmap_name(s->bitmap),
+                    dest_granularity);
+            return -EINVAL;
+        }
+    }
+
+    bdrv_disable_dirty_bitmap(s->bitmap);
+    if (enabled) {
+        DirtyBitmapLoadBitmapState *b;
+
+        bdrv_dirty_bitmap_create_successor(s->bs, s->bitmap, &local_err);
+        if (local_err) {
+            error_report_err(local_err);
+            return -EINVAL;
+        }
+
+        b = g_new(DirtyBitmapLoadBitmapState, 1);
+        b->bs = s->bs;
+        b->bitmap = s->bitmap;
+        b->migrated = false;
+        enabled_bitmaps = g_slist_prepend(enabled_bitmaps, b);
+    }
+
+    return 0;
+}
+
+void dirty_bitmap_mig_before_vm_start(void)
+{
+    GSList *item;
+
+    qemu_mutex_lock(&finish_lock);
+
+    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
+        DirtyBitmapLoadBitmapState *b = item->data;
+
+        if (b->migrated) {
+            bdrv_enable_dirty_bitmap(b->bitmap);
+        } else {
+            bdrv_dirty_bitmap_enable_successor(b->bitmap);
+        }
+
+        g_free(b);
+    }
+
+    g_slist_free(enabled_bitmaps);
+    enabled_bitmaps = NULL;
+
+    qemu_mutex_unlock(&finish_lock);
+}
+
+static void dirty_bitmap_load_complete(QEMUFile *f, DirtyBitmapLoadState *s)
+{
+    GSList *item;
+    trace_dirty_bitmap_load_complete();
+    bdrv_dirty_bitmap_deserialize_finish(s->bitmap);
+
+    qemu_mutex_lock(&finish_lock);
+
+    for (item = enabled_bitmaps; item; item = g_slist_next(item)) {
+        DirtyBitmapLoadBitmapState *b = item->data;
+
+        if (b->bitmap == s->bitmap) {
+            b->migrated = true;
+        }
+    }
+
+    if (bdrv_dirty_bitmap_frozen(s->bitmap)) {
+        if (enabled_bitmaps == NULL) {
+            /* in postcopy */
+            AioContext *aio_context = bdrv_get_aio_context(s->bs);
+            aio_context_acquire(aio_context);
+
+            bdrv_reclaim_dirty_bitmap(s->bs, s->bitmap, &error_abort);
+            bdrv_enable_dirty_bitmap(s->bitmap);
+
+            aio_context_release(aio_context);
+        } else {
+            /* target not started, successor is empty */
+            bdrv_dirty_bitmap_release_successor(s->bs, s->bitmap);
+        }
+    }
+
+    qemu_mutex_unlock(&finish_lock);
+}
+
+static int dirty_bitmap_load_bits(QEMUFile *f, DirtyBitmapLoadState *s)
+{
+    uint64_t first_sector = qemu_get_be64(f);
+    uint32_t nr_sectors = qemu_get_be32(f);
+    trace_dirty_bitmap_load_bits_enter(first_sector, nr_sectors);
+
+    if (s->flags & DIRTY_BITMAP_MIG_FLAG_ZEROES) {
+        trace_dirty_bitmap_load_bits_zeroes();
+        bdrv_dirty_bitmap_deserialize_zeroes(s->bitmap, first_sector,
+                                             nr_sectors, false);
+    } else {
+        uint8_t *buf;
+        uint64_t buf_size = qemu_get_be64(f);
+        uint64_t needed_size =
+            bdrv_dirty_bitmap_serialization_size(s->bitmap,
+                                                 first_sector, nr_sectors);
+
+        if (needed_size > buf_size) {
+            fprintf(stderr,
+                    "Error: Migrated bitmap granularity doesn't "
+                    "match the destination bitmap '%s' granularity\n",
+                    bdrv_dirty_bitmap_name(s->bitmap));
+            return -EINVAL;
+        }
+
+        buf = g_malloc(buf_size);
+        qemu_get_buffer(f, buf, buf_size);
+        bdrv_dirty_bitmap_deserialize_part(s->bitmap, buf,
+                                           first_sector,
+                                           nr_sectors, false);
+        g_free(buf);
+    }
+
+    return 0;
+}
+
+static int dirty_bitmap_load_header(QEMUFile *f, DirtyBitmapLoadState *s)
+{
+    Error *local_err = NULL;
+    s->flags = qemu_get_bitmap_flags(f);
+    trace_dirty_bitmap_load_header(s->flags);
+
+    if (s->flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
+        if (!qemu_get_counted_string(f, s->node_name)) {
+            fprintf(stderr, "Unable to read node name string\n");
+            return -EINVAL;
+        }
+        s->bs = bdrv_lookup_bs(s->node_name, s->node_name, &local_err);
+        if (!s->bs) {
+            error_report("%s", error_get_pretty(local_err));
+            error_free(local_err);
+            return -EINVAL;
+        }
+    } else if (!s->bs) {
+        fprintf(stderr, "Error: block device name is not set\n");
+        return -EINVAL;
+    }
+
+    if (s->flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
+        if (!qemu_get_counted_string(f, s->bitmap_name)) {
+            fprintf(stderr, "Unable to read node name string\n");
+            return -EINVAL;
+        }
+        s->bitmap = bdrv_find_dirty_bitmap(s->bs, s->bitmap_name);
+
+        /* bitmap may be NULL here, it wouldn't be an error if it is the
+         * first occurrence of the bitmap */
+        if (!s->bitmap && !(s->flags & DIRTY_BITMAP_MIG_FLAG_START)) {
+            fprintf(stderr, "Error: unknown dirty bitmap "
+                    "'%s' for block device '%s'\n",
+                    s->bitmap_name, s->node_name);
+            return -EINVAL;
+        }
+    } else if (!s->bitmap) {
+        fprintf(stderr, "Error: block device name is not set\n");
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
+static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
+{
+    static DirtyBitmapLoadState s;
+
+    int ret = 0;
+
+    trace_dirty_bitmap_load_enter();
+
+    do {
+        dirty_bitmap_load_header(f, &s);
+
+        if (s.flags & DIRTY_BITMAP_MIG_FLAG_START) {
+            ret = dirty_bitmap_load_start(f, &s);
+        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_COMPLETE) {
+            dirty_bitmap_load_complete(f, &s);
+        } else if (s.flags & DIRTY_BITMAP_MIG_FLAG_BITS) {
+            ret = dirty_bitmap_load_bits(f, &s);
+        }
+
+        if (!ret) {
+            ret = qemu_file_get_error(f);
+        }
+
+        if (ret) {
+            return ret;
+        }
+    } while (!(s.flags & DIRTY_BITMAP_MIG_FLAG_EOS));
+
+    trace_dirty_bitmap_load_success();
+    return 0;
+}
+
+static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
+{
+    DirtyBitmapMigBitmapState *dbms = NULL;
+    init_dirty_bitmap_migration();
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        send_bitmap_start(f, dbms);
+    }
+    qemu_put_bitmap_flags(f, DIRTY_BITMAP_MIG_FLAG_EOS);
+
+    return 0;
+}
+
+static bool dirty_bitmap_is_active(void *opaque)
+{
+    return migrate_dirty_bitmaps();
+}
+
+static bool dirty_bitmap_is_active_iterate(void *opaque)
+{
+    return dirty_bitmap_is_active(opaque) && !runstate_is_running();
+}
+
+static bool dirty_bitmap_has_postcopy(void *opaque)
+{
+    return true;
+}
+
+static SaveVMHandlers savevm_dirty_bitmap_handlers = {
+    .save_live_setup = dirty_bitmap_save_setup,
+    .save_live_complete_postcopy = dirty_bitmap_save_complete,
+    .save_live_complete_precopy = dirty_bitmap_save_complete,
+    .has_postcopy = dirty_bitmap_has_postcopy,
+    .save_live_pending = dirty_bitmap_save_pending,
+    .save_live_iterate = dirty_bitmap_save_iterate,
+    .is_active_iterate = dirty_bitmap_is_active_iterate,
+    .load_state = dirty_bitmap_load,
+    .cleanup = dirty_bitmap_migration_cleanup,
+    .is_active = dirty_bitmap_is_active,
+};
+
+void dirty_bitmap_mig_init(void)
+{
+    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
+
+    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
+                         &savevm_dirty_bitmap_handlers,
+                         &dirty_bitmap_mig_state);
+}
diff --git a/migration/migration.c b/migration/migration.c
index 179bd04..edf5623 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -122,6 +122,9 @@  MigrationIncomingState *migration_incoming_get_current(void)
         QLIST_INIT(&mis_current.loadvm_handlers);
         qemu_mutex_init(&mis_current.rp_mutex);
         qemu_event_init(&mis_current.main_thread_load_event, false);
+
+        init_dirty_bitmap_incoming_migration();
+
         once = true;
     }
     return &mis_current;
diff --git a/migration/savevm.c b/migration/savevm.c
index 54572ef..927a680 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1651,6 +1651,8 @@  static void loadvm_postcopy_handle_run_bh(void *opaque)
 
     trace_loadvm_postcopy_handle_run_vmstart();
 
+    dirty_bitmap_mig_before_vm_start();
+
     if (autostart) {
         /* Hold onto your hats, starting the CPU */
         vm_start();
diff --git a/migration/trace-events b/migration/trace-events
index 0212929..38fca41 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -222,3 +222,17 @@  colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
 colo_send_message(const char *msg) "Send '%s' message"
 colo_receive_message(const char *msg) "Receive '%s' message"
 colo_failover_set_state(const char *new_state) "new state %s"
+
+# migration/block-dirty-bitmap.c
+send_bitmap_header_enter(void) ""
+send_bitmap_bits(uint32_t flags, uint64_t start_sector, uint32_t nr_sectors, uint64_t data_size) "\n   flags:        %x\n   start_sector: %" PRIu64 "\n   nr_sectors:   %" PRIu32 "\n   data_size:    %" PRIu64 "\n"
+dirty_bitmap_save_iterate(int in_postcopy) "in postcopy: %d"
+dirty_bitmap_save_complete_enter(void) ""
+dirty_bitmap_save_complete_finish(void) ""
+dirty_bitmap_save_pending(uint64_t pending, uint64_t max_size) "pending %" PRIu64 " max: %" PRIu64
+dirty_bitmap_load_complete(void) ""
+dirty_bitmap_load_bits_enter(uint64_t first_sector, uint32_t nr_sectors) "chunk: %" PRIu64 " %" PRIu32
+dirty_bitmap_load_bits_zeroes(void) ""
+dirty_bitmap_load_header(uint32_t flags) "flags %x"
+dirty_bitmap_load_enter(void) ""
+dirty_bitmap_load_success(void) ""
diff --git a/vl.c b/vl.c
index b4eaf03..f1ee9ff 100644
--- a/vl.c
+++ b/vl.c
@@ -4405,6 +4405,7 @@  int main(int argc, char **argv, char **envp)
 
     blk_mig_init();
     ram_mig_init();
+    dirty_bitmap_mig_init();
 
     /* If the currently selected machine wishes to override the units-per-bus
      * property of its default HBA interface type, do so now. */