diff mbox

[RFC,v2,8/8] migration: add migration/dirty-bitmap.c

Message ID 1422356197-5285-9-git-send-email-vsementsov@parallels.com
State New
Headers show

Commit Message

Vladimir Sementsov-Ogievskiy Jan. 27, 2015, 10:56 a.m. UTC
Live migration of dirty bitmaps. Only named dirty bitmaps are migrated.
If destination qemu is already containing a dirty bitmap with the same
name as a migrated bitmap, then their granularities should be the same,
otherwise the error will be generated. If destination qemu doesn't
contain such bitmap it will be created.

format:

1 byte: flags

[ 1 byte: node name size ] \  flags & DEVICE_NAME
[ n bytes: node name     ] /

[ 1 byte: bitmap name size ]       \
[ n bytes: bitmap name     ]       | flags & BITMAP_NAME
[ [ be64: granularity    ] ]  flags & GRANULARITY

[ 1 byte: bitmap enabled bit ] flags & ENABLED

[ be64: start sector      ] \ flags & (NORMAL_CHUNK | ZERO_CHUNK)
[ be32: number of sectors ] /

[ be64: buffer size ] \ flags & NORMAL_CHUNK
[ n bytes: buffer   ] /

The last chunk should contain flags & EOS. The chunk may skip device
and/or bitmap names, assuming them to be the same with the previous
chunk. GRANULARITY is sent with the first chunk for the bitmap. ENABLED
bit is sent in the end of "complete" stage of migration. So when
destination gets ENABLED flag it should deserialize_finish the bitmap
and set its enabled bit to corresponding value.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
---
 include/migration/block.h |   1 +
 migration/Makefile.objs   |   2 +-
 migration/dirty-bitmap.c  | 606 ++++++++++++++++++++++++++++++++++++++++++++++
 vl.c                      |   1 +
 4 files changed, 609 insertions(+), 1 deletion(-)
 create mode 100644 migration/dirty-bitmap.c

Comments

John Snow Feb. 10, 2015, 9:33 p.m. UTC | #1
Peter Maydell: What's the right way to license a file as copied from a 
previous version? See below, please;

Max, Markus: ctrl+f "bdrv_get_device_name" and let me know what you 
think, if you would.

Juan, Amit, David: Copying migration maintainers.

On 01/27/2015 05:56 AM, Vladimir Sementsov-Ogievskiy wrote:
> Live migration of dirty bitmaps. Only named dirty bitmaps are migrated.
> If destination qemu is already containing a dirty bitmap with the same
> name as a migrated bitmap, then their granularities should be the same,
> otherwise the error will be generated. If destination qemu doesn't
> contain such bitmap it will be created.
>
> format:
>
> 1 byte: flags
>
> [ 1 byte: node name size ] \  flags & DEVICE_NAME
> [ n bytes: node name     ] /
>
> [ 1 byte: bitmap name size ]       \
> [ n bytes: bitmap name     ]       | flags & BITMAP_NAME
> [ [ be64: granularity    ] ]  flags & GRANULARITY
>
> [ 1 byte: bitmap enabled bit ] flags & ENABLED
>
> [ be64: start sector      ] \ flags & (NORMAL_CHUNK | ZERO_CHUNK)
> [ be32: number of sectors ] /
>
> [ be64: buffer size ] \ flags & NORMAL_CHUNK
> [ n bytes: buffer   ] /
>
> The last chunk should contain flags & EOS. The chunk may skip device
> and/or bitmap names, assuming them to be the same with the previous
> chunk. GRANULARITY is sent with the first chunk for the bitmap. ENABLED
> bit is sent in the end of "complete" stage of migration. So when
> destination gets ENABLED flag it should deserialize_finish the bitmap
> and set its enabled bit to corresponding value.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
> ---
>   include/migration/block.h |   1 +
>   migration/Makefile.objs   |   2 +-
>   migration/dirty-bitmap.c  | 606 ++++++++++++++++++++++++++++++++++++++++++++++
>   vl.c                      |   1 +
>   4 files changed, 609 insertions(+), 1 deletion(-)
>   create mode 100644 migration/dirty-bitmap.c
>
> diff --git a/include/migration/block.h b/include/migration/block.h
> index ffa8ac0..566bb9f 100644
> --- a/include/migration/block.h
> +++ b/include/migration/block.h
> @@ -14,6 +14,7 @@
>   #ifndef BLOCK_MIGRATION_H
>   #define BLOCK_MIGRATION_H
>
> +void dirty_bitmap_mig_init(void);
>   void blk_mig_init(void);
>   int blk_mig_active(void);
>   uint64_t blk_mig_bytes_transferred(void);

OK.

> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
> index d929e96..9adfda9 100644
> --- a/migration/Makefile.objs
> +++ b/migration/Makefile.objs
> @@ -6,5 +6,5 @@ common-obj-y += xbzrle.o
>   common-obj-$(CONFIG_RDMA) += rdma.o
>   common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
>
> -common-obj-y += block.o
> +common-obj-y += block.o dirty-bitmap.o
>

OK.

> diff --git a/migration/dirty-bitmap.c b/migration/dirty-bitmap.c
> new file mode 100644
> index 0000000..8621218
> --- /dev/null
> +++ b/migration/dirty-bitmap.c
> @@ -0,0 +1,606 @@
> +/*
> + * QEMU dirty bitmap migration
> + *
> + * derived from migration/block.c
> + *
> + * Author:
> + * Sementsov-Ogievskiy Vladimir <vsementsov@parallels.com>
> + *
> + * original copyright message:
> + * =====================================================================
> + * Copyright IBM, Corp. 2009
> + *
> + * Authors:
> + *  Liran Schour   <lirans@il.ibm.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Contributions after 2012-01-13 are licensed under the terms of the
> + * GNU GPL, version 2 or (at your option) any later version.
> + * =====================================================================
> + */
> +

Not super familiar with the right way to do licensing here; it's 
possible you may not need to copy the original here, but I'm not sure. 
You will want to make it clear what license applies to /your/ work, I 
think. Maybe Peter Maydell can clue us in.

> +#include "block/block.h"
> +#include "qemu/main-loop.h"
> +#include "qemu/error-report.h"
> +#include "migration/block.h"
> +#include "migration/migration.h"
> +#include "qemu/hbitmap.h"
> +#include <assert.h>
> +
> +#define CHUNK_SIZE                       (1 << 20)
> +
> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
> +#define DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK  0x02
> +#define DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK    0x04
> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x08
> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x10
> +#define DIRTY_BITMAP_MIG_FLAG_GRANULARITY   0x20
> +#define DIRTY_BITMAP_MIG_FLAG_ENABLED       0x40
> +/* flags should be <= 0xff */
> +

We should give ourselves a little breathing room with the flags, since 
we've only got room for one more.

> +/* #define DEBUG_DIRTY_BITMAP_MIGRATION */
> +
> +#ifdef DEBUG_DIRTY_BITMAP_MIGRATION
> +#define DPRINTF(fmt, ...) \
> +    do { printf("dirty_migration: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> +    do { } while (0)
> +#endif
> +
> +typedef struct DirtyBitmapMigBitmapState {
> +    /* Written during setup phase. */
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +    HBitmap *dirty_bitmap;

For my own sanity, I'd really prefer "bitmap" and "meta_bitmap" here; 
"dirty_bitmap" is often used as a synonym (outside of this file) to 
refer to the BdrvDirtyBitmap in general, so it's usage here can be 
somewhat confusing.

I'd appreciate "dirty_dirty_bitmap" as in your previous patch for 
consistency, or "meta_bitmap" as I recommend.

> +    int64_t total_sectors;
> +    uint64_t sectors_per_chunk;
> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
> +
> +    /* For bulk phase. */
> +    bool bulk_completed;
> +    int64_t cur_sector;
> +    bool granularity_sent;
> +
> +    /* For dirty phase. */
> +    int64_t cur_dirty;
> +} DirtyBitmapMigBitmapState;
> +
> +typedef struct DirtyBitmapMigState {
> +    int migration_enable;
> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
> +
> +    bool bulk_completed;
> +
> +    /* for send_bitmap() */
> +    BlockDriverState *prev_bs;
> +    BdrvDirtyBitmap *prev_bitmap;
> +} DirtyBitmapMigState;
> +
> +static DirtyBitmapMigState dirty_bitmap_mig_state;
> +
> +/* read name from qemu file:
> + * format:
> + * 1 byte : len = name length (<256)
> + * len bytes : name without last zero byte
> + *
> + * name should point to the buffer >= 256 bytes length
> + */
> +static char *qemu_get_name(QEMUFile *f, char *name)
> +{
> +    int len = qemu_get_byte(f);
> +    qemu_get_buffer(f, (uint8_t *)name, len);
> +    name[len] = '\0';
> +
> +    DPRINTF("get name: %d %s\n", len, name);
> +
> +    return name;
> +}
> +

OK. Maybe these could be "qemu_put_string" or "qemu_get_string" and 
added to qemu-file.c so others can use them.

> +/* write name to qemu file:
> + * format:
> + * same as for qemu_get_name
> + *
> + * maximum name length is 255
> + */
> +static void qemu_put_name(QEMUFile *f, const char *name)
> +{
> +    int len = strlen(name);
> +
> +    DPRINTF("put name: %d %s\n", len, name);
> +
> +    assert(len < 256);
> +    qemu_put_byte(f, len);
> +    qemu_put_buffer(f, (const uint8_t *)name, len);
> +}
> +

OK.

> +static void send_bitmap(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
> +                        uint64_t start_sector, uint32_t nr_sectors)
> +{
> +    BlockDriverState *bs = dbms->bs;
> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
> +    uint8_t flags = 0;
> +    /* align for buffer_is_zero() */
> +    uint64_t align = 4 * sizeof(long);
> +    uint64_t buf_size =
> +        (bdrv_dirty_bitmap_data_size(bitmap, nr_sectors) + align - 1) &
> +        ~(align - 1);
> +    uint8_t *buf = g_malloc0(buf_size);
> +
> +    bdrv_dirty_bitmap_serialize_part(bitmap, buf, start_sector, nr_sectors);
> +
> +    if (buffer_is_zero(buf, buf_size)) {
> +        g_free(buf);
> +        buf = NULL;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK;
> +    } else {
> +        flags |= DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK;
> +    }
> +
> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
> +        dirty_bitmap_mig_state.prev_bs = bs;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
> +    }
> +
> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
> +    }

OK, so we use the current bs/bitmap under consideration to detect if we 
have switched context, and put the names in the stream when it happens. OK.

> +
> +    if (dbms->granularity_sent == 0) {
> +        dbms->granularity_sent = 1;
> +        flags |= DIRTY_BITMAP_MIG_FLAG_GRANULARITY;
> +    }
> +
> +    DPRINTF("Enter send_bitmap"
> +            "\n   flags:        %x"
> +            "\n   start_sector: %" PRIu64
> +            "\n   nr_sectors:   %" PRIu32
> +            "\n   data_size:    %" PRIu64 "\n",
> +            flags, start_sector, nr_sectors, buf_size);
> +
> +    qemu_put_byte(f, flags);
> +
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> +        qemu_put_name(f, bdrv_get_device_name(bs));
> +    }

I am still not fully clear on this myself, but I think we are phasing 
out bdrv_get_device_name. In the context of bitmaps, we are mostly 
likely using them one-per-tree, but they /can/ be attached one-per-node, 
so we shouldn't be trying to get the device name here, but rather, the 
node-name.

I /think/ we may want to be using bdrv_get_node_name, but I think it is 
not currently true that all nodes WILL be named ... I am CC'ing Markus 
Armbruster and Max Reitz for (A) A refresher course and (B) Opinions on 
what function call might make sense here, given that we wish to migrate 
bitmaps attached to arbitrary nodes.

> +
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> +        qemu_put_name(f, bdrv_dirty_bitmap_name(bitmap));
> +
> +        if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
> +            qemu_put_be64(f, bdrv_dirty_bitmap_granularity(bs, bitmap));
> +        }
> +    } else {
> +        assert(!(flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY));
> +    }
> +

I thought we were only migrating bitmaps with names?
I suppose the conditional can't hurt, but I am not clear on when we 
won't have a bitmap name here.

> +    qemu_put_be64(f, start_sector);
> +    qemu_put_be32(f, nr_sectors);
> +
> +    /* if a block is zero we need to flush here since the network
> +     * bandwidth is now a lot higher than the storage device bandwidth.
> +     * thus if we queue zero blocks we slow down the migration.
> +     * also, skip writing block when migrate only dirty bitmaps. */
> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
> +        qemu_fflush(f);
> +        return;
> +    }
> +
> +    qemu_put_be64(f, buf_size);
> +    qemu_put_buffer(f, buf, buf_size);
> +    g_free(buf);
> +}
> +
> +
> +/* Called with iothread lock taken.  */
> +
> +static void set_dirty_tracking(void)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        dbms->dirty_bitmap =
> +            bdrv_create_dirty_dirty_bitmap(dbms->bitmap, CHUNK_SIZE);
> +    }
> +}
> +

OK: so we only have these dirty-dirty bitmaps when migration is 
starting, which makes sense.

> +static void unset_dirty_tracking(void)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        bdrv_release_dirty_dirty_bitmap(dbms->bitmap);
> +    }
> +}
> +

OK.

> +static void init_dirty_bitmap_migration(QEMUFile *f)
> +{
> +    BlockDriverState *bs;
> +    BdrvDirtyBitmap *bitmap;
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    dirty_bitmap_mig_state.bulk_completed = false;
> +    dirty_bitmap_mig_state.prev_bs = NULL;
> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
> +
> +    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
> +                continue;
> +            }
> +
> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
> +            dbms->bs = bs;
> +            dbms->bitmap = bitmap;
> +            dbms->total_sectors = bdrv_nb_sectors(bs);
> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
> +                bdrv_dirty_bitmap_granularity(dbms->bs, dbms->bitmap)
> +                >> BDRV_SECTOR_BITS;
> +
> +            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
> +                                 dbms, entry);
> +        }
> +    }
> +}
> +

OK, but see the note below for dirty_bitmap_mig_init.

> +/* Called with no lock taken.  */
> +static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
> +                             dbms->sectors_per_chunk);

What about cases where nr_sectors will put us past the end of the 
bitmap? The bitmap serialization implementation might need a touchup 
with this in mind.

> +
> +    send_bitmap(f, dbms, dbms->cur_sector, nr_sectors);
> +
> +    dbms->cur_sector += nr_sectors;
> +    if (dbms->cur_sector >= dbms->total_sectors) {
> +        dbms->bulk_completed = true;
> +    }
> +}
> +
> +/* Called with no lock taken.  */
> +static void bulk_phase(QEMUFile *f, bool limit)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        while (!dbms->bulk_completed) {
> +            bulk_phase_send_chunk(f, dbms);
> +            if (limit && qemu_file_rate_limit(f)) {
> +                return;
> +            }
> +        }
> +    }
> +
> +    dirty_bitmap_mig_state.bulk_completed = true;
> +}

OK.

> +
> +static void blk_mig_reset_dirty_cursor(void)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        dbms->cur_dirty = 0;
> +    }
> +}
> +

OK.

> +/* Called with iothread lock taken.  */
> +static void dirty_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
> +{
> +    uint32_t nr_sectors;
> +
> +    while (dbms->cur_dirty < dbms->total_sectors &&
> +           !hbitmap_get(dbms->dirty_bitmap, dbms->cur_dirty)) {
> +        dbms->cur_dirty += dbms->sectors_per_chunk;
> +    }

OK, so we fast forward the dirty cursor while the meta-bitmap is empty. 
Is it not worth using the HBitmapIterator here? You can reset them 
everywhere you reset the dirty cursor, and then just fast-seek to the 
first dirty sector.

> +
> +    if (dbms->cur_dirty >= dbms->total_sectors) {
> +        return;
> +    }
> +
> +    nr_sectors = MIN(dbms->total_sectors - dbms->cur_dirty,
> +                     dbms->sectors_per_chunk);

What happens if nr_sectors goes past the end?

> +    send_bitmap(f, dbms, dbms->cur_dirty, nr_sectors);
> +    hbitmap_reset(dbms->dirty_bitmap, dbms->cur_dirty, dbms->sectors_per_chunk);
> +    dbms->cur_dirty += nr_sectors;
> +}
> +
> +/* Called with iothread lock taken.
> + *
> + * return value:
> + * 0: too much data for max_downtime
> + * 1: few enough data for max_downtime
> +*/

dirty_phase below doesn't have a return value.

> +static void dirty_phase(QEMUFile *f, bool limit)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        while (dbms->cur_dirty < dbms->total_sectors) {
> +            dirty_phase_send_chunk(f, dbms);
> +            if (limit && qemu_file_rate_limit(f)) {
> +                return;
> +            }
> +        }
> +    }
> +}
> +

OK.

> +
> +/* Called with iothread lock taken.  */
> +static void dirty_bitmap_mig_cleanup(void)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +
> +    unset_dirty_tracking();
> +
> +    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
> +        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
> +        g_free(dbms);
> +    }
> +}
> +

OK.

> +static void dirty_bitmap_migration_cancel(void *opaque)
> +{
> +    dirty_bitmap_mig_cleanup();
> +}
> +

OK.

> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
> +{
> +    DPRINTF("Enter save live iterate\n");
> +
> +    blk_mig_reset_dirty_cursor();

I suppose this is because it's easier to check if we are finished by 
starting from sector 0 every time.

A harder, but faster method may be: Use HBitmapIterators, but don't 
reset them every iteration: just iterate until the end, and check that 
the bitmap is empty. If the meta bitmap is empty, the dirty phase is 
complete. If the meta bitmap is NOT empty, reset the HBI and continue 
allowing iterations over the dirty phase.

> +
> +    if (dirty_bitmap_mig_state.bulk_completed) {
> +        qemu_mutex_lock_iothread();
> +        dirty_phase(f, true);
> +        qemu_mutex_unlock_iothread();
> +    } else {
> +        bulk_phase(f, true);
> +    }
> +
> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    return dirty_bitmap_mig_state.bulk_completed;
> +}
> +
> +/* Called with iothread lock taken.  */
> +
> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +    DPRINTF("Enter save live complete\n");
> +
> +    if (!dirty_bitmap_mig_state.bulk_completed) {
> +        bulk_phase(f, false);
> +    }

[Not expertly familiar with savevm:] Under what conditions can this happen?

> +
> +    blk_mig_reset_dirty_cursor();
> +    dirty_phase(f, false);
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
> +
> +        qemu_put_byte(f, flags);
> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
> +    }
> +
> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    DPRINTF("Dirty bitmaps migration completed\n");
> +
> +    dirty_bitmap_mig_cleanup();
> +    return 0;
> +}
> +

I suppose we don't need a flag that distinctly SAYS this is the end 
section, since we can tell by omission of 
DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.

> +static uint64_t dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
> +                                          uint64_t max_size)
> +{
> +    DirtyBitmapMigBitmapState *dbms;
> +    uint64_t pending = 0;
> +
> +    qemu_mutex_lock_iothread();
> +
> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
> +        uint64_t sectors = hbitmap_count(dbms->dirty_bitmap);
> +        if (!dbms->bulk_completed) {
> +            sectors += dbms->total_sectors - dbms->cur_sector;
> +        }
> +        pending += bdrv_dirty_bitmap_data_size(dbms->bitmap, sectors);
> +    }
> +
> +    qemu_mutex_unlock_iothread();
> +
> +    DPRINTF("Enter save live pending %" PRIu64 ", max: %" PRIu64 "\n",
> +            pending, max_size);
> +    return pending;
> +}
> +

OK.

> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
> +{
> +    int flags;
> +
> +    static char device_name[256], bitmap_name[256];
> +    static BlockDriverState *bs;
> +    static BdrvDirtyBitmap *bitmap;
> +
> +    uint8_t *buf;
> +    uint64_t first_sector;
> +    uint32_t  nr_sectors;
> +    int ret;
> +
> +    DPRINTF("load start\n");
> +
> +    do {
> +        flags = qemu_get_byte(f);
> +        DPRINTF("flags: %x\n", flags);
> +
> +        if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
> +            qemu_get_name(f, device_name);
> +            bs = bdrv_find(device_name);

Similar to the above confusion, you may want bdrv_lookup_bs or similar, 
since we're going to be looking for BDS nodes instead of "devices."

> +            if (!bs) {
> +                fprintf(stderr, "Error: unknown block device '%s'\n",
> +                        device_name);
> +                return -EINVAL;
> +            }
> +        }
> +
> +        if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
> +            if (!bs) {
> +                fprintf(stderr, "Error: block device name is not set\n");
> +                return -EINVAL;
> +            }
> +
> +            qemu_get_name(f, bitmap_name);
> +            bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
> +            if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
> +                /* First chunk from this bitmap */
> +                uint64_t granularity = qemu_get_be64(f);
> +                if (!bitmap) {
> +                    Error *local_err = NULL;
> +                    bitmap = bdrv_create_dirty_bitmap(bs, granularity,
> +                                                      bitmap_name,
> +                                                      &local_err);
> +                    if (!bitmap) {
> +                        error_report("%s", error_get_pretty(local_err));
> +                        error_free(local_err);
> +                        return -EINVAL;
> +                    }
> +                } else {
> +                    uint64_t dest_granularity =
> +                        bdrv_dirty_bitmap_granularity(bs, bitmap);
> +                    if (dest_granularity != granularity) {
> +                        fprintf(stderr,
> +                                "Error: "
> +                                "Migrated bitmap granularity (%" PRIu64 ") "
> +                                "is not match with destination bitmap '%s' "
> +                                "granularity (%" PRIu64 ")\n",
> +                                granularity,
> +                                bitmap_name,
> +                                dest_granularity);
> +                        return -EINVAL;
> +                    }
> +                }
> +                bdrv_disable_dirty_bitmap(bitmap);
> +            }
> +            if (!bitmap) {
> +                fprintf(stderr, "Error: unknown dirty bitmap "
> +                        "'%s' for block device '%s'\n",
> +                        bitmap_name, device_name);
> +                return -EINVAL;
> +            }
> +        }
> +
> +        if (flags & DIRTY_BITMAP_MIG_FLAG_ENABLED) {
> +            bool enabled;
> +            if (!bitmap) {
> +                fprintf(stderr, "Error: dirty bitmap name is not set\n");
> +                return -EINVAL;
> +            }
> +            bdrv_dirty_bitmap_deserialize_finish(bitmap);
> +            /* complete migration */
> +            enabled = qemu_get_byte(f);
> +            if (enabled) {
> +                bdrv_enable_dirty_bitmap(bitmap);
> +            }
> +        }

Oh, so you use the ENABLED flag to show that migration is over. If we 
are going to commit to a stream format for bitmaps, though, maybe it's 
best to actually create a "COMPLETION BLOCK" flag and then split this 
function into two pieces:

(1) The part that receives regular / zero blocks, and
(2) The part that receives completion data.

That way, if we change the properties that bitmaps have down the line, 
we aren't reliant on literally the "enabled" flag to decide what to do.

Also, it might help make this fairly long function a little smaller and 
more readable.

> +
> +        if (flags & (DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK |
> +                     DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK)) {
> +            if (!bs) {
> +                fprintf(stderr, "Error: block device name is not set\n");
> +                return -EINVAL;
> +            }
> +            if (!bitmap) {
> +                fprintf(stderr, "Error: dirty bitmap name is not set\n");
> +                return -EINVAL;
> +            }
> +
> +            first_sector = qemu_get_be64(f);
> +            nr_sectors = qemu_get_be32(f);
> +            DPRINTF("chunk: %lu %u\n", first_sector, nr_sectors);
> +
> +
> +            if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
> +                bdrv_dirty_bitmap_deserialize_part0(bitmap, first_sector,
> +                                                    nr_sectors);
> +            } else {
> +                uint64_t buf_size = qemu_get_be64(f);
> +                uint64_t needed_size =
> +                    bdrv_dirty_bitmap_data_size(bitmap, nr_sectors);
> +
> +                if (needed_size > buf_size) {
> +                    fprintf(stderr,
> +                            "Error: Migrated bitmap granularity is not "
> +                            "match with destination bitmap granularity\n");
> +                    return -EINVAL;
> +                }
> +

"Migrated bitmap granularity doesn't match the destination bitmap 
granularity" perhaps.

> +                buf = g_malloc(buf_size);
> +                qemu_get_buffer(f, buf, buf_size);
> +                bdrv_dirty_bitmap_deserialize_part(bitmap, buf,
> +                                                   first_sector,
> +                                                   nr_sectors);
> +                g_free(buf);
> +            }
> +        }
> +
> +        ret = qemu_file_get_error(f);
> +        if (ret != 0) {
> +            return ret;
> +        }
> +    } while (!(flags & DIRTY_BITMAP_MIG_FLAG_EOS));
> +
> +    DPRINTF("load finish\n");
> +    return 0;
> +}
> +
> +static void dirty_bitmap_set_params(const MigrationParams *params, void *opaque)
> +{
> +    dirty_bitmap_mig_state.migration_enable = params->dirty;
> +}
> +

OK; though I am not immediately aware of what changes need to happen to 
accommodate Eric's suggestions.

> +static bool dirty_bitmap_is_active(void *opaque)
> +{
> +    return dirty_bitmap_mig_state.migration_enable == 1;
> +}
> +

OK.

> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
> +{
> +    init_dirty_bitmap_migration(f);
> +
> +    qemu_mutex_lock_iothread();
> +    /* start track dirtyness of dirty bitmaps */
> +    set_dirty_tracking();
> +    qemu_mutex_unlock_iothread();
> +
> +    blk_mig_reset_dirty_cursor();
> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
> +
> +    return 0;
> +}
> +

OK; see dirty_bitmap_mig_init below, though.

> +static SaveVMHandlers savevm_block_handlers = {
> +    .set_params = dirty_bitmap_set_params,
> +    .save_live_setup = dirty_bitmap_save_setup,
> +    .save_live_iterate = dirty_bitmap_save_iterate,
> +    .save_live_complete = dirty_bitmap_save_complete,
> +    .save_live_pending = dirty_bitmap_save_pending,
> +    .load_state = dirty_bitmap_load,
> +    .cancel = dirty_bitmap_migration_cancel,
> +    .is_active = dirty_bitmap_is_active,
> +};
> +
> +void dirty_bitmap_mig_init(void)
> +{
> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);

Maybe I haven't looked thoroughly enough yet, but it's weird that part 
of the dirty_bitmap_mig_state is initialized here, and the rest of it in 
init_dirty_bitmap_migration. I'd prefer to keep it all together, if 
possible.

> +
> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1, &savevm_block_handlers,
> +                         &dirty_bitmap_mig_state);
> +}

OK.

> diff --git a/vl.c b/vl.c
> index a824a7d..dee7220 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -4184,6 +4184,7 @@ int main(int argc, char **argv, char **envp)
>
>       blk_mig_init();
>       ram_mig_init();
> +    dirty_bitmap_mig_init();
>
>       /* If the currently selected machine wishes to override the units-per-bus
>        * property of its default HBA interface type, do so now. */
>

Hm, since dirty bitmaps are a sub-component of the block layer, would it 
not make sense to put this hook under blk_mig_init, perhaps?


Overall this looks very clean compared to the intermingled format in V1, 
and the code is organized pretty well. Just a few minor comments, and 
I'd like to get the opinion of the migration maintainers, but I am 
happy. Sorry it took me so long to review, please feel free to let me 
know if you disagree with any of my opinions :)

Thank you,
--John
Vladimir Sementsov-Ogievskiy Feb. 13, 2015, 8:19 a.m. UTC | #2
On 11.02.2015 00:33, John Snow wrote:
> Peter Maydell: What's the right way to license a file as copied from a 
> previous version? See below, please;
>
> Max, Markus: ctrl+f "bdrv_get_device_name" and let me know what you 
> think, if you would.
>
> Juan, Amit, David: Copying migration maintainers.
>
> On 01/27/2015 05:56 AM, Vladimir Sementsov-Ogievskiy wrote:
>> Live migration of dirty bitmaps. Only named dirty bitmaps are migrated.
>> If destination qemu is already containing a dirty bitmap with the same
>> name as a migrated bitmap, then their granularities should be the same,
>> otherwise the error will be generated. If destination qemu doesn't
>> contain such bitmap it will be created.
>>
>> format:
>>
>> 1 byte: flags
>>
>> [ 1 byte: node name size ] \  flags & DEVICE_NAME
>> [ n bytes: node name     ] /
>>
>> [ 1 byte: bitmap name size ]       \
>> [ n bytes: bitmap name     ]       | flags & BITMAP_NAME
>> [ [ be64: granularity    ] ]  flags & GRANULARITY
>>
>> [ 1 byte: bitmap enabled bit ] flags & ENABLED
>>
>> [ be64: start sector      ] \ flags & (NORMAL_CHUNK | ZERO_CHUNK)
>> [ be32: number of sectors ] /
>>
>> [ be64: buffer size ] \ flags & NORMAL_CHUNK
>> [ n bytes: buffer   ] /
>>
>> The last chunk should contain flags & EOS. The chunk may skip device
>> and/or bitmap names, assuming them to be the same with the previous
>> chunk. GRANULARITY is sent with the first chunk for the bitmap. ENABLED
>> bit is sent in the end of "complete" stage of migration. So when
>> destination gets ENABLED flag it should deserialize_finish the bitmap
>> and set its enabled bit to corresponding value.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
>> ---
>>   include/migration/block.h |   1 +
>>   migration/Makefile.objs   |   2 +-
>>   migration/dirty-bitmap.c  | 606 
>> ++++++++++++++++++++++++++++++++++++++++++++++
>>   vl.c                      |   1 +
>>   4 files changed, 609 insertions(+), 1 deletion(-)
>>   create mode 100644 migration/dirty-bitmap.c
>>
>> diff --git a/include/migration/block.h b/include/migration/block.h
>> index ffa8ac0..566bb9f 100644
>> --- a/include/migration/block.h
>> +++ b/include/migration/block.h
>> @@ -14,6 +14,7 @@
>>   #ifndef BLOCK_MIGRATION_H
>>   #define BLOCK_MIGRATION_H
>>
>> +void dirty_bitmap_mig_init(void);
>>   void blk_mig_init(void);
>>   int blk_mig_active(void);
>>   uint64_t blk_mig_bytes_transferred(void);
>
> OK.
>
>> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
>> index d929e96..9adfda9 100644
>> --- a/migration/Makefile.objs
>> +++ b/migration/Makefile.objs
>> @@ -6,5 +6,5 @@ common-obj-y += xbzrle.o
>>   common-obj-$(CONFIG_RDMA) += rdma.o
>>   common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
>>
>> -common-obj-y += block.o
>> +common-obj-y += block.o dirty-bitmap.o
>>
>
> OK.
>
>> diff --git a/migration/dirty-bitmap.c b/migration/dirty-bitmap.c
>> new file mode 100644
>> index 0000000..8621218
>> --- /dev/null
>> +++ b/migration/dirty-bitmap.c
>> @@ -0,0 +1,606 @@
>> +/*
>> + * QEMU dirty bitmap migration
>> + *
>> + * derived from migration/block.c
>> + *
>> + * Author:
>> + * Sementsov-Ogievskiy Vladimir <vsementsov@parallels.com>
>> + *
>> + * original copyright message:
>> + * 
>> =====================================================================
>> + * Copyright IBM, Corp. 2009
>> + *
>> + * Authors:
>> + *  Liran Schour <lirans@il.ibm.com>
>> + *
>> + * This work is licensed under the terms of the GNU GPL, version 2.  
>> See
>> + * the COPYING file in the top-level directory.
>> + *
>> + * Contributions after 2012-01-13 are licensed under the terms of the
>> + * GNU GPL, version 2 or (at your option) any later version.
>> + * 
>> =====================================================================
>> + */
>> +
>
> Not super familiar with the right way to do licensing here; it's 
> possible you may not need to copy the original here, but I'm not sure. 
> You will want to make it clear what license applies to /your/ work, I 
> think. Maybe Peter Maydell can clue us in.
>
>> +#include "block/block.h"
>> +#include "qemu/main-loop.h"
>> +#include "qemu/error-report.h"
>> +#include "migration/block.h"
>> +#include "migration/migration.h"
>> +#include "qemu/hbitmap.h"
>> +#include <assert.h>
>> +
>> +#define CHUNK_SIZE                       (1 << 20)
>> +
>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>> +#define DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK  0x02
>> +#define DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK    0x04
>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x08
>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x10
>> +#define DIRTY_BITMAP_MIG_FLAG_GRANULARITY   0x20
>> +#define DIRTY_BITMAP_MIG_FLAG_ENABLED       0x40
>> +/* flags should be <= 0xff */
>> +
>
> We should give ourselves a little breathing room with the flags, since 
> we've only got room for one more.
Ok. Will one more byte be enough?
>
>> +/* #define DEBUG_DIRTY_BITMAP_MIGRATION */
>> +
>> +#ifdef DEBUG_DIRTY_BITMAP_MIGRATION
>> +#define DPRINTF(fmt, ...) \
>> +    do { printf("dirty_migration: " fmt, ## __VA_ARGS__); } while (0)
>> +#else
>> +#define DPRINTF(fmt, ...) \
>> +    do { } while (0)
>> +#endif
>> +
>> +typedef struct DirtyBitmapMigBitmapState {
>> +    /* Written during setup phase. */
>> +    BlockDriverState *bs;
>> +    BdrvDirtyBitmap *bitmap;
>> +    HBitmap *dirty_bitmap;
>
> For my own sanity, I'd really prefer "bitmap" and "meta_bitmap" here; 
> "dirty_bitmap" is often used as a synonym (outside of this file) to 
> refer to the BdrvDirtyBitmap in general, so it's usage here can be 
> somewhat confusing.
>
> I'd appreciate "dirty_dirty_bitmap" as in your previous patch for 
> consistency, or "meta_bitmap" as I recommend.
>
Ok
>> +    int64_t total_sectors;
>> +    uint64_t sectors_per_chunk;
>> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
>> +
>> +    /* For bulk phase. */
>> +    bool bulk_completed;
>> +    int64_t cur_sector;
>> +    bool granularity_sent;
>> +
>> +    /* For dirty phase. */
>> +    int64_t cur_dirty;
>> +} DirtyBitmapMigBitmapState;
>> +
>> +typedef struct DirtyBitmapMigState {
>> +    int migration_enable;
>> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
>> +
>> +    bool bulk_completed;
>> +
>> +    /* for send_bitmap() */
>> +    BlockDriverState *prev_bs;
>> +    BdrvDirtyBitmap *prev_bitmap;
>> +} DirtyBitmapMigState;
>> +
>> +static DirtyBitmapMigState dirty_bitmap_mig_state;
>> +
>> +/* read name from qemu file:
>> + * format:
>> + * 1 byte : len = name length (<256)
>> + * len bytes : name without last zero byte
>> + *
>> + * name should point to the buffer >= 256 bytes length
>> + */
>> +static char *qemu_get_name(QEMUFile *f, char *name)
>> +{
>> +    int len = qemu_get_byte(f);
>> +    qemu_get_buffer(f, (uint8_t *)name, len);
>> +    name[len] = '\0';
>> +
>> +    DPRINTF("get name: %d %s\n", len, name);
>> +
>> +    return name;
>> +}
>> +
>
> OK. Maybe these could be "qemu_put_string" or "qemu_get_string" and 
> added to qemu-file.c so others can use them.
If no objections for sharing this format, I'll do it.
>
>> +/* write name to qemu file:
>> + * format:
>> + * same as for qemu_get_name
>> + *
>> + * maximum name length is 255
>> + */
>> +static void qemu_put_name(QEMUFile *f, const char *name)
>> +{
>> +    int len = strlen(name);
>> +
>> +    DPRINTF("put name: %d %s\n", len, name);
>> +
>> +    assert(len < 256);
>> +    qemu_put_byte(f, len);
>> +    qemu_put_buffer(f, (const uint8_t *)name, len);
>> +}
>> +
>
> OK.
>
>> +static void send_bitmap(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
>> +                        uint64_t start_sector, uint32_t nr_sectors)
>> +{
>> +    BlockDriverState *bs = dbms->bs;
>> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
>> +    uint8_t flags = 0;
>> +    /* align for buffer_is_zero() */
>> +    uint64_t align = 4 * sizeof(long);
>> +    uint64_t buf_size =
>> +        (bdrv_dirty_bitmap_data_size(bitmap, nr_sectors) + align - 1) &
>> +        ~(align - 1);
>> +    uint8_t *buf = g_malloc0(buf_size);
>> +
>> +    bdrv_dirty_bitmap_serialize_part(bitmap, buf, start_sector, 
>> nr_sectors);
>> +
>> +    if (buffer_is_zero(buf, buf_size)) {
>> +        g_free(buf);
>> +        buf = NULL;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK;
>> +    } else {
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK;
>> +    }
>> +
>> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
>> +        dirty_bitmap_mig_state.prev_bs = bs;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
>> +    }
>> +
>> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
>> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
>> +    }
>
> OK, so we use the current bs/bitmap under consideration to detect if 
> we have switched context, and put the names in the stream when it 
> happens. OK.
>
>> +
>> +    if (dbms->granularity_sent == 0) {
>> +        dbms->granularity_sent = 1;
>> +        flags |= DIRTY_BITMAP_MIG_FLAG_GRANULARITY;
>> +    }
>> +
>> +    DPRINTF("Enter send_bitmap"
>> +            "\n   flags:        %x"
>> +            "\n   start_sector: %" PRIu64
>> +            "\n   nr_sectors:   %" PRIu32
>> +            "\n   data_size:    %" PRIu64 "\n",
>> +            flags, start_sector, nr_sectors, buf_size);
>> +
>> +    qemu_put_byte(f, flags);
>> +
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>> +        qemu_put_name(f, bdrv_get_device_name(bs));
>> +    }
>
> I am still not fully clear on this myself, but I think we are phasing 
> out bdrv_get_device_name. In the context of bitmaps, we are mostly 
> likely using them one-per-tree, but they /can/ be attached 
> one-per-node, so we shouldn't be trying to get the device name here, 
> but rather, the node-name.
>
> I /think/ we may want to be using bdrv_get_node_name, but I think it 
> is not currently true that all nodes WILL be named ... I am CC'ing 
> Markus Armbruster and Max Reitz for (A) A refresher course and (B) 
> Opinions on what function call might make sense here, given that we 
> wish to migrate bitmaps attached to arbitrary nodes.
Hmm.. I'm not familiar with hierarchy of nodes and devices. As I 
understand, both command_line- and qmp- created drives are created 
through blockdev_init, which creates both blk(device) and bs(node) 
through blk_new_with_bs.. Am I right? Also, bdrv_get_device_name is used 
in migration/block.c.
>
>> +
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(bitmap));
>> +
>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>> +            qemu_put_be64(f, bdrv_dirty_bitmap_granularity(bs, 
>> bitmap));
>> +        }
>> +    } else {
>> +        assert(!(flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY));
>> +    }
>> +
>
> I thought we were only migrating bitmaps with names?
> I suppose the conditional can't hurt, but I am not clear on when we 
> won't have a bitmap name here.
You are right, 'else' case is not possible.. Hmm. I've added it to be 
sure that format is not corrupted, when I decided to put granularity 
only with name. Wi won't have a bitmap name only when we send the same 
bitmap as on the previous send_bitmap() call. May be it will be better 
to use two separate if's without else and assert.
>
>> +    qemu_put_be64(f, start_sector);
>> +    qemu_put_be32(f, nr_sectors);
>> +
>> +    /* if a block is zero we need to flush here since the network
>> +     * bandwidth is now a lot higher than the storage device bandwidth.
>> +     * thus if we queue zero blocks we slow down the migration.
>> +     * also, skip writing block when migrate only dirty bitmaps. */
>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>> +        qemu_fflush(f);
>> +        return;
>> +    }
>> +
>> +    qemu_put_be64(f, buf_size);
>> +    qemu_put_buffer(f, buf, buf_size);
>> +    g_free(buf);
>> +}
>> +
>> +
>> +/* Called with iothread lock taken.  */
>> +
>> +static void set_dirty_tracking(void)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        dbms->dirty_bitmap =
>> +            bdrv_create_dirty_dirty_bitmap(dbms->bitmap, CHUNK_SIZE);
>> +    }
>> +}
>> +
>
> OK: so we only have these dirty-dirty bitmaps when migration is 
> starting, which makes sense.
>
>> +static void unset_dirty_tracking(void)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        bdrv_release_dirty_dirty_bitmap(dbms->bitmap);
>> +    }
>> +}
>> +
>
> OK.
>
>> +static void init_dirty_bitmap_migration(QEMUFile *f)
>> +{
>> +    BlockDriverState *bs;
>> +    BdrvDirtyBitmap *bitmap;
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    dirty_bitmap_mig_state.bulk_completed = false;
>> +    dirty_bitmap_mig_state.prev_bs = NULL;
>> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
>> +
>> +    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
>> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
>> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
>> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
>> +                continue;
>> +            }
>> +
>> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
>> +            dbms->bs = bs;
>> +            dbms->bitmap = bitmap;
>> +            dbms->total_sectors = bdrv_nb_sectors(bs);
>> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
>> +                bdrv_dirty_bitmap_granularity(dbms->bs, dbms->bitmap)
>> +                >> BDRV_SECTOR_BITS;
>> +
>> + QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
>> +                                 dbms, entry);
>> +        }
>> +    }
>> +}
>> +
>
> OK, but see the note below for dirty_bitmap_mig_init.
actually it is not 'init' but 'reinit' - called on every migration 
start.. Hmm. dbms_list should be cleared here before fill it again.
>
>> +/* Called with no lock taken.  */
>> +static void bulk_phase_send_chunk(QEMUFile *f, 
>> DirtyBitmapMigBitmapState *dbms)
>> +{
>> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
>> +                             dbms->sectors_per_chunk);
>
> What about cases where nr_sectors will put us past the end of the 
> bitmap? The bitmap serialization implementation might need a touchup 
> with this in mind.
I don't understand.. nr_sectors <=  dbms->total_sectors - 
dbms->cur_sector and it can't put us past the end...
>
>> +
>> +    send_bitmap(f, dbms, dbms->cur_sector, nr_sectors);
>> +
>> +    dbms->cur_sector += nr_sectors;
>> +    if (dbms->cur_sector >= dbms->total_sectors) {
>> +        dbms->bulk_completed = true;
>> +    }
>> +}
>> +
>> +/* Called with no lock taken.  */
>> +static void bulk_phase(QEMUFile *f, bool limit)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        while (!dbms->bulk_completed) {
>> +            bulk_phase_send_chunk(f, dbms);
>> +            if (limit && qemu_file_rate_limit(f)) {
>> +                return;
>> +            }
>> +        }
>> +    }
>> +
>> +    dirty_bitmap_mig_state.bulk_completed = true;
>> +}
>
> OK.
>
>> +
>> +static void blk_mig_reset_dirty_cursor(void)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        dbms->cur_dirty = 0;
>> +    }
>> +}
>> +
>
> OK.
>
>> +/* Called with iothread lock taken.  */
>> +static void dirty_phase_send_chunk(QEMUFile *f, 
>> DirtyBitmapMigBitmapState *dbms)
>> +{
>> +    uint32_t nr_sectors;
>> +
>> +    while (dbms->cur_dirty < dbms->total_sectors &&
>> +           !hbitmap_get(dbms->dirty_bitmap, dbms->cur_dirty)) {
>> +        dbms->cur_dirty += dbms->sectors_per_chunk;
>> +    }
>
> OK, so we fast forward the dirty cursor while the meta-bitmap is 
> empty. Is it not worth using the HBitmapIterator here? You can reset 
> them everywhere you reset the dirty cursor, and then just fast-seek to 
> the first dirty sector.
Yes, I've thought about it, just used simpler way (copied from 
migration/block.c) for an early version of the patch set. I will do it.
>
>> +
>> +    if (dbms->cur_dirty >= dbms->total_sectors) {
>> +        return;
>> +    }
>> +
>> +    nr_sectors = MIN(dbms->total_sectors - dbms->cur_dirty,
>> +                     dbms->sectors_per_chunk);
>
> What happens if nr_sectors goes past the end?
>
>> +    send_bitmap(f, dbms, dbms->cur_dirty, nr_sectors);
>> +    hbitmap_reset(dbms->dirty_bitmap, dbms->cur_dirty, 
>> dbms->sectors_per_chunk);
>> +    dbms->cur_dirty += nr_sectors;
>> +}
>> +
>> +/* Called with iothread lock taken.
>> + *
>> + * return value:
>> + * 0: too much data for max_downtime
>> + * 1: few enough data for max_downtime
>> +*/
>
> dirty_phase below doesn't have a return value.
rudimentary comment.. thanks.
>
>> +static void dirty_phase(QEMUFile *f, bool limit)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        while (dbms->cur_dirty < dbms->total_sectors) {
>> +            dirty_phase_send_chunk(f, dbms);
>> +            if (limit && qemu_file_rate_limit(f)) {
>> +                return;
>> +            }
>> +        }
>> +    }
>> +}
>> +
>
> OK.
>
>> +
>> +/* Called with iothread lock taken.  */
>> +static void dirty_bitmap_mig_cleanup(void)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +
>> +    unset_dirty_tracking();
>> +
>> +    while ((dbms = 
>> QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
>> + QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
>> +        g_free(dbms);
>> +    }
>> +}
>> +
>
> OK.
>
>> +static void dirty_bitmap_migration_cancel(void *opaque)
>> +{
>> +    dirty_bitmap_mig_cleanup();
>> +}
>> +
>
> OK.
>
>> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
>> +{
>> +    DPRINTF("Enter save live iterate\n");
>> +
>> +    blk_mig_reset_dirty_cursor();
>
> I suppose this is because it's easier to check if we are finished by 
> starting from sector 0 every time.
>
> A harder, but faster method may be: Use HBitmapIterators, but don't 
> reset them every iteration: just iterate until the end, and check that 
> the bitmap is empty. If the meta bitmap is empty, the dirty phase is 
> complete. If the meta bitmap is NOT empty, reset the HBI and continue 
> allowing iterations over the dirty phase.
Ok, will do.
>
>> +
>> +    if (dirty_bitmap_mig_state.bulk_completed) {
>> +        qemu_mutex_lock_iothread();
>> +        dirty_phase(f, true);
>> +        qemu_mutex_unlock_iothread();
>> +    } else {
>> +        bulk_phase(f, true);
>> +    }
>> +
>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>> +
>> +    return dirty_bitmap_mig_state.bulk_completed;
>> +}
>> +
>> +/* Called with iothread lock taken.  */
>> +
>> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +    DPRINTF("Enter save live complete\n");
>> +
>> +    if (!dirty_bitmap_mig_state.bulk_completed) {
>> +        bulk_phase(f, false);
>> +    }
>
> [Not expertly familiar with savevm:] Under what conditions can this 
> happen?
This can happen. save_complete will happen when savevm decide that 
pending data size to send is small enough. It was the case for my bugfix 
for migration/block.c about pending. To prevent save_complete when bulk 
phase isn't completed, save_pending returns (in my bugfix for 
migration/block.c) big value. Here I decided to make more honest 
save_pending, so I need to complete (if it doesn't) bulk phase in 
save_complete.
>
>> +
>> +    blk_mig_reset_dirty_cursor();
>> +    dirty_phase(f, false);
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>> +
>> +        qemu_put_byte(f, flags);
>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>> +    }
>> +
>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>> +
>> +    DPRINTF("Dirty bitmaps migration completed\n");
>> +
>> +    dirty_bitmap_mig_cleanup();
>> +    return 0;
>> +}
>> +
>
> I suppose we don't need a flag that distinctly SAYS this is the end 
> section, since we can tell by omission of 
> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
Hmm. I think it simplifies the logic (to use EOS after each section). 
And the same approach is in migration/block.c.. It's a question about 
which format is better:  "Each section for dirty_bitmap_load ends with 
EOS" or "Each section for dirty_bitmap_load ends with EOS except the 
last one. The last one may be recognized by absent NORMAL_CHUNK and 
ZERO_CHUNK"
>
>> +static uint64_t dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>> +                                          uint64_t max_size)
>> +{
>> +    DirtyBitmapMigBitmapState *dbms;
>> +    uint64_t pending = 0;
>> +
>> +    qemu_mutex_lock_iothread();
>> +
>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>> +        uint64_t sectors = hbitmap_count(dbms->dirty_bitmap);
>> +        if (!dbms->bulk_completed) {
>> +            sectors += dbms->total_sectors - dbms->cur_sector;
>> +        }
>> +        pending += bdrv_dirty_bitmap_data_size(dbms->bitmap, sectors);
>> +    }
>> +
>> +    qemu_mutex_unlock_iothread();
>> +
>> +    DPRINTF("Enter save live pending %" PRIu64 ", max: %" PRIu64 "\n",
>> +            pending, max_size);
>> +    return pending;
>> +}
>> +
>
> OK.
>
>> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
>> +{
>> +    int flags;
>> +
>> +    static char device_name[256], bitmap_name[256];
>> +    static BlockDriverState *bs;
>> +    static BdrvDirtyBitmap *bitmap;
>> +
>> +    uint8_t *buf;
>> +    uint64_t first_sector;
>> +    uint32_t  nr_sectors;
>> +    int ret;
>> +
>> +    DPRINTF("load start\n");
>> +
>> +    do {
>> +        flags = qemu_get_byte(f);
>> +        DPRINTF("flags: %x\n", flags);
>> +
>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>> +            qemu_get_name(f, device_name);
>> +            bs = bdrv_find(device_name);
>
> Similar to the above confusion, you may want bdrv_lookup_bs or 
> similar, since we're going to be looking for BDS nodes instead of 
> "devices."
In this case, should it be changed in migration/block.c too?
>
>> +            if (!bs) {
>> +                fprintf(stderr, "Error: unknown block device '%s'\n",
>> +                        device_name);
>> +                return -EINVAL;
>> +            }
>> +        }
>> +
>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>> +            if (!bs) {
>> +                fprintf(stderr, "Error: block device name is not 
>> set\n");
>> +                return -EINVAL;
>> +            }
>> +
>> +            qemu_get_name(f, bitmap_name);
>> +            bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>> +                /* First chunk from this bitmap */
>> +                uint64_t granularity = qemu_get_be64(f);
>> +                if (!bitmap) {
>> +                    Error *local_err = NULL;
>> +                    bitmap = bdrv_create_dirty_bitmap(bs, granularity,
>> + bitmap_name,
>> + &local_err);
>> +                    if (!bitmap) {
>> +                        error_report("%s", 
>> error_get_pretty(local_err));
>> +                        error_free(local_err);
>> +                        return -EINVAL;
>> +                    }
>> +                } else {
>> +                    uint64_t dest_granularity =
>> +                        bdrv_dirty_bitmap_granularity(bs, bitmap);
>> +                    if (dest_granularity != granularity) {
>> +                        fprintf(stderr,
>> +                                "Error: "
>> +                                "Migrated bitmap granularity (%" 
>> PRIu64 ") "
>> +                                "is not match with destination 
>> bitmap '%s' "
>> +                                "granularity (%" PRIu64 ")\n",
>> +                                granularity,
>> +                                bitmap_name,
>> +                                dest_granularity);
>> +                        return -EINVAL;
>> +                    }
>> +                }
>> +                bdrv_disable_dirty_bitmap(bitmap);
>> +            }
>> +            if (!bitmap) {
>> +                fprintf(stderr, "Error: unknown dirty bitmap "
>> +                        "'%s' for block device '%s'\n",
>> +                        bitmap_name, device_name);
>> +                return -EINVAL;
>> +            }
>> +        }
>> +
>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_ENABLED) {
>> +            bool enabled;
>> +            if (!bitmap) {
>> +                fprintf(stderr, "Error: dirty bitmap name is not 
>> set\n");
>> +                return -EINVAL;
>> +            }
>> +            bdrv_dirty_bitmap_deserialize_finish(bitmap);
>> +            /* complete migration */
>> +            enabled = qemu_get_byte(f);
>> +            if (enabled) {
>> +                bdrv_enable_dirty_bitmap(bitmap);
>> +            }
>> +        }
>
> Oh, so you use the ENABLED flag to show that migration is over.
Yes, it was bad idea..
> If we are going to commit to a stream format for bitmaps, though, 
> maybe it's best to actually create a "COMPLETION BLOCK" flag and then 
> split this function into two pieces:
>
> (1) The part that receives regular / zero blocks, and
> (2) The part that receives completion data.
>
> That way, if we change the properties that bitmaps have down the line, 
> we aren't reliant on literally the "enabled" flag to decide what to do.
>
> Also, it might help make this fairly long function a little smaller 
> and more readable.
Ok.
>
>> +
>> +        if (flags & (DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK |
>> +                     DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK)) {
>> +            if (!bs) {
>> +                fprintf(stderr, "Error: block device name is not 
>> set\n");
>> +                return -EINVAL;
>> +            }
>> +            if (!bitmap) {
>> +                fprintf(stderr, "Error: dirty bitmap name is not 
>> set\n");
>> +                return -EINVAL;
>> +            }
>> +
>> +            first_sector = qemu_get_be64(f);
>> +            nr_sectors = qemu_get_be32(f);
>> +            DPRINTF("chunk: %lu %u\n", first_sector, nr_sectors);
>> +
>> +
>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>> +                bdrv_dirty_bitmap_deserialize_part0(bitmap, 
>> first_sector,
>> + nr_sectors);
>> +            } else {
>> +                uint64_t buf_size = qemu_get_be64(f);
>> +                uint64_t needed_size =
>> +                    bdrv_dirty_bitmap_data_size(bitmap, nr_sectors);
>> +
>> +                if (needed_size > buf_size) {
>> +                    fprintf(stderr,
>> +                            "Error: Migrated bitmap granularity is 
>> not "
>> +                            "match with destination bitmap 
>> granularity\n");
>> +                    return -EINVAL;
>> +                }
>> +
>
> "Migrated bitmap granularity doesn't match the destination bitmap 
> granularity" perhaps.
>
>> +                buf = g_malloc(buf_size);
>> +                qemu_get_buffer(f, buf, buf_size);
>> +                bdrv_dirty_bitmap_deserialize_part(bitmap, buf,
>> + first_sector,
>> +                                                   nr_sectors);
>> +                g_free(buf);
>> +            }
>> +        }
>> +
>> +        ret = qemu_file_get_error(f);
>> +        if (ret != 0) {
>> +            return ret;
>> +        }
>> +    } while (!(flags & DIRTY_BITMAP_MIG_FLAG_EOS));
>> +
>> +    DPRINTF("load finish\n");
>> +    return 0;
>> +}
>> +
>> +static void dirty_bitmap_set_params(const MigrationParams *params, 
>> void *opaque)
>> +{
>> +    dirty_bitmap_mig_state.migration_enable = params->dirty;
>> +}
>> +
>
> OK; though I am not immediately aware of what changes need to happen 
> to accommodate Eric's suggestions.
This function will be dropped in v3.
>
>> +static bool dirty_bitmap_is_active(void *opaque)
>> +{
>> +    return dirty_bitmap_mig_state.migration_enable == 1;
>> +}
>> +
>
> OK.
>
>> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
>> +{
>> +    init_dirty_bitmap_migration(f);
>> +
>> +    qemu_mutex_lock_iothread();
>> +    /* start track dirtyness of dirty bitmaps */
>> +    set_dirty_tracking();
>> +    qemu_mutex_unlock_iothread();
>> +
>> +    blk_mig_reset_dirty_cursor();
>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>> +
>> +    return 0;
>> +}
>> +
>
> OK; see dirty_bitmap_mig_init below, though.
>
>> +static SaveVMHandlers savevm_block_handlers = {
>> +    .set_params = dirty_bitmap_set_params,
>> +    .save_live_setup = dirty_bitmap_save_setup,
>> +    .save_live_iterate = dirty_bitmap_save_iterate,
>> +    .save_live_complete = dirty_bitmap_save_complete,
>> +    .save_live_pending = dirty_bitmap_save_pending,
>> +    .load_state = dirty_bitmap_load,
>> +    .cancel = dirty_bitmap_migration_cancel,
>> +    .is_active = dirty_bitmap_is_active,
>> +};
>> +
>> +void dirty_bitmap_mig_init(void)
>> +{
>> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
>
> Maybe I haven't looked thoroughly enough yet, but it's weird that part 
> of the dirty_bitmap_mig_state is initialized here, and the rest of it 
> in init_dirty_bitmap_migration. I'd prefer to keep it all together, if 
> possible.
dirty_bitmap_mig_init is called one time when qemu starts. QSIMPLEQ_INIT 
should be called once. dirty_bitmap_save_setup is called on every 
migration start, it's like 'reinitialize'.
>
>> +
>> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1, 
>> &savevm_block_handlers,
>> +                         &dirty_bitmap_mig_state);
>> +}
>
> OK.
>
>> diff --git a/vl.c b/vl.c
>> index a824a7d..dee7220 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -4184,6 +4184,7 @@ int main(int argc, char **argv, char **envp)
>>
>>       blk_mig_init();
>>       ram_mig_init();
>> +    dirty_bitmap_mig_init();
>>
>>       /* If the currently selected machine wishes to override the 
>> units-per-bus
>>        * property of its default HBA interface type, do so now. */
>>
>
> Hm, since dirty bitmaps are a sub-component of the block layer, would 
> it not make sense to put this hook under blk_mig_init, perhaps?
IMHO the reason to put it here is to keep all 
register_savevm_live-entities in one place.
>
>
> Overall this looks very clean compared to the intermingled format in 
> V1, and the code is organized pretty well. Just a few minor comments, 
> and I'd like to get the opinion of the migration maintainers, but I am 
> happy. Sorry it took me so long to review, please feel free to let me 
> know if you disagree with any of my opinions :)
>
> Thank you,
> --John

Thank you for reviewing my series)
Vladimir Sementsov-Ogievskiy Feb. 13, 2015, 9:06 a.m. UTC | #3
>>
>>> +
>>> +    blk_mig_reset_dirty_cursor();
>>> +    dirty_phase(f, false);
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>>> +
>>> +        qemu_put_byte(f, flags);
>>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>>> +    }
>>> +
>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>> +
>>> +    DPRINTF("Dirty bitmaps migration completed\n");
>>> +
>>> +    dirty_bitmap_mig_cleanup();
>>> +    return 0;
>>> +}
>>> +
>>
>> I suppose we don't need a flag that distinctly SAYS this is the end 
>> section, since we can tell by omission of 
>> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
> Hmm. I think it simplifies the logic (to use EOS after each section). 
> And the same approach is in migration/block.c.. It's a question about 
> which format is better:  "Each section for dirty_bitmap_load ends with 
> EOS" or "Each section for dirty_bitmap_load ends with EOS except the 
> last one. The last one may be recognized by absent NORMAL_CHUNK and 
> ZERO_CHUNK"

Oh, sorry, no, it's important EOS. There are several blocks with no 
*_CHUNK! Several bitmaps. And loop in dirty_bitmap_load will read them 
iteratively, and it will finish when find EOS.
John Snow Feb. 13, 2015, 5:32 p.m. UTC | #4
On 02/13/2015 04:06 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>
>>>> +
>>>> +    blk_mig_reset_dirty_cursor();
>>>> +    dirty_phase(f, false);
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>>>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>>>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>>>> +
>>>> +        qemu_put_byte(f, flags);
>>>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>>>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>>>> +    }
>>>> +
>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>> +
>>>> +    DPRINTF("Dirty bitmaps migration completed\n");
>>>> +
>>>> +    dirty_bitmap_mig_cleanup();
>>>> +    return 0;
>>>> +}
>>>> +
>>>
>>> I suppose we don't need a flag that distinctly SAYS this is the end
>>> section, since we can tell by omission of
>>> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
>> Hmm. I think it simplifies the logic (to use EOS after each section).
>> And the same approach is in migration/block.c.. It's a question about
>> which format is better:  "Each section for dirty_bitmap_load ends with
>> EOS" or "Each section for dirty_bitmap_load ends with EOS except the
>> last one. The last one may be recognized by absent NORMAL_CHUNK and
>> ZERO_CHUNK"
>
> Oh, sorry, no, it's important EOS. There are several blocks with no
> *_CHUNK! Several bitmaps. And loop in dirty_bitmap_load will read them
> iteratively, and it will finish when find EOS.
>
>

Sorry, I worded that poorly. I was wondering why you didn't have an 
explicit "end of bitmap" flag, and I realized that you are doing this 
check essentially by the absence of the NORMAL_CHUNK/ZERO_CHUNK flags.

This is really just a comment on my part; I was expecting a more 
distinct "It is now safe to rebuild the bitmap" flag and was just 
commenting on why we didn't necessarily need one.

I think in another comment I point out that an "end of bitmap" flag 
might make the loading function simpler, and I still think that might be 
nice.
Vladimir Sementsov-Ogievskiy Feb. 13, 2015, 5:41 p.m. UTC | #5
On 13.02.2015 20:32, John Snow wrote:
>
>
> On 02/13/2015 04:06 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>
>>>>> +
>>>>> +    blk_mig_reset_dirty_cursor();
>>>>> +    dirty_phase(f, false);
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>>> entry) {
>>>>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>>>>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>>>>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>>>>> +
>>>>> +        qemu_put_byte(f, flags);
>>>>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>>>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>>>>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>>>>> +    }
>>>>> +
>>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>>> +
>>>>> +    DPRINTF("Dirty bitmaps migration completed\n");
>>>>> +
>>>>> +    dirty_bitmap_mig_cleanup();
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>
>>>> I suppose we don't need a flag that distinctly SAYS this is the end
>>>> section, since we can tell by omission of
>>>> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
>>> Hmm. I think it simplifies the logic (to use EOS after each section).
>>> And the same approach is in migration/block.c.. It's a question about
>>> which format is better:  "Each section for dirty_bitmap_load ends with
>>> EOS" or "Each section for dirty_bitmap_load ends with EOS except the
>>> last one. The last one may be recognized by absent NORMAL_CHUNK and
>>> ZERO_CHUNK"
>>
>> Oh, sorry, no, it's important EOS. There are several blocks with no
>> *_CHUNK! Several bitmaps. And loop in dirty_bitmap_load will read them
>> iteratively, and it will finish when find EOS.
>>
>>
>
> Sorry, I worded that poorly. I was wondering why you didn't have an 
> explicit "end of bitmap" flag, and I realized that you are doing this 
> check essentially by the absence of the NORMAL_CHUNK/ZERO_CHUNK flags.
>
> This is really just a comment on my part; I was expecting a more 
> distinct "It is now safe to rebuild the bitmap" flag and was just 
> commenting on why we didn't necessarily need one.
>
> I think in another comment I point out that an "end of bitmap" flag 
> might make the loading function simpler, and I still think that might 
> be nice.
Ok. Today I've refactored these things, there would be separate start 
and complete frames of bitmap, the first with additional granularity 
field (without any dirty bitmap data) and the second with additional 
enabled field (also without data).
John Snow Feb. 13, 2015, 8:22 p.m. UTC | #6
On 02/13/2015 03:19 AM, Vladimir Sementsov-Ogievskiy wrote:
> On 11.02.2015 00:33, John Snow wrote:
>> Peter Maydell: What's the right way to license a file as copied from a
>> previous version? See below, please;
>>
>> Max, Markus: ctrl+f "bdrv_get_device_name" and let me know what you
>> think, if you would.
>>
>> Juan, Amit, David: Copying migration maintainers.
>>
>> On 01/27/2015 05:56 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> Live migration of dirty bitmaps. Only named dirty bitmaps are migrated.
>>> If destination qemu is already containing a dirty bitmap with the same
>>> name as a migrated bitmap, then their granularities should be the same,
>>> otherwise the error will be generated. If destination qemu doesn't
>>> contain such bitmap it will be created.
>>>
>>> format:
>>>
>>> 1 byte: flags
>>>
>>> [ 1 byte: node name size ] \  flags & DEVICE_NAME
>>> [ n bytes: node name     ] /
>>>
>>> [ 1 byte: bitmap name size ]       \
>>> [ n bytes: bitmap name     ]       | flags & BITMAP_NAME
>>> [ [ be64: granularity    ] ]  flags & GRANULARITY
>>>
>>> [ 1 byte: bitmap enabled bit ] flags & ENABLED
>>>
>>> [ be64: start sector      ] \ flags & (NORMAL_CHUNK | ZERO_CHUNK)
>>> [ be32: number of sectors ] /
>>>
>>> [ be64: buffer size ] \ flags & NORMAL_CHUNK
>>> [ n bytes: buffer   ] /
>>>
>>> The last chunk should contain flags & EOS. The chunk may skip device
>>> and/or bitmap names, assuming them to be the same with the previous
>>> chunk. GRANULARITY is sent with the first chunk for the bitmap. ENABLED
>>> bit is sent in the end of "complete" stage of migration. So when
>>> destination gets ENABLED flag it should deserialize_finish the bitmap
>>> and set its enabled bit to corresponding value.
>>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
>>> ---
>>>   include/migration/block.h |   1 +
>>>   migration/Makefile.objs   |   2 +-
>>>   migration/dirty-bitmap.c  | 606
>>> ++++++++++++++++++++++++++++++++++++++++++++++
>>>   vl.c                      |   1 +
>>>   4 files changed, 609 insertions(+), 1 deletion(-)
>>>   create mode 100644 migration/dirty-bitmap.c
>>>
>>> diff --git a/include/migration/block.h b/include/migration/block.h
>>> index ffa8ac0..566bb9f 100644
>>> --- a/include/migration/block.h
>>> +++ b/include/migration/block.h
>>> @@ -14,6 +14,7 @@
>>>   #ifndef BLOCK_MIGRATION_H
>>>   #define BLOCK_MIGRATION_H
>>>
>>> +void dirty_bitmap_mig_init(void);
>>>   void blk_mig_init(void);
>>>   int blk_mig_active(void);
>>>   uint64_t blk_mig_bytes_transferred(void);
>>
>> OK.
>>
>>> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
>>> index d929e96..9adfda9 100644
>>> --- a/migration/Makefile.objs
>>> +++ b/migration/Makefile.objs
>>> @@ -6,5 +6,5 @@ common-obj-y += xbzrle.o
>>>   common-obj-$(CONFIG_RDMA) += rdma.o
>>>   common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
>>>
>>> -common-obj-y += block.o
>>> +common-obj-y += block.o dirty-bitmap.o
>>>
>>
>> OK.
>>
>>> diff --git a/migration/dirty-bitmap.c b/migration/dirty-bitmap.c
>>> new file mode 100644
>>> index 0000000..8621218
>>> --- /dev/null
>>> +++ b/migration/dirty-bitmap.c
>>> @@ -0,0 +1,606 @@
>>> +/*
>>> + * QEMU dirty bitmap migration
>>> + *
>>> + * derived from migration/block.c
>>> + *
>>> + * Author:
>>> + * Sementsov-Ogievskiy Vladimir <vsementsov@parallels.com>
>>> + *
>>> + * original copyright message:
>>> + *
>>> =====================================================================
>>> + * Copyright IBM, Corp. 2009
>>> + *
>>> + * Authors:
>>> + *  Liran Schour <lirans@il.ibm.com>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2. See
>>> + * the COPYING file in the top-level directory.
>>> + *
>>> + * Contributions after 2012-01-13 are licensed under the terms of the
>>> + * GNU GPL, version 2 or (at your option) any later version.
>>> + *
>>> =====================================================================
>>> + */
>>> +
>>
>> Not super familiar with the right way to do licensing here; it's
>> possible you may not need to copy the original here, but I'm not sure.
>> You will want to make it clear what license applies to /your/ work, I
>> think. Maybe Peter Maydell can clue us in.
>>
>>> +#include "block/block.h"
>>> +#include "qemu/main-loop.h"
>>> +#include "qemu/error-report.h"
>>> +#include "migration/block.h"
>>> +#include "migration/migration.h"
>>> +#include "qemu/hbitmap.h"
>>> +#include <assert.h>
>>> +
>>> +#define CHUNK_SIZE                       (1 << 20)
>>> +
>>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>>> +#define DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK  0x02
>>> +#define DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK    0x04
>>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x08
>>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x10
>>> +#define DIRTY_BITMAP_MIG_FLAG_GRANULARITY   0x20
>>> +#define DIRTY_BITMAP_MIG_FLAG_ENABLED       0x40
>>> +/* flags should be <= 0xff */
>>> +
>>
>> We should give ourselves a little breathing room with the flags, since
>> we've only got room for one more.
> Ok. Will one more byte be enough?

I should hope so. If you do add a completion chunk and flag, that fills 
up the first byte completely, so having a second byte is a good idea.

I might recommend reserving the last bit of the second byte to be a flag 
such as DIRTY_BITMAP_EXTRA_FLAGS that indicates the presence of 
additional byte(s) of flags, to be determined later, if we ever need 
them, but two bytes for now should be sufficient.

>>
>>> +/* #define DEBUG_DIRTY_BITMAP_MIGRATION */
>>> +
>>> +#ifdef DEBUG_DIRTY_BITMAP_MIGRATION
>>> +#define DPRINTF(fmt, ...) \
>>> +    do { printf("dirty_migration: " fmt, ## __VA_ARGS__); } while (0)
>>> +#else
>>> +#define DPRINTF(fmt, ...) \
>>> +    do { } while (0)
>>> +#endif
>>> +
>>> +typedef struct DirtyBitmapMigBitmapState {
>>> +    /* Written during setup phase. */
>>> +    BlockDriverState *bs;
>>> +    BdrvDirtyBitmap *bitmap;
>>> +    HBitmap *dirty_bitmap;
>>
>> For my own sanity, I'd really prefer "bitmap" and "meta_bitmap" here;
>> "dirty_bitmap" is often used as a synonym (outside of this file) to
>> refer to the BdrvDirtyBitmap in general, so it's usage here can be
>> somewhat confusing.
>>
>> I'd appreciate "dirty_dirty_bitmap" as in your previous patch for
>> consistency, or "meta_bitmap" as I recommend.
>>
> Ok
>>> +    int64_t total_sectors;
>>> +    uint64_t sectors_per_chunk;
>>> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
>>> +
>>> +    /* For bulk phase. */
>>> +    bool bulk_completed;
>>> +    int64_t cur_sector;
>>> +    bool granularity_sent;
>>> +
>>> +    /* For dirty phase. */
>>> +    int64_t cur_dirty;
>>> +} DirtyBitmapMigBitmapState;
>>> +
>>> +typedef struct DirtyBitmapMigState {
>>> +    int migration_enable;
>>> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
>>> +
>>> +    bool bulk_completed;
>>> +
>>> +    /* for send_bitmap() */
>>> +    BlockDriverState *prev_bs;
>>> +    BdrvDirtyBitmap *prev_bitmap;
>>> +} DirtyBitmapMigState;
>>> +
>>> +static DirtyBitmapMigState dirty_bitmap_mig_state;
>>> +
>>> +/* read name from qemu file:
>>> + * format:
>>> + * 1 byte : len = name length (<256)
>>> + * len bytes : name without last zero byte
>>> + *
>>> + * name should point to the buffer >= 256 bytes length
>>> + */
>>> +static char *qemu_get_name(QEMUFile *f, char *name)
>>> +{
>>> +    int len = qemu_get_byte(f);
>>> +    qemu_get_buffer(f, (uint8_t *)name, len);
>>> +    name[len] = '\0';
>>> +
>>> +    DPRINTF("get name: %d %s\n", len, name);
>>> +
>>> +    return name;
>>> +}
>>> +
>>
>> OK. Maybe these could be "qemu_put_string" or "qemu_get_string" and
>> added to qemu-file.c so others can use them.
> If no objections for sharing this format, I'll do it.
>>
>>> +/* write name to qemu file:
>>> + * format:
>>> + * same as for qemu_get_name
>>> + *
>>> + * maximum name length is 255
>>> + */
>>> +static void qemu_put_name(QEMUFile *f, const char *name)
>>> +{
>>> +    int len = strlen(name);
>>> +
>>> +    DPRINTF("put name: %d %s\n", len, name);
>>> +
>>> +    assert(len < 256);
>>> +    qemu_put_byte(f, len);
>>> +    qemu_put_buffer(f, (const uint8_t *)name, len);
>>> +}
>>> +
>>
>> OK.
>>
>>> +static void send_bitmap(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
>>> +                        uint64_t start_sector, uint32_t nr_sectors)
>>> +{
>>> +    BlockDriverState *bs = dbms->bs;
>>> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
>>> +    uint8_t flags = 0;
>>> +    /* align for buffer_is_zero() */
>>> +    uint64_t align = 4 * sizeof(long);
>>> +    uint64_t buf_size =
>>> +        (bdrv_dirty_bitmap_data_size(bitmap, nr_sectors) + align - 1) &
>>> +        ~(align - 1);
>>> +    uint8_t *buf = g_malloc0(buf_size);
>>> +
>>> +    bdrv_dirty_bitmap_serialize_part(bitmap, buf, start_sector,
>>> nr_sectors);
>>> +
>>> +    if (buffer_is_zero(buf, buf_size)) {
>>> +        g_free(buf);
>>> +        buf = NULL;
>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK;
>>> +    } else {
>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK;
>>> +    }
>>> +
>>> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
>>> +        dirty_bitmap_mig_state.prev_bs = bs;
>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
>>> +    }
>>> +
>>> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
>>> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
>>> +    }
>>
>> OK, so we use the current bs/bitmap under consideration to detect if
>> we have switched context, and put the names in the stream when it
>> happens. OK.
>>
>>> +
>>> +    if (dbms->granularity_sent == 0) {
>>> +        dbms->granularity_sent = 1;
>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_GRANULARITY;
>>> +    }
>>> +
>>> +    DPRINTF("Enter send_bitmap"
>>> +            "\n   flags:        %x"
>>> +            "\n   start_sector: %" PRIu64
>>> +            "\n   nr_sectors:   %" PRIu32
>>> +            "\n   data_size:    %" PRIu64 "\n",
>>> +            flags, start_sector, nr_sectors, buf_size);
>>> +
>>> +    qemu_put_byte(f, flags);
>>> +
>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>>> +        qemu_put_name(f, bdrv_get_device_name(bs));
>>> +    }
>>
>> I am still not fully clear on this myself, but I think we are phasing
>> out bdrv_get_device_name. In the context of bitmaps, we are mostly
>> likely using them one-per-tree, but they /can/ be attached
>> one-per-node, so we shouldn't be trying to get the device name here,
>> but rather, the node-name.
>>
>> I /think/ we may want to be using bdrv_get_node_name, but I think it
>> is not currently true that all nodes WILL be named ... I am CC'ing
>> Markus Armbruster and Max Reitz for (A) A refresher course and (B)
>> Opinions on what function call might make sense here, given that we
>> wish to migrate bitmaps attached to arbitrary nodes.
> Hmm.. I'm not familiar with hierarchy of nodes and devices. As I
> understand, both command_line- and qmp- created drives are created
> through blockdev_init, which creates both blk(device) and bs(node)
> through blk_new_with_bs.. Am I right? Also, bdrv_get_device_name is used
> in migration/block.c.

Now that I'm more awake, here's a better rundown of what's going on:

It's something that is a little bit in flux right now, unfortunately. 
We're trying to transition to a format where we have arbitrarily complex 
Block trees, where the root of the tree is always a BlockBackend (See 
the big series by Max Reitz) and the configuration of the tree may 
become arbitrarily complex.

Simple trees may consist of just one BlockBackend and one 
BlockDriverState node, where I think we can refer to this BDS as the 
"root node," not to be confused with the BlockBackend "root." The 
BlockBackend is a relatively new invention, so it isn't actually used 
consistently everywhere yet.

In the future, we may have commands that make distinctions based on if 
you want to work on the BlockBackend, the root node under the 
blockbackend associated with a BDS, only the explicit node/BDS you 
identify, or some combination of the above semantics.

As of right now, bitmaps can be *attached* to any arbitrary node, though 
they are currently only *useful* when attached to the first child of the 
BlockBackend, the root node. It's only useful currently in cases where 
it is attached to the root because I've only proposed  patches for 
adding bitmap support to produce incrementals for Drive Backup, which 
operates only on drives/devices (the root node of a tree.)

However, in the context of migrating, it could be that we want to 
migrate any bitmaps attached to /any/ nodes, so we should be careful 
about what names we are pulling - we don't necessarily want the name of 
the root node or BlockBackend, we may want the BDS and accompanying name 
of strictly the node the bitmap is attached to.

I know other areas of the code don't provide a good example for this 
distinction, yet, but the block layer people are actively working on 
fixing that. (See also the back-and-forth reviews for what to name my 
QMP parameters in the incremental backup patches for some overview of 
this semantic transition.)

That said, We should think carefully about *which* name we want to put 
in the stream and what implications it has for migration.


(1) bdrv_get_node_name and bdrv_find_node

This would migrate bitmaps as attached to their specific BDS. This would 
mean that the node layout on the destination is either identical, or 
similar enough such that no named bitmaps are attached to a node not 
present on the destination.

This gives us precision: bitmaps may be attached lower in the tree and 
can provide more fine-grained detail for which layers have been changed 
or modified during runtime.

This also gives us fragility: In cases where we transfer, say, a complex 
tree of nodes and collapse it to a single destination drive, we'd be 
unable to migrate bitmaps not attached to the root along with it, 
because they'd have nowhere meaningful to attach.

It is perhaps somewhat unneccessary at this exact moment in time, as 
well, because bitmaps are currently only useful on root nodes.

(2) bdrv_get_device_name and bdrv_lookup_bs(device_name, NULL, errp)

This would migrate any bitmaps in a tree and attach them to the entire 
drive on the destination.

This is simpler: You just need to make sure that the root nodes have the 
same names, which is a lot easier to manage.

This matches how drive migration currently appears to work: The entire 
tree appears to be generally squashed into a single node and transferred 
cluster-by-cluster, without general consideration as to the layout of 
the local block tree. As we both know by now, none of the metadata is 
transferred, just the data.

It prevents migration of just bitmaps where you WANT the extra 
complexity: If a bitmap is attached lower in the tree, re-affixing it to 
the root of a destination tree might invalidate the semantics of what 
that bitmap was meant to track, and it may become useless.


So in summary:
using device names is probably fine for now, as it matches the current 
use case of bitmaps as well as drive migration; but using node names may 
give us more power and precision later.

I talked to Max about it, and he is leaning towards using device names 
for now and switching to node names if we decide we want that power.

(...I wonder if we could use a flag, for now, that says we're including 
DEVICE names. Later, we could add a flag that says we're using NODE 
names and add an option to toggle as the usage case sees fit.)


Are you confused yet? :D


>>
>>> +
>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(bitmap));
>>> +
>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>>> +            qemu_put_be64(f, bdrv_dirty_bitmap_granularity(bs,
>>> bitmap));
>>> +        }
>>> +    } else {
>>> +        assert(!(flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY));
>>> +    }
>>> +
>>
>> I thought we were only migrating bitmaps with names?
>> I suppose the conditional can't hurt, but I am not clear on when we
>> won't have a bitmap name here.
> You are right, 'else' case is not possible.. Hmm. I've added it to be
> sure that format is not corrupted, when I decided to put granularity
> only with name. Wi won't have a bitmap name only when we send the same
> bitmap as on the previous send_bitmap() call. May be it will be better
> to use two separate if's without else and assert.

It's okay if it is just "paranoia," but I was just checking. It would 
make a decent assert().

>>
>>> +    qemu_put_be64(f, start_sector);
>>> +    qemu_put_be32(f, nr_sectors);
>>> +
>>> +    /* if a block is zero we need to flush here since the network
>>> +     * bandwidth is now a lot higher than the storage device bandwidth.
>>> +     * thus if we queue zero blocks we slow down the migration.
>>> +     * also, skip writing block when migrate only dirty bitmaps. */
>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>>> +        qemu_fflush(f);
>>> +        return;
>>> +    }
>>> +
>>> +    qemu_put_be64(f, buf_size);
>>> +    qemu_put_buffer(f, buf, buf_size);
>>> +    g_free(buf);
>>> +}
>>> +
>>> +
>>> +/* Called with iothread lock taken.  */
>>> +
>>> +static void set_dirty_tracking(void)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        dbms->dirty_bitmap =
>>> +            bdrv_create_dirty_dirty_bitmap(dbms->bitmap, CHUNK_SIZE);
>>> +    }
>>> +}
>>> +
>>
>> OK: so we only have these dirty-dirty bitmaps when migration is
>> starting, which makes sense.
>>
>>> +static void unset_dirty_tracking(void)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        bdrv_release_dirty_dirty_bitmap(dbms->bitmap);
>>> +    }
>>> +}
>>> +
>>
>> OK.
>>
>>> +static void init_dirty_bitmap_migration(QEMUFile *f)
>>> +{
>>> +    BlockDriverState *bs;
>>> +    BdrvDirtyBitmap *bitmap;
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    dirty_bitmap_mig_state.bulk_completed = false;
>>> +    dirty_bitmap_mig_state.prev_bs = NULL;
>>> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
>>> +
>>> +    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
>>> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
>>> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
>>> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
>>> +                continue;
>>> +            }
>>> +
>>> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
>>> +            dbms->bs = bs;
>>> +            dbms->bitmap = bitmap;
>>> +            dbms->total_sectors = bdrv_nb_sectors(bs);
>>> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
>>> +                bdrv_dirty_bitmap_granularity(dbms->bs, dbms->bitmap)
>>> +                >> BDRV_SECTOR_BITS;
>>> +
>>> + QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
>>> +                                 dbms, entry);
>>> +        }
>>> +    }
>>> +}
>>> +
>>
>> OK, but see the note below for dirty_bitmap_mig_init.
> actually it is not 'init' but 'reinit' - called on every migration
> start.. Hmm. dbms_list should be cleared here before fill it again.
>>
>>> +/* Called with no lock taken.  */
>>> +static void bulk_phase_send_chunk(QEMUFile *f,
>>> DirtyBitmapMigBitmapState *dbms)
>>> +{
>>> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
>>> +                             dbms->sectors_per_chunk);
>>
>> What about cases where nr_sectors will put us past the end of the
>> bitmap? The bitmap serialization implementation might need a touchup
>> with this in mind.
> I don't understand.. nr_sectors <=  dbms->total_sectors -
> dbms->cur_sector and it can't put us past the end...

Oh, because you take the minimum, so we don't have to worry about 
sectors_per_chunk eclipsing what we have.

Nevermind, I can't read... :(

>>
>>> +
>>> +    send_bitmap(f, dbms, dbms->cur_sector, nr_sectors);
>>> +
>>> +    dbms->cur_sector += nr_sectors;
>>> +    if (dbms->cur_sector >= dbms->total_sectors) {
>>> +        dbms->bulk_completed = true;
>>> +    }
>>> +}
>>> +
>>> +/* Called with no lock taken.  */
>>> +static void bulk_phase(QEMUFile *f, bool limit)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        while (!dbms->bulk_completed) {
>>> +            bulk_phase_send_chunk(f, dbms);
>>> +            if (limit && qemu_file_rate_limit(f)) {
>>> +                return;
>>> +            }
>>> +        }
>>> +    }
>>> +
>>> +    dirty_bitmap_mig_state.bulk_completed = true;
>>> +}
>>
>> OK.
>>
>>> +
>>> +static void blk_mig_reset_dirty_cursor(void)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        dbms->cur_dirty = 0;
>>> +    }
>>> +}
>>> +
>>
>> OK.
>>
>>> +/* Called with iothread lock taken.  */
>>> +static void dirty_phase_send_chunk(QEMUFile *f,
>>> DirtyBitmapMigBitmapState *dbms)
>>> +{
>>> +    uint32_t nr_sectors;
>>> +
>>> +    while (dbms->cur_dirty < dbms->total_sectors &&
>>> +           !hbitmap_get(dbms->dirty_bitmap, dbms->cur_dirty)) {
>>> +        dbms->cur_dirty += dbms->sectors_per_chunk;
>>> +    }
>>
>> OK, so we fast forward the dirty cursor while the meta-bitmap is
>> empty. Is it not worth using the HBitmapIterator here? You can reset
>> them everywhere you reset the dirty cursor, and then just fast-seek to
>> the first dirty sector.
> Yes, I've thought about it, just used simpler way (copied from
> migration/block.c) for an early version of the patch set. I will do it.

Only if it doesn't make things more complicated to look at.

>>
>>> +
>>> +    if (dbms->cur_dirty >= dbms->total_sectors) {
>>> +        return;
>>> +    }
>>> +
>>> +    nr_sectors = MIN(dbms->total_sectors - dbms->cur_dirty,
>>> +                     dbms->sectors_per_chunk);
>>
>> What happens if nr_sectors goes past the end?

Again, I misread.

>>
>>> +    send_bitmap(f, dbms, dbms->cur_dirty, nr_sectors);
>>> +    hbitmap_reset(dbms->dirty_bitmap, dbms->cur_dirty,
>>> dbms->sectors_per_chunk);
>>> +    dbms->cur_dirty += nr_sectors;
>>> +}
>>> +
>>> +/* Called with iothread lock taken.
>>> + *
>>> + * return value:
>>> + * 0: too much data for max_downtime
>>> + * 1: few enough data for max_downtime
>>> +*/
>>
>> dirty_phase below doesn't have a return value.
> rudimentary comment.. thanks.
>>
>>> +static void dirty_phase(QEMUFile *f, bool limit)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        while (dbms->cur_dirty < dbms->total_sectors) {
>>> +            dirty_phase_send_chunk(f, dbms);
>>> +            if (limit && qemu_file_rate_limit(f)) {
>>> +                return;
>>> +            }
>>> +        }
>>> +    }
>>> +}
>>> +
>>
>> OK.
>>
>>> +
>>> +/* Called with iothread lock taken.  */
>>> +static void dirty_bitmap_mig_cleanup(void)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +
>>> +    unset_dirty_tracking();
>>> +
>>> +    while ((dbms =
>>> QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
>>> + QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
>>> +        g_free(dbms);
>>> +    }
>>> +}
>>> +
>>
>> OK.
>>
>>> +static void dirty_bitmap_migration_cancel(void *opaque)
>>> +{
>>> +    dirty_bitmap_mig_cleanup();
>>> +}
>>> +
>>
>> OK.
>>
>>> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
>>> +{
>>> +    DPRINTF("Enter save live iterate\n");
>>> +
>>> +    blk_mig_reset_dirty_cursor();
>>
>> I suppose this is because it's easier to check if we are finished by
>> starting from sector 0 every time.
>>
>> A harder, but faster method may be: Use HBitmapIterators, but don't
>> reset them every iteration: just iterate until the end, and check that
>> the bitmap is empty. If the meta bitmap is empty, the dirty phase is
>> complete. If the meta bitmap is NOT empty, reset the HBI and continue
>> allowing iterations over the dirty phase.
> Ok, will do.
>>
>>> +
>>> +    if (dirty_bitmap_mig_state.bulk_completed) {
>>> +        qemu_mutex_lock_iothread();
>>> +        dirty_phase(f, true);
>>> +        qemu_mutex_unlock_iothread();
>>> +    } else {
>>> +        bulk_phase(f, true);
>>> +    }
>>> +
>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>> +
>>> +    return dirty_bitmap_mig_state.bulk_completed;
>>> +}
>>> +
>>> +/* Called with iothread lock taken.  */
>>> +
>>> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +    DPRINTF("Enter save live complete\n");
>>> +
>>> +    if (!dirty_bitmap_mig_state.bulk_completed) {
>>> +        bulk_phase(f, false);
>>> +    }
>>
>> [Not expertly familiar with savevm:] Under what conditions can this
>> happen?
> This can happen. save_complete will happen when savevm decide that
> pending data size to send is small enough. It was the case for my bugfix
> for migration/block.c about pending. To prevent save_complete when bulk
> phase isn't completed, save_pending returns (in my bugfix for
> migration/block.c) big value. Here I decided to make more honest
> save_pending, so I need to complete (if it doesn't) bulk phase in
> save_complete.

OK, Gotcha.

>>
>>> +
>>> +    blk_mig_reset_dirty_cursor();
>>> +    dirty_phase(f, false);
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>>> +
>>> +        qemu_put_byte(f, flags);
>>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>>> +    }
>>> +
>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>> +
>>> +    DPRINTF("Dirty bitmaps migration completed\n");
>>> +
>>> +    dirty_bitmap_mig_cleanup();
>>> +    return 0;
>>> +}
>>> +
>>
>> I suppose we don't need a flag that distinctly SAYS this is the end
>> section, since we can tell by omission of
>> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
> Hmm. I think it simplifies the logic (to use EOS after each section).
> And the same approach is in migration/block.c.. It's a question about
> which format is better:  "Each section for dirty_bitmap_load ends with
> EOS" or "Each section for dirty_bitmap_load ends with EOS except the
> last one. The last one may be recognized by absent NORMAL_CHUNK and
> ZERO_CHUNK"
>>
>>> +static uint64_t dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>>> +                                          uint64_t max_size)
>>> +{
>>> +    DirtyBitmapMigBitmapState *dbms;
>>> +    uint64_t pending = 0;
>>> +
>>> +    qemu_mutex_lock_iothread();
>>> +
>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
>>> +        uint64_t sectors = hbitmap_count(dbms->dirty_bitmap);
>>> +        if (!dbms->bulk_completed) {
>>> +            sectors += dbms->total_sectors - dbms->cur_sector;
>>> +        }
>>> +        pending += bdrv_dirty_bitmap_data_size(dbms->bitmap, sectors);
>>> +    }
>>> +
>>> +    qemu_mutex_unlock_iothread();
>>> +
>>> +    DPRINTF("Enter save live pending %" PRIu64 ", max: %" PRIu64 "\n",
>>> +            pending, max_size);
>>> +    return pending;
>>> +}
>>> +
>>
>> OK.
>>
>>> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
>>> +{
>>> +    int flags;
>>> +
>>> +    static char device_name[256], bitmap_name[256];
>>> +    static BlockDriverState *bs;
>>> +    static BdrvDirtyBitmap *bitmap;
>>> +
>>> +    uint8_t *buf;
>>> +    uint64_t first_sector;
>>> +    uint32_t  nr_sectors;
>>> +    int ret;
>>> +
>>> +    DPRINTF("load start\n");
>>> +
>>> +    do {
>>> +        flags = qemu_get_byte(f);
>>> +        DPRINTF("flags: %x\n", flags);
>>> +
>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>>> +            qemu_get_name(f, device_name);
>>> +            bs = bdrv_find(device_name);
>>
>> Similar to the above confusion, you may want bdrv_lookup_bs or
>> similar, since we're going to be looking for BDS nodes instead of
>> "devices."
> In this case, should it be changed in migration/block.c too?

[See discussion above!]

>>
>>> +            if (!bs) {
>>> +                fprintf(stderr, "Error: unknown block device '%s'\n",
>>> +                        device_name);
>>> +                return -EINVAL;
>>> +            }
>>> +        }
>>> +
>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>>> +            if (!bs) {
>>> +                fprintf(stderr, "Error: block device name is not
>>> set\n");
>>> +                return -EINVAL;
>>> +            }
>>> +
>>> +            qemu_get_name(f, bitmap_name);
>>> +            bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
>>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>>> +                /* First chunk from this bitmap */
>>> +                uint64_t granularity = qemu_get_be64(f);
>>> +                if (!bitmap) {
>>> +                    Error *local_err = NULL;
>>> +                    bitmap = bdrv_create_dirty_bitmap(bs, granularity,
>>> + bitmap_name,
>>> + &local_err);
>>> +                    if (!bitmap) {
>>> +                        error_report("%s",
>>> error_get_pretty(local_err));
>>> +                        error_free(local_err);
>>> +                        return -EINVAL;
>>> +                    }
>>> +                } else {
>>> +                    uint64_t dest_granularity =
>>> +                        bdrv_dirty_bitmap_granularity(bs, bitmap);
>>> +                    if (dest_granularity != granularity) {
>>> +                        fprintf(stderr,
>>> +                                "Error: "
>>> +                                "Migrated bitmap granularity (%"
>>> PRIu64 ") "
>>> +                                "is not match with destination
>>> bitmap '%s' "
>>> +                                "granularity (%" PRIu64 ")\n",
>>> +                                granularity,
>>> +                                bitmap_name,
>>> +                                dest_granularity);
>>> +                        return -EINVAL;
>>> +                    }
>>> +                }
>>> +                bdrv_disable_dirty_bitmap(bitmap);
>>> +            }
>>> +            if (!bitmap) {
>>> +                fprintf(stderr, "Error: unknown dirty bitmap "
>>> +                        "'%s' for block device '%s'\n",
>>> +                        bitmap_name, device_name);
>>> +                return -EINVAL;
>>> +            }
>>> +        }
>>> +
>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_ENABLED) {
>>> +            bool enabled;
>>> +            if (!bitmap) {
>>> +                fprintf(stderr, "Error: dirty bitmap name is not
>>> set\n");
>>> +                return -EINVAL;
>>> +            }
>>> +            bdrv_dirty_bitmap_deserialize_finish(bitmap);
>>> +            /* complete migration */
>>> +            enabled = qemu_get_byte(f);
>>> +            if (enabled) {
>>> +                bdrv_enable_dirty_bitmap(bitmap);
>>> +            }
>>> +        }
>>
>> Oh, so you use the ENABLED flag to show that migration is over.
> Yes, it was bad idea..
>> If we are going to commit to a stream format for bitmaps, though,
>> maybe it's best to actually create a "COMPLETION BLOCK" flag and then
>> split this function into two pieces:
>>
>> (1) The part that receives regular / zero blocks, and
>> (2) The part that receives completion data.
>>
>> That way, if we change the properties that bitmaps have down the line,
>> we aren't reliant on literally the "enabled" flag to decide what to do.
>>
>> Also, it might help make this fairly long function a little smaller
>> and more readable.
> Ok.
>>
>>> +
>>> +        if (flags & (DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK |
>>> +                     DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK)) {
>>> +            if (!bs) {
>>> +                fprintf(stderr, "Error: block device name is not
>>> set\n");
>>> +                return -EINVAL;
>>> +            }
>>> +            if (!bitmap) {
>>> +                fprintf(stderr, "Error: dirty bitmap name is not
>>> set\n");
>>> +                return -EINVAL;
>>> +            }
>>> +
>>> +            first_sector = qemu_get_be64(f);
>>> +            nr_sectors = qemu_get_be32(f);
>>> +            DPRINTF("chunk: %lu %u\n", first_sector, nr_sectors);
>>> +
>>> +
>>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>>> +                bdrv_dirty_bitmap_deserialize_part0(bitmap,
>>> first_sector,
>>> + nr_sectors);
>>> +            } else {
>>> +                uint64_t buf_size = qemu_get_be64(f);
>>> +                uint64_t needed_size =
>>> +                    bdrv_dirty_bitmap_data_size(bitmap, nr_sectors);
>>> +
>>> +                if (needed_size > buf_size) {
>>> +                    fprintf(stderr,
>>> +                            "Error: Migrated bitmap granularity is
>>> not "
>>> +                            "match with destination bitmap
>>> granularity\n");
>>> +                    return -EINVAL;
>>> +                }
>>> +
>>
>> "Migrated bitmap granularity doesn't match the destination bitmap
>> granularity" perhaps.
>>
>>> +                buf = g_malloc(buf_size);
>>> +                qemu_get_buffer(f, buf, buf_size);
>>> +                bdrv_dirty_bitmap_deserialize_part(bitmap, buf,
>>> + first_sector,
>>> +                                                   nr_sectors);
>>> +                g_free(buf);
>>> +            }
>>> +        }
>>> +
>>> +        ret = qemu_file_get_error(f);
>>> +        if (ret != 0) {
>>> +            return ret;
>>> +        }
>>> +    } while (!(flags & DIRTY_BITMAP_MIG_FLAG_EOS));
>>> +
>>> +    DPRINTF("load finish\n");
>>> +    return 0;
>>> +}
>>> +
>>> +static void dirty_bitmap_set_params(const MigrationParams *params,
>>> void *opaque)
>>> +{
>>> +    dirty_bitmap_mig_state.migration_enable = params->dirty;
>>> +}
>>> +
>>
>> OK; though I am not immediately aware of what changes need to happen
>> to accommodate Eric's suggestions.
> This function will be dropped in v3.
>>
>>> +static bool dirty_bitmap_is_active(void *opaque)
>>> +{
>>> +    return dirty_bitmap_mig_state.migration_enable == 1;
>>> +}
>>> +
>>
>> OK.
>>
>>> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
>>> +{
>>> +    init_dirty_bitmap_migration(f);
>>> +
>>> +    qemu_mutex_lock_iothread();
>>> +    /* start track dirtyness of dirty bitmaps */
>>> +    set_dirty_tracking();
>>> +    qemu_mutex_unlock_iothread();
>>> +
>>> +    blk_mig_reset_dirty_cursor();
>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>
>> OK; see dirty_bitmap_mig_init below, though.
>>
>>> +static SaveVMHandlers savevm_block_handlers = {
>>> +    .set_params = dirty_bitmap_set_params,
>>> +    .save_live_setup = dirty_bitmap_save_setup,
>>> +    .save_live_iterate = dirty_bitmap_save_iterate,
>>> +    .save_live_complete = dirty_bitmap_save_complete,
>>> +    .save_live_pending = dirty_bitmap_save_pending,
>>> +    .load_state = dirty_bitmap_load,
>>> +    .cancel = dirty_bitmap_migration_cancel,
>>> +    .is_active = dirty_bitmap_is_active,
>>> +};
>>> +
>>> +void dirty_bitmap_mig_init(void)
>>> +{
>>> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
>>
>> Maybe I haven't looked thoroughly enough yet, but it's weird that part
>> of the dirty_bitmap_mig_state is initialized here, and the rest of it
>> in init_dirty_bitmap_migration. I'd prefer to keep it all together, if
>> possible.
> dirty_bitmap_mig_init is called one time when qemu starts. QSIMPLEQ_INIT
> should be called once. dirty_bitmap_save_setup is called on every
> migration start, it's like 'reinitialize'.
>>
>>> +
>>> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
>>> &savevm_block_handlers,
>>> +                         &dirty_bitmap_mig_state);
>>> +}
>>
>> OK.
>>
>>> diff --git a/vl.c b/vl.c
>>> index a824a7d..dee7220 100644
>>> --- a/vl.c
>>> +++ b/vl.c
>>> @@ -4184,6 +4184,7 @@ int main(int argc, char **argv, char **envp)
>>>
>>>       blk_mig_init();
>>>       ram_mig_init();
>>> +    dirty_bitmap_mig_init();
>>>
>>>       /* If the currently selected machine wishes to override the
>>> units-per-bus
>>>        * property of its default HBA interface type, do so now. */
>>>
>>
>> Hm, since dirty bitmaps are a sub-component of the block layer, would
>> it not make sense to put this hook under blk_mig_init, perhaps?
> IMHO the reason to put it here is to keep all
> register_savevm_live-entities in one place.

If you still feel that way I won't withhold my R-b, but there are 
already other cases such as ppc_spapr_init which are not in this general 
area of vl.c.

Plus the dozens of devices that use register_savevm as a wrapper to 
register_savevm_live, so maybe consolidating calls to this function 
isn't that important.

>>
>>
>> Overall this looks very clean compared to the intermingled format in
>> V1, and the code is organized pretty well. Just a few minor comments,
>> and I'd like to get the opinion of the migration maintainers, but I am
>> happy. Sorry it took me so long to review, please feel free to let me
>> know if you disagree with any of my opinions :)
>>
>> Thank you,
>> --John
>
> Thank you for reviewing my series)
>

Yup. Hopefully I didn't miss too much that will irritate the Migration 
overlords.

Once you respin on top of v12, I can run some thorough migration tests 
on it (perhaps over a weekend) and verify that it survives a couple 
hundred migrations without any kind of integrity loss.

This is what makes sense to me right now, anyway.

Do you think you'll be including the bitmap checksum in the 
BlockDirtyInfo command? That'd be convenient for iotests.

Thank you,
--John.
Vladimir Sementsov-Ogievskiy Feb. 16, 2015, 12:06 p.m. UTC | #7
On 13.02.2015 23:22, John Snow wrote:
>
>
> On 02/13/2015 03:19 AM, Vladimir Sementsov-Ogievskiy wrote:
>> On 11.02.2015 00:33, John Snow wrote:
>>> Peter Maydell: What's the right way to license a file as copied from a
>>> previous version? See below, please;
>>>
>>> Max, Markus: ctrl+f "bdrv_get_device_name" and let me know what you
>>> think, if you would.
>>>
>>> Juan, Amit, David: Copying migration maintainers.
>>>
>>> On 01/27/2015 05:56 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> Live migration of dirty bitmaps. Only named dirty bitmaps are 
>>>> migrated.
>>>> If destination qemu is already containing a dirty bitmap with the same
>>>> name as a migrated bitmap, then their granularities should be the 
>>>> same,
>>>> otherwise the error will be generated. If destination qemu doesn't
>>>> contain such bitmap it will be created.
>>>>
>>>> format:
>>>>
>>>> 1 byte: flags
>>>>
>>>> [ 1 byte: node name size ] \  flags & DEVICE_NAME
>>>> [ n bytes: node name     ] /
>>>>
>>>> [ 1 byte: bitmap name size ]       \
>>>> [ n bytes: bitmap name     ]       | flags & BITMAP_NAME
>>>> [ [ be64: granularity    ] ]  flags & GRANULARITY
>>>>
>>>> [ 1 byte: bitmap enabled bit ] flags & ENABLED
>>>>
>>>> [ be64: start sector      ] \ flags & (NORMAL_CHUNK | ZERO_CHUNK)
>>>> [ be32: number of sectors ] /
>>>>
>>>> [ be64: buffer size ] \ flags & NORMAL_CHUNK
>>>> [ n bytes: buffer   ] /
>>>>
>>>> The last chunk should contain flags & EOS. The chunk may skip device
>>>> and/or bitmap names, assuming them to be the same with the previous
>>>> chunk. GRANULARITY is sent with the first chunk for the bitmap. 
>>>> ENABLED
>>>> bit is sent in the end of "complete" stage of migration. So when
>>>> destination gets ENABLED flag it should deserialize_finish the bitmap
>>>> and set its enabled bit to corresponding value.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
>>>> ---
>>>>   include/migration/block.h |   1 +
>>>>   migration/Makefile.objs   |   2 +-
>>>>   migration/dirty-bitmap.c  | 606
>>>> ++++++++++++++++++++++++++++++++++++++++++++++
>>>>   vl.c                      |   1 +
>>>>   4 files changed, 609 insertions(+), 1 deletion(-)
>>>>   create mode 100644 migration/dirty-bitmap.c
>>>>
>>>> diff --git a/include/migration/block.h b/include/migration/block.h
>>>> index ffa8ac0..566bb9f 100644
>>>> --- a/include/migration/block.h
>>>> +++ b/include/migration/block.h
>>>> @@ -14,6 +14,7 @@
>>>>   #ifndef BLOCK_MIGRATION_H
>>>>   #define BLOCK_MIGRATION_H
>>>>
>>>> +void dirty_bitmap_mig_init(void);
>>>>   void blk_mig_init(void);
>>>>   int blk_mig_active(void);
>>>>   uint64_t blk_mig_bytes_transferred(void);
>>>
>>> OK.
>>>
>>>> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
>>>> index d929e96..9adfda9 100644
>>>> --- a/migration/Makefile.objs
>>>> +++ b/migration/Makefile.objs
>>>> @@ -6,5 +6,5 @@ common-obj-y += xbzrle.o
>>>>   common-obj-$(CONFIG_RDMA) += rdma.o
>>>>   common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
>>>>
>>>> -common-obj-y += block.o
>>>> +common-obj-y += block.o dirty-bitmap.o
>>>>
>>>
>>> OK.
>>>
>>>> diff --git a/migration/dirty-bitmap.c b/migration/dirty-bitmap.c
>>>> new file mode 100644
>>>> index 0000000..8621218
>>>> --- /dev/null
>>>> +++ b/migration/dirty-bitmap.c
>>>> @@ -0,0 +1,606 @@
>>>> +/*
>>>> + * QEMU dirty bitmap migration
>>>> + *
>>>> + * derived from migration/block.c
>>>> + *
>>>> + * Author:
>>>> + * Sementsov-Ogievskiy Vladimir <vsementsov@parallels.com>
>>>> + *
>>>> + * original copyright message:
>>>> + *
>>>> =====================================================================
>>>> + * Copyright IBM, Corp. 2009
>>>> + *
>>>> + * Authors:
>>>> + *  Liran Schour <lirans@il.ibm.com>
>>>> + *
>>>> + * This work is licensed under the terms of the GNU GPL, version 
>>>> 2. See
>>>> + * the COPYING file in the top-level directory.
>>>> + *
>>>> + * Contributions after 2012-01-13 are licensed under the terms of the
>>>> + * GNU GPL, version 2 or (at your option) any later version.
>>>> + *
>>>> =====================================================================
>>>> + */
>>>> +
>>>
>>> Not super familiar with the right way to do licensing here; it's
>>> possible you may not need to copy the original here, but I'm not sure.
>>> You will want to make it clear what license applies to /your/ work, I
>>> think. Maybe Peter Maydell can clue us in.
>>>
>>>> +#include "block/block.h"
>>>> +#include "qemu/main-loop.h"
>>>> +#include "qemu/error-report.h"
>>>> +#include "migration/block.h"
>>>> +#include "migration/migration.h"
>>>> +#include "qemu/hbitmap.h"
>>>> +#include <assert.h>
>>>> +
>>>> +#define CHUNK_SIZE                       (1 << 20)
>>>> +
>>>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>>>> +#define DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK  0x02
>>>> +#define DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK    0x04
>>>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x08
>>>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x10
>>>> +#define DIRTY_BITMAP_MIG_FLAG_GRANULARITY   0x20
>>>> +#define DIRTY_BITMAP_MIG_FLAG_ENABLED       0x40
>>>> +/* flags should be <= 0xff */
>>>> +
>>>
>>> We should give ourselves a little breathing room with the flags, since
>>> we've only got room for one more.
>> Ok. Will one more byte be enough?
>
> I should hope so. If you do add a completion chunk and flag, that 
> fills up the first byte completely, so having a second byte is a good 
> idea.
>
> I might recommend reserving the last bit of the second byte to be a 
> flag such as DIRTY_BITMAP_EXTRA_FLAGS that indicates the presence of 
> additional byte(s) of flags, to be determined later, if we ever need 
> them, but two bytes for now should be sufficient.
Ok.
>
>>>
>>>> +/* #define DEBUG_DIRTY_BITMAP_MIGRATION */
>>>> +
>>>> +#ifdef DEBUG_DIRTY_BITMAP_MIGRATION
>>>> +#define DPRINTF(fmt, ...) \
>>>> +    do { printf("dirty_migration: " fmt, ## __VA_ARGS__); } while (0)
>>>> +#else
>>>> +#define DPRINTF(fmt, ...) \
>>>> +    do { } while (0)
>>>> +#endif
>>>> +
>>>> +typedef struct DirtyBitmapMigBitmapState {
>>>> +    /* Written during setup phase. */
>>>> +    BlockDriverState *bs;
>>>> +    BdrvDirtyBitmap *bitmap;
>>>> +    HBitmap *dirty_bitmap;
>>>
>>> For my own sanity, I'd really prefer "bitmap" and "meta_bitmap" here;
>>> "dirty_bitmap" is often used as a synonym (outside of this file) to
>>> refer to the BdrvDirtyBitmap in general, so it's usage here can be
>>> somewhat confusing.
>>>
>>> I'd appreciate "dirty_dirty_bitmap" as in your previous patch for
>>> consistency, or "meta_bitmap" as I recommend.
>>>
>> Ok
>>>> +    int64_t total_sectors;
>>>> +    uint64_t sectors_per_chunk;
>>>> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
>>>> +
>>>> +    /* For bulk phase. */
>>>> +    bool bulk_completed;
>>>> +    int64_t cur_sector;
>>>> +    bool granularity_sent;
>>>> +
>>>> +    /* For dirty phase. */
>>>> +    int64_t cur_dirty;
>>>> +} DirtyBitmapMigBitmapState;
>>>> +
>>>> +typedef struct DirtyBitmapMigState {
>>>> +    int migration_enable;
>>>> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
>>>> +
>>>> +    bool bulk_completed;
>>>> +
>>>> +    /* for send_bitmap() */
>>>> +    BlockDriverState *prev_bs;
>>>> +    BdrvDirtyBitmap *prev_bitmap;
>>>> +} DirtyBitmapMigState;
>>>> +
>>>> +static DirtyBitmapMigState dirty_bitmap_mig_state;
>>>> +
>>>> +/* read name from qemu file:
>>>> + * format:
>>>> + * 1 byte : len = name length (<256)
>>>> + * len bytes : name without last zero byte
>>>> + *
>>>> + * name should point to the buffer >= 256 bytes length
>>>> + */
>>>> +static char *qemu_get_name(QEMUFile *f, char *name)
>>>> +{
>>>> +    int len = qemu_get_byte(f);
>>>> +    qemu_get_buffer(f, (uint8_t *)name, len);
>>>> +    name[len] = '\0';
>>>> +
>>>> +    DPRINTF("get name: %d %s\n", len, name);
>>>> +
>>>> +    return name;
>>>> +}
>>>> +
>>>
>>> OK. Maybe these could be "qemu_put_string" or "qemu_get_string" and
>>> added to qemu-file.c so others can use them.
>> If no objections for sharing this format, I'll do it.
>>>
>>>> +/* write name to qemu file:
>>>> + * format:
>>>> + * same as for qemu_get_name
>>>> + *
>>>> + * maximum name length is 255
>>>> + */
>>>> +static void qemu_put_name(QEMUFile *f, const char *name)
>>>> +{
>>>> +    int len = strlen(name);
>>>> +
>>>> +    DPRINTF("put name: %d %s\n", len, name);
>>>> +
>>>> +    assert(len < 256);
>>>> +    qemu_put_byte(f, len);
>>>> +    qemu_put_buffer(f, (const uint8_t *)name, len);
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +static void send_bitmap(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
>>>> +                        uint64_t start_sector, uint32_t nr_sectors)
>>>> +{
>>>> +    BlockDriverState *bs = dbms->bs;
>>>> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
>>>> +    uint8_t flags = 0;
>>>> +    /* align for buffer_is_zero() */
>>>> +    uint64_t align = 4 * sizeof(long);
>>>> +    uint64_t buf_size =
>>>> +        (bdrv_dirty_bitmap_data_size(bitmap, nr_sectors) + align - 
>>>> 1) &
>>>> +        ~(align - 1);
>>>> +    uint8_t *buf = g_malloc0(buf_size);
>>>> +
>>>> +    bdrv_dirty_bitmap_serialize_part(bitmap, buf, start_sector,
>>>> nr_sectors);
>>>> +
>>>> +    if (buffer_is_zero(buf, buf_size)) {
>>>> +        g_free(buf);
>>>> +        buf = NULL;
>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK;
>>>> +    } else {
>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK;
>>>> +    }
>>>> +
>>>> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
>>>> +        dirty_bitmap_mig_state.prev_bs = bs;
>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
>>>> +    }
>>>> +
>>>> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
>>>> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
>>>> +    }
>>>
>>> OK, so we use the current bs/bitmap under consideration to detect if
>>> we have switched context, and put the names in the stream when it
>>> happens. OK.
>>>
>>>> +
>>>> +    if (dbms->granularity_sent == 0) {
>>>> +        dbms->granularity_sent = 1;
>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_GRANULARITY;
>>>> +    }
>>>> +
>>>> +    DPRINTF("Enter send_bitmap"
>>>> +            "\n   flags:        %x"
>>>> +            "\n   start_sector: %" PRIu64
>>>> +            "\n   nr_sectors:   %" PRIu32
>>>> +            "\n   data_size:    %" PRIu64 "\n",
>>>> +            flags, start_sector, nr_sectors, buf_size);
>>>> +
>>>> +    qemu_put_byte(f, flags);
>>>> +
>>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>>>> +        qemu_put_name(f, bdrv_get_device_name(bs));
>>>> +    }
>>>
>>> I am still not fully clear on this myself, but I think we are phasing
>>> out bdrv_get_device_name. In the context of bitmaps, we are mostly
>>> likely using them one-per-tree, but they /can/ be attached
>>> one-per-node, so we shouldn't be trying to get the device name here,
>>> but rather, the node-name.
>>>
>>> I /think/ we may want to be using bdrv_get_node_name, but I think it
>>> is not currently true that all nodes WILL be named ... I am CC'ing
>>> Markus Armbruster and Max Reitz for (A) A refresher course and (B)
>>> Opinions on what function call might make sense here, given that we
>>> wish to migrate bitmaps attached to arbitrary nodes.
>> Hmm.. I'm not familiar with hierarchy of nodes and devices. As I
>> understand, both command_line- and qmp- created drives are created
>> through blockdev_init, which creates both blk(device) and bs(node)
>> through blk_new_with_bs.. Am I right? Also, bdrv_get_device_name is used
>> in migration/block.c.
>
> Now that I'm more awake, here's a better rundown of what's going on:
>
> It's something that is a little bit in flux right now, unfortunately. 
> We're trying to transition to a format where we have arbitrarily 
> complex Block trees, where the root of the tree is always a 
> BlockBackend (See the big series by Max Reitz) and the configuration 
> of the tree may become arbitrarily complex.
>
> Simple trees may consist of just one BlockBackend and one 
> BlockDriverState node, where I think we can refer to this BDS as the 
> "root node," not to be confused with the BlockBackend "root." The 
> BlockBackend is a relatively new invention, so it isn't actually used 
> consistently everywhere yet.
>
> In the future, we may have commands that make distinctions based on if 
> you want to work on the BlockBackend, the root node under the 
> blockbackend associated with a BDS, only the explicit node/BDS you 
> identify, or some combination of the above semantics.
>
> As of right now, bitmaps can be *attached* to any arbitrary node, 
> though they are currently only *useful* when attached to the first 
> child of the BlockBackend, the root node. It's only useful currently 
> in cases where it is attached to the root because I've only proposed  
> patches for adding bitmap support to produce incrementals for Drive 
> Backup, which operates only on drives/devices (the root node of a tree.)
>
> However, in the context of migrating, it could be that we want to 
> migrate any bitmaps attached to /any/ nodes, so we should be careful 
> about what names we are pulling - we don't necessarily want the name 
> of the root node or BlockBackend, we may want the BDS and accompanying 
> name of strictly the node the bitmap is attached to.
>
> I know other areas of the code don't provide a good example for this 
> distinction, yet, but the block layer people are actively working on 
> fixing that. (See also the back-and-forth reviews for what to name my 
> QMP parameters in the incremental backup patches for some overview of 
> this semantic transition.)
>
> That said, We should think carefully about *which* name we want to put 
> in the stream and what implications it has for migration.
>
>
> (1) bdrv_get_node_name and bdrv_find_node
>
> This would migrate bitmaps as attached to their specific BDS. This 
> would mean that the node layout on the destination is either 
> identical, or similar enough such that no named bitmaps are attached 
> to a node not present on the destination.
>
> This gives us precision: bitmaps may be attached lower in the tree and 
> can provide more fine-grained detail for which layers have been 
> changed or modified during runtime.
>
> This also gives us fragility: In cases where we transfer, say, a 
> complex tree of nodes and collapse it to a single destination drive, 
> we'd be unable to migrate bitmaps not attached to the root along with 
> it, because they'd have nowhere meaningful to attach.
>
> It is perhaps somewhat unneccessary at this exact moment in time, as 
> well, because bitmaps are currently only useful on root nodes.
>
> (2) bdrv_get_device_name and bdrv_lookup_bs(device_name, NULL, errp)
>
> This would migrate any bitmaps in a tree and attach them to the entire 
> drive on the destination.
>
> This is simpler: You just need to make sure that the root nodes have 
> the same names, which is a lot easier to manage.
>
> This matches how drive migration currently appears to work: The entire 
> tree appears to be generally squashed into a single node and 
> transferred cluster-by-cluster, without general consideration as to 
> the layout of the local block tree. As we both know by now, none of 
> the metadata is transferred, just the data.
>
> It prevents migration of just bitmaps where you WANT the extra 
> complexity: If a bitmap is attached lower in the tree, re-affixing it 
> to the root of a destination tree might invalidate the semantics of 
> what that bitmap was meant to track, and it may become useless.
>
>
> So in summary:
> using device names is probably fine for now, as it matches the current 
> use case of bitmaps as well as drive migration; but using node names 
> may give us more power and precision later.
>
> I talked to Max about it, and he is leaning towards using device names 
> for now and switching to node names if we decide we want that power.
>
> (...I wonder if we could use a flag, for now, that says we're 
> including DEVICE names. Later, we could add a flag that says we're 
> using NODE names and add an option to toggle as the usage case sees fit.)
>
>
> Are you confused yet? :D
O, thanks for the explanation). Are we really need this flag? As Markus 
wrote, nodes and devices are sharing namespaces.. We can use 
bdrv_lookup_bs(name, name, errp)..

Also, we can, for example, send bitmaps as follows:

if node has name - send bitmap with this name
if node is root, but hasn't name - send it with blk name
otherwise - don't send the bitmap
>
>
>>>
>>>> +
>>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(bitmap));
>>>> +
>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>>>> +            qemu_put_be64(f, bdrv_dirty_bitmap_granularity(bs,
>>>> bitmap));
>>>> +        }
>>>> +    } else {
>>>> +        assert(!(flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY));
>>>> +    }
>>>> +
>>>
>>> I thought we were only migrating bitmaps with names?
>>> I suppose the conditional can't hurt, but I am not clear on when we
>>> won't have a bitmap name here.
>> You are right, 'else' case is not possible.. Hmm. I've added it to be
>> sure that format is not corrupted, when I decided to put granularity
>> only with name. Wi won't have a bitmap name only when we send the same
>> bitmap as on the previous send_bitmap() call. May be it will be better
>> to use two separate if's without else and assert.
>
> It's okay if it is just "paranoia," but I was just checking. It would 
> make a decent assert().
>
>>>
>>>> +    qemu_put_be64(f, start_sector);
>>>> +    qemu_put_be32(f, nr_sectors);
>>>> +
>>>> +    /* if a block is zero we need to flush here since the network
>>>> +     * bandwidth is now a lot higher than the storage device 
>>>> bandwidth.
>>>> +     * thus if we queue zero blocks we slow down the migration.
>>>> +     * also, skip writing block when migrate only dirty bitmaps. */
>>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>>>> +        qemu_fflush(f);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    qemu_put_be64(f, buf_size);
>>>> +    qemu_put_buffer(f, buf, buf_size);
>>>> +    g_free(buf);
>>>> +}
>>>> +
>>>> +
>>>> +/* Called with iothread lock taken.  */
>>>> +
>>>> +static void set_dirty_tracking(void)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        dbms->dirty_bitmap =
>>>> +            bdrv_create_dirty_dirty_bitmap(dbms->bitmap, CHUNK_SIZE);
>>>> +    }
>>>> +}
>>>> +
>>>
>>> OK: so we only have these dirty-dirty bitmaps when migration is
>>> starting, which makes sense.
>>>
>>>> +static void unset_dirty_tracking(void)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        bdrv_release_dirty_dirty_bitmap(dbms->bitmap);
>>>> +    }
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +static void init_dirty_bitmap_migration(QEMUFile *f)
>>>> +{
>>>> +    BlockDriverState *bs;
>>>> +    BdrvDirtyBitmap *bitmap;
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    dirty_bitmap_mig_state.bulk_completed = false;
>>>> +    dirty_bitmap_mig_state.prev_bs = NULL;
>>>> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
>>>> +
>>>> +    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
>>>> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
>>>> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
>>>> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
>>>> +                continue;
>>>> +            }
>>>> +
>>>> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
>>>> +            dbms->bs = bs;
>>>> +            dbms->bitmap = bitmap;
>>>> +            dbms->total_sectors = bdrv_nb_sectors(bs);
>>>> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
>>>> +                bdrv_dirty_bitmap_granularity(dbms->bs, dbms->bitmap)
>>>> +                >> BDRV_SECTOR_BITS;
>>>> +
>>>> + QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
>>>> +                                 dbms, entry);
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>
>>> OK, but see the note below for dirty_bitmap_mig_init.
>> actually it is not 'init' but 'reinit' - called on every migration
>> start.. Hmm. dbms_list should be cleared here before fill it again.
>>>
>>>> +/* Called with no lock taken.  */
>>>> +static void bulk_phase_send_chunk(QEMUFile *f,
>>>> DirtyBitmapMigBitmapState *dbms)
>>>> +{
>>>> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
>>>> +                             dbms->sectors_per_chunk);
>>>
>>> What about cases where nr_sectors will put us past the end of the
>>> bitmap? The bitmap serialization implementation might need a touchup
>>> with this in mind.
>> I don't understand.. nr_sectors <=  dbms->total_sectors -
>> dbms->cur_sector and it can't put us past the end...
>
> Oh, because you take the minimum, so we don't have to worry about 
> sectors_per_chunk eclipsing what we have.
>
> Nevermind, I can't read... :(
>
>>>
>>>> +
>>>> +    send_bitmap(f, dbms, dbms->cur_sector, nr_sectors);
>>>> +
>>>> +    dbms->cur_sector += nr_sectors;
>>>> +    if (dbms->cur_sector >= dbms->total_sectors) {
>>>> +        dbms->bulk_completed = true;
>>>> +    }
>>>> +}
>>>> +
>>>> +/* Called with no lock taken.  */
>>>> +static void bulk_phase(QEMUFile *f, bool limit)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        while (!dbms->bulk_completed) {
>>>> +            bulk_phase_send_chunk(f, dbms);
>>>> +            if (limit && qemu_file_rate_limit(f)) {
>>>> +                return;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +
>>>> +    dirty_bitmap_mig_state.bulk_completed = true;
>>>> +}
>>>
>>> OK.
>>>
>>>> +
>>>> +static void blk_mig_reset_dirty_cursor(void)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        dbms->cur_dirty = 0;
>>>> +    }
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +/* Called with iothread lock taken. */
>>>> +static void dirty_phase_send_chunk(QEMUFile *f,
>>>> DirtyBitmapMigBitmapState *dbms)
>>>> +{
>>>> +    uint32_t nr_sectors;
>>>> +
>>>> +    while (dbms->cur_dirty < dbms->total_sectors &&
>>>> +           !hbitmap_get(dbms->dirty_bitmap, dbms->cur_dirty)) {
>>>> +        dbms->cur_dirty += dbms->sectors_per_chunk;
>>>> +    }
>>>
>>> OK, so we fast forward the dirty cursor while the meta-bitmap is
>>> empty. Is it not worth using the HBitmapIterator here? You can reset
>>> them everywhere you reset the dirty cursor, and then just fast-seek to
>>> the first dirty sector.
>> Yes, I've thought about it, just used simpler way (copied from
>> migration/block.c) for an early version of the patch set. I will do it.
>
> Only if it doesn't make things more complicated to look at.
>
>>>
>>>> +
>>>> +    if (dbms->cur_dirty >= dbms->total_sectors) {
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    nr_sectors = MIN(dbms->total_sectors - dbms->cur_dirty,
>>>> +                     dbms->sectors_per_chunk);
>>>
>>> What happens if nr_sectors goes past the end?
>
> Again, I misread.
>
>>>
>>>> +    send_bitmap(f, dbms, dbms->cur_dirty, nr_sectors);
>>>> +    hbitmap_reset(dbms->dirty_bitmap, dbms->cur_dirty,
>>>> dbms->sectors_per_chunk);
>>>> +    dbms->cur_dirty += nr_sectors;
>>>> +}
>>>> +
>>>> +/* Called with iothread lock taken.
>>>> + *
>>>> + * return value:
>>>> + * 0: too much data for max_downtime
>>>> + * 1: few enough data for max_downtime
>>>> +*/
>>>
>>> dirty_phase below doesn't have a return value.
>> rudimentary comment.. thanks.
>>>
>>>> +static void dirty_phase(QEMUFile *f, bool limit)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        while (dbms->cur_dirty < dbms->total_sectors) {
>>>> +            dirty_phase_send_chunk(f, dbms);
>>>> +            if (limit && qemu_file_rate_limit(f)) {
>>>> +                return;
>>>> +            }
>>>> +        }
>>>> +    }
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +
>>>> +/* Called with iothread lock taken.  */
>>>> +static void dirty_bitmap_mig_cleanup(void)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +
>>>> +    unset_dirty_tracking();
>>>> +
>>>> +    while ((dbms =
>>>> QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
>>>> + QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
>>>> +        g_free(dbms);
>>>> +    }
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +static void dirty_bitmap_migration_cancel(void *opaque)
>>>> +{
>>>> +    dirty_bitmap_mig_cleanup();
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
>>>> +{
>>>> +    DPRINTF("Enter save live iterate\n");
>>>> +
>>>> +    blk_mig_reset_dirty_cursor();
>>>
>>> I suppose this is because it's easier to check if we are finished by
>>> starting from sector 0 every time.
>>>
>>> A harder, but faster method may be: Use HBitmapIterators, but don't
>>> reset them every iteration: just iterate until the end, and check that
>>> the bitmap is empty. If the meta bitmap is empty, the dirty phase is
>>> complete. If the meta bitmap is NOT empty, reset the HBI and continue
>>> allowing iterations over the dirty phase.
>> Ok, will do.
>>>
>>>> +
>>>> +    if (dirty_bitmap_mig_state.bulk_completed) {
>>>> +        qemu_mutex_lock_iothread();
>>>> +        dirty_phase(f, true);
>>>> +        qemu_mutex_unlock_iothread();
>>>> +    } else {
>>>> +        bulk_phase(f, true);
>>>> +    }
>>>> +
>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>> +
>>>> +    return dirty_bitmap_mig_state.bulk_completed;
>>>> +}
>>>> +
>>>> +/* Called with iothread lock taken.  */
>>>> +
>>>> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +    DPRINTF("Enter save live complete\n");
>>>> +
>>>> +    if (!dirty_bitmap_mig_state.bulk_completed) {
>>>> +        bulk_phase(f, false);
>>>> +    }
>>>
>>> [Not expertly familiar with savevm:] Under what conditions can this
>>> happen?
>> This can happen. save_complete will happen when savevm decide that
>> pending data size to send is small enough. It was the case for my bugfix
>> for migration/block.c about pending. To prevent save_complete when bulk
>> phase isn't completed, save_pending returns (in my bugfix for
>> migration/block.c) big value. Here I decided to make more honest
>> save_pending, so I need to complete (if it doesn't) bulk phase in
>> save_complete.
>
> OK, Gotcha.
>
>>>
>>>> +
>>>> +    blk_mig_reset_dirty_cursor();
>>>> +    dirty_phase(f, false);
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>>>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>>>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>>>> +
>>>> +        qemu_put_byte(f, flags);
>>>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>>>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>>>> +    }
>>>> +
>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>> +
>>>> +    DPRINTF("Dirty bitmaps migration completed\n");
>>>> +
>>>> +    dirty_bitmap_mig_cleanup();
>>>> +    return 0;
>>>> +}
>>>> +
>>>
>>> I suppose we don't need a flag that distinctly SAYS this is the end
>>> section, since we can tell by omission of
>>> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
>> Hmm. I think it simplifies the logic (to use EOS after each section).
>> And the same approach is in migration/block.c.. It's a question about
>> which format is better:  "Each section for dirty_bitmap_load ends with
>> EOS" or "Each section for dirty_bitmap_load ends with EOS except the
>> last one. The last one may be recognized by absent NORMAL_CHUNK and
>> ZERO_CHUNK"
>>>
>>>> +static uint64_t dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>>>> +                                          uint64_t max_size)
>>>> +{
>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>> +    uint64_t pending = 0;
>>>> +
>>>> +    qemu_mutex_lock_iothread();
>>>> +
>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, 
>>>> entry) {
>>>> +        uint64_t sectors = hbitmap_count(dbms->dirty_bitmap);
>>>> +        if (!dbms->bulk_completed) {
>>>> +            sectors += dbms->total_sectors - dbms->cur_sector;
>>>> +        }
>>>> +        pending += bdrv_dirty_bitmap_data_size(dbms->bitmap, 
>>>> sectors);
>>>> +    }
>>>> +
>>>> +    qemu_mutex_unlock_iothread();
>>>> +
>>>> +    DPRINTF("Enter save live pending %" PRIu64 ", max: %" PRIu64 
>>>> "\n",
>>>> +            pending, max_size);
>>>> +    return pending;
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int 
>>>> version_id)
>>>> +{
>>>> +    int flags;
>>>> +
>>>> +    static char device_name[256], bitmap_name[256];
>>>> +    static BlockDriverState *bs;
>>>> +    static BdrvDirtyBitmap *bitmap;
>>>> +
>>>> +    uint8_t *buf;
>>>> +    uint64_t first_sector;
>>>> +    uint32_t  nr_sectors;
>>>> +    int ret;
>>>> +
>>>> +    DPRINTF("load start\n");
>>>> +
>>>> +    do {
>>>> +        flags = qemu_get_byte(f);
>>>> +        DPRINTF("flags: %x\n", flags);
>>>> +
>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>>>> +            qemu_get_name(f, device_name);
>>>> +            bs = bdrv_find(device_name);
>>>
>>> Similar to the above confusion, you may want bdrv_lookup_bs or
>>> similar, since we're going to be looking for BDS nodes instead of
>>> "devices."
>> In this case, should it be changed in migration/block.c too?
>
> [See discussion above!]
>
>>>
>>>> +            if (!bs) {
>>>> +                fprintf(stderr, "Error: unknown block device '%s'\n",
>>>> +                        device_name);
>>>> +                return -EINVAL;
>>>> +            }
>>>> +        }
>>>> +
>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>>>> +            if (!bs) {
>>>> +                fprintf(stderr, "Error: block device name is not
>>>> set\n");
>>>> +                return -EINVAL;
>>>> +            }
>>>> +
>>>> +            qemu_get_name(f, bitmap_name);
>>>> +            bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
>>>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>>>> +                /* First chunk from this bitmap */
>>>> +                uint64_t granularity = qemu_get_be64(f);
>>>> +                if (!bitmap) {
>>>> +                    Error *local_err = NULL;
>>>> +                    bitmap = bdrv_create_dirty_bitmap(bs, 
>>>> granularity,
>>>> + bitmap_name,
>>>> + &local_err);
>>>> +                    if (!bitmap) {
>>>> +                        error_report("%s",
>>>> error_get_pretty(local_err));
>>>> +                        error_free(local_err);
>>>> +                        return -EINVAL;
>>>> +                    }
>>>> +                } else {
>>>> +                    uint64_t dest_granularity =
>>>> +                        bdrv_dirty_bitmap_granularity(bs, bitmap);
>>>> +                    if (dest_granularity != granularity) {
>>>> +                        fprintf(stderr,
>>>> +                                "Error: "
>>>> +                                "Migrated bitmap granularity (%"
>>>> PRIu64 ") "
>>>> +                                "is not match with destination
>>>> bitmap '%s' "
>>>> +                                "granularity (%" PRIu64 ")\n",
>>>> +                                granularity,
>>>> +                                bitmap_name,
>>>> +                                dest_granularity);
>>>> +                        return -EINVAL;
>>>> +                    }
>>>> +                }
>>>> +                bdrv_disable_dirty_bitmap(bitmap);
>>>> +            }
>>>> +            if (!bitmap) {
>>>> +                fprintf(stderr, "Error: unknown dirty bitmap "
>>>> +                        "'%s' for block device '%s'\n",
>>>> +                        bitmap_name, device_name);
>>>> +                return -EINVAL;
>>>> +            }
>>>> +        }
>>>> +
>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_ENABLED) {
>>>> +            bool enabled;
>>>> +            if (!bitmap) {
>>>> +                fprintf(stderr, "Error: dirty bitmap name is not
>>>> set\n");
>>>> +                return -EINVAL;
>>>> +            }
>>>> +            bdrv_dirty_bitmap_deserialize_finish(bitmap);
>>>> +            /* complete migration */
>>>> +            enabled = qemu_get_byte(f);
>>>> +            if (enabled) {
>>>> +                bdrv_enable_dirty_bitmap(bitmap);
>>>> +            }
>>>> +        }
>>>
>>> Oh, so you use the ENABLED flag to show that migration is over.
>> Yes, it was bad idea..
>>> If we are going to commit to a stream format for bitmaps, though,
>>> maybe it's best to actually create a "COMPLETION BLOCK" flag and then
>>> split this function into two pieces:
>>>
>>> (1) The part that receives regular / zero blocks, and
>>> (2) The part that receives completion data.
>>>
>>> That way, if we change the properties that bitmaps have down the line,
>>> we aren't reliant on literally the "enabled" flag to decide what to do.
>>>
>>> Also, it might help make this fairly long function a little smaller
>>> and more readable.
>> Ok.
>>>
>>>> +
>>>> +        if (flags & (DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK |
>>>> +                     DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK)) {
>>>> +            if (!bs) {
>>>> +                fprintf(stderr, "Error: block device name is not
>>>> set\n");
>>>> +                return -EINVAL;
>>>> +            }
>>>> +            if (!bitmap) {
>>>> +                fprintf(stderr, "Error: dirty bitmap name is not
>>>> set\n");
>>>> +                return -EINVAL;
>>>> +            }
>>>> +
>>>> +            first_sector = qemu_get_be64(f);
>>>> +            nr_sectors = qemu_get_be32(f);
>>>> +            DPRINTF("chunk: %lu %u\n", first_sector, nr_sectors);
>>>> +
>>>> +
>>>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>>>> +                bdrv_dirty_bitmap_deserialize_part0(bitmap,
>>>> first_sector,
>>>> + nr_sectors);
>>>> +            } else {
>>>> +                uint64_t buf_size = qemu_get_be64(f);
>>>> +                uint64_t needed_size =
>>>> +                    bdrv_dirty_bitmap_data_size(bitmap, nr_sectors);
>>>> +
>>>> +                if (needed_size > buf_size) {
>>>> +                    fprintf(stderr,
>>>> +                            "Error: Migrated bitmap granularity is
>>>> not "
>>>> +                            "match with destination bitmap
>>>> granularity\n");
>>>> +                    return -EINVAL;
>>>> +                }
>>>> +
>>>
>>> "Migrated bitmap granularity doesn't match the destination bitmap
>>> granularity" perhaps.
>>>
>>>> +                buf = g_malloc(buf_size);
>>>> +                qemu_get_buffer(f, buf, buf_size);
>>>> +                bdrv_dirty_bitmap_deserialize_part(bitmap, buf,
>>>> + first_sector,
>>>> + nr_sectors);
>>>> +                g_free(buf);
>>>> +            }
>>>> +        }
>>>> +
>>>> +        ret = qemu_file_get_error(f);
>>>> +        if (ret != 0) {
>>>> +            return ret;
>>>> +        }
>>>> +    } while (!(flags & DIRTY_BITMAP_MIG_FLAG_EOS));
>>>> +
>>>> +    DPRINTF("load finish\n");
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static void dirty_bitmap_set_params(const MigrationParams *params,
>>>> void *opaque)
>>>> +{
>>>> +    dirty_bitmap_mig_state.migration_enable = params->dirty;
>>>> +}
>>>> +
>>>
>>> OK; though I am not immediately aware of what changes need to happen
>>> to accommodate Eric's suggestions.
>> This function will be dropped in v3.
>>>
>>>> +static bool dirty_bitmap_is_active(void *opaque)
>>>> +{
>>>> +    return dirty_bitmap_mig_state.migration_enable == 1;
>>>> +}
>>>> +
>>>
>>> OK.
>>>
>>>> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
>>>> +{
>>>> +    init_dirty_bitmap_migration(f);
>>>> +
>>>> +    qemu_mutex_lock_iothread();
>>>> +    /* start track dirtyness of dirty bitmaps */
>>>> +    set_dirty_tracking();
>>>> +    qemu_mutex_unlock_iothread();
>>>> +
>>>> +    blk_mig_reset_dirty_cursor();
>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>
>>> OK; see dirty_bitmap_mig_init below, though.
>>>
>>>> +static SaveVMHandlers savevm_block_handlers = {
>>>> +    .set_params = dirty_bitmap_set_params,
>>>> +    .save_live_setup = dirty_bitmap_save_setup,
>>>> +    .save_live_iterate = dirty_bitmap_save_iterate,
>>>> +    .save_live_complete = dirty_bitmap_save_complete,
>>>> +    .save_live_pending = dirty_bitmap_save_pending,
>>>> +    .load_state = dirty_bitmap_load,
>>>> +    .cancel = dirty_bitmap_migration_cancel,
>>>> +    .is_active = dirty_bitmap_is_active,
>>>> +};
>>>> +
>>>> +void dirty_bitmap_mig_init(void)
>>>> +{
>>>> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
>>>
>>> Maybe I haven't looked thoroughly enough yet, but it's weird that part
>>> of the dirty_bitmap_mig_state is initialized here, and the rest of it
>>> in init_dirty_bitmap_migration. I'd prefer to keep it all together, if
>>> possible.
>> dirty_bitmap_mig_init is called one time when qemu starts. QSIMPLEQ_INIT
>> should be called once. dirty_bitmap_save_setup is called on every
>> migration start, it's like 'reinitialize'.
>>>
>>>> +
>>>> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
>>>> &savevm_block_handlers,
>>>> +                         &dirty_bitmap_mig_state);
>>>> +}
>>>
>>> OK.
>>>
>>>> diff --git a/vl.c b/vl.c
>>>> index a824a7d..dee7220 100644
>>>> --- a/vl.c
>>>> +++ b/vl.c
>>>> @@ -4184,6 +4184,7 @@ int main(int argc, char **argv, char **envp)
>>>>
>>>>       blk_mig_init();
>>>>       ram_mig_init();
>>>> +    dirty_bitmap_mig_init();
>>>>
>>>>       /* If the currently selected machine wishes to override the
>>>> units-per-bus
>>>>        * property of its default HBA interface type, do so now. */
>>>>
>>>
>>> Hm, since dirty bitmaps are a sub-component of the block layer, would
>>> it not make sense to put this hook under blk_mig_init, perhaps?
>> IMHO the reason to put it here is to keep all
>> register_savevm_live-entities in one place.
>
> If you still feel that way I won't withhold my R-b, but there are 
> already other cases such as ppc_spapr_init which are not in this 
> general area of vl.c.
>
> Plus the dozens of devices that use register_savevm as a wrapper to 
> register_savevm_live, so maybe consolidating calls to this function 
> isn't that important.
Hm, I've missed it, ppc_spapr_init is not here, yes.. Another thing 
here: dirty bitmaps migration are separate from blk migration. And it 
may be used without blk migration (nbd+mirrow migration may be used).. 
Is it ok to connect dirty bitmaps migration to blk_mig_init, which is 
located in migration/block.c, which may not be used at all when we 
bitmaps are migrated using migration/dirty-bitmap.c?
In other words, yes, dirty bitmaps are a sub-component of the block 
layer, but dirty bitmap migration is not a sub-component of blk migration.
>
>>>
>>>
>>> Overall this looks very clean compared to the intermingled format in
>>> V1, and the code is organized pretty well. Just a few minor comments,
>>> and I'd like to get the opinion of the migration maintainers, but I am
>>> happy. Sorry it took me so long to review, please feel free to let me
>>> know if you disagree with any of my opinions :)
>>>
>>> Thank you,
>>> --John
>>
>> Thank you for reviewing my series)
>>
>
> Yup. Hopefully I didn't miss too much that will irritate the Migration 
> overlords.
>
> Once you respin on top of v12, I can run some thorough migration tests 
> on it (perhaps over a weekend) and verify that it survives a couple 
> hundred migrations without any kind of integrity loss.
I hope I'll do it with all other things in about two days.
>
> This is what makes sense to me right now, anyway.
>
> Do you think you'll be including the bitmap checksum in the 
> BlockDirtyInfo command? That'd be convenient for iotests.
Ok, will do. Good idea. Only two points:
1) Is it ok to include debug info into BlockDirtyInfo? Will users be 
happy with it?
2) When I was debugging my code, the information about dirty-regions was 
very useful. Now, all is working and checksums are enough for regression 
control.
>
> Thank you,
> --John.
John Snow Feb. 16, 2015, 6:18 p.m. UTC | #8
On 02/16/2015 07:06 AM, Vladimir Sementsov-Ogievskiy wrote:
> On 13.02.2015 23:22, John Snow wrote:
>>
>>
>> On 02/13/2015 03:19 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> On 11.02.2015 00:33, John Snow wrote:
>>>> Peter Maydell: What's the right way to license a file as copied from a
>>>> previous version? See below, please;
>>>>
>>>> Max, Markus: ctrl+f "bdrv_get_device_name" and let me know what you
>>>> think, if you would.
>>>>
>>>> Juan, Amit, David: Copying migration maintainers.
>>>>
>>>> On 01/27/2015 05:56 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>> Live migration of dirty bitmaps. Only named dirty bitmaps are
>>>>> migrated.
>>>>> If destination qemu is already containing a dirty bitmap with the same
>>>>> name as a migrated bitmap, then their granularities should be the
>>>>> same,
>>>>> otherwise the error will be generated. If destination qemu doesn't
>>>>> contain such bitmap it will be created.
>>>>>
>>>>> format:
>>>>>
>>>>> 1 byte: flags
>>>>>
>>>>> [ 1 byte: node name size ] \  flags & DEVICE_NAME
>>>>> [ n bytes: node name     ] /
>>>>>
>>>>> [ 1 byte: bitmap name size ]       \
>>>>> [ n bytes: bitmap name     ]       | flags & BITMAP_NAME
>>>>> [ [ be64: granularity    ] ]  flags & GRANULARITY
>>>>>
>>>>> [ 1 byte: bitmap enabled bit ] flags & ENABLED
>>>>>
>>>>> [ be64: start sector      ] \ flags & (NORMAL_CHUNK | ZERO_CHUNK)
>>>>> [ be32: number of sectors ] /
>>>>>
>>>>> [ be64: buffer size ] \ flags & NORMAL_CHUNK
>>>>> [ n bytes: buffer   ] /
>>>>>
>>>>> The last chunk should contain flags & EOS. The chunk may skip device
>>>>> and/or bitmap names, assuming them to be the same with the previous
>>>>> chunk. GRANULARITY is sent with the first chunk for the bitmap.
>>>>> ENABLED
>>>>> bit is sent in the end of "complete" stage of migration. So when
>>>>> destination gets ENABLED flag it should deserialize_finish the bitmap
>>>>> and set its enabled bit to corresponding value.
>>>>>
>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@parallels.com>
>>>>> ---
>>>>>   include/migration/block.h |   1 +
>>>>>   migration/Makefile.objs   |   2 +-
>>>>>   migration/dirty-bitmap.c  | 606
>>>>> ++++++++++++++++++++++++++++++++++++++++++++++
>>>>>   vl.c                      |   1 +
>>>>>   4 files changed, 609 insertions(+), 1 deletion(-)
>>>>>   create mode 100644 migration/dirty-bitmap.c
>>>>>
>>>>> diff --git a/include/migration/block.h b/include/migration/block.h
>>>>> index ffa8ac0..566bb9f 100644
>>>>> --- a/include/migration/block.h
>>>>> +++ b/include/migration/block.h
>>>>> @@ -14,6 +14,7 @@
>>>>>   #ifndef BLOCK_MIGRATION_H
>>>>>   #define BLOCK_MIGRATION_H
>>>>>
>>>>> +void dirty_bitmap_mig_init(void);
>>>>>   void blk_mig_init(void);
>>>>>   int blk_mig_active(void);
>>>>>   uint64_t blk_mig_bytes_transferred(void);
>>>>
>>>> OK.
>>>>
>>>>> diff --git a/migration/Makefile.objs b/migration/Makefile.objs
>>>>> index d929e96..9adfda9 100644
>>>>> --- a/migration/Makefile.objs
>>>>> +++ b/migration/Makefile.objs
>>>>> @@ -6,5 +6,5 @@ common-obj-y += xbzrle.o
>>>>>   common-obj-$(CONFIG_RDMA) += rdma.o
>>>>>   common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
>>>>>
>>>>> -common-obj-y += block.o
>>>>> +common-obj-y += block.o dirty-bitmap.o
>>>>>
>>>>
>>>> OK.
>>>>
>>>>> diff --git a/migration/dirty-bitmap.c b/migration/dirty-bitmap.c
>>>>> new file mode 100644
>>>>> index 0000000..8621218
>>>>> --- /dev/null
>>>>> +++ b/migration/dirty-bitmap.c
>>>>> @@ -0,0 +1,606 @@
>>>>> +/*
>>>>> + * QEMU dirty bitmap migration
>>>>> + *
>>>>> + * derived from migration/block.c
>>>>> + *
>>>>> + * Author:
>>>>> + * Sementsov-Ogievskiy Vladimir <vsementsov@parallels.com>
>>>>> + *
>>>>> + * original copyright message:
>>>>> + *
>>>>> =====================================================================
>>>>> + * Copyright IBM, Corp. 2009
>>>>> + *
>>>>> + * Authors:
>>>>> + *  Liran Schour <lirans@il.ibm.com>
>>>>> + *
>>>>> + * This work is licensed under the terms of the GNU GPL, version
>>>>> 2. See
>>>>> + * the COPYING file in the top-level directory.
>>>>> + *
>>>>> + * Contributions after 2012-01-13 are licensed under the terms of the
>>>>> + * GNU GPL, version 2 or (at your option) any later version.
>>>>> + *
>>>>> =====================================================================
>>>>> + */
>>>>> +
>>>>
>>>> Not super familiar with the right way to do licensing here; it's
>>>> possible you may not need to copy the original here, but I'm not sure.
>>>> You will want to make it clear what license applies to /your/ work, I
>>>> think. Maybe Peter Maydell can clue us in.
>>>>
>>>>> +#include "block/block.h"
>>>>> +#include "qemu/main-loop.h"
>>>>> +#include "qemu/error-report.h"
>>>>> +#include "migration/block.h"
>>>>> +#include "migration/migration.h"
>>>>> +#include "qemu/hbitmap.h"
>>>>> +#include <assert.h>
>>>>> +
>>>>> +#define CHUNK_SIZE                       (1 << 20)
>>>>> +
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK  0x02
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK    0x04
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x08
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x10
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_GRANULARITY   0x20
>>>>> +#define DIRTY_BITMAP_MIG_FLAG_ENABLED       0x40
>>>>> +/* flags should be <= 0xff */
>>>>> +
>>>>
>>>> We should give ourselves a little breathing room with the flags, since
>>>> we've only got room for one more.
>>> Ok. Will one more byte be enough?
>>
>> I should hope so. If you do add a completion chunk and flag, that
>> fills up the first byte completely, so having a second byte is a good
>> idea.
>>
>> I might recommend reserving the last bit of the second byte to be a
>> flag such as DIRTY_BITMAP_EXTRA_FLAGS that indicates the presence of
>> additional byte(s) of flags, to be determined later, if we ever need
>> them, but two bytes for now should be sufficient.
> Ok.
>>
>>>>
>>>>> +/* #define DEBUG_DIRTY_BITMAP_MIGRATION */
>>>>> +
>>>>> +#ifdef DEBUG_DIRTY_BITMAP_MIGRATION
>>>>> +#define DPRINTF(fmt, ...) \
>>>>> +    do { printf("dirty_migration: " fmt, ## __VA_ARGS__); } while (0)
>>>>> +#else
>>>>> +#define DPRINTF(fmt, ...) \
>>>>> +    do { } while (0)
>>>>> +#endif
>>>>> +
>>>>> +typedef struct DirtyBitmapMigBitmapState {
>>>>> +    /* Written during setup phase. */
>>>>> +    BlockDriverState *bs;
>>>>> +    BdrvDirtyBitmap *bitmap;
>>>>> +    HBitmap *dirty_bitmap;
>>>>
>>>> For my own sanity, I'd really prefer "bitmap" and "meta_bitmap" here;
>>>> "dirty_bitmap" is often used as a synonym (outside of this file) to
>>>> refer to the BdrvDirtyBitmap in general, so it's usage here can be
>>>> somewhat confusing.
>>>>
>>>> I'd appreciate "dirty_dirty_bitmap" as in your previous patch for
>>>> consistency, or "meta_bitmap" as I recommend.
>>>>
>>> Ok
>>>>> +    int64_t total_sectors;
>>>>> +    uint64_t sectors_per_chunk;
>>>>> +    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
>>>>> +
>>>>> +    /* For bulk phase. */
>>>>> +    bool bulk_completed;
>>>>> +    int64_t cur_sector;
>>>>> +    bool granularity_sent;
>>>>> +
>>>>> +    /* For dirty phase. */
>>>>> +    int64_t cur_dirty;
>>>>> +} DirtyBitmapMigBitmapState;
>>>>> +
>>>>> +typedef struct DirtyBitmapMigState {
>>>>> +    int migration_enable;
>>>>> +    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
>>>>> +
>>>>> +    bool bulk_completed;
>>>>> +
>>>>> +    /* for send_bitmap() */
>>>>> +    BlockDriverState *prev_bs;
>>>>> +    BdrvDirtyBitmap *prev_bitmap;
>>>>> +} DirtyBitmapMigState;
>>>>> +
>>>>> +static DirtyBitmapMigState dirty_bitmap_mig_state;
>>>>> +
>>>>> +/* read name from qemu file:
>>>>> + * format:
>>>>> + * 1 byte : len = name length (<256)
>>>>> + * len bytes : name without last zero byte
>>>>> + *
>>>>> + * name should point to the buffer >= 256 bytes length
>>>>> + */
>>>>> +static char *qemu_get_name(QEMUFile *f, char *name)
>>>>> +{
>>>>> +    int len = qemu_get_byte(f);
>>>>> +    qemu_get_buffer(f, (uint8_t *)name, len);
>>>>> +    name[len] = '\0';
>>>>> +
>>>>> +    DPRINTF("get name: %d %s\n", len, name);
>>>>> +
>>>>> +    return name;
>>>>> +}
>>>>> +
>>>>
>>>> OK. Maybe these could be "qemu_put_string" or "qemu_get_string" and
>>>> added to qemu-file.c so others can use them.
>>> If no objections for sharing this format, I'll do it.
>>>>
>>>>> +/* write name to qemu file:
>>>>> + * format:
>>>>> + * same as for qemu_get_name
>>>>> + *
>>>>> + * maximum name length is 255
>>>>> + */
>>>>> +static void qemu_put_name(QEMUFile *f, const char *name)
>>>>> +{
>>>>> +    int len = strlen(name);
>>>>> +
>>>>> +    DPRINTF("put name: %d %s\n", len, name);
>>>>> +
>>>>> +    assert(len < 256);
>>>>> +    qemu_put_byte(f, len);
>>>>> +    qemu_put_buffer(f, (const uint8_t *)name, len);
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +static void send_bitmap(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
>>>>> +                        uint64_t start_sector, uint32_t nr_sectors)
>>>>> +{
>>>>> +    BlockDriverState *bs = dbms->bs;
>>>>> +    BdrvDirtyBitmap *bitmap = dbms->bitmap;
>>>>> +    uint8_t flags = 0;
>>>>> +    /* align for buffer_is_zero() */
>>>>> +    uint64_t align = 4 * sizeof(long);
>>>>> +    uint64_t buf_size =
>>>>> +        (bdrv_dirty_bitmap_data_size(bitmap, nr_sectors) + align -
>>>>> 1) &
>>>>> +        ~(align - 1);
>>>>> +    uint8_t *buf = g_malloc0(buf_size);
>>>>> +
>>>>> +    bdrv_dirty_bitmap_serialize_part(bitmap, buf, start_sector,
>>>>> nr_sectors);
>>>>> +
>>>>> +    if (buffer_is_zero(buf, buf_size)) {
>>>>> +        g_free(buf);
>>>>> +        buf = NULL;
>>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK;
>>>>> +    } else {
>>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK;
>>>>> +    }
>>>>> +
>>>>> +    if (bs != dirty_bitmap_mig_state.prev_bs) {
>>>>> +        dirty_bitmap_mig_state.prev_bs = bs;
>>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
>>>>> +    }
>>>>> +
>>>>> +    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
>>>>> +        dirty_bitmap_mig_state.prev_bitmap = bitmap;
>>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
>>>>> +    }
>>>>
>>>> OK, so we use the current bs/bitmap under consideration to detect if
>>>> we have switched context, and put the names in the stream when it
>>>> happens. OK.
>>>>
>>>>> +
>>>>> +    if (dbms->granularity_sent == 0) {
>>>>> +        dbms->granularity_sent = 1;
>>>>> +        flags |= DIRTY_BITMAP_MIG_FLAG_GRANULARITY;
>>>>> +    }
>>>>> +
>>>>> +    DPRINTF("Enter send_bitmap"
>>>>> +            "\n   flags:        %x"
>>>>> +            "\n   start_sector: %" PRIu64
>>>>> +            "\n   nr_sectors:   %" PRIu32
>>>>> +            "\n   data_size:    %" PRIu64 "\n",
>>>>> +            flags, start_sector, nr_sectors, buf_size);
>>>>> +
>>>>> +    qemu_put_byte(f, flags);
>>>>> +
>>>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>>>>> +        qemu_put_name(f, bdrv_get_device_name(bs));
>>>>> +    }
>>>>
>>>> I am still not fully clear on this myself, but I think we are phasing
>>>> out bdrv_get_device_name. In the context of bitmaps, we are mostly
>>>> likely using them one-per-tree, but they /can/ be attached
>>>> one-per-node, so we shouldn't be trying to get the device name here,
>>>> but rather, the node-name.
>>>>
>>>> I /think/ we may want to be using bdrv_get_node_name, but I think it
>>>> is not currently true that all nodes WILL be named ... I am CC'ing
>>>> Markus Armbruster and Max Reitz for (A) A refresher course and (B)
>>>> Opinions on what function call might make sense here, given that we
>>>> wish to migrate bitmaps attached to arbitrary nodes.
>>> Hmm.. I'm not familiar with hierarchy of nodes and devices. As I
>>> understand, both command_line- and qmp- created drives are created
>>> through blockdev_init, which creates both blk(device) and bs(node)
>>> through blk_new_with_bs.. Am I right? Also, bdrv_get_device_name is used
>>> in migration/block.c.
>>
>> Now that I'm more awake, here's a better rundown of what's going on:
>>
>> It's something that is a little bit in flux right now, unfortunately.
>> We're trying to transition to a format where we have arbitrarily
>> complex Block trees, where the root of the tree is always a
>> BlockBackend (See the big series by Max Reitz) and the configuration
>> of the tree may become arbitrarily complex.
>>
>> Simple trees may consist of just one BlockBackend and one
>> BlockDriverState node, where I think we can refer to this BDS as the
>> "root node," not to be confused with the BlockBackend "root." The
>> BlockBackend is a relatively new invention, so it isn't actually used
>> consistently everywhere yet.
>>
>> In the future, we may have commands that make distinctions based on if
>> you want to work on the BlockBackend, the root node under the
>> blockbackend associated with a BDS, only the explicit node/BDS you
>> identify, or some combination of the above semantics.
>>
>> As of right now, bitmaps can be *attached* to any arbitrary node,
>> though they are currently only *useful* when attached to the first
>> child of the BlockBackend, the root node. It's only useful currently
>> in cases where it is attached to the root because I've only proposed
>> patches for adding bitmap support to produce incrementals for Drive
>> Backup, which operates only on drives/devices (the root node of a tree.)
>>
>> However, in the context of migrating, it could be that we want to
>> migrate any bitmaps attached to /any/ nodes, so we should be careful
>> about what names we are pulling - we don't necessarily want the name
>> of the root node or BlockBackend, we may want the BDS and accompanying
>> name of strictly the node the bitmap is attached to.
>>
>> I know other areas of the code don't provide a good example for this
>> distinction, yet, but the block layer people are actively working on
>> fixing that. (See also the back-and-forth reviews for what to name my
>> QMP parameters in the incremental backup patches for some overview of
>> this semantic transition.)
>>
>> That said, We should think carefully about *which* name we want to put
>> in the stream and what implications it has for migration.
>>
>>
>> (1) bdrv_get_node_name and bdrv_find_node
>>
>> This would migrate bitmaps as attached to their specific BDS. This
>> would mean that the node layout on the destination is either
>> identical, or similar enough such that no named bitmaps are attached
>> to a node not present on the destination.
>>
>> This gives us precision: bitmaps may be attached lower in the tree and
>> can provide more fine-grained detail for which layers have been
>> changed or modified during runtime.
>>
>> This also gives us fragility: In cases where we transfer, say, a
>> complex tree of nodes and collapse it to a single destination drive,
>> we'd be unable to migrate bitmaps not attached to the root along with
>> it, because they'd have nowhere meaningful to attach.
>>
>> It is perhaps somewhat unneccessary at this exact moment in time, as
>> well, because bitmaps are currently only useful on root nodes.
>>
>> (2) bdrv_get_device_name and bdrv_lookup_bs(device_name, NULL, errp)
>>
>> This would migrate any bitmaps in a tree and attach them to the entire
>> drive on the destination.
>>
>> This is simpler: You just need to make sure that the root nodes have
>> the same names, which is a lot easier to manage.
>>
>> This matches how drive migration currently appears to work: The entire
>> tree appears to be generally squashed into a single node and
>> transferred cluster-by-cluster, without general consideration as to
>> the layout of the local block tree. As we both know by now, none of
>> the metadata is transferred, just the data.
>>
>> It prevents migration of just bitmaps where you WANT the extra
>> complexity: If a bitmap is attached lower in the tree, re-affixing it
>> to the root of a destination tree might invalidate the semantics of
>> what that bitmap was meant to track, and it may become useless.
>>
>>
>> So in summary:
>> using device names is probably fine for now, as it matches the current
>> use case of bitmaps as well as drive migration; but using node names
>> may give us more power and precision later.
>>
>> I talked to Max about it, and he is leaning towards using device names
>> for now and switching to node names if we decide we want that power.
>>
>> (...I wonder if we could use a flag, for now, that says we're
>> including DEVICE names. Later, we could add a flag that says we're
>> using NODE names and add an option to toggle as the usage case sees fit.)
>>
>>
>> Are you confused yet? :D
> O, thanks for the explanation). Are we really need this flag? As Markus
> wrote, nodes and devices are sharing namespaces.. We can use
> bdrv_lookup_bs(name, name, errp)..

what 'name' are you using here, though? It looked to me like in your 
backup routine we got a list of BDS entries and get the name *from* the 
BDS, so we still have to think about how we want to /get/ the name.

>
> Also, we can, for example, send bitmaps as follows:
>
> if node has name - send bitmap with this name
> if node is root, but hasn't name - send it with blk name
> otherwise - don't send the bitmap

The node a bitmap is attached to should always have a name -- it would 
not be possible via the existing interface to attach it to a node 
without a name.

I *think* the root node should always have a name, but I am actually 
less sure of that.

>>
>>
>>>>
>>>>> +
>>>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>>>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(bitmap));
>>>>> +
>>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>>>>> +            qemu_put_be64(f, bdrv_dirty_bitmap_granularity(bs,
>>>>> bitmap));
>>>>> +        }
>>>>> +    } else {
>>>>> +        assert(!(flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY));
>>>>> +    }
>>>>> +
>>>>
>>>> I thought we were only migrating bitmaps with names?
>>>> I suppose the conditional can't hurt, but I am not clear on when we
>>>> won't have a bitmap name here.
>>> You are right, 'else' case is not possible.. Hmm. I've added it to be
>>> sure that format is not corrupted, when I decided to put granularity
>>> only with name. Wi won't have a bitmap name only when we send the same
>>> bitmap as on the previous send_bitmap() call. May be it will be better
>>> to use two separate if's without else and assert.
>>
>> It's okay if it is just "paranoia," but I was just checking. It would
>> make a decent assert().
>>
>>>>
>>>>> +    qemu_put_be64(f, start_sector);
>>>>> +    qemu_put_be32(f, nr_sectors);
>>>>> +
>>>>> +    /* if a block is zero we need to flush here since the network
>>>>> +     * bandwidth is now a lot higher than the storage device
>>>>> bandwidth.
>>>>> +     * thus if we queue zero blocks we slow down the migration.
>>>>> +     * also, skip writing block when migrate only dirty bitmaps. */
>>>>> +    if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>>>>> +        qemu_fflush(f);
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    qemu_put_be64(f, buf_size);
>>>>> +    qemu_put_buffer(f, buf, buf_size);
>>>>> +    g_free(buf);
>>>>> +}
>>>>> +
>>>>> +
>>>>> +/* Called with iothread lock taken.  */
>>>>> +
>>>>> +static void set_dirty_tracking(void)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        dbms->dirty_bitmap =
>>>>> +            bdrv_create_dirty_dirty_bitmap(dbms->bitmap, CHUNK_SIZE);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>
>>>> OK: so we only have these dirty-dirty bitmaps when migration is
>>>> starting, which makes sense.
>>>>
>>>>> +static void unset_dirty_tracking(void)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        bdrv_release_dirty_dirty_bitmap(dbms->bitmap);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +static void init_dirty_bitmap_migration(QEMUFile *f)
>>>>> +{
>>>>> +    BlockDriverState *bs;
>>>>> +    BdrvDirtyBitmap *bitmap;
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    dirty_bitmap_mig_state.bulk_completed = false;
>>>>> +    dirty_bitmap_mig_state.prev_bs = NULL;
>>>>> +    dirty_bitmap_mig_state.prev_bitmap = NULL;
>>>>> +
>>>>> +    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
>>>>> +        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
>>>>> +             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
>>>>> +            if (!bdrv_dirty_bitmap_name(bitmap)) {
>>>>> +                continue;
>>>>> +            }
>>>>> +
>>>>> +            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
>>>>> +            dbms->bs = bs;
>>>>> +            dbms->bitmap = bitmap;
>>>>> +            dbms->total_sectors = bdrv_nb_sectors(bs);
>>>>> +            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
>>>>> +                bdrv_dirty_bitmap_granularity(dbms->bs, dbms->bitmap)
>>>>> +                >> BDRV_SECTOR_BITS;
>>>>> +
>>>>> + QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
>>>>> +                                 dbms, entry);
>>>>> +        }
>>>>> +    }
>>>>> +}
>>>>> +
>>>>
>>>> OK, but see the note below for dirty_bitmap_mig_init.
>>> actually it is not 'init' but 'reinit' - called on every migration
>>> start.. Hmm. dbms_list should be cleared here before fill it again.
>>>>
>>>>> +/* Called with no lock taken.  */
>>>>> +static void bulk_phase_send_chunk(QEMUFile *f,
>>>>> DirtyBitmapMigBitmapState *dbms)
>>>>> +{
>>>>> +    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
>>>>> +                             dbms->sectors_per_chunk);
>>>>
>>>> What about cases where nr_sectors will put us past the end of the
>>>> bitmap? The bitmap serialization implementation might need a touchup
>>>> with this in mind.
>>> I don't understand.. nr_sectors <=  dbms->total_sectors -
>>> dbms->cur_sector and it can't put us past the end...
>>
>> Oh, because you take the minimum, so we don't have to worry about
>> sectors_per_chunk eclipsing what we have.
>>
>> Nevermind, I can't read... :(
>>
>>>>
>>>>> +
>>>>> +    send_bitmap(f, dbms, dbms->cur_sector, nr_sectors);
>>>>> +
>>>>> +    dbms->cur_sector += nr_sectors;
>>>>> +    if (dbms->cur_sector >= dbms->total_sectors) {
>>>>> +        dbms->bulk_completed = true;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>> +/* Called with no lock taken.  */
>>>>> +static void bulk_phase(QEMUFile *f, bool limit)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        while (!dbms->bulk_completed) {
>>>>> +            bulk_phase_send_chunk(f, dbms);
>>>>> +            if (limit && qemu_file_rate_limit(f)) {
>>>>> +                return;
>>>>> +            }
>>>>> +        }
>>>>> +    }
>>>>> +
>>>>> +    dirty_bitmap_mig_state.bulk_completed = true;
>>>>> +}
>>>>
>>>> OK.
>>>>
>>>>> +
>>>>> +static void blk_mig_reset_dirty_cursor(void)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        dbms->cur_dirty = 0;
>>>>> +    }
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +/* Called with iothread lock taken. */
>>>>> +static void dirty_phase_send_chunk(QEMUFile *f,
>>>>> DirtyBitmapMigBitmapState *dbms)
>>>>> +{
>>>>> +    uint32_t nr_sectors;
>>>>> +
>>>>> +    while (dbms->cur_dirty < dbms->total_sectors &&
>>>>> +           !hbitmap_get(dbms->dirty_bitmap, dbms->cur_dirty)) {
>>>>> +        dbms->cur_dirty += dbms->sectors_per_chunk;
>>>>> +    }
>>>>
>>>> OK, so we fast forward the dirty cursor while the meta-bitmap is
>>>> empty. Is it not worth using the HBitmapIterator here? You can reset
>>>> them everywhere you reset the dirty cursor, and then just fast-seek to
>>>> the first dirty sector.
>>> Yes, I've thought about it, just used simpler way (copied from
>>> migration/block.c) for an early version of the patch set. I will do it.
>>
>> Only if it doesn't make things more complicated to look at.
>>
>>>>
>>>>> +
>>>>> +    if (dbms->cur_dirty >= dbms->total_sectors) {
>>>>> +        return;
>>>>> +    }
>>>>> +
>>>>> +    nr_sectors = MIN(dbms->total_sectors - dbms->cur_dirty,
>>>>> +                     dbms->sectors_per_chunk);
>>>>
>>>> What happens if nr_sectors goes past the end?
>>
>> Again, I misread.
>>
>>>>
>>>>> +    send_bitmap(f, dbms, dbms->cur_dirty, nr_sectors);
>>>>> +    hbitmap_reset(dbms->dirty_bitmap, dbms->cur_dirty,
>>>>> dbms->sectors_per_chunk);
>>>>> +    dbms->cur_dirty += nr_sectors;
>>>>> +}
>>>>> +
>>>>> +/* Called with iothread lock taken.
>>>>> + *
>>>>> + * return value:
>>>>> + * 0: too much data for max_downtime
>>>>> + * 1: few enough data for max_downtime
>>>>> +*/
>>>>
>>>> dirty_phase below doesn't have a return value.
>>> rudimentary comment.. thanks.
>>>>
>>>>> +static void dirty_phase(QEMUFile *f, bool limit)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        while (dbms->cur_dirty < dbms->total_sectors) {
>>>>> +            dirty_phase_send_chunk(f, dbms);
>>>>> +            if (limit && qemu_file_rate_limit(f)) {
>>>>> +                return;
>>>>> +            }
>>>>> +        }
>>>>> +    }
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +
>>>>> +/* Called with iothread lock taken.  */
>>>>> +static void dirty_bitmap_mig_cleanup(void)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +
>>>>> +    unset_dirty_tracking();
>>>>> +
>>>>> +    while ((dbms =
>>>>> QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
>>>>> + QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
>>>>> +        g_free(dbms);
>>>>> +    }
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +static void dirty_bitmap_migration_cancel(void *opaque)
>>>>> +{
>>>>> +    dirty_bitmap_mig_cleanup();
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
>>>>> +{
>>>>> +    DPRINTF("Enter save live iterate\n");
>>>>> +
>>>>> +    blk_mig_reset_dirty_cursor();
>>>>
>>>> I suppose this is because it's easier to check if we are finished by
>>>> starting from sector 0 every time.
>>>>
>>>> A harder, but faster method may be: Use HBitmapIterators, but don't
>>>> reset them every iteration: just iterate until the end, and check that
>>>> the bitmap is empty. If the meta bitmap is empty, the dirty phase is
>>>> complete. If the meta bitmap is NOT empty, reset the HBI and continue
>>>> allowing iterations over the dirty phase.
>>> Ok, will do.
>>>>
>>>>> +
>>>>> +    if (dirty_bitmap_mig_state.bulk_completed) {
>>>>> +        qemu_mutex_lock_iothread();
>>>>> +        dirty_phase(f, true);
>>>>> +        qemu_mutex_unlock_iothread();
>>>>> +    } else {
>>>>> +        bulk_phase(f, true);
>>>>> +    }
>>>>> +
>>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>>> +
>>>>> +    return dirty_bitmap_mig_state.bulk_completed;
>>>>> +}
>>>>> +
>>>>> +/* Called with iothread lock taken.  */
>>>>> +
>>>>> +static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +    DPRINTF("Enter save live complete\n");
>>>>> +
>>>>> +    if (!dirty_bitmap_mig_state.bulk_completed) {
>>>>> +        bulk_phase(f, false);
>>>>> +    }
>>>>
>>>> [Not expertly familiar with savevm:] Under what conditions can this
>>>> happen?
>>> This can happen. save_complete will happen when savevm decide that
>>> pending data size to send is small enough. It was the case for my bugfix
>>> for migration/block.c about pending. To prevent save_complete when bulk
>>> phase isn't completed, save_pending returns (in my bugfix for
>>> migration/block.c) big value. Here I decided to make more honest
>>> save_pending, so I need to complete (if it doesn't) bulk phase in
>>> save_complete.
>>
>> OK, Gotcha.
>>
>>>>
>>>>> +
>>>>> +    blk_mig_reset_dirty_cursor();
>>>>> +    dirty_phase(f, false);
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
>>>>> +                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
>>>>> +                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
>>>>> +
>>>>> +        qemu_put_byte(f, flags);
>>>>> +        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
>>>>> +        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
>>>>> +        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
>>>>> +    }
>>>>> +
>>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>>> +
>>>>> +    DPRINTF("Dirty bitmaps migration completed\n");
>>>>> +
>>>>> +    dirty_bitmap_mig_cleanup();
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>
>>>> I suppose we don't need a flag that distinctly SAYS this is the end
>>>> section, since we can tell by omission of
>>>> DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK or ZERO_CHUNK.
>>> Hmm. I think it simplifies the logic (to use EOS after each section).
>>> And the same approach is in migration/block.c.. It's a question about
>>> which format is better:  "Each section for dirty_bitmap_load ends with
>>> EOS" or "Each section for dirty_bitmap_load ends with EOS except the
>>> last one. The last one may be recognized by absent NORMAL_CHUNK and
>>> ZERO_CHUNK"
>>>>
>>>>> +static uint64_t dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
>>>>> +                                          uint64_t max_size)
>>>>> +{
>>>>> +    DirtyBitmapMigBitmapState *dbms;
>>>>> +    uint64_t pending = 0;
>>>>> +
>>>>> +    qemu_mutex_lock_iothread();
>>>>> +
>>>>> +    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list,
>>>>> entry) {
>>>>> +        uint64_t sectors = hbitmap_count(dbms->dirty_bitmap);
>>>>> +        if (!dbms->bulk_completed) {
>>>>> +            sectors += dbms->total_sectors - dbms->cur_sector;
>>>>> +        }
>>>>> +        pending += bdrv_dirty_bitmap_data_size(dbms->bitmap,
>>>>> sectors);
>>>>> +    }
>>>>> +
>>>>> +    qemu_mutex_unlock_iothread();
>>>>> +
>>>>> +    DPRINTF("Enter save live pending %" PRIu64 ", max: %" PRIu64
>>>>> "\n",
>>>>> +            pending, max_size);
>>>>> +    return pending;
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +static int dirty_bitmap_load(QEMUFile *f, void *opaque, int
>>>>> version_id)
>>>>> +{
>>>>> +    int flags;
>>>>> +
>>>>> +    static char device_name[256], bitmap_name[256];
>>>>> +    static BlockDriverState *bs;
>>>>> +    static BdrvDirtyBitmap *bitmap;
>>>>> +
>>>>> +    uint8_t *buf;
>>>>> +    uint64_t first_sector;
>>>>> +    uint32_t  nr_sectors;
>>>>> +    int ret;
>>>>> +
>>>>> +    DPRINTF("load start\n");
>>>>> +
>>>>> +    do {
>>>>> +        flags = qemu_get_byte(f);
>>>>> +        DPRINTF("flags: %x\n", flags);
>>>>> +
>>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
>>>>> +            qemu_get_name(f, device_name);
>>>>> +            bs = bdrv_find(device_name);
>>>>
>>>> Similar to the above confusion, you may want bdrv_lookup_bs or
>>>> similar, since we're going to be looking for BDS nodes instead of
>>>> "devices."
>>> In this case, should it be changed in migration/block.c too?
>>
>> [See discussion above!]
>>
>>>>
>>>>> +            if (!bs) {
>>>>> +                fprintf(stderr, "Error: unknown block device '%s'\n",
>>>>> +                        device_name);
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
>>>>> +            if (!bs) {
>>>>> +                fprintf(stderr, "Error: block device name is not
>>>>> set\n");
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +
>>>>> +            qemu_get_name(f, bitmap_name);
>>>>> +            bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
>>>>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
>>>>> +                /* First chunk from this bitmap */
>>>>> +                uint64_t granularity = qemu_get_be64(f);
>>>>> +                if (!bitmap) {
>>>>> +                    Error *local_err = NULL;
>>>>> +                    bitmap = bdrv_create_dirty_bitmap(bs,
>>>>> granularity,
>>>>> + bitmap_name,
>>>>> + &local_err);
>>>>> +                    if (!bitmap) {
>>>>> +                        error_report("%s",
>>>>> error_get_pretty(local_err));
>>>>> +                        error_free(local_err);
>>>>> +                        return -EINVAL;
>>>>> +                    }
>>>>> +                } else {
>>>>> +                    uint64_t dest_granularity =
>>>>> +                        bdrv_dirty_bitmap_granularity(bs, bitmap);
>>>>> +                    if (dest_granularity != granularity) {
>>>>> +                        fprintf(stderr,
>>>>> +                                "Error: "
>>>>> +                                "Migrated bitmap granularity (%"
>>>>> PRIu64 ") "
>>>>> +                                "is not match with destination
>>>>> bitmap '%s' "
>>>>> +                                "granularity (%" PRIu64 ")\n",
>>>>> +                                granularity,
>>>>> +                                bitmap_name,
>>>>> +                                dest_granularity);
>>>>> +                        return -EINVAL;
>>>>> +                    }
>>>>> +                }
>>>>> +                bdrv_disable_dirty_bitmap(bitmap);
>>>>> +            }
>>>>> +            if (!bitmap) {
>>>>> +                fprintf(stderr, "Error: unknown dirty bitmap "
>>>>> +                        "'%s' for block device '%s'\n",
>>>>> +                        bitmap_name, device_name);
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>> +        if (flags & DIRTY_BITMAP_MIG_FLAG_ENABLED) {
>>>>> +            bool enabled;
>>>>> +            if (!bitmap) {
>>>>> +                fprintf(stderr, "Error: dirty bitmap name is not
>>>>> set\n");
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +            bdrv_dirty_bitmap_deserialize_finish(bitmap);
>>>>> +            /* complete migration */
>>>>> +            enabled = qemu_get_byte(f);
>>>>> +            if (enabled) {
>>>>> +                bdrv_enable_dirty_bitmap(bitmap);
>>>>> +            }
>>>>> +        }
>>>>
>>>> Oh, so you use the ENABLED flag to show that migration is over.
>>> Yes, it was bad idea..
>>>> If we are going to commit to a stream format for bitmaps, though,
>>>> maybe it's best to actually create a "COMPLETION BLOCK" flag and then
>>>> split this function into two pieces:
>>>>
>>>> (1) The part that receives regular / zero blocks, and
>>>> (2) The part that receives completion data.
>>>>
>>>> That way, if we change the properties that bitmaps have down the line,
>>>> we aren't reliant on literally the "enabled" flag to decide what to do.
>>>>
>>>> Also, it might help make this fairly long function a little smaller
>>>> and more readable.
>>> Ok.
>>>>
>>>>> +
>>>>> +        if (flags & (DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK |
>>>>> +                     DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK)) {
>>>>> +            if (!bs) {
>>>>> +                fprintf(stderr, "Error: block device name is not
>>>>> set\n");
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +            if (!bitmap) {
>>>>> +                fprintf(stderr, "Error: dirty bitmap name is not
>>>>> set\n");
>>>>> +                return -EINVAL;
>>>>> +            }
>>>>> +
>>>>> +            first_sector = qemu_get_be64(f);
>>>>> +            nr_sectors = qemu_get_be32(f);
>>>>> +            DPRINTF("chunk: %lu %u\n", first_sector, nr_sectors);
>>>>> +
>>>>> +
>>>>> +            if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
>>>>> +                bdrv_dirty_bitmap_deserialize_part0(bitmap,
>>>>> first_sector,
>>>>> + nr_sectors);
>>>>> +            } else {
>>>>> +                uint64_t buf_size = qemu_get_be64(f);
>>>>> +                uint64_t needed_size =
>>>>> +                    bdrv_dirty_bitmap_data_size(bitmap, nr_sectors);
>>>>> +
>>>>> +                if (needed_size > buf_size) {
>>>>> +                    fprintf(stderr,
>>>>> +                            "Error: Migrated bitmap granularity is
>>>>> not "
>>>>> +                            "match with destination bitmap
>>>>> granularity\n");
>>>>> +                    return -EINVAL;
>>>>> +                }
>>>>> +
>>>>
>>>> "Migrated bitmap granularity doesn't match the destination bitmap
>>>> granularity" perhaps.
>>>>
>>>>> +                buf = g_malloc(buf_size);
>>>>> +                qemu_get_buffer(f, buf, buf_size);
>>>>> +                bdrv_dirty_bitmap_deserialize_part(bitmap, buf,
>>>>> + first_sector,
>>>>> + nr_sectors);
>>>>> +                g_free(buf);
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>> +        ret = qemu_file_get_error(f);
>>>>> +        if (ret != 0) {
>>>>> +            return ret;
>>>>> +        }
>>>>> +    } while (!(flags & DIRTY_BITMAP_MIG_FLAG_EOS));
>>>>> +
>>>>> +    DPRINTF("load finish\n");
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>> +static void dirty_bitmap_set_params(const MigrationParams *params,
>>>>> void *opaque)
>>>>> +{
>>>>> +    dirty_bitmap_mig_state.migration_enable = params->dirty;
>>>>> +}
>>>>> +
>>>>
>>>> OK; though I am not immediately aware of what changes need to happen
>>>> to accommodate Eric's suggestions.
>>> This function will be dropped in v3.
>>>>
>>>>> +static bool dirty_bitmap_is_active(void *opaque)
>>>>> +{
>>>>> +    return dirty_bitmap_mig_state.migration_enable == 1;
>>>>> +}
>>>>> +
>>>>
>>>> OK.
>>>>
>>>>> +static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
>>>>> +{
>>>>> +    init_dirty_bitmap_migration(f);
>>>>> +
>>>>> +    qemu_mutex_lock_iothread();
>>>>> +    /* start track dirtyness of dirty bitmaps */
>>>>> +    set_dirty_tracking();
>>>>> +    qemu_mutex_unlock_iothread();
>>>>> +
>>>>> +    blk_mig_reset_dirty_cursor();
>>>>> +    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
>>>>> +
>>>>> +    return 0;
>>>>> +}
>>>>> +
>>>>
>>>> OK; see dirty_bitmap_mig_init below, though.
>>>>
>>>>> +static SaveVMHandlers savevm_block_handlers = {
>>>>> +    .set_params = dirty_bitmap_set_params,
>>>>> +    .save_live_setup = dirty_bitmap_save_setup,
>>>>> +    .save_live_iterate = dirty_bitmap_save_iterate,
>>>>> +    .save_live_complete = dirty_bitmap_save_complete,
>>>>> +    .save_live_pending = dirty_bitmap_save_pending,
>>>>> +    .load_state = dirty_bitmap_load,
>>>>> +    .cancel = dirty_bitmap_migration_cancel,
>>>>> +    .is_active = dirty_bitmap_is_active,
>>>>> +};
>>>>> +
>>>>> +void dirty_bitmap_mig_init(void)
>>>>> +{
>>>>> +    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
>>>>
>>>> Maybe I haven't looked thoroughly enough yet, but it's weird that part
>>>> of the dirty_bitmap_mig_state is initialized here, and the rest of it
>>>> in init_dirty_bitmap_migration. I'd prefer to keep it all together, if
>>>> possible.
>>> dirty_bitmap_mig_init is called one time when qemu starts. QSIMPLEQ_INIT
>>> should be called once. dirty_bitmap_save_setup is called on every
>>> migration start, it's like 'reinitialize'.
>>>>
>>>>> +
>>>>> +    register_savevm_live(NULL, "dirty-bitmap", 0, 1,
>>>>> &savevm_block_handlers,
>>>>> +                         &dirty_bitmap_mig_state);
>>>>> +}
>>>>
>>>> OK.
>>>>
>>>>> diff --git a/vl.c b/vl.c
>>>>> index a824a7d..dee7220 100644
>>>>> --- a/vl.c
>>>>> +++ b/vl.c
>>>>> @@ -4184,6 +4184,7 @@ int main(int argc, char **argv, char **envp)
>>>>>
>>>>>       blk_mig_init();
>>>>>       ram_mig_init();
>>>>> +    dirty_bitmap_mig_init();
>>>>>
>>>>>       /* If the currently selected machine wishes to override the
>>>>> units-per-bus
>>>>>        * property of its default HBA interface type, do so now. */
>>>>>
>>>>
>>>> Hm, since dirty bitmaps are a sub-component of the block layer, would
>>>> it not make sense to put this hook under blk_mig_init, perhaps?
>>> IMHO the reason to put it here is to keep all
>>> register_savevm_live-entities in one place.
>>
>> If you still feel that way I won't withhold my R-b, but there are
>> already other cases such as ppc_spapr_init which are not in this
>> general area of vl.c.
>>
>> Plus the dozens of devices that use register_savevm as a wrapper to
>> register_savevm_live, so maybe consolidating calls to this function
>> isn't that important.
> Hm, I've missed it, ppc_spapr_init is not here, yes.. Another thing
> here: dirty bitmaps migration are separate from blk migration. And it
> may be used without blk migration (nbd+mirrow migration may be used)..
> Is it ok to connect dirty bitmaps migration to blk_mig_init, which is
> located in migration/block.c, which may not be used at all when we
> bitmaps are migrated using migration/dirty-bitmap.c?
> In other words, yes, dirty bitmaps are a sub-component of the block
> layer, but dirty bitmap migration is not a sub-component of blk migration.
>>
>>>>
>>>>
>>>> Overall this looks very clean compared to the intermingled format in
>>>> V1, and the code is organized pretty well. Just a few minor comments,
>>>> and I'd like to get the opinion of the migration maintainers, but I am
>>>> happy. Sorry it took me so long to review, please feel free to let me
>>>> know if you disagree with any of my opinions :)
>>>>
>>>> Thank you,
>>>> --John
>>>
>>> Thank you for reviewing my series)
>>>
>>
>> Yup. Hopefully I didn't miss too much that will irritate the Migration
>> overlords.
>>
>> Once you respin on top of v12, I can run some thorough migration tests
>> on it (perhaps over a weekend) and verify that it survives a couple
>> hundred migrations without any kind of integrity loss.
> I hope I'll do it with all other things in about two days.
>>
>> This is what makes sense to me right now, anyway.
>>
>> Do you think you'll be including the bitmap checksum in the
>> BlockDirtyInfo command? That'd be convenient for iotests.
> Ok, will do. Good idea. Only two points:
> 1) Is it ok to include debug info into BlockDirtyInfo? Will users be
> happy with it?
> 2) When I was debugging my code, the information about dirty-regions was
> very useful. Now, all is working and checksums are enough for regression
> control.

I think leaving some tactical debug prints in the code, disabled, is 
perfectly fine from my personal standpoint. We're not really done 
implementing all of these features yet and they may yet be useful.

I'd vote for leaving in any non-QMP/QAPI debug information you wish, for 
now. We just have to be careful about interfaces -- no QMP/HMP commands.

As long as it looks reasonably tidy, I don't think it will be a problem.
Dr. David Alan Gilbert Feb. 16, 2015, 6:22 p.m. UTC | #9
A small request on this patch;  please make it migration/block-dirty-bitmap.c;
we've got enough bitmaps floating around the RAM code in migration that I
wouldn't want to confuse things.

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy Feb. 17, 2015, 8:54 a.m. UTC | #10
On 16.02.2015 21:18, John Snow wrote:
>
>
> On 02/16/2015 07:06 AM, Vladimir Sementsov-Ogievskiy wrote:
>> On 13.02.2015 23:22, John Snow wrote:
>>>
>>>
>>> On 02/13/2015 03:19 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>> On 11.02.2015 00:33, John Snow wrote:

>>>> So in summary:
>>>> using device names is probably fine for now, as it matches the current
>>>> use case of bitmaps as well as drive migration; but using node names
>>>> may give us more power and precision later.
>>>>
>>>> I talked to Max about it, and he is leaning towards using device names
>>>> for now and switching to node names if we decide we want that power.
>>>>
>>>> (...I wonder if we could use a flag, for now, that says we're
>>>> including DEVICE names. Later, we could add a flag that says we're
>>>> using NODE names and add an option to toggle as the usage case sees 
>>>> fit.)
>>>>
>>>>
>>>> Are you confused yet? :D
>> O, thanks for the explanation). Are we really need this flag? As Markus
>> wrote, nodes and devices are sharing namespaces.. We can use
>> bdrv_lookup_bs(name, name, errp)..
>
> what 'name' are you using here, though? It looked to me like in your 
> backup routine we got a list of BDS entries and get the name *from* 
> the BDS, so we still have to think about how we want to /get/ the name.
>
>>
>> Also, we can, for example, send bitmaps as follows:
>>
>> if node has name - send bitmap with this name
>> if node is root, but hasn't name - send it with blk name
>> otherwise - don't send the bitmap
>
> The node a bitmap is attached to should always have a name -- it would 
> not be possible via the existing interface to attach it to a node 
> without a name.
>
> I *think* the root node should always have a name, but I am actually 
> less sure of that.
>
Hmm.. No? bitmap is attached using bdrv_lookup_bs(name, name, errp), 
which can find device with this name. qemu option -drive 
file=...,id=disk creates blk named 'disk' and attached node with no name.
John Snow Feb. 17, 2015, 6:45 p.m. UTC | #11
On 02/17/2015 03:54 AM, Vladimir Sementsov-Ogievskiy wrote:
> On 16.02.2015 21:18, John Snow wrote:
>>
>>
>> On 02/16/2015 07:06 AM, Vladimir Sementsov-Ogievskiy wrote:
>>> On 13.02.2015 23:22, John Snow wrote:
>>>>
>>>>
>>>> On 02/13/2015 03:19 AM, Vladimir Sementsov-Ogievskiy wrote:
>>>>> On 11.02.2015 00:33, John Snow wrote:
>
>>>>> So in summary:
>>>>> using device names is probably fine for now, as it matches the current
>>>>> use case of bitmaps as well as drive migration; but using node names
>>>>> may give us more power and precision later.
>>>>>
>>>>> I talked to Max about it, and he is leaning towards using device names
>>>>> for now and switching to node names if we decide we want that power.
>>>>>
>>>>> (...I wonder if we could use a flag, for now, that says we're
>>>>> including DEVICE names. Later, we could add a flag that says we're
>>>>> using NODE names and add an option to toggle as the usage case sees
>>>>> fit.)
>>>>>
>>>>>
>>>>> Are you confused yet? :D
>>> O, thanks for the explanation). Are we really need this flag? As Markus
>>> wrote, nodes and devices are sharing namespaces.. We can use
>>> bdrv_lookup_bs(name, name, errp)..
>>
>> what 'name' are you using here, though? It looked to me like in your
>> backup routine we got a list of BDS entries and get the name *from*
>> the BDS, so we still have to think about how we want to /get/ the name.
>>
>>>
>>> Also, we can, for example, send bitmaps as follows:
>>>
>>> if node has name - send bitmap with this name
>>> if node is root, but hasn't name - send it with blk name
>>> otherwise - don't send the bitmap
>>
>> The node a bitmap is attached to should always have a name -- it would
>> not be possible via the existing interface to attach it to a node
>> without a name.
>>
>> I *think* the root node should always have a name, but I am actually
>> less sure of that.
>>
> Hmm.. No? bitmap is attached using bdrv_lookup_bs(name, name, errp),
> which can find device with this name. qemu option -drive
> file=...,id=disk creates blk named 'disk' and attached node with no name.
>
>

Very good point -- We use the device name as a convenience shortcut to 
the first node. So the node a bitmap is attached to might in fact not 
have a name.

I see what you mean.

Hmm. So you propose, on the sending side:

(1) Use this node's name, if available
(2) Fall back to the backend/"device" name, if present,
(3) Do not send the bitmap if neither names are present (THIS scenario 
should be impossible to achieve and should trigger an error.)

What about on the receiving end? To which node(s) do we attach the 
bitmap? I assume:

(1) Try to find a node with this name and attach it there,
(2) Fall back to attaching it to the root node of a device/BB with this name
(3) If neither are present, abort.

Is that right? If so, it sounds serviceable to me, but keep in mind that 
bdrv_lookup_bs() on the receiving end will default to #2 first before #1.

Overall, this makes the migration very "fuzzy" in terms of which bitmaps 
will go where, and the onus is really on the user (or management tool) 
to keep trees compatibly similar for the purposes of bitmap migration.

I still wonder if making the exportation of names more explicit in terms 
of a flag ("this name is a node-name", "this name is a backend-name") 
might still be of use for providing more rigid and knowable migration 
semantics.

Might as well go ahead with your suggestion for now, and we'll see if 
anyone from migration or libvirt groups has a reason not to do it that 
way. I don't have any concrete reasons.

--js
Eric Blake Feb. 17, 2015, 7:12 p.m. UTC | #12
On 02/17/2015 11:45 AM, John Snow wrote:

>>>
>> Hmm.. No? bitmap is attached using bdrv_lookup_bs(name, name, errp),
>> which can find device with this name. qemu option -drive
>> file=...,id=disk creates blk named 'disk' and attached node with no name.
>>
>>
> 
> Very good point -- We use the device name as a convenience shortcut to
> the first node. So the node a bitmap is attached to might in fact not
> have a name.

Or, if we revisit Jeff's proposal to always auto-name every node, then
you'd never have a node without a name.  Ultimately, libvirt should be
using named nodes, but we're not there yet.

> 
> I see what you mean.
> 
> Hmm. So you propose, on the sending side:
> 
> (1) Use this node's name, if available
> (2) Fall back to the backend/"device" name, if present,
> (3) Do not send the bitmap if neither names are present (THIS scenario
> should be impossible to achieve and should trigger an error.)

Seems okay to me.

> 
> What about on the receiving end? To which node(s) do we attach the
> bitmap? I assume:
> 
> (1) Try to find a node with this name and attach it there,
> (2) Fall back to attaching it to the root node of a device/BB with this
> name
> (3) If neither are present, abort.
> 

That would also work. Since node names and device names share the same
namespace, it should always resolve to the correct node.

> Is that right? If so, it sounds serviceable to me, but keep in mind that
> bdrv_lookup_bs() on the receiving end will default to #2 first before #1.

Except that if a device is named "foo", then no node has that name, so
it is unambiguous that you meant the node attached to device "foo".

> 
> Overall, this makes the migration very "fuzzy" in terms of which bitmaps
> will go where, and the onus is really on the user (or management tool)
> to keep trees compatibly similar for the purposes of bitmap migration.
> 
> I still wonder if making the exportation of names more explicit in terms
> of a flag ("this name is a node-name", "this name is a backend-name")
> might still be of use for providing more rigid and knowable migration
> semantics.
> 
> Might as well go ahead with your suggestion for now, and we'll see if
> anyone from migration or libvirt groups has a reason not to do it that
> way. I don't have any concrete reasons.
> 
> --js
> 
> 
>
diff mbox

Patch

diff --git a/include/migration/block.h b/include/migration/block.h
index ffa8ac0..566bb9f 100644
--- a/include/migration/block.h
+++ b/include/migration/block.h
@@ -14,6 +14,7 @@ 
 #ifndef BLOCK_MIGRATION_H
 #define BLOCK_MIGRATION_H
 
+void dirty_bitmap_mig_init(void);
 void blk_mig_init(void);
 int blk_mig_active(void);
 uint64_t blk_mig_bytes_transferred(void);
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index d929e96..9adfda9 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -6,5 +6,5 @@  common-obj-y += xbzrle.o
 common-obj-$(CONFIG_RDMA) += rdma.o
 common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
 
-common-obj-y += block.o
+common-obj-y += block.o dirty-bitmap.o
 
diff --git a/migration/dirty-bitmap.c b/migration/dirty-bitmap.c
new file mode 100644
index 0000000..8621218
--- /dev/null
+++ b/migration/dirty-bitmap.c
@@ -0,0 +1,606 @@ 
+/*
+ * QEMU dirty bitmap migration
+ *
+ * derived from migration/block.c
+ *
+ * Author:
+ * Sementsov-Ogievskiy Vladimir <vsementsov@parallels.com>
+ *
+ * original copyright message:
+ * =====================================================================
+ * Copyright IBM, Corp. 2009
+ *
+ * Authors:
+ *  Liran Schour   <lirans@il.ibm.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ *
+ * Contributions after 2012-01-13 are licensed under the terms of the
+ * GNU GPL, version 2 or (at your option) any later version.
+ * =====================================================================
+ */
+
+#include "block/block.h"
+#include "qemu/main-loop.h"
+#include "qemu/error-report.h"
+#include "migration/block.h"
+#include "migration/migration.h"
+#include "qemu/hbitmap.h"
+#include <assert.h>
+
+#define CHUNK_SIZE                       (1 << 20)
+
+#define DIRTY_BITMAP_MIG_FLAG_EOS           0x01
+#define DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK  0x02
+#define DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK    0x04
+#define DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME   0x08
+#define DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME   0x10
+#define DIRTY_BITMAP_MIG_FLAG_GRANULARITY   0x20
+#define DIRTY_BITMAP_MIG_FLAG_ENABLED       0x40
+/* flags should be <= 0xff */
+
+/* #define DEBUG_DIRTY_BITMAP_MIGRATION */
+
+#ifdef DEBUG_DIRTY_BITMAP_MIGRATION
+#define DPRINTF(fmt, ...) \
+    do { printf("dirty_migration: " fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+    do { } while (0)
+#endif
+
+typedef struct DirtyBitmapMigBitmapState {
+    /* Written during setup phase. */
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+    HBitmap *dirty_bitmap;
+    int64_t total_sectors;
+    uint64_t sectors_per_chunk;
+    QSIMPLEQ_ENTRY(DirtyBitmapMigBitmapState) entry;
+
+    /* For bulk phase. */
+    bool bulk_completed;
+    int64_t cur_sector;
+    bool granularity_sent;
+
+    /* For dirty phase. */
+    int64_t cur_dirty;
+} DirtyBitmapMigBitmapState;
+
+typedef struct DirtyBitmapMigState {
+    int migration_enable;
+    QSIMPLEQ_HEAD(dbms_list, DirtyBitmapMigBitmapState) dbms_list;
+
+    bool bulk_completed;
+
+    /* for send_bitmap() */
+    BlockDriverState *prev_bs;
+    BdrvDirtyBitmap *prev_bitmap;
+} DirtyBitmapMigState;
+
+static DirtyBitmapMigState dirty_bitmap_mig_state;
+
+/* read name from qemu file:
+ * format:
+ * 1 byte : len = name length (<256)
+ * len bytes : name without last zero byte
+ *
+ * name should point to the buffer >= 256 bytes length
+ */
+static char *qemu_get_name(QEMUFile *f, char *name)
+{
+    int len = qemu_get_byte(f);
+    qemu_get_buffer(f, (uint8_t *)name, len);
+    name[len] = '\0';
+
+    DPRINTF("get name: %d %s\n", len, name);
+
+    return name;
+}
+
+/* write name to qemu file:
+ * format:
+ * same as for qemu_get_name
+ *
+ * maximum name length is 255
+ */
+static void qemu_put_name(QEMUFile *f, const char *name)
+{
+    int len = strlen(name);
+
+    DPRINTF("put name: %d %s\n", len, name);
+
+    assert(len < 256);
+    qemu_put_byte(f, len);
+    qemu_put_buffer(f, (const uint8_t *)name, len);
+}
+
+static void send_bitmap(QEMUFile *f, DirtyBitmapMigBitmapState *dbms,
+                        uint64_t start_sector, uint32_t nr_sectors)
+{
+    BlockDriverState *bs = dbms->bs;
+    BdrvDirtyBitmap *bitmap = dbms->bitmap;
+    uint8_t flags = 0;
+    /* align for buffer_is_zero() */
+    uint64_t align = 4 * sizeof(long);
+    uint64_t buf_size =
+        (bdrv_dirty_bitmap_data_size(bitmap, nr_sectors) + align - 1) &
+        ~(align - 1);
+    uint8_t *buf = g_malloc0(buf_size);
+
+    bdrv_dirty_bitmap_serialize_part(bitmap, buf, start_sector, nr_sectors);
+
+    if (buffer_is_zero(buf, buf_size)) {
+        g_free(buf);
+        buf = NULL;
+        flags |= DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK;
+    } else {
+        flags |= DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK;
+    }
+
+    if (bs != dirty_bitmap_mig_state.prev_bs) {
+        dirty_bitmap_mig_state.prev_bs = bs;
+        flags |= DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME;
+    }
+
+    if (bitmap != dirty_bitmap_mig_state.prev_bitmap) {
+        dirty_bitmap_mig_state.prev_bitmap = bitmap;
+        flags |= DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME;
+    }
+
+    if (dbms->granularity_sent == 0) {
+        dbms->granularity_sent = 1;
+        flags |= DIRTY_BITMAP_MIG_FLAG_GRANULARITY;
+    }
+
+    DPRINTF("Enter send_bitmap"
+            "\n   flags:        %x"
+            "\n   start_sector: %" PRIu64
+            "\n   nr_sectors:   %" PRIu32
+            "\n   data_size:    %" PRIu64 "\n",
+            flags, start_sector, nr_sectors, buf_size);
+
+    qemu_put_byte(f, flags);
+
+    if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
+        qemu_put_name(f, bdrv_get_device_name(bs));
+    }
+
+    if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
+        qemu_put_name(f, bdrv_dirty_bitmap_name(bitmap));
+
+        if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
+            qemu_put_be64(f, bdrv_dirty_bitmap_granularity(bs, bitmap));
+        }
+    } else {
+        assert(!(flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY));
+    }
+
+    qemu_put_be64(f, start_sector);
+    qemu_put_be32(f, nr_sectors);
+
+    /* if a block is zero we need to flush here since the network
+     * bandwidth is now a lot higher than the storage device bandwidth.
+     * thus if we queue zero blocks we slow down the migration.
+     * also, skip writing block when migrate only dirty bitmaps. */
+    if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
+        qemu_fflush(f);
+        return;
+    }
+
+    qemu_put_be64(f, buf_size);
+    qemu_put_buffer(f, buf, buf_size);
+    g_free(buf);
+}
+
+
+/* Called with iothread lock taken.  */
+
+static void set_dirty_tracking(void)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        dbms->dirty_bitmap =
+            bdrv_create_dirty_dirty_bitmap(dbms->bitmap, CHUNK_SIZE);
+    }
+}
+
+static void unset_dirty_tracking(void)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        bdrv_release_dirty_dirty_bitmap(dbms->bitmap);
+    }
+}
+
+static void init_dirty_bitmap_migration(QEMUFile *f)
+{
+    BlockDriverState *bs;
+    BdrvDirtyBitmap *bitmap;
+    DirtyBitmapMigBitmapState *dbms;
+
+    dirty_bitmap_mig_state.bulk_completed = false;
+    dirty_bitmap_mig_state.prev_bs = NULL;
+    dirty_bitmap_mig_state.prev_bitmap = NULL;
+
+    for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) {
+        for (bitmap = bdrv_next_dirty_bitmap(bs, NULL); bitmap;
+             bitmap = bdrv_next_dirty_bitmap(bs, bitmap)) {
+            if (!bdrv_dirty_bitmap_name(bitmap)) {
+                continue;
+            }
+
+            dbms = g_new0(DirtyBitmapMigBitmapState, 1);
+            dbms->bs = bs;
+            dbms->bitmap = bitmap;
+            dbms->total_sectors = bdrv_nb_sectors(bs);
+            dbms->sectors_per_chunk = CHUNK_SIZE * 8 *
+                bdrv_dirty_bitmap_granularity(dbms->bs, dbms->bitmap)
+                >> BDRV_SECTOR_BITS;
+
+            QSIMPLEQ_INSERT_TAIL(&dirty_bitmap_mig_state.dbms_list,
+                                 dbms, entry);
+        }
+    }
+}
+
+/* Called with no lock taken.  */
+static void bulk_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
+{
+    uint32_t nr_sectors = MIN(dbms->total_sectors - dbms->cur_sector,
+                             dbms->sectors_per_chunk);
+
+    send_bitmap(f, dbms, dbms->cur_sector, nr_sectors);
+
+    dbms->cur_sector += nr_sectors;
+    if (dbms->cur_sector >= dbms->total_sectors) {
+        dbms->bulk_completed = true;
+    }
+}
+
+/* Called with no lock taken.  */
+static void bulk_phase(QEMUFile *f, bool limit)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        while (!dbms->bulk_completed) {
+            bulk_phase_send_chunk(f, dbms);
+            if (limit && qemu_file_rate_limit(f)) {
+                return;
+            }
+        }
+    }
+
+    dirty_bitmap_mig_state.bulk_completed = true;
+}
+
+static void blk_mig_reset_dirty_cursor(void)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        dbms->cur_dirty = 0;
+    }
+}
+
+/* Called with iothread lock taken.  */
+static void dirty_phase_send_chunk(QEMUFile *f, DirtyBitmapMigBitmapState *dbms)
+{
+    uint32_t nr_sectors;
+
+    while (dbms->cur_dirty < dbms->total_sectors &&
+           !hbitmap_get(dbms->dirty_bitmap, dbms->cur_dirty)) {
+        dbms->cur_dirty += dbms->sectors_per_chunk;
+    }
+
+    if (dbms->cur_dirty >= dbms->total_sectors) {
+        return;
+    }
+
+    nr_sectors = MIN(dbms->total_sectors - dbms->cur_dirty,
+                     dbms->sectors_per_chunk);
+    send_bitmap(f, dbms, dbms->cur_dirty, nr_sectors);
+    hbitmap_reset(dbms->dirty_bitmap, dbms->cur_dirty, dbms->sectors_per_chunk);
+    dbms->cur_dirty += nr_sectors;
+}
+
+/* Called with iothread lock taken.
+ *
+ * return value:
+ * 0: too much data for max_downtime
+ * 1: few enough data for max_downtime
+*/
+static void dirty_phase(QEMUFile *f, bool limit)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        while (dbms->cur_dirty < dbms->total_sectors) {
+            dirty_phase_send_chunk(f, dbms);
+            if (limit && qemu_file_rate_limit(f)) {
+                return;
+            }
+        }
+    }
+}
+
+
+/* Called with iothread lock taken.  */
+static void dirty_bitmap_mig_cleanup(void)
+{
+    DirtyBitmapMigBitmapState *dbms;
+
+    unset_dirty_tracking();
+
+    while ((dbms = QSIMPLEQ_FIRST(&dirty_bitmap_mig_state.dbms_list)) != NULL) {
+        QSIMPLEQ_REMOVE_HEAD(&dirty_bitmap_mig_state.dbms_list, entry);
+        g_free(dbms);
+    }
+}
+
+static void dirty_bitmap_migration_cancel(void *opaque)
+{
+    dirty_bitmap_mig_cleanup();
+}
+
+static int dirty_bitmap_save_iterate(QEMUFile *f, void *opaque)
+{
+    DPRINTF("Enter save live iterate\n");
+
+    blk_mig_reset_dirty_cursor();
+
+    if (dirty_bitmap_mig_state.bulk_completed) {
+        qemu_mutex_lock_iothread();
+        dirty_phase(f, true);
+        qemu_mutex_unlock_iothread();
+    } else {
+        bulk_phase(f, true);
+    }
+
+    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
+
+    return dirty_bitmap_mig_state.bulk_completed;
+}
+
+/* Called with iothread lock taken.  */
+
+static int dirty_bitmap_save_complete(QEMUFile *f, void *opaque)
+{
+    DirtyBitmapMigBitmapState *dbms;
+    DPRINTF("Enter save live complete\n");
+
+    if (!dirty_bitmap_mig_state.bulk_completed) {
+        bulk_phase(f, false);
+    }
+
+    blk_mig_reset_dirty_cursor();
+    dirty_phase(f, false);
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        uint8_t flags = DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME |
+                        DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME |
+                        DIRTY_BITMAP_MIG_FLAG_ENABLED;
+
+        qemu_put_byte(f, flags);
+        qemu_put_name(f, bdrv_get_device_name(dbms->bs));
+        qemu_put_name(f, bdrv_dirty_bitmap_name(dbms->bitmap));
+        qemu_put_byte(f, bdrv_dirty_bitmap_enabled(dbms->bitmap));
+    }
+
+    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
+
+    DPRINTF("Dirty bitmaps migration completed\n");
+
+    dirty_bitmap_mig_cleanup();
+    return 0;
+}
+
+static uint64_t dirty_bitmap_save_pending(QEMUFile *f, void *opaque,
+                                          uint64_t max_size)
+{
+    DirtyBitmapMigBitmapState *dbms;
+    uint64_t pending = 0;
+
+    qemu_mutex_lock_iothread();
+
+    QSIMPLEQ_FOREACH(dbms, &dirty_bitmap_mig_state.dbms_list, entry) {
+        uint64_t sectors = hbitmap_count(dbms->dirty_bitmap);
+        if (!dbms->bulk_completed) {
+            sectors += dbms->total_sectors - dbms->cur_sector;
+        }
+        pending += bdrv_dirty_bitmap_data_size(dbms->bitmap, sectors);
+    }
+
+    qemu_mutex_unlock_iothread();
+
+    DPRINTF("Enter save live pending %" PRIu64 ", max: %" PRIu64 "\n",
+            pending, max_size);
+    return pending;
+}
+
+static int dirty_bitmap_load(QEMUFile *f, void *opaque, int version_id)
+{
+    int flags;
+
+    static char device_name[256], bitmap_name[256];
+    static BlockDriverState *bs;
+    static BdrvDirtyBitmap *bitmap;
+
+    uint8_t *buf;
+    uint64_t first_sector;
+    uint32_t  nr_sectors;
+    int ret;
+
+    DPRINTF("load start\n");
+
+    do {
+        flags = qemu_get_byte(f);
+        DPRINTF("flags: %x\n", flags);
+
+        if (flags & DIRTY_BITMAP_MIG_FLAG_DEVICE_NAME) {
+            qemu_get_name(f, device_name);
+            bs = bdrv_find(device_name);
+            if (!bs) {
+                fprintf(stderr, "Error: unknown block device '%s'\n",
+                        device_name);
+                return -EINVAL;
+            }
+        }
+
+        if (flags & DIRTY_BITMAP_MIG_FLAG_BITMAP_NAME) {
+            if (!bs) {
+                fprintf(stderr, "Error: block device name is not set\n");
+                return -EINVAL;
+            }
+
+            qemu_get_name(f, bitmap_name);
+            bitmap = bdrv_find_dirty_bitmap(bs, bitmap_name);
+            if (flags & DIRTY_BITMAP_MIG_FLAG_GRANULARITY) {
+                /* First chunk from this bitmap */
+                uint64_t granularity = qemu_get_be64(f);
+                if (!bitmap) {
+                    Error *local_err = NULL;
+                    bitmap = bdrv_create_dirty_bitmap(bs, granularity,
+                                                      bitmap_name,
+                                                      &local_err);
+                    if (!bitmap) {
+                        error_report("%s", error_get_pretty(local_err));
+                        error_free(local_err);
+                        return -EINVAL;
+                    }
+                } else {
+                    uint64_t dest_granularity =
+                        bdrv_dirty_bitmap_granularity(bs, bitmap);
+                    if (dest_granularity != granularity) {
+                        fprintf(stderr,
+                                "Error: "
+                                "Migrated bitmap granularity (%" PRIu64 ") "
+                                "is not match with destination bitmap '%s' "
+                                "granularity (%" PRIu64 ")\n",
+                                granularity,
+                                bitmap_name,
+                                dest_granularity);
+                        return -EINVAL;
+                    }
+                }
+                bdrv_disable_dirty_bitmap(bitmap);
+            }
+            if (!bitmap) {
+                fprintf(stderr, "Error: unknown dirty bitmap "
+                        "'%s' for block device '%s'\n",
+                        bitmap_name, device_name);
+                return -EINVAL;
+            }
+        }
+
+        if (flags & DIRTY_BITMAP_MIG_FLAG_ENABLED) {
+            bool enabled;
+            if (!bitmap) {
+                fprintf(stderr, "Error: dirty bitmap name is not set\n");
+                return -EINVAL;
+            }
+            bdrv_dirty_bitmap_deserialize_finish(bitmap);
+            /* complete migration */
+            enabled = qemu_get_byte(f);
+            if (enabled) {
+                bdrv_enable_dirty_bitmap(bitmap);
+            }
+        }
+
+        if (flags & (DIRTY_BITMAP_MIG_FLAG_NORMAL_CHUNK |
+                     DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK)) {
+            if (!bs) {
+                fprintf(stderr, "Error: block device name is not set\n");
+                return -EINVAL;
+            }
+            if (!bitmap) {
+                fprintf(stderr, "Error: dirty bitmap name is not set\n");
+                return -EINVAL;
+            }
+
+            first_sector = qemu_get_be64(f);
+            nr_sectors = qemu_get_be32(f);
+            DPRINTF("chunk: %lu %u\n", first_sector, nr_sectors);
+
+
+            if (flags & DIRTY_BITMAP_MIG_FLAG_ZERO_CHUNK) {
+                bdrv_dirty_bitmap_deserialize_part0(bitmap, first_sector,
+                                                    nr_sectors);
+            } else {
+                uint64_t buf_size = qemu_get_be64(f);
+                uint64_t needed_size =
+                    bdrv_dirty_bitmap_data_size(bitmap, nr_sectors);
+
+                if (needed_size > buf_size) {
+                    fprintf(stderr,
+                            "Error: Migrated bitmap granularity is not "
+                            "match with destination bitmap granularity\n");
+                    return -EINVAL;
+                }
+
+                buf = g_malloc(buf_size);
+                qemu_get_buffer(f, buf, buf_size);
+                bdrv_dirty_bitmap_deserialize_part(bitmap, buf,
+                                                   first_sector,
+                                                   nr_sectors);
+                g_free(buf);
+            }
+        }
+
+        ret = qemu_file_get_error(f);
+        if (ret != 0) {
+            return ret;
+        }
+    } while (!(flags & DIRTY_BITMAP_MIG_FLAG_EOS));
+
+    DPRINTF("load finish\n");
+    return 0;
+}
+
+static void dirty_bitmap_set_params(const MigrationParams *params, void *opaque)
+{
+    dirty_bitmap_mig_state.migration_enable = params->dirty;
+}
+
+static bool dirty_bitmap_is_active(void *opaque)
+{
+    return dirty_bitmap_mig_state.migration_enable == 1;
+}
+
+static int dirty_bitmap_save_setup(QEMUFile *f, void *opaque)
+{
+    init_dirty_bitmap_migration(f);
+
+    qemu_mutex_lock_iothread();
+    /* start track dirtyness of dirty bitmaps */
+    set_dirty_tracking();
+    qemu_mutex_unlock_iothread();
+
+    blk_mig_reset_dirty_cursor();
+    qemu_put_byte(f, DIRTY_BITMAP_MIG_FLAG_EOS);
+
+    return 0;
+}
+
+static SaveVMHandlers savevm_block_handlers = {
+    .set_params = dirty_bitmap_set_params,
+    .save_live_setup = dirty_bitmap_save_setup,
+    .save_live_iterate = dirty_bitmap_save_iterate,
+    .save_live_complete = dirty_bitmap_save_complete,
+    .save_live_pending = dirty_bitmap_save_pending,
+    .load_state = dirty_bitmap_load,
+    .cancel = dirty_bitmap_migration_cancel,
+    .is_active = dirty_bitmap_is_active,
+};
+
+void dirty_bitmap_mig_init(void)
+{
+    QSIMPLEQ_INIT(&dirty_bitmap_mig_state.dbms_list);
+
+    register_savevm_live(NULL, "dirty-bitmap", 0, 1, &savevm_block_handlers,
+                         &dirty_bitmap_mig_state);
+}
diff --git a/vl.c b/vl.c
index a824a7d..dee7220 100644
--- a/vl.c
+++ b/vl.c
@@ -4184,6 +4184,7 @@  int main(int argc, char **argv, char **envp)
 
     blk_mig_init();
     ram_mig_init();
+    dirty_bitmap_mig_init();
 
     /* If the currently selected machine wishes to override the units-per-bus
      * property of its default HBA interface type, do so now. */