diff mbox

[v5,27/45] Postcopy: Maintain sentmap and calculate discard

Message ID 1424883128-9841-28-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert Feb. 25, 2015, 4:51 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Where postcopy is preceeded by a period of precopy, the destination will
have received pages that may have been dirtied on the source after the
page was sent.  The destination must throw these pages away before
starting it's CPUs.

Maintain a 'sentmap' of pages that have already been sent.
Calculate list of sent & dirty pages
Provide helpers on the destination side to discard these.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 arch_init.c                      | 275 ++++++++++++++++++++++++++++++++++++++-
 include/migration/migration.h    |  12 ++
 include/migration/postcopy-ram.h |  34 +++++
 include/qemu/typedefs.h          |   1 +
 migration/migration.c            |   1 +
 migration/postcopy-ram.c         | 111 ++++++++++++++++
 savevm.c                         |   3 -
 trace-events                     |   4 +
 8 files changed, 435 insertions(+), 6 deletions(-)

Comments

David Gibson March 23, 2015, 3:30 a.m. UTC | #1
On Wed, Feb 25, 2015 at 04:51:50PM +0000, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Where postcopy is preceeded by a period of precopy, the destination will
> have received pages that may have been dirtied on the source after the
> page was sent.  The destination must throw these pages away before
> starting it's CPUs.
> 
> Maintain a 'sentmap' of pages that have already been sent.
> Calculate list of sent & dirty pages
> Provide helpers on the destination side to discard these.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  arch_init.c                      | 275 ++++++++++++++++++++++++++++++++++++++-
>  include/migration/migration.h    |  12 ++
>  include/migration/postcopy-ram.h |  34 +++++
>  include/qemu/typedefs.h          |   1 +
>  migration/migration.c            |   1 +
>  migration/postcopy-ram.c         | 111 ++++++++++++++++
>  savevm.c                         |   3 -
>  trace-events                     |   4 +
>  8 files changed, 435 insertions(+), 6 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index 7bc5fa6..21e7ebe 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -40,6 +40,7 @@
>  #include "hw/audio/audio.h"
>  #include "sysemu/kvm.h"
>  #include "migration/migration.h"
> +#include "migration/postcopy-ram.h"
>  #include "hw/i386/smbios.h"
>  #include "exec/address-spaces.h"
>  #include "hw/audio/pcspk.h"
> @@ -414,9 +415,17 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
>      return bytes_sent;
>  }
>  
> +/* mr: The region to search for dirty pages in
> + * start: Start address (typically so we can continue from previous page)
> + * ram_addr_abs: Pointer into which to store the address of the dirty page
> + *               within the global ram_addr space
> + *
> + * Returns: byte offset within memory region of the start of a dirty page
> + */
>  static inline
>  ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
> -                                                 ram_addr_t start)
> +                                                 ram_addr_t start,
> +                                                 ram_addr_t *ram_addr_abs)
>  {
>      unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
>      unsigned long nr = base + (start >> TARGET_PAGE_BITS);
> @@ -435,6 +444,7 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
>          clear_bit(next, migration_bitmap);
>          migration_dirty_pages--;
>      }
> +    *ram_addr_abs = next << TARGET_PAGE_BITS;
>      return (next - base) << TARGET_PAGE_BITS;
>  }
>  
> @@ -571,6 +581,19 @@ static void migration_bitmap_sync(void)
>      }
>  }
>  
> +static RAMBlock *ram_find_block(const char *id)
> +{
> +    RAMBlock *block;
> +
> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +        if (!strcmp(id, block->idstr)) {
> +            return block;
> +        }
> +    }
> +
> +    return NULL;
> +}
> +
>  /*
>   * ram_save_page: Send the given page to the stream
>   *
> @@ -659,13 +682,16 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
>      bool complete_round = false;
>      int bytes_sent = 0;
>      MemoryRegion *mr;
> +    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
> +                                 ram_addr_t space */
>  
>      if (!block)
>          block = QTAILQ_FIRST(&ram_list.blocks);
>  
>      while (true) {
>          mr = block->mr;
> -        offset = migration_bitmap_find_and_reset_dirty(mr, offset);
> +        offset = migration_bitmap_find_and_reset_dirty(mr, offset,
> +                                                       &dirty_ram_abs);
>          if (complete_round && block == last_seen_block &&
>              offset >= last_offset) {
>              break;
> @@ -683,6 +709,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
>  
>              /* if page is unmodified, continue to the next */
>              if (bytes_sent > 0) {
> +                MigrationState *ms = migrate_get_current();
> +                if (ms->sentmap) {
> +                    set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
> +                }
> +
>                  last_sent_block = block;
>                  break;
>              }
> @@ -742,12 +773,19 @@ void free_xbzrle_decoded_buf(void)
>  
>  static void migration_end(void)
>  {
> +    MigrationState *s = migrate_get_current();
> +
>      if (migration_bitmap) {
>          memory_global_dirty_log_stop();
>          g_free(migration_bitmap);
>          migration_bitmap = NULL;
>      }
>  
> +    if (s->sentmap) {
> +        g_free(s->sentmap);
> +        s->sentmap = NULL;
> +    }
> +
>      XBZRLE_cache_lock();
>      if (XBZRLE.cache) {
>          cache_fini(XBZRLE.cache);
> @@ -815,6 +853,232 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
>      }
>  }
>  
> +/* **** functions for postcopy ***** */
> +
> +/*
> + * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32
> + * messier for 64; the bitmaps are actually long's that are 32 or 64bit
> + */
> +static uint32_t get_32bits_map(unsigned long *map, int64_t start)
> +{
> +#if HOST_LONG_BITS == 64
> +    uint64_t tmp64;
> +
> +    tmp64 = map[start / 64];
> +    return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu);
> +#elif HOST_LONG_BITS == 32
> +    /*
> +     * Irrespective of host endianness, sentmap[n] is for pages earlier
> +     * than sentmap[n+1] so we can't just cast up
> +     */
> +    return map[start / 32];
> +#else
> +#error "Host long other than 64/32 not supported"
> +#endif
> +}
> +
> +/*
> + * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32
> + * messier for 64; the bitmaps are actually long's that are 32 or 64bit
> + */
> +__attribute__ (( unused )) /* Until later in patch series */
> +static void put_32bits_map(unsigned long *map, int64_t start,
> +                           uint32_t v)
> +{
> +#if HOST_LONG_BITS == 64
> +    uint64_t tmp64 = v;
> +    uint64_t mask = 0xffffffffu;
> +
> +    if (start & 32) {
> +        tmp64 = tmp64 << 32;
> +        mask =  mask << 32;
> +    }
> +
> +    map[start / 64] = (map[start / 64] & ~mask) | tmp64;
> +#elif HOST_LONG_BITS == 32
> +    /*
> +     * Irrespective of host endianness, sentmap[n] is for pages earlier
> +     * than sentmap[n+1] so we can't just cast up
> +     */
> +    map[start / 32] = v;
> +#else
> +#error "Host long other than 64/32 not supported"
> +#endif
> +}
> +
> +/*
> + * When working on 32bit chunks of a bitmap where the only valid section
> + * is between start..end (inclusive), generate a mask with only those
> + * valid bits set for the current 32bit word within that bitmask.
> + */
> +static int make_32bit_mask(unsigned long start, unsigned long end,
> +                           unsigned long cur32)
> +{
> +    unsigned long first32, last32;
> +    uint32_t mask = ~(uint32_t)0;
> +    first32 = start / 32;
> +    last32 = end / 32;
> +
> +    if ((cur32 == first32) && (start & 31)) {
> +        /* e.g. (start & 31) = 3
> +         *         1 << .    -> 2^3
> +         *         . - 1     -> 2^3 - 1 i.e. mask 2..0
> +         *         ~.        -> mask 31..3
> +         */
> +        mask &= ~((((uint32_t)1) << (start & 31)) - 1);
> +    }
> +
> +    if ((cur32 == last32) && ((end & 31) != 31)) {
> +        /* e.g. (end & 31) = 3
> +         *            .   +1 -> 4
> +         *         1 << .    -> 2^4
> +         *         . -1      -> 2^4 - 1
> +         *                   = mask set 3..0
> +         */
> +        mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1;
> +    }
> +
> +    return mask;
> +}

Urgh.. looks correct, but continue to makes me question if this is a
sensible wire encoding of the discard map.

> +
> +/*
> + * Callback from ram_postcopy_each_ram_discard for each RAMBlock
> + * start,end: Indexes into the bitmap for the first and last bit
> + *            representing the named block
> + */
> +static int pc_send_discard_bm_ram(MigrationState *ms,
> +                                  PostcopyDiscardState *pds,
> +                                  unsigned long start, unsigned long end)

I know it's a long name, but I'd prefer to see this called
"postcopy_".  I have to keep reminding myself that "pc" means
"postcopy" here, not that it's a PC machine type specific function.

> +{
> +    /*
> +     * There is no guarantee that start, end are on convenient 32bit multiples
> +     * (We always send 32bit chunks over the wire, irrespective of long size)
> +     */
> +    unsigned long first32, last32, cur32;
> +    first32 = start / 32;
> +    last32 = end / 32;
> +
> +    for (cur32 = first32; cur32 <= last32; cur32++) {
> +        /* Deal with start/end not on alignment */
> +        uint32_t mask = make_32bit_mask(start, end, cur32);
> +
> +        uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32);
> +        data &= mask;
> +
> +        if (data) {
> +            postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data);
> +        }
> +    }
> +
> +    return 0;
> +}


> +/*
> + * Utility for the outgoing postcopy code.
> + *   Calls postcopy_send_discard_bm_ram for each RAMBlock
> + *   passing it bitmap indexes and name.
> + * Returns: 0 on success
> + * (qemu_ram_foreach_block ends up passing unscaled lengths
> + *  which would mean postcopy code would have to deal with target page)
> + */
> +static int pc_each_ram_discard(MigrationState *ms)

s/discard/send_discard/

> +{
> +    struct RAMBlock *block;
> +    int ret;
> +
> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +        unsigned long first = block->offset >> TARGET_PAGE_BITS;
> +        unsigned long last = (block->offset + (block->max_length-1))
> +                                >> TARGET_PAGE_BITS;
> +        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
> +                                                               first & 31,
> +                                                               block->idstr);
> +
> +        /*
> +         * Postcopy sends chunks of bitmap over the wire, but it
> +         * just needs indexes at this point, avoids it having
> +         * target page specific code.
> +         */
> +        ret = pc_send_discard_bm_ram(ms, pds, first, last);
> +        postcopy_discard_send_finish(ms, pds);
> +        if (ret) {
> +            return ret;
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * Transmit the set of pages to be discarded after precopy to the target
> + * these are pages that have been sent previously but have been dirtied
> + * Hopefully this is pretty sparse
> + */
> +int ram_postcopy_send_discard_bitmap(MigrationState *ms)
> +{
> +    /* This should be our last sync, the src is now paused */
> +    migration_bitmap_sync();
> +
> +    /*
> +     * Update the sentmap to be  sentmap&=dirty
> +     */
> +    bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap,
> +               last_ram_offset() >> TARGET_PAGE_BITS);
> +
> +
> +    trace_ram_postcopy_send_discard_bitmap();
> +#ifdef DEBUG_POSTCOPY
> +    ram_debug_dump_bitmap(ms->sentmap, false);
> +#endif
> +
> +    return pc_each_ram_discard(ms);
> +}
> +
> +/*
> + * At the start of the postcopy phase of migration, any now-dirty
> + * precopied pages are discarded.
> + *
> + * start..end is an inclusive range of bits indexed in the source
> + *    VMs bitmap for this RAMBlock, source_target_page_bits tells
> + *    us what one of those bits represents.
> + *
> + * start/end are offsets from the start of the bitmap for RAMBlock 'block_name'
> + *
> + * Returns 0 on success.
> + */
> +int ram_discard_range(MigrationIncomingState *mis,
> +                      const char *block_name,
> +                      uint64_t start, uint64_t end)
> +{
> +    assert(end >= start);
> +
> +    RAMBlock *rb = ram_find_block(block_name);
> +
> +    if (!rb) {
> +        error_report("ram_discard_range: Failed to find block '%s'",
> +                     block_name);
> +        return -1;
> +    }
> +
> +    uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS;
> +    postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1);
> +
> +    /* +1 gives the byte after the end of the last page to be discarded */
> +    ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS;
> +    uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS);
> +    uint8_t *host_endaddr;
> +
> +    if (end_offset <= rb->used_length) {
> +        host_endaddr   = rb->host + (end_offset-1);
> +        return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr);
> +    } else {
> +        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
> +                     "/%" PRIu64 "/%zu)",
> +                     block_name, start, end, rb->used_length);
> +        return -1;
> +    }
> +}
> +
>  static int ram_save_setup(QEMUFile *f, void *opaque)
>  {
>      RAMBlock *block;
> @@ -854,7 +1118,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>  
>          acct_clear();
>      }
> -
>      qemu_mutex_lock_iothread();
>      qemu_mutex_lock_ramlist();
>      bytes_transferred = 0;
> @@ -864,6 +1127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      migration_bitmap = bitmap_new(ram_bitmap_pages);
>      bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
>  
> +    if (migrate_postcopy_ram()) {
> +        MigrationState *s = migrate_get_current();
> +        s->sentmap = bitmap_new(ram_bitmap_pages);
> +        bitmap_clear(s->sentmap, 0, ram_bitmap_pages);
> +    }
> +
>      /*
>       * Count the total number of pages used by ram blocks not including any
>       * gaps due to alignment or unplugs.
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index 86200b9..e749f4c 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -125,6 +125,13 @@ struct MigrationState
>  
>      /* Flag set once the migration has been asked to enter postcopy */
>      bool start_postcopy;
> +
> +    /* bitmap of pages that have been sent at least once
> +     * only maintained and used in postcopy at the moment
> +     * where it's used to send the dirtymap at the start
> +     * of the postcopy phase
> +     */
> +    unsigned long *sentmap;
>  };
>  
>  void process_incoming_migration(QEMUFile *f);
> @@ -194,6 +201,11 @@ double xbzrle_mig_cache_miss_rate(void);
>  
>  void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
>  void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
> +/* For outgoing discard bitmap */
> +int ram_postcopy_send_discard_bitmap(MigrationState *ms);
> +/* For incoming postcopy discard */
> +int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
> +                      uint64_t start, uint64_t end);
>  
>  /**
>   * @migrate_add_blocker - prevent migration from proceeding
> diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
> index e93ee8a..1fec1c1 100644
> --- a/include/migration/postcopy-ram.h
> +++ b/include/migration/postcopy-ram.h
> @@ -28,4 +28,38 @@ void postcopy_pmi_destroy(MigrationIncomingState *mis);
>  void postcopy_pmi_discard_range(MigrationIncomingState *mis,
>                                  size_t start, size_t npages);
>  void postcopy_pmi_dump(MigrationIncomingState *mis);
> +
> +/*
> + * Discard the contents of memory start..end inclusive.
> + * We can assume that if we've been called postcopy_ram_hosttest returned true
> + */
> +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> +                               uint8_t *end);
> +
> +
> +/*
> + * Called at the start of each RAMBlock by the bitmap code
> + * offset is the bit within the first 32bit chunk of mask
> + * that represents the first page of the RAM Block
> + * Returns a new PDS
> + */
> +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
> +                                                 uint8_t offset,
> +                                                 const char *name);
> +
> +/*
> + * Called by the bitmap code for each chunk to discard
> + * May send a discard message, may just leave it queued to
> + * be sent later
> + */
> +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds,
> +                                unsigned long pos, uint32_t bitmap);
> +
> +/*
> + * Called at the end of each RAMBlock by the bitmap code
> + * Sends any outstanding discard messages, frees the PDS
> + */
> +void postcopy_discard_send_finish(MigrationState *ms,
> +                                  PostcopyDiscardState *pds);
> +
>  #endif
> diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
> index 924eeb6..0651275 100644
> --- a/include/qemu/typedefs.h
> +++ b/include/qemu/typedefs.h
> @@ -61,6 +61,7 @@ typedef struct PCIExpressHost PCIExpressHost;
>  typedef struct PCIHostState PCIHostState;
>  typedef struct PCMCIACardState PCMCIACardState;
>  typedef struct PixelFormat PixelFormat;
> +typedef struct PostcopyDiscardState PostcopyDiscardState;
>  typedef struct PostcopyPMI PostcopyPMI;
>  typedef struct PropertyInfo PropertyInfo;
>  typedef struct Property Property;
> diff --git a/migration/migration.c b/migration/migration.c
> index 6b20b56..850fe1a 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -22,6 +22,7 @@
>  #include "block/block.h"
>  #include "qemu/sockets.h"
>  #include "migration/block.h"
> +#include "migration/postcopy-ram.h"
>  #include "qemu/thread.h"
>  #include "qmp-commands.h"
>  #include "trace.h"
> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> index 4f29055..391e9c6 100644
> --- a/migration/postcopy-ram.c
> +++ b/migration/postcopy-ram.c
> @@ -28,6 +28,19 @@
>  #include "qemu/error-report.h"
>  #include "trace.h"
>  
> +#define MAX_DISCARDS_PER_COMMAND 12
> +
> +struct PostcopyDiscardState {
> +    const char *name;
> +    uint16_t cur_entry;
> +    uint64_t addrlist[MAX_DISCARDS_PER_COMMAND];
> +    uint32_t masklist[MAX_DISCARDS_PER_COMMAND];
> +    uint8_t  offset;  /* Offset within 32bit mask at addr0 representing 1st
> +                         page of block */
> +    unsigned int nsentwords;
> +    unsigned int nsentcmds;
> +};
> +
>  /* Postcopy needs to detect accesses to pages that haven't yet been copied
>   * across, and efficiently map new pages in, the techniques for doing this
>   * are target OS specific.
> @@ -364,6 +377,21 @@ out:
>      return ret;
>  }
>  
> +/*
> + * Discard the contents of memory start..end inclusive.
> + * We can assume that if we've been called postcopy_ram_hosttest returned true
> + */
> +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> +                               uint8_t *end)
> +{
> +    if (madvise(start, (end-start)+1, MADV_DONTNEED)) {
> +        perror("postcopy_ram_discard_range MADV_DONTNEED");
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>  #else
>  /* No target OS support, stubs just fail */
>  
> @@ -380,5 +408,88 @@ void postcopy_hook_early_receive(MigrationIncomingState *mis,
>      /* We don't support postcopy so don't care */
>  }
>  
> +void postcopy_pmi_destroy(MigrationIncomingState *mis)
> +{
> +    /* Called in normal cleanup path - so it's OK */
> +}
> +
> +void postcopy_pmi_discard_range(MigrationIncomingState *mis,
> +                                size_t start, size_t npages)
> +{
> +    assert(0);
> +}
> +
> +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> +                               uint8_t *end)
> +{
> +    assert(0);
> +}
>  #endif
>  
> +/* ------------------------------------------------------------------------- */
> +
> +/*
> + * Called at the start of each RAMBlock by the bitmap code
> + * offset is the bit within the first 64bit chunk of mask
> + * that represents the first page of the RAM Block
> + * Returns a new PDS
> + */
> +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
> +                                                 uint8_t offset,
> +                                                 const char *name)
> +{
> +    PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState));
> +
> +    if (res) {
> +        res->name = name;
> +        res->cur_entry = 0;
> +        res->nsentwords = 0;
> +        res->nsentcmds = 0;
> +        res->offset = offset;
> +    }
> +
> +    return res;
> +}
> +
> +/*
> + * Called by the bitmap code for each chunk to discard
> + * May send a discard message, may just leave it queued to
> + * be sent later
> + */
> +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds,
> +                                unsigned long pos, uint32_t bitmap)
> +{
> +    pds->addrlist[pds->cur_entry] = pos;
> +    pds->masklist[pds->cur_entry] = bitmap;
> +    pds->cur_entry++;
> +    pds->nsentwords++;
> +
> +    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
> +        /* Full set, ship it! */
> +        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
> +                                              pds->cur_entry, pds->offset,
> +                                              pds->addrlist, pds->masklist);
> +        pds->nsentcmds++;
> +        pds->cur_entry = 0;
> +    }
> +}
> +
> +/*
> + * Called at the end of each RAMBlock by the bitmap code
> + * Sends any outstanding discard messages, frees the PDS
> + */
> +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
> +{
> +    /* Anything unsent? */
> +    if (pds->cur_entry) {
> +        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
> +                                              pds->cur_entry, pds->offset,
> +                                              pds->addrlist, pds->masklist);
> +        pds->nsentcmds++;
> +    }
> +
> +    trace_postcopy_discard_send_finish(pds->name, pds->nsentwords,
> +                                       pds->nsentcmds);
> +
> +    g_free(pds);
> +}
> diff --git a/savevm.c b/savevm.c
> index 1e8d289..2589b8c 100644
> --- a/savevm.c
> +++ b/savevm.c
> @@ -1282,15 +1282,12 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
>               * we know there must be at least 1 bit set due to the loop entry
>               * If there is no 0 firstzero will be 32
>               */
> -            /* TODO - ram_discard_range gets added in a later patch
>              int ret = ram_discard_range(mis, ramid,
>                                  startaddr + firstset - first_bit_offset,
>                                  startaddr + (firstzero - 1) - first_bit_offset);
> -            ret = -1;
>              if (ret) {
>                  return ret;
>              }
> -            */
>  
>              /* mask= .?0000000000 */
>              /*         ^fz ^fs    */
> diff --git a/trace-events b/trace-events
> index a555b56..f985117 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1217,6 +1217,7 @@ qemu_file_fclose(void) ""
>  migration_bitmap_sync_start(void) ""
>  migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
>  migration_throttle(void) ""
> +ram_postcopy_send_discard_bitmap(void) ""
>  
>  # hw/display/qxl.c
>  disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
> @@ -1478,6 +1479,9 @@ rdma_start_incoming_migration_after_rdma_listen(void) ""
>  rdma_start_outgoing_migration_after_rdma_connect(void) ""
>  rdma_start_outgoing_migration_after_rdma_source_init(void) ""
>  
> +# migration/postcopy-ram.c
> +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
> +
>  # kvm-all.c
>  kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
>  kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
Dr. David Alan Gilbert March 23, 2015, 2:36 p.m. UTC | #2
* David Gibson (david@gibson.dropbear.id.au) wrote:
> On Wed, Feb 25, 2015 at 04:51:50PM +0000, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Where postcopy is preceeded by a period of precopy, the destination will
> > have received pages that may have been dirtied on the source after the
> > page was sent.  The destination must throw these pages away before
> > starting it's CPUs.
> > 
> > Maintain a 'sentmap' of pages that have already been sent.
> > Calculate list of sent & dirty pages
> > Provide helpers on the destination side to discard these.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  arch_init.c                      | 275 ++++++++++++++++++++++++++++++++++++++-
> >  include/migration/migration.h    |  12 ++
> >  include/migration/postcopy-ram.h |  34 +++++
> >  include/qemu/typedefs.h          |   1 +
> >  migration/migration.c            |   1 +
> >  migration/postcopy-ram.c         | 111 ++++++++++++++++
> >  savevm.c                         |   3 -
> >  trace-events                     |   4 +
> >  8 files changed, 435 insertions(+), 6 deletions(-)
> > 
> > diff --git a/arch_init.c b/arch_init.c
> > index 7bc5fa6..21e7ebe 100644
> > --- a/arch_init.c
> > +++ b/arch_init.c
> > @@ -40,6 +40,7 @@
> >  #include "hw/audio/audio.h"
> >  #include "sysemu/kvm.h"
> >  #include "migration/migration.h"
> > +#include "migration/postcopy-ram.h"
> >  #include "hw/i386/smbios.h"
> >  #include "exec/address-spaces.h"
> >  #include "hw/audio/pcspk.h"
> > @@ -414,9 +415,17 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
> >      return bytes_sent;
> >  }
> >  
> > +/* mr: The region to search for dirty pages in
> > + * start: Start address (typically so we can continue from previous page)
> > + * ram_addr_abs: Pointer into which to store the address of the dirty page
> > + *               within the global ram_addr space
> > + *
> > + * Returns: byte offset within memory region of the start of a dirty page
> > + */
> >  static inline
> >  ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
> > -                                                 ram_addr_t start)
> > +                                                 ram_addr_t start,
> > +                                                 ram_addr_t *ram_addr_abs)
> >  {
> >      unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
> >      unsigned long nr = base + (start >> TARGET_PAGE_BITS);
> > @@ -435,6 +444,7 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
> >          clear_bit(next, migration_bitmap);
> >          migration_dirty_pages--;
> >      }
> > +    *ram_addr_abs = next << TARGET_PAGE_BITS;
> >      return (next - base) << TARGET_PAGE_BITS;
> >  }
> >  
> > @@ -571,6 +581,19 @@ static void migration_bitmap_sync(void)
> >      }
> >  }
> >  
> > +static RAMBlock *ram_find_block(const char *id)
> > +{
> > +    RAMBlock *block;
> > +
> > +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> > +        if (!strcmp(id, block->idstr)) {
> > +            return block;
> > +        }
> > +    }
> > +
> > +    return NULL;
> > +}
> > +
> >  /*
> >   * ram_save_page: Send the given page to the stream
> >   *
> > @@ -659,13 +682,16 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
> >      bool complete_round = false;
> >      int bytes_sent = 0;
> >      MemoryRegion *mr;
> > +    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
> > +                                 ram_addr_t space */
> >  
> >      if (!block)
> >          block = QTAILQ_FIRST(&ram_list.blocks);
> >  
> >      while (true) {
> >          mr = block->mr;
> > -        offset = migration_bitmap_find_and_reset_dirty(mr, offset);
> > +        offset = migration_bitmap_find_and_reset_dirty(mr, offset,
> > +                                                       &dirty_ram_abs);
> >          if (complete_round && block == last_seen_block &&
> >              offset >= last_offset) {
> >              break;
> > @@ -683,6 +709,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
> >  
> >              /* if page is unmodified, continue to the next */
> >              if (bytes_sent > 0) {
> > +                MigrationState *ms = migrate_get_current();
> > +                if (ms->sentmap) {
> > +                    set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
> > +                }
> > +
> >                  last_sent_block = block;
> >                  break;
> >              }
> > @@ -742,12 +773,19 @@ void free_xbzrle_decoded_buf(void)
> >  
> >  static void migration_end(void)
> >  {
> > +    MigrationState *s = migrate_get_current();
> > +
> >      if (migration_bitmap) {
> >          memory_global_dirty_log_stop();
> >          g_free(migration_bitmap);
> >          migration_bitmap = NULL;
> >      }
> >  
> > +    if (s->sentmap) {
> > +        g_free(s->sentmap);
> > +        s->sentmap = NULL;
> > +    }
> > +
> >      XBZRLE_cache_lock();
> >      if (XBZRLE.cache) {
> >          cache_fini(XBZRLE.cache);
> > @@ -815,6 +853,232 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
> >      }
> >  }
> >  
> > +/* **** functions for postcopy ***** */
> > +
> > +/*
> > + * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32
> > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit
> > + */
> > +static uint32_t get_32bits_map(unsigned long *map, int64_t start)
> > +{
> > +#if HOST_LONG_BITS == 64
> > +    uint64_t tmp64;
> > +
> > +    tmp64 = map[start / 64];
> > +    return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu);
> > +#elif HOST_LONG_BITS == 32
> > +    /*
> > +     * Irrespective of host endianness, sentmap[n] is for pages earlier
> > +     * than sentmap[n+1] so we can't just cast up
> > +     */
> > +    return map[start / 32];
> > +#else
> > +#error "Host long other than 64/32 not supported"
> > +#endif
> > +}
> > +
> > +/*
> > + * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32
> > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit
> > + */
> > +__attribute__ (( unused )) /* Until later in patch series */
> > +static void put_32bits_map(unsigned long *map, int64_t start,
> > +                           uint32_t v)
> > +{
> > +#if HOST_LONG_BITS == 64
> > +    uint64_t tmp64 = v;
> > +    uint64_t mask = 0xffffffffu;
> > +
> > +    if (start & 32) {
> > +        tmp64 = tmp64 << 32;
> > +        mask =  mask << 32;
> > +    }
> > +
> > +    map[start / 64] = (map[start / 64] & ~mask) | tmp64;
> > +#elif HOST_LONG_BITS == 32
> > +    /*
> > +     * Irrespective of host endianness, sentmap[n] is for pages earlier
> > +     * than sentmap[n+1] so we can't just cast up
> > +     */
> > +    map[start / 32] = v;
> > +#else
> > +#error "Host long other than 64/32 not supported"
> > +#endif
> > +}
> > +
> > +/*
> > + * When working on 32bit chunks of a bitmap where the only valid section
> > + * is between start..end (inclusive), generate a mask with only those
> > + * valid bits set for the current 32bit word within that bitmask.
> > + */
> > +static int make_32bit_mask(unsigned long start, unsigned long end,
> > +                           unsigned long cur32)
> > +{
> > +    unsigned long first32, last32;
> > +    uint32_t mask = ~(uint32_t)0;
> > +    first32 = start / 32;
> > +    last32 = end / 32;
> > +
> > +    if ((cur32 == first32) && (start & 31)) {
> > +        /* e.g. (start & 31) = 3
> > +         *         1 << .    -> 2^3
> > +         *         . - 1     -> 2^3 - 1 i.e. mask 2..0
> > +         *         ~.        -> mask 31..3
> > +         */
> > +        mask &= ~((((uint32_t)1) << (start & 31)) - 1);
> > +    }
> > +
> > +    if ((cur32 == last32) && ((end & 31) != 31)) {
> > +        /* e.g. (end & 31) = 3
> > +         *            .   +1 -> 4
> > +         *         1 << .    -> 2^4
> > +         *         . -1      -> 2^4 - 1
> > +         *                   = mask set 3..0
> > +         */
> > +        mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1;
> > +    }
> > +
> > +    return mask;
> > +}
> 
> Urgh.. looks correct, but continue to makes me question if this is a
> sensible wire encoding of the discard map.
> 
> > +
> > +/*
> > + * Callback from ram_postcopy_each_ram_discard for each RAMBlock
> > + * start,end: Indexes into the bitmap for the first and last bit
> > + *            representing the named block
> > + */
> > +static int pc_send_discard_bm_ram(MigrationState *ms,
> > +                                  PostcopyDiscardState *pds,
> > +                                  unsigned long start, unsigned long end)
> 
> I know it's a long name, but I'd prefer to see this called
> "postcopy_".  I have to keep reminding myself that "pc" means
> "postcopy" here, not that it's a PC machine type specific function.

OK, yeh I see that's a clash; done.

> 
> > +{
> > +    /*
> > +     * There is no guarantee that start, end are on convenient 32bit multiples
> > +     * (We always send 32bit chunks over the wire, irrespective of long size)
> > +     */
> > +    unsigned long first32, last32, cur32;
> > +    first32 = start / 32;
> > +    last32 = end / 32;
> > +
> > +    for (cur32 = first32; cur32 <= last32; cur32++) {
> > +        /* Deal with start/end not on alignment */
> > +        uint32_t mask = make_32bit_mask(start, end, cur32);
> > +
> > +        uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32);
> > +        data &= mask;
> > +
> > +        if (data) {
> > +            postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data);
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> 
> 
> > +/*
> > + * Utility for the outgoing postcopy code.
> > + *   Calls postcopy_send_discard_bm_ram for each RAMBlock
> > + *   passing it bitmap indexes and name.
> > + * Returns: 0 on success
> > + * (qemu_ram_foreach_block ends up passing unscaled lengths
> > + *  which would mean postcopy code would have to deal with target page)
> > + */
> > +static int pc_each_ram_discard(MigrationState *ms)
> 
> s/discard/send_discard/

Done.

> 
> > +{
> > +    struct RAMBlock *block;
> > +    int ret;
> > +
> > +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> > +        unsigned long first = block->offset >> TARGET_PAGE_BITS;
> > +        unsigned long last = (block->offset + (block->max_length-1))
> > +                                >> TARGET_PAGE_BITS;
> > +        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
> > +                                                               first & 31,
> > +                                                               block->idstr);
> > +
> > +        /*
> > +         * Postcopy sends chunks of bitmap over the wire, but it
> > +         * just needs indexes at this point, avoids it having
> > +         * target page specific code.
> > +         */
> > +        ret = pc_send_discard_bm_ram(ms, pds, first, last);
> > +        postcopy_discard_send_finish(ms, pds);
> > +        if (ret) {
> > +            return ret;
> > +        }
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> > +/*
> > + * Transmit the set of pages to be discarded after precopy to the target
> > + * these are pages that have been sent previously but have been dirtied
> > + * Hopefully this is pretty sparse
> > + */
> > +int ram_postcopy_send_discard_bitmap(MigrationState *ms)
> > +{
> > +    /* This should be our last sync, the src is now paused */
> > +    migration_bitmap_sync();
> > +
> > +    /*
> > +     * Update the sentmap to be  sentmap&=dirty
> > +     */
> > +    bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap,
> > +               last_ram_offset() >> TARGET_PAGE_BITS);
> > +
> > +
> > +    trace_ram_postcopy_send_discard_bitmap();
> > +#ifdef DEBUG_POSTCOPY
> > +    ram_debug_dump_bitmap(ms->sentmap, false);
> > +#endif
> > +
> > +    return pc_each_ram_discard(ms);
> > +}
> > +
> > +/*
> > + * At the start of the postcopy phase of migration, any now-dirty
> > + * precopied pages are discarded.
> > + *
> > + * start..end is an inclusive range of bits indexed in the source
> > + *    VMs bitmap for this RAMBlock, source_target_page_bits tells
> > + *    us what one of those bits represents.
> > + *
> > + * start/end are offsets from the start of the bitmap for RAMBlock 'block_name'
> > + *
> > + * Returns 0 on success.
> > + */
> > +int ram_discard_range(MigrationIncomingState *mis,
> > +                      const char *block_name,
> > +                      uint64_t start, uint64_t end)
> > +{
> > +    assert(end >= start);
> > +
> > +    RAMBlock *rb = ram_find_block(block_name);
> > +
> > +    if (!rb) {
> > +        error_report("ram_discard_range: Failed to find block '%s'",
> > +                     block_name);
> > +        return -1;
> > +    }
> > +
> > +    uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS;
> > +    postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1);
> > +
> > +    /* +1 gives the byte after the end of the last page to be discarded */
> > +    ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS;
> > +    uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS);
> > +    uint8_t *host_endaddr;
> > +
> > +    if (end_offset <= rb->used_length) {
> > +        host_endaddr   = rb->host + (end_offset-1);
> > +        return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr);
> > +    } else {
> > +        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
> > +                     "/%" PRIu64 "/%zu)",
> > +                     block_name, start, end, rb->used_length);
> > +        return -1;
> > +    }
> > +}
> > +
> >  static int ram_save_setup(QEMUFile *f, void *opaque)
> >  {
> >      RAMBlock *block;
> > @@ -854,7 +1118,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> >  
> >          acct_clear();
> >      }
> > -
> >      qemu_mutex_lock_iothread();
> >      qemu_mutex_lock_ramlist();
> >      bytes_transferred = 0;
> > @@ -864,6 +1127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> >      migration_bitmap = bitmap_new(ram_bitmap_pages);
> >      bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
> >  
> > +    if (migrate_postcopy_ram()) {
> > +        MigrationState *s = migrate_get_current();
> > +        s->sentmap = bitmap_new(ram_bitmap_pages);
> > +        bitmap_clear(s->sentmap, 0, ram_bitmap_pages);
> > +    }
> > +
> >      /*
> >       * Count the total number of pages used by ram blocks not including any
> >       * gaps due to alignment or unplugs.
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index 86200b9..e749f4c 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -125,6 +125,13 @@ struct MigrationState
> >  
> >      /* Flag set once the migration has been asked to enter postcopy */
> >      bool start_postcopy;
> > +
> > +    /* bitmap of pages that have been sent at least once
> > +     * only maintained and used in postcopy at the moment
> > +     * where it's used to send the dirtymap at the start
> > +     * of the postcopy phase
> > +     */
> > +    unsigned long *sentmap;
> >  };
> >  
> >  void process_incoming_migration(QEMUFile *f);
> > @@ -194,6 +201,11 @@ double xbzrle_mig_cache_miss_rate(void);
> >  
> >  void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
> >  void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
> > +/* For outgoing discard bitmap */
> > +int ram_postcopy_send_discard_bitmap(MigrationState *ms);
> > +/* For incoming postcopy discard */
> > +int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
> > +                      uint64_t start, uint64_t end);
> >  
> >  /**
> >   * @migrate_add_blocker - prevent migration from proceeding
> > diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
> > index e93ee8a..1fec1c1 100644
> > --- a/include/migration/postcopy-ram.h
> > +++ b/include/migration/postcopy-ram.h
> > @@ -28,4 +28,38 @@ void postcopy_pmi_destroy(MigrationIncomingState *mis);
> >  void postcopy_pmi_discard_range(MigrationIncomingState *mis,
> >                                  size_t start, size_t npages);
> >  void postcopy_pmi_dump(MigrationIncomingState *mis);
> > +
> > +/*
> > + * Discard the contents of memory start..end inclusive.
> > + * We can assume that if we've been called postcopy_ram_hosttest returned true
> > + */
> > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> > +                               uint8_t *end);
> > +
> > +
> > +/*
> > + * Called at the start of each RAMBlock by the bitmap code
> > + * offset is the bit within the first 32bit chunk of mask
> > + * that represents the first page of the RAM Block
> > + * Returns a new PDS
> > + */
> > +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
> > +                                                 uint8_t offset,
> > +                                                 const char *name);
> > +
> > +/*
> > + * Called by the bitmap code for each chunk to discard
> > + * May send a discard message, may just leave it queued to
> > + * be sent later
> > + */
> > +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds,
> > +                                unsigned long pos, uint32_t bitmap);
> > +
> > +/*
> > + * Called at the end of each RAMBlock by the bitmap code
> > + * Sends any outstanding discard messages, frees the PDS
> > + */
> > +void postcopy_discard_send_finish(MigrationState *ms,
> > +                                  PostcopyDiscardState *pds);
> > +
> >  #endif
> > diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
> > index 924eeb6..0651275 100644
> > --- a/include/qemu/typedefs.h
> > +++ b/include/qemu/typedefs.h
> > @@ -61,6 +61,7 @@ typedef struct PCIExpressHost PCIExpressHost;
> >  typedef struct PCIHostState PCIHostState;
> >  typedef struct PCMCIACardState PCMCIACardState;
> >  typedef struct PixelFormat PixelFormat;
> > +typedef struct PostcopyDiscardState PostcopyDiscardState;
> >  typedef struct PostcopyPMI PostcopyPMI;
> >  typedef struct PropertyInfo PropertyInfo;
> >  typedef struct Property Property;
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 6b20b56..850fe1a 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -22,6 +22,7 @@
> >  #include "block/block.h"
> >  #include "qemu/sockets.h"
> >  #include "migration/block.h"
> > +#include "migration/postcopy-ram.h"
> >  #include "qemu/thread.h"
> >  #include "qmp-commands.h"
> >  #include "trace.h"
> > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> > index 4f29055..391e9c6 100644
> > --- a/migration/postcopy-ram.c
> > +++ b/migration/postcopy-ram.c
> > @@ -28,6 +28,19 @@
> >  #include "qemu/error-report.h"
> >  #include "trace.h"
> >  
> > +#define MAX_DISCARDS_PER_COMMAND 12
> > +
> > +struct PostcopyDiscardState {
> > +    const char *name;
> > +    uint16_t cur_entry;
> > +    uint64_t addrlist[MAX_DISCARDS_PER_COMMAND];
> > +    uint32_t masklist[MAX_DISCARDS_PER_COMMAND];
> > +    uint8_t  offset;  /* Offset within 32bit mask at addr0 representing 1st
> > +                         page of block */
> > +    unsigned int nsentwords;
> > +    unsigned int nsentcmds;
> > +};
> > +
> >  /* Postcopy needs to detect accesses to pages that haven't yet been copied
> >   * across, and efficiently map new pages in, the techniques for doing this
> >   * are target OS specific.
> > @@ -364,6 +377,21 @@ out:
> >      return ret;
> >  }
> >  
> > +/*
> > + * Discard the contents of memory start..end inclusive.
> > + * We can assume that if we've been called postcopy_ram_hosttest returned true
> > + */
> > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> > +                               uint8_t *end)
> > +{
> > +    if (madvise(start, (end-start)+1, MADV_DONTNEED)) {
> > +        perror("postcopy_ram_discard_range MADV_DONTNEED");
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  #else
> >  /* No target OS support, stubs just fail */
> >  
> > @@ -380,5 +408,88 @@ void postcopy_hook_early_receive(MigrationIncomingState *mis,
> >      /* We don't support postcopy so don't care */
> >  }
> >  
> > +void postcopy_pmi_destroy(MigrationIncomingState *mis)
> > +{
> > +    /* Called in normal cleanup path - so it's OK */
> > +}
> > +
> > +void postcopy_pmi_discard_range(MigrationIncomingState *mis,
> > +                                size_t start, size_t npages)
> > +{
> > +    assert(0);
> > +}
> > +
> > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
> > +                               uint8_t *end)
> > +{
> > +    assert(0);
> > +}
> >  #endif
> >  
> > +/* ------------------------------------------------------------------------- */
> > +
> > +/*
> > + * Called at the start of each RAMBlock by the bitmap code
> > + * offset is the bit within the first 64bit chunk of mask
> > + * that represents the first page of the RAM Block
> > + * Returns a new PDS
> > + */
> > +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
> > +                                                 uint8_t offset,
> > +                                                 const char *name)
> > +{
> > +    PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState));
> > +
> > +    if (res) {
> > +        res->name = name;
> > +        res->cur_entry = 0;
> > +        res->nsentwords = 0;
> > +        res->nsentcmds = 0;
> > +        res->offset = offset;
> > +    }
> > +
> > +    return res;
> > +}
> > +
> > +/*
> > + * Called by the bitmap code for each chunk to discard
> > + * May send a discard message, may just leave it queued to
> > + * be sent later
> > + */
> > +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds,
> > +                                unsigned long pos, uint32_t bitmap)
> > +{
> > +    pds->addrlist[pds->cur_entry] = pos;
> > +    pds->masklist[pds->cur_entry] = bitmap;
> > +    pds->cur_entry++;
> > +    pds->nsentwords++;
> > +
> > +    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
> > +        /* Full set, ship it! */
> > +        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
> > +                                              pds->cur_entry, pds->offset,
> > +                                              pds->addrlist, pds->masklist);
> > +        pds->nsentcmds++;
> > +        pds->cur_entry = 0;
> > +    }
> > +}
> > +
> > +/*
> > + * Called at the end of each RAMBlock by the bitmap code
> > + * Sends any outstanding discard messages, frees the PDS
> > + */
> > +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
> > +{
> > +    /* Anything unsent? */
> > +    if (pds->cur_entry) {
> > +        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
> > +                                              pds->cur_entry, pds->offset,
> > +                                              pds->addrlist, pds->masklist);
> > +        pds->nsentcmds++;
> > +    }
> > +
> > +    trace_postcopy_discard_send_finish(pds->name, pds->nsentwords,
> > +                                       pds->nsentcmds);
> > +
> > +    g_free(pds);
> > +}
> > diff --git a/savevm.c b/savevm.c
> > index 1e8d289..2589b8c 100644
> > --- a/savevm.c
> > +++ b/savevm.c
> > @@ -1282,15 +1282,12 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> >               * we know there must be at least 1 bit set due to the loop entry
> >               * If there is no 0 firstzero will be 32
> >               */
> > -            /* TODO - ram_discard_range gets added in a later patch
> >              int ret = ram_discard_range(mis, ramid,
> >                                  startaddr + firstset - first_bit_offset,
> >                                  startaddr + (firstzero - 1) - first_bit_offset);
> > -            ret = -1;
> >              if (ret) {
> >                  return ret;
> >              }
> > -            */
> >  
> >              /* mask= .?0000000000 */
> >              /*         ^fz ^fs    */
> > diff --git a/trace-events b/trace-events
> > index a555b56..f985117 100644
> > --- a/trace-events
> > +++ b/trace-events
> > @@ -1217,6 +1217,7 @@ qemu_file_fclose(void) ""
> >  migration_bitmap_sync_start(void) ""
> >  migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
> >  migration_throttle(void) ""
> > +ram_postcopy_send_discard_bitmap(void) ""
> >  
> >  # hw/display/qxl.c
> >  disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
> > @@ -1478,6 +1479,9 @@ rdma_start_incoming_migration_after_rdma_listen(void) ""
> >  rdma_start_outgoing_migration_after_rdma_connect(void) ""
> >  rdma_start_outgoing_migration_after_rdma_source_init(void) ""
> >  
> > +# migration/postcopy-ram.c
> > +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
> > +
> >  # kvm-all.c
> >  kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
> >  kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson


--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/arch_init.c b/arch_init.c
index 7bc5fa6..21e7ebe 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -40,6 +40,7 @@ 
 #include "hw/audio/audio.h"
 #include "sysemu/kvm.h"
 #include "migration/migration.h"
+#include "migration/postcopy-ram.h"
 #include "hw/i386/smbios.h"
 #include "exec/address-spaces.h"
 #include "hw/audio/pcspk.h"
@@ -414,9 +415,17 @@  static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
     return bytes_sent;
 }
 
+/* mr: The region to search for dirty pages in
+ * start: Start address (typically so we can continue from previous page)
+ * ram_addr_abs: Pointer into which to store the address of the dirty page
+ *               within the global ram_addr space
+ *
+ * Returns: byte offset within memory region of the start of a dirty page
+ */
 static inline
 ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
-                                                 ram_addr_t start)
+                                                 ram_addr_t start,
+                                                 ram_addr_t *ram_addr_abs)
 {
     unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS;
     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
@@ -435,6 +444,7 @@  ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
         clear_bit(next, migration_bitmap);
         migration_dirty_pages--;
     }
+    *ram_addr_abs = next << TARGET_PAGE_BITS;
     return (next - base) << TARGET_PAGE_BITS;
 }
 
@@ -571,6 +581,19 @@  static void migration_bitmap_sync(void)
     }
 }
 
+static RAMBlock *ram_find_block(const char *id)
+{
+    RAMBlock *block;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(id, block->idstr)) {
+            return block;
+        }
+    }
+
+    return NULL;
+}
+
 /*
  * ram_save_page: Send the given page to the stream
  *
@@ -659,13 +682,16 @@  static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
     bool complete_round = false;
     int bytes_sent = 0;
     MemoryRegion *mr;
+    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
+                                 ram_addr_t space */
 
     if (!block)
         block = QTAILQ_FIRST(&ram_list.blocks);
 
     while (true) {
         mr = block->mr;
-        offset = migration_bitmap_find_and_reset_dirty(mr, offset);
+        offset = migration_bitmap_find_and_reset_dirty(mr, offset,
+                                                       &dirty_ram_abs);
         if (complete_round && block == last_seen_block &&
             offset >= last_offset) {
             break;
@@ -683,6 +709,11 @@  static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
 
             /* if page is unmodified, continue to the next */
             if (bytes_sent > 0) {
+                MigrationState *ms = migrate_get_current();
+                if (ms->sentmap) {
+                    set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
+                }
+
                 last_sent_block = block;
                 break;
             }
@@ -742,12 +773,19 @@  void free_xbzrle_decoded_buf(void)
 
 static void migration_end(void)
 {
+    MigrationState *s = migrate_get_current();
+
     if (migration_bitmap) {
         memory_global_dirty_log_stop();
         g_free(migration_bitmap);
         migration_bitmap = NULL;
     }
 
+    if (s->sentmap) {
+        g_free(s->sentmap);
+        s->sentmap = NULL;
+    }
+
     XBZRLE_cache_lock();
     if (XBZRLE.cache) {
         cache_fini(XBZRLE.cache);
@@ -815,6 +853,232 @@  void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
     }
 }
 
+/* **** functions for postcopy ***** */
+
+/*
+ * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32
+ * messier for 64; the bitmaps are actually long's that are 32 or 64bit
+ */
+static uint32_t get_32bits_map(unsigned long *map, int64_t start)
+{
+#if HOST_LONG_BITS == 64
+    uint64_t tmp64;
+
+    tmp64 = map[start / 64];
+    return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu);
+#elif HOST_LONG_BITS == 32
+    /*
+     * Irrespective of host endianness, sentmap[n] is for pages earlier
+     * than sentmap[n+1] so we can't just cast up
+     */
+    return map[start / 32];
+#else
+#error "Host long other than 64/32 not supported"
+#endif
+}
+
+/*
+ * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32
+ * messier for 64; the bitmaps are actually long's that are 32 or 64bit
+ */
+__attribute__ (( unused )) /* Until later in patch series */
+static void put_32bits_map(unsigned long *map, int64_t start,
+                           uint32_t v)
+{
+#if HOST_LONG_BITS == 64
+    uint64_t tmp64 = v;
+    uint64_t mask = 0xffffffffu;
+
+    if (start & 32) {
+        tmp64 = tmp64 << 32;
+        mask =  mask << 32;
+    }
+
+    map[start / 64] = (map[start / 64] & ~mask) | tmp64;
+#elif HOST_LONG_BITS == 32
+    /*
+     * Irrespective of host endianness, sentmap[n] is for pages earlier
+     * than sentmap[n+1] so we can't just cast up
+     */
+    map[start / 32] = v;
+#else
+#error "Host long other than 64/32 not supported"
+#endif
+}
+
+/*
+ * When working on 32bit chunks of a bitmap where the only valid section
+ * is between start..end (inclusive), generate a mask with only those
+ * valid bits set for the current 32bit word within that bitmask.
+ */
+static int make_32bit_mask(unsigned long start, unsigned long end,
+                           unsigned long cur32)
+{
+    unsigned long first32, last32;
+    uint32_t mask = ~(uint32_t)0;
+    first32 = start / 32;
+    last32 = end / 32;
+
+    if ((cur32 == first32) && (start & 31)) {
+        /* e.g. (start & 31) = 3
+         *         1 << .    -> 2^3
+         *         . - 1     -> 2^3 - 1 i.e. mask 2..0
+         *         ~.        -> mask 31..3
+         */
+        mask &= ~((((uint32_t)1) << (start & 31)) - 1);
+    }
+
+    if ((cur32 == last32) && ((end & 31) != 31)) {
+        /* e.g. (end & 31) = 3
+         *            .   +1 -> 4
+         *         1 << .    -> 2^4
+         *         . -1      -> 2^4 - 1
+         *                   = mask set 3..0
+         */
+        mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1;
+    }
+
+    return mask;
+}
+
+/*
+ * Callback from ram_postcopy_each_ram_discard for each RAMBlock
+ * start,end: Indexes into the bitmap for the first and last bit
+ *            representing the named block
+ */
+static int pc_send_discard_bm_ram(MigrationState *ms,
+                                  PostcopyDiscardState *pds,
+                                  unsigned long start, unsigned long end)
+{
+    /*
+     * There is no guarantee that start, end are on convenient 32bit multiples
+     * (We always send 32bit chunks over the wire, irrespective of long size)
+     */
+    unsigned long first32, last32, cur32;
+    first32 = start / 32;
+    last32 = end / 32;
+
+    for (cur32 = first32; cur32 <= last32; cur32++) {
+        /* Deal with start/end not on alignment */
+        uint32_t mask = make_32bit_mask(start, end, cur32);
+
+        uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32);
+        data &= mask;
+
+        if (data) {
+            postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *   Calls postcopy_send_discard_bm_ram for each RAMBlock
+ *   passing it bitmap indexes and name.
+ * Returns: 0 on success
+ * (qemu_ram_foreach_block ends up passing unscaled lengths
+ *  which would mean postcopy code would have to deal with target page)
+ */
+static int pc_each_ram_discard(MigrationState *ms)
+{
+    struct RAMBlock *block;
+    int ret;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+        unsigned long last = (block->offset + (block->max_length-1))
+                                >> TARGET_PAGE_BITS;
+        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
+                                                               first & 31,
+                                                               block->idstr);
+
+        /*
+         * Postcopy sends chunks of bitmap over the wire, but it
+         * just needs indexes at this point, avoids it having
+         * target page specific code.
+         */
+        ret = pc_send_discard_bm_ram(ms, pds, first, last);
+        postcopy_discard_send_finish(ms, pds);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Transmit the set of pages to be discarded after precopy to the target
+ * these are pages that have been sent previously but have been dirtied
+ * Hopefully this is pretty sparse
+ */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms)
+{
+    /* This should be our last sync, the src is now paused */
+    migration_bitmap_sync();
+
+    /*
+     * Update the sentmap to be  sentmap&=dirty
+     */
+    bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap,
+               last_ram_offset() >> TARGET_PAGE_BITS);
+
+
+    trace_ram_postcopy_send_discard_bitmap();
+#ifdef DEBUG_POSTCOPY
+    ram_debug_dump_bitmap(ms->sentmap, false);
+#endif
+
+    return pc_each_ram_discard(ms);
+}
+
+/*
+ * At the start of the postcopy phase of migration, any now-dirty
+ * precopied pages are discarded.
+ *
+ * start..end is an inclusive range of bits indexed in the source
+ *    VMs bitmap for this RAMBlock, source_target_page_bits tells
+ *    us what one of those bits represents.
+ *
+ * start/end are offsets from the start of the bitmap for RAMBlock 'block_name'
+ *
+ * Returns 0 on success.
+ */
+int ram_discard_range(MigrationIncomingState *mis,
+                      const char *block_name,
+                      uint64_t start, uint64_t end)
+{
+    assert(end >= start);
+
+    RAMBlock *rb = ram_find_block(block_name);
+
+    if (!rb) {
+        error_report("ram_discard_range: Failed to find block '%s'",
+                     block_name);
+        return -1;
+    }
+
+    uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS;
+    postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1);
+
+    /* +1 gives the byte after the end of the last page to be discarded */
+    ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS;
+    uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS);
+    uint8_t *host_endaddr;
+
+    if (end_offset <= rb->used_length) {
+        host_endaddr   = rb->host + (end_offset-1);
+        return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr);
+    } else {
+        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
+                     "/%" PRIu64 "/%zu)",
+                     block_name, start, end, rb->used_length);
+        return -1;
+    }
+}
+
 static int ram_save_setup(QEMUFile *f, void *opaque)
 {
     RAMBlock *block;
@@ -854,7 +1118,6 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
 
         acct_clear();
     }
-
     qemu_mutex_lock_iothread();
     qemu_mutex_lock_ramlist();
     bytes_transferred = 0;
@@ -864,6 +1127,12 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     migration_bitmap = bitmap_new(ram_bitmap_pages);
     bitmap_set(migration_bitmap, 0, ram_bitmap_pages);
 
+    if (migrate_postcopy_ram()) {
+        MigrationState *s = migrate_get_current();
+        s->sentmap = bitmap_new(ram_bitmap_pages);
+        bitmap_clear(s->sentmap, 0, ram_bitmap_pages);
+    }
+
     /*
      * Count the total number of pages used by ram blocks not including any
      * gaps due to alignment or unplugs.
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 86200b9..e749f4c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -125,6 +125,13 @@  struct MigrationState
 
     /* Flag set once the migration has been asked to enter postcopy */
     bool start_postcopy;
+
+    /* bitmap of pages that have been sent at least once
+     * only maintained and used in postcopy at the moment
+     * where it's used to send the dirtymap at the start
+     * of the postcopy phase
+     */
+    unsigned long *sentmap;
 };
 
 void process_incoming_migration(QEMUFile *f);
@@ -194,6 +201,11 @@  double xbzrle_mig_cache_miss_rate(void);
 
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
+/* For outgoing discard bitmap */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms);
+/* For incoming postcopy discard */
+int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
+                      uint64_t start, uint64_t end);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
index e93ee8a..1fec1c1 100644
--- a/include/migration/postcopy-ram.h
+++ b/include/migration/postcopy-ram.h
@@ -28,4 +28,38 @@  void postcopy_pmi_destroy(MigrationIncomingState *mis);
 void postcopy_pmi_discard_range(MigrationIncomingState *mis,
                                 size_t start, size_t npages);
 void postcopy_pmi_dump(MigrationIncomingState *mis);
+
+/*
+ * Discard the contents of memory start..end inclusive.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               uint8_t *end);
+
+
+/*
+ * Called at the start of each RAMBlock by the bitmap code
+ * offset is the bit within the first 32bit chunk of mask
+ * that represents the first page of the RAM Block
+ * Returns a new PDS
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+                                                 uint8_t offset,
+                                                 const char *name);
+
+/*
+ * Called by the bitmap code for each chunk to discard
+ * May send a discard message, may just leave it queued to
+ * be sent later
+ */
+void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds,
+                                unsigned long pos, uint32_t bitmap);
+
+/*
+ * Called at the end of each RAMBlock by the bitmap code
+ * Sends any outstanding discard messages, frees the PDS
+ */
+void postcopy_discard_send_finish(MigrationState *ms,
+                                  PostcopyDiscardState *pds);
+
 #endif
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index 924eeb6..0651275 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -61,6 +61,7 @@  typedef struct PCIExpressHost PCIExpressHost;
 typedef struct PCIHostState PCIHostState;
 typedef struct PCMCIACardState PCMCIACardState;
 typedef struct PixelFormat PixelFormat;
+typedef struct PostcopyDiscardState PostcopyDiscardState;
 typedef struct PostcopyPMI PostcopyPMI;
 typedef struct PropertyInfo PropertyInfo;
 typedef struct Property Property;
diff --git a/migration/migration.c b/migration/migration.c
index 6b20b56..850fe1a 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -22,6 +22,7 @@ 
 #include "block/block.h"
 #include "qemu/sockets.h"
 #include "migration/block.h"
+#include "migration/postcopy-ram.h"
 #include "qemu/thread.h"
 #include "qmp-commands.h"
 #include "trace.h"
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 4f29055..391e9c6 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -28,6 +28,19 @@ 
 #include "qemu/error-report.h"
 #include "trace.h"
 
+#define MAX_DISCARDS_PER_COMMAND 12
+
+struct PostcopyDiscardState {
+    const char *name;
+    uint16_t cur_entry;
+    uint64_t addrlist[MAX_DISCARDS_PER_COMMAND];
+    uint32_t masklist[MAX_DISCARDS_PER_COMMAND];
+    uint8_t  offset;  /* Offset within 32bit mask at addr0 representing 1st
+                         page of block */
+    unsigned int nsentwords;
+    unsigned int nsentcmds;
+};
+
 /* Postcopy needs to detect accesses to pages that haven't yet been copied
  * across, and efficiently map new pages in, the techniques for doing this
  * are target OS specific.
@@ -364,6 +377,21 @@  out:
     return ret;
 }
 
+/*
+ * Discard the contents of memory start..end inclusive.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               uint8_t *end)
+{
+    if (madvise(start, (end-start)+1, MADV_DONTNEED)) {
+        perror("postcopy_ram_discard_range MADV_DONTNEED");
+        return -1;
+    }
+
+    return 0;
+}
+
 #else
 /* No target OS support, stubs just fail */
 
@@ -380,5 +408,88 @@  void postcopy_hook_early_receive(MigrationIncomingState *mis,
     /* We don't support postcopy so don't care */
 }
 
+void postcopy_pmi_destroy(MigrationIncomingState *mis)
+{
+    /* Called in normal cleanup path - so it's OK */
+}
+
+void postcopy_pmi_discard_range(MigrationIncomingState *mis,
+                                size_t start, size_t npages)
+{
+    assert(0);
+}
+
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               uint8_t *end)
+{
+    assert(0);
+}
 #endif
 
+/* ------------------------------------------------------------------------- */
+
+/*
+ * Called at the start of each RAMBlock by the bitmap code
+ * offset is the bit within the first 64bit chunk of mask
+ * that represents the first page of the RAM Block
+ * Returns a new PDS
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+                                                 uint8_t offset,
+                                                 const char *name)
+{
+    PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState));
+
+    if (res) {
+        res->name = name;
+        res->cur_entry = 0;
+        res->nsentwords = 0;
+        res->nsentcmds = 0;
+        res->offset = offset;
+    }
+
+    return res;
+}
+
+/*
+ * Called by the bitmap code for each chunk to discard
+ * May send a discard message, may just leave it queued to
+ * be sent later
+ */
+void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds,
+                                unsigned long pos, uint32_t bitmap)
+{
+    pds->addrlist[pds->cur_entry] = pos;
+    pds->masklist[pds->cur_entry] = bitmap;
+    pds->cur_entry++;
+    pds->nsentwords++;
+
+    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+        /* Full set, ship it! */
+        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
+                                              pds->cur_entry, pds->offset,
+                                              pds->addrlist, pds->masklist);
+        pds->nsentcmds++;
+        pds->cur_entry = 0;
+    }
+}
+
+/*
+ * Called at the end of each RAMBlock by the bitmap code
+ * Sends any outstanding discard messages, frees the PDS
+ */
+void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
+{
+    /* Anything unsent? */
+    if (pds->cur_entry) {
+        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name,
+                                              pds->cur_entry, pds->offset,
+                                              pds->addrlist, pds->masklist);
+        pds->nsentcmds++;
+    }
+
+    trace_postcopy_discard_send_finish(pds->name, pds->nsentwords,
+                                       pds->nsentcmds);
+
+    g_free(pds);
+}
diff --git a/savevm.c b/savevm.c
index 1e8d289..2589b8c 100644
--- a/savevm.c
+++ b/savevm.c
@@ -1282,15 +1282,12 @@  static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
              * we know there must be at least 1 bit set due to the loop entry
              * If there is no 0 firstzero will be 32
              */
-            /* TODO - ram_discard_range gets added in a later patch
             int ret = ram_discard_range(mis, ramid,
                                 startaddr + firstset - first_bit_offset,
                                 startaddr + (firstzero - 1) - first_bit_offset);
-            ret = -1;
             if (ret) {
                 return ret;
             }
-            */
 
             /* mask= .?0000000000 */
             /*         ^fz ^fs    */
diff --git a/trace-events b/trace-events
index a555b56..f985117 100644
--- a/trace-events
+++ b/trace-events
@@ -1217,6 +1217,7 @@  qemu_file_fclose(void) ""
 migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
 migration_throttle(void) ""
+ram_postcopy_send_discard_bitmap(void) ""
 
 # hw/display/qxl.c
 disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
@@ -1478,6 +1479,9 @@  rdma_start_incoming_migration_after_rdma_listen(void) ""
 rdma_start_outgoing_migration_after_rdma_connect(void) ""
 rdma_start_outgoing_migration_after_rdma_source_init(void) ""
 
+# migration/postcopy-ram.c
+postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
+
 # kvm-all.c
 kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
 kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"