Message ID | 1424883128-9841-28-git-send-email-dgilbert@redhat.com |
---|---|
State | New |
Headers | show |
On Wed, Feb 25, 2015 at 04:51:50PM +0000, Dr. David Alan Gilbert (git) wrote: > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com> > > Where postcopy is preceeded by a period of precopy, the destination will > have received pages that may have been dirtied on the source after the > page was sent. The destination must throw these pages away before > starting it's CPUs. > > Maintain a 'sentmap' of pages that have already been sent. > Calculate list of sent & dirty pages > Provide helpers on the destination side to discard these. > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > --- > arch_init.c | 275 ++++++++++++++++++++++++++++++++++++++- > include/migration/migration.h | 12 ++ > include/migration/postcopy-ram.h | 34 +++++ > include/qemu/typedefs.h | 1 + > migration/migration.c | 1 + > migration/postcopy-ram.c | 111 ++++++++++++++++ > savevm.c | 3 - > trace-events | 4 + > 8 files changed, 435 insertions(+), 6 deletions(-) > > diff --git a/arch_init.c b/arch_init.c > index 7bc5fa6..21e7ebe 100644 > --- a/arch_init.c > +++ b/arch_init.c > @@ -40,6 +40,7 @@ > #include "hw/audio/audio.h" > #include "sysemu/kvm.h" > #include "migration/migration.h" > +#include "migration/postcopy-ram.h" > #include "hw/i386/smbios.h" > #include "exec/address-spaces.h" > #include "hw/audio/pcspk.h" > @@ -414,9 +415,17 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, > return bytes_sent; > } > > +/* mr: The region to search for dirty pages in > + * start: Start address (typically so we can continue from previous page) > + * ram_addr_abs: Pointer into which to store the address of the dirty page > + * within the global ram_addr space > + * > + * Returns: byte offset within memory region of the start of a dirty page > + */ > static inline > ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, > - ram_addr_t start) > + ram_addr_t start, > + ram_addr_t *ram_addr_abs) > { > unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; > unsigned long nr = base + (start >> TARGET_PAGE_BITS); > @@ -435,6 +444,7 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, > clear_bit(next, migration_bitmap); > migration_dirty_pages--; > } > + *ram_addr_abs = next << TARGET_PAGE_BITS; > return (next - base) << TARGET_PAGE_BITS; > } > > @@ -571,6 +581,19 @@ static void migration_bitmap_sync(void) > } > } > > +static RAMBlock *ram_find_block(const char *id) > +{ > + RAMBlock *block; > + > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > + if (!strcmp(id, block->idstr)) { > + return block; > + } > + } > + > + return NULL; > +} > + > /* > * ram_save_page: Send the given page to the stream > * > @@ -659,13 +682,16 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) > bool complete_round = false; > int bytes_sent = 0; > MemoryRegion *mr; > + ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in > + ram_addr_t space */ > > if (!block) > block = QTAILQ_FIRST(&ram_list.blocks); > > while (true) { > mr = block->mr; > - offset = migration_bitmap_find_and_reset_dirty(mr, offset); > + offset = migration_bitmap_find_and_reset_dirty(mr, offset, > + &dirty_ram_abs); > if (complete_round && block == last_seen_block && > offset >= last_offset) { > break; > @@ -683,6 +709,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) > > /* if page is unmodified, continue to the next */ > if (bytes_sent > 0) { > + MigrationState *ms = migrate_get_current(); > + if (ms->sentmap) { > + set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap); > + } > + > last_sent_block = block; > break; > } > @@ -742,12 +773,19 @@ void free_xbzrle_decoded_buf(void) > > static void migration_end(void) > { > + MigrationState *s = migrate_get_current(); > + > if (migration_bitmap) { > memory_global_dirty_log_stop(); > g_free(migration_bitmap); > migration_bitmap = NULL; > } > > + if (s->sentmap) { > + g_free(s->sentmap); > + s->sentmap = NULL; > + } > + > XBZRLE_cache_lock(); > if (XBZRLE.cache) { > cache_fini(XBZRLE.cache); > @@ -815,6 +853,232 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected) > } > } > > +/* **** functions for postcopy ***** */ > + > +/* > + * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32 > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit > + */ > +static uint32_t get_32bits_map(unsigned long *map, int64_t start) > +{ > +#if HOST_LONG_BITS == 64 > + uint64_t tmp64; > + > + tmp64 = map[start / 64]; > + return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu); > +#elif HOST_LONG_BITS == 32 > + /* > + * Irrespective of host endianness, sentmap[n] is for pages earlier > + * than sentmap[n+1] so we can't just cast up > + */ > + return map[start / 32]; > +#else > +#error "Host long other than 64/32 not supported" > +#endif > +} > + > +/* > + * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32 > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit > + */ > +__attribute__ (( unused )) /* Until later in patch series */ > +static void put_32bits_map(unsigned long *map, int64_t start, > + uint32_t v) > +{ > +#if HOST_LONG_BITS == 64 > + uint64_t tmp64 = v; > + uint64_t mask = 0xffffffffu; > + > + if (start & 32) { > + tmp64 = tmp64 << 32; > + mask = mask << 32; > + } > + > + map[start / 64] = (map[start / 64] & ~mask) | tmp64; > +#elif HOST_LONG_BITS == 32 > + /* > + * Irrespective of host endianness, sentmap[n] is for pages earlier > + * than sentmap[n+1] so we can't just cast up > + */ > + map[start / 32] = v; > +#else > +#error "Host long other than 64/32 not supported" > +#endif > +} > + > +/* > + * When working on 32bit chunks of a bitmap where the only valid section > + * is between start..end (inclusive), generate a mask with only those > + * valid bits set for the current 32bit word within that bitmask. > + */ > +static int make_32bit_mask(unsigned long start, unsigned long end, > + unsigned long cur32) > +{ > + unsigned long first32, last32; > + uint32_t mask = ~(uint32_t)0; > + first32 = start / 32; > + last32 = end / 32; > + > + if ((cur32 == first32) && (start & 31)) { > + /* e.g. (start & 31) = 3 > + * 1 << . -> 2^3 > + * . - 1 -> 2^3 - 1 i.e. mask 2..0 > + * ~. -> mask 31..3 > + */ > + mask &= ~((((uint32_t)1) << (start & 31)) - 1); > + } > + > + if ((cur32 == last32) && ((end & 31) != 31)) { > + /* e.g. (end & 31) = 3 > + * . +1 -> 4 > + * 1 << . -> 2^4 > + * . -1 -> 2^4 - 1 > + * = mask set 3..0 > + */ > + mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1; > + } > + > + return mask; > +} Urgh.. looks correct, but continue to makes me question if this is a sensible wire encoding of the discard map. > + > +/* > + * Callback from ram_postcopy_each_ram_discard for each RAMBlock > + * start,end: Indexes into the bitmap for the first and last bit > + * representing the named block > + */ > +static int pc_send_discard_bm_ram(MigrationState *ms, > + PostcopyDiscardState *pds, > + unsigned long start, unsigned long end) I know it's a long name, but I'd prefer to see this called "postcopy_". I have to keep reminding myself that "pc" means "postcopy" here, not that it's a PC machine type specific function. > +{ > + /* > + * There is no guarantee that start, end are on convenient 32bit multiples > + * (We always send 32bit chunks over the wire, irrespective of long size) > + */ > + unsigned long first32, last32, cur32; > + first32 = start / 32; > + last32 = end / 32; > + > + for (cur32 = first32; cur32 <= last32; cur32++) { > + /* Deal with start/end not on alignment */ > + uint32_t mask = make_32bit_mask(start, end, cur32); > + > + uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32); > + data &= mask; > + > + if (data) { > + postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data); > + } > + } > + > + return 0; > +} > +/* > + * Utility for the outgoing postcopy code. > + * Calls postcopy_send_discard_bm_ram for each RAMBlock > + * passing it bitmap indexes and name. > + * Returns: 0 on success > + * (qemu_ram_foreach_block ends up passing unscaled lengths > + * which would mean postcopy code would have to deal with target page) > + */ > +static int pc_each_ram_discard(MigrationState *ms) s/discard/send_discard/ > +{ > + struct RAMBlock *block; > + int ret; > + > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > + unsigned long first = block->offset >> TARGET_PAGE_BITS; > + unsigned long last = (block->offset + (block->max_length-1)) > + >> TARGET_PAGE_BITS; > + PostcopyDiscardState *pds = postcopy_discard_send_init(ms, > + first & 31, > + block->idstr); > + > + /* > + * Postcopy sends chunks of bitmap over the wire, but it > + * just needs indexes at this point, avoids it having > + * target page specific code. > + */ > + ret = pc_send_discard_bm_ram(ms, pds, first, last); > + postcopy_discard_send_finish(ms, pds); > + if (ret) { > + return ret; > + } > + } > + > + return 0; > +} > + > +/* > + * Transmit the set of pages to be discarded after precopy to the target > + * these are pages that have been sent previously but have been dirtied > + * Hopefully this is pretty sparse > + */ > +int ram_postcopy_send_discard_bitmap(MigrationState *ms) > +{ > + /* This should be our last sync, the src is now paused */ > + migration_bitmap_sync(); > + > + /* > + * Update the sentmap to be sentmap&=dirty > + */ > + bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap, > + last_ram_offset() >> TARGET_PAGE_BITS); > + > + > + trace_ram_postcopy_send_discard_bitmap(); > +#ifdef DEBUG_POSTCOPY > + ram_debug_dump_bitmap(ms->sentmap, false); > +#endif > + > + return pc_each_ram_discard(ms); > +} > + > +/* > + * At the start of the postcopy phase of migration, any now-dirty > + * precopied pages are discarded. > + * > + * start..end is an inclusive range of bits indexed in the source > + * VMs bitmap for this RAMBlock, source_target_page_bits tells > + * us what one of those bits represents. > + * > + * start/end are offsets from the start of the bitmap for RAMBlock 'block_name' > + * > + * Returns 0 on success. > + */ > +int ram_discard_range(MigrationIncomingState *mis, > + const char *block_name, > + uint64_t start, uint64_t end) > +{ > + assert(end >= start); > + > + RAMBlock *rb = ram_find_block(block_name); > + > + if (!rb) { > + error_report("ram_discard_range: Failed to find block '%s'", > + block_name); > + return -1; > + } > + > + uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS; > + postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1); > + > + /* +1 gives the byte after the end of the last page to be discarded */ > + ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS; > + uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS); > + uint8_t *host_endaddr; > + > + if (end_offset <= rb->used_length) { > + host_endaddr = rb->host + (end_offset-1); > + return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr); > + } else { > + error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 > + "/%" PRIu64 "/%zu)", > + block_name, start, end, rb->used_length); > + return -1; > + } > +} > + > static int ram_save_setup(QEMUFile *f, void *opaque) > { > RAMBlock *block; > @@ -854,7 +1118,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > > acct_clear(); > } > - > qemu_mutex_lock_iothread(); > qemu_mutex_lock_ramlist(); > bytes_transferred = 0; > @@ -864,6 +1127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > migration_bitmap = bitmap_new(ram_bitmap_pages); > bitmap_set(migration_bitmap, 0, ram_bitmap_pages); > > + if (migrate_postcopy_ram()) { > + MigrationState *s = migrate_get_current(); > + s->sentmap = bitmap_new(ram_bitmap_pages); > + bitmap_clear(s->sentmap, 0, ram_bitmap_pages); > + } > + > /* > * Count the total number of pages used by ram blocks not including any > * gaps due to alignment or unplugs. > diff --git a/include/migration/migration.h b/include/migration/migration.h > index 86200b9..e749f4c 100644 > --- a/include/migration/migration.h > +++ b/include/migration/migration.h > @@ -125,6 +125,13 @@ struct MigrationState > > /* Flag set once the migration has been asked to enter postcopy */ > bool start_postcopy; > + > + /* bitmap of pages that have been sent at least once > + * only maintained and used in postcopy at the moment > + * where it's used to send the dirtymap at the start > + * of the postcopy phase > + */ > + unsigned long *sentmap; > }; > > void process_incoming_migration(QEMUFile *f); > @@ -194,6 +201,11 @@ double xbzrle_mig_cache_miss_rate(void); > > void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); > void ram_debug_dump_bitmap(unsigned long *todump, bool expected); > +/* For outgoing discard bitmap */ > +int ram_postcopy_send_discard_bitmap(MigrationState *ms); > +/* For incoming postcopy discard */ > +int ram_discard_range(MigrationIncomingState *mis, const char *block_name, > + uint64_t start, uint64_t end); > > /** > * @migrate_add_blocker - prevent migration from proceeding > diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h > index e93ee8a..1fec1c1 100644 > --- a/include/migration/postcopy-ram.h > +++ b/include/migration/postcopy-ram.h > @@ -28,4 +28,38 @@ void postcopy_pmi_destroy(MigrationIncomingState *mis); > void postcopy_pmi_discard_range(MigrationIncomingState *mis, > size_t start, size_t npages); > void postcopy_pmi_dump(MigrationIncomingState *mis); > + > +/* > + * Discard the contents of memory start..end inclusive. > + * We can assume that if we've been called postcopy_ram_hosttest returned true > + */ > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, > + uint8_t *end); > + > + > +/* > + * Called at the start of each RAMBlock by the bitmap code > + * offset is the bit within the first 32bit chunk of mask > + * that represents the first page of the RAM Block > + * Returns a new PDS > + */ > +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, > + uint8_t offset, > + const char *name); > + > +/* > + * Called by the bitmap code for each chunk to discard > + * May send a discard message, may just leave it queued to > + * be sent later > + */ > +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds, > + unsigned long pos, uint32_t bitmap); > + > +/* > + * Called at the end of each RAMBlock by the bitmap code > + * Sends any outstanding discard messages, frees the PDS > + */ > +void postcopy_discard_send_finish(MigrationState *ms, > + PostcopyDiscardState *pds); > + > #endif > diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h > index 924eeb6..0651275 100644 > --- a/include/qemu/typedefs.h > +++ b/include/qemu/typedefs.h > @@ -61,6 +61,7 @@ typedef struct PCIExpressHost PCIExpressHost; > typedef struct PCIHostState PCIHostState; > typedef struct PCMCIACardState PCMCIACardState; > typedef struct PixelFormat PixelFormat; > +typedef struct PostcopyDiscardState PostcopyDiscardState; > typedef struct PostcopyPMI PostcopyPMI; > typedef struct PropertyInfo PropertyInfo; > typedef struct Property Property; > diff --git a/migration/migration.c b/migration/migration.c > index 6b20b56..850fe1a 100644 > --- a/migration/migration.c > +++ b/migration/migration.c > @@ -22,6 +22,7 @@ > #include "block/block.h" > #include "qemu/sockets.h" > #include "migration/block.h" > +#include "migration/postcopy-ram.h" > #include "qemu/thread.h" > #include "qmp-commands.h" > #include "trace.h" > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > index 4f29055..391e9c6 100644 > --- a/migration/postcopy-ram.c > +++ b/migration/postcopy-ram.c > @@ -28,6 +28,19 @@ > #include "qemu/error-report.h" > #include "trace.h" > > +#define MAX_DISCARDS_PER_COMMAND 12 > + > +struct PostcopyDiscardState { > + const char *name; > + uint16_t cur_entry; > + uint64_t addrlist[MAX_DISCARDS_PER_COMMAND]; > + uint32_t masklist[MAX_DISCARDS_PER_COMMAND]; > + uint8_t offset; /* Offset within 32bit mask at addr0 representing 1st > + page of block */ > + unsigned int nsentwords; > + unsigned int nsentcmds; > +}; > + > /* Postcopy needs to detect accesses to pages that haven't yet been copied > * across, and efficiently map new pages in, the techniques for doing this > * are target OS specific. > @@ -364,6 +377,21 @@ out: > return ret; > } > > +/* > + * Discard the contents of memory start..end inclusive. > + * We can assume that if we've been called postcopy_ram_hosttest returned true > + */ > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, > + uint8_t *end) > +{ > + if (madvise(start, (end-start)+1, MADV_DONTNEED)) { > + perror("postcopy_ram_discard_range MADV_DONTNEED"); > + return -1; > + } > + > + return 0; > +} > + > #else > /* No target OS support, stubs just fail */ > > @@ -380,5 +408,88 @@ void postcopy_hook_early_receive(MigrationIncomingState *mis, > /* We don't support postcopy so don't care */ > } > > +void postcopy_pmi_destroy(MigrationIncomingState *mis) > +{ > + /* Called in normal cleanup path - so it's OK */ > +} > + > +void postcopy_pmi_discard_range(MigrationIncomingState *mis, > + size_t start, size_t npages) > +{ > + assert(0); > +} > + > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, > + uint8_t *end) > +{ > + assert(0); > +} > #endif > > +/* ------------------------------------------------------------------------- */ > + > +/* > + * Called at the start of each RAMBlock by the bitmap code > + * offset is the bit within the first 64bit chunk of mask > + * that represents the first page of the RAM Block > + * Returns a new PDS > + */ > +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, > + uint8_t offset, > + const char *name) > +{ > + PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState)); > + > + if (res) { > + res->name = name; > + res->cur_entry = 0; > + res->nsentwords = 0; > + res->nsentcmds = 0; > + res->offset = offset; > + } > + > + return res; > +} > + > +/* > + * Called by the bitmap code for each chunk to discard > + * May send a discard message, may just leave it queued to > + * be sent later > + */ > +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds, > + unsigned long pos, uint32_t bitmap) > +{ > + pds->addrlist[pds->cur_entry] = pos; > + pds->masklist[pds->cur_entry] = bitmap; > + pds->cur_entry++; > + pds->nsentwords++; > + > + if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { > + /* Full set, ship it! */ > + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name, > + pds->cur_entry, pds->offset, > + pds->addrlist, pds->masklist); > + pds->nsentcmds++; > + pds->cur_entry = 0; > + } > +} > + > +/* > + * Called at the end of each RAMBlock by the bitmap code > + * Sends any outstanding discard messages, frees the PDS > + */ > +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) > +{ > + /* Anything unsent? */ > + if (pds->cur_entry) { > + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name, > + pds->cur_entry, pds->offset, > + pds->addrlist, pds->masklist); > + pds->nsentcmds++; > + } > + > + trace_postcopy_discard_send_finish(pds->name, pds->nsentwords, > + pds->nsentcmds); > + > + g_free(pds); > +} > diff --git a/savevm.c b/savevm.c > index 1e8d289..2589b8c 100644 > --- a/savevm.c > +++ b/savevm.c > @@ -1282,15 +1282,12 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, > * we know there must be at least 1 bit set due to the loop entry > * If there is no 0 firstzero will be 32 > */ > - /* TODO - ram_discard_range gets added in a later patch > int ret = ram_discard_range(mis, ramid, > startaddr + firstset - first_bit_offset, > startaddr + (firstzero - 1) - first_bit_offset); > - ret = -1; > if (ret) { > return ret; > } > - */ > > /* mask= .?0000000000 */ > /* ^fz ^fs */ > diff --git a/trace-events b/trace-events > index a555b56..f985117 100644 > --- a/trace-events > +++ b/trace-events > @@ -1217,6 +1217,7 @@ qemu_file_fclose(void) "" > migration_bitmap_sync_start(void) "" > migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64"" > migration_throttle(void) "" > +ram_postcopy_send_discard_bitmap(void) "" > > # hw/display/qxl.c > disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d" > @@ -1478,6 +1479,9 @@ rdma_start_incoming_migration_after_rdma_listen(void) "" > rdma_start_outgoing_migration_after_rdma_connect(void) "" > rdma_start_outgoing_migration_after_rdma_source_init(void) "" > > +# migration/postcopy-ram.c > +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands" > + > # kvm-all.c > kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" > kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
* David Gibson (david@gibson.dropbear.id.au) wrote: > On Wed, Feb 25, 2015 at 04:51:50PM +0000, Dr. David Alan Gilbert (git) wrote: > > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com> > > > > Where postcopy is preceeded by a period of precopy, the destination will > > have received pages that may have been dirtied on the source after the > > page was sent. The destination must throw these pages away before > > starting it's CPUs. > > > > Maintain a 'sentmap' of pages that have already been sent. > > Calculate list of sent & dirty pages > > Provide helpers on the destination side to discard these. > > > > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com> > > --- > > arch_init.c | 275 ++++++++++++++++++++++++++++++++++++++- > > include/migration/migration.h | 12 ++ > > include/migration/postcopy-ram.h | 34 +++++ > > include/qemu/typedefs.h | 1 + > > migration/migration.c | 1 + > > migration/postcopy-ram.c | 111 ++++++++++++++++ > > savevm.c | 3 - > > trace-events | 4 + > > 8 files changed, 435 insertions(+), 6 deletions(-) > > > > diff --git a/arch_init.c b/arch_init.c > > index 7bc5fa6..21e7ebe 100644 > > --- a/arch_init.c > > +++ b/arch_init.c > > @@ -40,6 +40,7 @@ > > #include "hw/audio/audio.h" > > #include "sysemu/kvm.h" > > #include "migration/migration.h" > > +#include "migration/postcopy-ram.h" > > #include "hw/i386/smbios.h" > > #include "exec/address-spaces.h" > > #include "hw/audio/pcspk.h" > > @@ -414,9 +415,17 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, > > return bytes_sent; > > } > > > > +/* mr: The region to search for dirty pages in > > + * start: Start address (typically so we can continue from previous page) > > + * ram_addr_abs: Pointer into which to store the address of the dirty page > > + * within the global ram_addr space > > + * > > + * Returns: byte offset within memory region of the start of a dirty page > > + */ > > static inline > > ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, > > - ram_addr_t start) > > + ram_addr_t start, > > + ram_addr_t *ram_addr_abs) > > { > > unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; > > unsigned long nr = base + (start >> TARGET_PAGE_BITS); > > @@ -435,6 +444,7 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, > > clear_bit(next, migration_bitmap); > > migration_dirty_pages--; > > } > > + *ram_addr_abs = next << TARGET_PAGE_BITS; > > return (next - base) << TARGET_PAGE_BITS; > > } > > > > @@ -571,6 +581,19 @@ static void migration_bitmap_sync(void) > > } > > } > > > > +static RAMBlock *ram_find_block(const char *id) > > +{ > > + RAMBlock *block; > > + > > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > > + if (!strcmp(id, block->idstr)) { > > + return block; > > + } > > + } > > + > > + return NULL; > > +} > > + > > /* > > * ram_save_page: Send the given page to the stream > > * > > @@ -659,13 +682,16 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) > > bool complete_round = false; > > int bytes_sent = 0; > > MemoryRegion *mr; > > + ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in > > + ram_addr_t space */ > > > > if (!block) > > block = QTAILQ_FIRST(&ram_list.blocks); > > > > while (true) { > > mr = block->mr; > > - offset = migration_bitmap_find_and_reset_dirty(mr, offset); > > + offset = migration_bitmap_find_and_reset_dirty(mr, offset, > > + &dirty_ram_abs); > > if (complete_round && block == last_seen_block && > > offset >= last_offset) { > > break; > > @@ -683,6 +709,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) > > > > /* if page is unmodified, continue to the next */ > > if (bytes_sent > 0) { > > + MigrationState *ms = migrate_get_current(); > > + if (ms->sentmap) { > > + set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap); > > + } > > + > > last_sent_block = block; > > break; > > } > > @@ -742,12 +773,19 @@ void free_xbzrle_decoded_buf(void) > > > > static void migration_end(void) > > { > > + MigrationState *s = migrate_get_current(); > > + > > if (migration_bitmap) { > > memory_global_dirty_log_stop(); > > g_free(migration_bitmap); > > migration_bitmap = NULL; > > } > > > > + if (s->sentmap) { > > + g_free(s->sentmap); > > + s->sentmap = NULL; > > + } > > + > > XBZRLE_cache_lock(); > > if (XBZRLE.cache) { > > cache_fini(XBZRLE.cache); > > @@ -815,6 +853,232 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected) > > } > > } > > > > +/* **** functions for postcopy ***** */ > > + > > +/* > > + * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32 > > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit > > + */ > > +static uint32_t get_32bits_map(unsigned long *map, int64_t start) > > +{ > > +#if HOST_LONG_BITS == 64 > > + uint64_t tmp64; > > + > > + tmp64 = map[start / 64]; > > + return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu); > > +#elif HOST_LONG_BITS == 32 > > + /* > > + * Irrespective of host endianness, sentmap[n] is for pages earlier > > + * than sentmap[n+1] so we can't just cast up > > + */ > > + return map[start / 32]; > > +#else > > +#error "Host long other than 64/32 not supported" > > +#endif > > +} > > + > > +/* > > + * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32 > > + * messier for 64; the bitmaps are actually long's that are 32 or 64bit > > + */ > > +__attribute__ (( unused )) /* Until later in patch series */ > > +static void put_32bits_map(unsigned long *map, int64_t start, > > + uint32_t v) > > +{ > > +#if HOST_LONG_BITS == 64 > > + uint64_t tmp64 = v; > > + uint64_t mask = 0xffffffffu; > > + > > + if (start & 32) { > > + tmp64 = tmp64 << 32; > > + mask = mask << 32; > > + } > > + > > + map[start / 64] = (map[start / 64] & ~mask) | tmp64; > > +#elif HOST_LONG_BITS == 32 > > + /* > > + * Irrespective of host endianness, sentmap[n] is for pages earlier > > + * than sentmap[n+1] so we can't just cast up > > + */ > > + map[start / 32] = v; > > +#else > > +#error "Host long other than 64/32 not supported" > > +#endif > > +} > > + > > +/* > > + * When working on 32bit chunks of a bitmap where the only valid section > > + * is between start..end (inclusive), generate a mask with only those > > + * valid bits set for the current 32bit word within that bitmask. > > + */ > > +static int make_32bit_mask(unsigned long start, unsigned long end, > > + unsigned long cur32) > > +{ > > + unsigned long first32, last32; > > + uint32_t mask = ~(uint32_t)0; > > + first32 = start / 32; > > + last32 = end / 32; > > + > > + if ((cur32 == first32) && (start & 31)) { > > + /* e.g. (start & 31) = 3 > > + * 1 << . -> 2^3 > > + * . - 1 -> 2^3 - 1 i.e. mask 2..0 > > + * ~. -> mask 31..3 > > + */ > > + mask &= ~((((uint32_t)1) << (start & 31)) - 1); > > + } > > + > > + if ((cur32 == last32) && ((end & 31) != 31)) { > > + /* e.g. (end & 31) = 3 > > + * . +1 -> 4 > > + * 1 << . -> 2^4 > > + * . -1 -> 2^4 - 1 > > + * = mask set 3..0 > > + */ > > + mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1; > > + } > > + > > + return mask; > > +} > > Urgh.. looks correct, but continue to makes me question if this is a > sensible wire encoding of the discard map. > > > + > > +/* > > + * Callback from ram_postcopy_each_ram_discard for each RAMBlock > > + * start,end: Indexes into the bitmap for the first and last bit > > + * representing the named block > > + */ > > +static int pc_send_discard_bm_ram(MigrationState *ms, > > + PostcopyDiscardState *pds, > > + unsigned long start, unsigned long end) > > I know it's a long name, but I'd prefer to see this called > "postcopy_". I have to keep reminding myself that "pc" means > "postcopy" here, not that it's a PC machine type specific function. OK, yeh I see that's a clash; done. > > > +{ > > + /* > > + * There is no guarantee that start, end are on convenient 32bit multiples > > + * (We always send 32bit chunks over the wire, irrespective of long size) > > + */ > > + unsigned long first32, last32, cur32; > > + first32 = start / 32; > > + last32 = end / 32; > > + > > + for (cur32 = first32; cur32 <= last32; cur32++) { > > + /* Deal with start/end not on alignment */ > > + uint32_t mask = make_32bit_mask(start, end, cur32); > > + > > + uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32); > > + data &= mask; > > + > > + if (data) { > > + postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data); > > + } > > + } > > + > > + return 0; > > +} > > > > +/* > > + * Utility for the outgoing postcopy code. > > + * Calls postcopy_send_discard_bm_ram for each RAMBlock > > + * passing it bitmap indexes and name. > > + * Returns: 0 on success > > + * (qemu_ram_foreach_block ends up passing unscaled lengths > > + * which would mean postcopy code would have to deal with target page) > > + */ > > +static int pc_each_ram_discard(MigrationState *ms) > > s/discard/send_discard/ Done. > > > +{ > > + struct RAMBlock *block; > > + int ret; > > + > > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > > + unsigned long first = block->offset >> TARGET_PAGE_BITS; > > + unsigned long last = (block->offset + (block->max_length-1)) > > + >> TARGET_PAGE_BITS; > > + PostcopyDiscardState *pds = postcopy_discard_send_init(ms, > > + first & 31, > > + block->idstr); > > + > > + /* > > + * Postcopy sends chunks of bitmap over the wire, but it > > + * just needs indexes at this point, avoids it having > > + * target page specific code. > > + */ > > + ret = pc_send_discard_bm_ram(ms, pds, first, last); > > + postcopy_discard_send_finish(ms, pds); > > + if (ret) { > > + return ret; > > + } > > + } > > + > > + return 0; > > +} > > + > > +/* > > + * Transmit the set of pages to be discarded after precopy to the target > > + * these are pages that have been sent previously but have been dirtied > > + * Hopefully this is pretty sparse > > + */ > > +int ram_postcopy_send_discard_bitmap(MigrationState *ms) > > +{ > > + /* This should be our last sync, the src is now paused */ > > + migration_bitmap_sync(); > > + > > + /* > > + * Update the sentmap to be sentmap&=dirty > > + */ > > + bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap, > > + last_ram_offset() >> TARGET_PAGE_BITS); > > + > > + > > + trace_ram_postcopy_send_discard_bitmap(); > > +#ifdef DEBUG_POSTCOPY > > + ram_debug_dump_bitmap(ms->sentmap, false); > > +#endif > > + > > + return pc_each_ram_discard(ms); > > +} > > + > > +/* > > + * At the start of the postcopy phase of migration, any now-dirty > > + * precopied pages are discarded. > > + * > > + * start..end is an inclusive range of bits indexed in the source > > + * VMs bitmap for this RAMBlock, source_target_page_bits tells > > + * us what one of those bits represents. > > + * > > + * start/end are offsets from the start of the bitmap for RAMBlock 'block_name' > > + * > > + * Returns 0 on success. > > + */ > > +int ram_discard_range(MigrationIncomingState *mis, > > + const char *block_name, > > + uint64_t start, uint64_t end) > > +{ > > + assert(end >= start); > > + > > + RAMBlock *rb = ram_find_block(block_name); > > + > > + if (!rb) { > > + error_report("ram_discard_range: Failed to find block '%s'", > > + block_name); > > + return -1; > > + } > > + > > + uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS; > > + postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1); > > + > > + /* +1 gives the byte after the end of the last page to be discarded */ > > + ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS; > > + uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS); > > + uint8_t *host_endaddr; > > + > > + if (end_offset <= rb->used_length) { > > + host_endaddr = rb->host + (end_offset-1); > > + return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr); > > + } else { > > + error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 > > + "/%" PRIu64 "/%zu)", > > + block_name, start, end, rb->used_length); > > + return -1; > > + } > > +} > > + > > static int ram_save_setup(QEMUFile *f, void *opaque) > > { > > RAMBlock *block; > > @@ -854,7 +1118,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > > > > acct_clear(); > > } > > - > > qemu_mutex_lock_iothread(); > > qemu_mutex_lock_ramlist(); > > bytes_transferred = 0; > > @@ -864,6 +1127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) > > migration_bitmap = bitmap_new(ram_bitmap_pages); > > bitmap_set(migration_bitmap, 0, ram_bitmap_pages); > > > > + if (migrate_postcopy_ram()) { > > + MigrationState *s = migrate_get_current(); > > + s->sentmap = bitmap_new(ram_bitmap_pages); > > + bitmap_clear(s->sentmap, 0, ram_bitmap_pages); > > + } > > + > > /* > > * Count the total number of pages used by ram blocks not including any > > * gaps due to alignment or unplugs. > > diff --git a/include/migration/migration.h b/include/migration/migration.h > > index 86200b9..e749f4c 100644 > > --- a/include/migration/migration.h > > +++ b/include/migration/migration.h > > @@ -125,6 +125,13 @@ struct MigrationState > > > > /* Flag set once the migration has been asked to enter postcopy */ > > bool start_postcopy; > > + > > + /* bitmap of pages that have been sent at least once > > + * only maintained and used in postcopy at the moment > > + * where it's used to send the dirtymap at the start > > + * of the postcopy phase > > + */ > > + unsigned long *sentmap; > > }; > > > > void process_incoming_migration(QEMUFile *f); > > @@ -194,6 +201,11 @@ double xbzrle_mig_cache_miss_rate(void); > > > > void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); > > void ram_debug_dump_bitmap(unsigned long *todump, bool expected); > > +/* For outgoing discard bitmap */ > > +int ram_postcopy_send_discard_bitmap(MigrationState *ms); > > +/* For incoming postcopy discard */ > > +int ram_discard_range(MigrationIncomingState *mis, const char *block_name, > > + uint64_t start, uint64_t end); > > > > /** > > * @migrate_add_blocker - prevent migration from proceeding > > diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h > > index e93ee8a..1fec1c1 100644 > > --- a/include/migration/postcopy-ram.h > > +++ b/include/migration/postcopy-ram.h > > @@ -28,4 +28,38 @@ void postcopy_pmi_destroy(MigrationIncomingState *mis); > > void postcopy_pmi_discard_range(MigrationIncomingState *mis, > > size_t start, size_t npages); > > void postcopy_pmi_dump(MigrationIncomingState *mis); > > + > > +/* > > + * Discard the contents of memory start..end inclusive. > > + * We can assume that if we've been called postcopy_ram_hosttest returned true > > + */ > > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, > > + uint8_t *end); > > + > > + > > +/* > > + * Called at the start of each RAMBlock by the bitmap code > > + * offset is the bit within the first 32bit chunk of mask > > + * that represents the first page of the RAM Block > > + * Returns a new PDS > > + */ > > +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, > > + uint8_t offset, > > + const char *name); > > + > > +/* > > + * Called by the bitmap code for each chunk to discard > > + * May send a discard message, may just leave it queued to > > + * be sent later > > + */ > > +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds, > > + unsigned long pos, uint32_t bitmap); > > + > > +/* > > + * Called at the end of each RAMBlock by the bitmap code > > + * Sends any outstanding discard messages, frees the PDS > > + */ > > +void postcopy_discard_send_finish(MigrationState *ms, > > + PostcopyDiscardState *pds); > > + > > #endif > > diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h > > index 924eeb6..0651275 100644 > > --- a/include/qemu/typedefs.h > > +++ b/include/qemu/typedefs.h > > @@ -61,6 +61,7 @@ typedef struct PCIExpressHost PCIExpressHost; > > typedef struct PCIHostState PCIHostState; > > typedef struct PCMCIACardState PCMCIACardState; > > typedef struct PixelFormat PixelFormat; > > +typedef struct PostcopyDiscardState PostcopyDiscardState; > > typedef struct PostcopyPMI PostcopyPMI; > > typedef struct PropertyInfo PropertyInfo; > > typedef struct Property Property; > > diff --git a/migration/migration.c b/migration/migration.c > > index 6b20b56..850fe1a 100644 > > --- a/migration/migration.c > > +++ b/migration/migration.c > > @@ -22,6 +22,7 @@ > > #include "block/block.h" > > #include "qemu/sockets.h" > > #include "migration/block.h" > > +#include "migration/postcopy-ram.h" > > #include "qemu/thread.h" > > #include "qmp-commands.h" > > #include "trace.h" > > diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c > > index 4f29055..391e9c6 100644 > > --- a/migration/postcopy-ram.c > > +++ b/migration/postcopy-ram.c > > @@ -28,6 +28,19 @@ > > #include "qemu/error-report.h" > > #include "trace.h" > > > > +#define MAX_DISCARDS_PER_COMMAND 12 > > + > > +struct PostcopyDiscardState { > > + const char *name; > > + uint16_t cur_entry; > > + uint64_t addrlist[MAX_DISCARDS_PER_COMMAND]; > > + uint32_t masklist[MAX_DISCARDS_PER_COMMAND]; > > + uint8_t offset; /* Offset within 32bit mask at addr0 representing 1st > > + page of block */ > > + unsigned int nsentwords; > > + unsigned int nsentcmds; > > +}; > > + > > /* Postcopy needs to detect accesses to pages that haven't yet been copied > > * across, and efficiently map new pages in, the techniques for doing this > > * are target OS specific. > > @@ -364,6 +377,21 @@ out: > > return ret; > > } > > > > +/* > > + * Discard the contents of memory start..end inclusive. > > + * We can assume that if we've been called postcopy_ram_hosttest returned true > > + */ > > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, > > + uint8_t *end) > > +{ > > + if (madvise(start, (end-start)+1, MADV_DONTNEED)) { > > + perror("postcopy_ram_discard_range MADV_DONTNEED"); > > + return -1; > > + } > > + > > + return 0; > > +} > > + > > #else > > /* No target OS support, stubs just fail */ > > > > @@ -380,5 +408,88 @@ void postcopy_hook_early_receive(MigrationIncomingState *mis, > > /* We don't support postcopy so don't care */ > > } > > > > +void postcopy_pmi_destroy(MigrationIncomingState *mis) > > +{ > > + /* Called in normal cleanup path - so it's OK */ > > +} > > + > > +void postcopy_pmi_discard_range(MigrationIncomingState *mis, > > + size_t start, size_t npages) > > +{ > > + assert(0); > > +} > > + > > +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, > > + uint8_t *end) > > +{ > > + assert(0); > > +} > > #endif > > > > +/* ------------------------------------------------------------------------- */ > > + > > +/* > > + * Called at the start of each RAMBlock by the bitmap code > > + * offset is the bit within the first 64bit chunk of mask > > + * that represents the first page of the RAM Block > > + * Returns a new PDS > > + */ > > +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, > > + uint8_t offset, > > + const char *name) > > +{ > > + PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState)); > > + > > + if (res) { > > + res->name = name; > > + res->cur_entry = 0; > > + res->nsentwords = 0; > > + res->nsentcmds = 0; > > + res->offset = offset; > > + } > > + > > + return res; > > +} > > + > > +/* > > + * Called by the bitmap code for each chunk to discard > > + * May send a discard message, may just leave it queued to > > + * be sent later > > + */ > > +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds, > > + unsigned long pos, uint32_t bitmap) > > +{ > > + pds->addrlist[pds->cur_entry] = pos; > > + pds->masklist[pds->cur_entry] = bitmap; > > + pds->cur_entry++; > > + pds->nsentwords++; > > + > > + if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { > > + /* Full set, ship it! */ > > + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name, > > + pds->cur_entry, pds->offset, > > + pds->addrlist, pds->masklist); > > + pds->nsentcmds++; > > + pds->cur_entry = 0; > > + } > > +} > > + > > +/* > > + * Called at the end of each RAMBlock by the bitmap code > > + * Sends any outstanding discard messages, frees the PDS > > + */ > > +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) > > +{ > > + /* Anything unsent? */ > > + if (pds->cur_entry) { > > + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name, > > + pds->cur_entry, pds->offset, > > + pds->addrlist, pds->masklist); > > + pds->nsentcmds++; > > + } > > + > > + trace_postcopy_discard_send_finish(pds->name, pds->nsentwords, > > + pds->nsentcmds); > > + > > + g_free(pds); > > +} > > diff --git a/savevm.c b/savevm.c > > index 1e8d289..2589b8c 100644 > > --- a/savevm.c > > +++ b/savevm.c > > @@ -1282,15 +1282,12 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, > > * we know there must be at least 1 bit set due to the loop entry > > * If there is no 0 firstzero will be 32 > > */ > > - /* TODO - ram_discard_range gets added in a later patch > > int ret = ram_discard_range(mis, ramid, > > startaddr + firstset - first_bit_offset, > > startaddr + (firstzero - 1) - first_bit_offset); > > - ret = -1; > > if (ret) { > > return ret; > > } > > - */ > > > > /* mask= .?0000000000 */ > > /* ^fz ^fs */ > > diff --git a/trace-events b/trace-events > > index a555b56..f985117 100644 > > --- a/trace-events > > +++ b/trace-events > > @@ -1217,6 +1217,7 @@ qemu_file_fclose(void) "" > > migration_bitmap_sync_start(void) "" > > migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64"" > > migration_throttle(void) "" > > +ram_postcopy_send_discard_bitmap(void) "" > > > > # hw/display/qxl.c > > disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d" > > @@ -1478,6 +1479,9 @@ rdma_start_incoming_migration_after_rdma_listen(void) "" > > rdma_start_outgoing_migration_after_rdma_connect(void) "" > > rdma_start_outgoing_migration_after_rdma_source_init(void) "" > > > > +# migration/postcopy-ram.c > > +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands" > > + > > # kvm-all.c > > kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" > > kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p" > > -- > David Gibson | I'll have my music baroque, and my code > david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ > | _way_ _around_! > http://www.ozlabs.org/~dgibson -- Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff --git a/arch_init.c b/arch_init.c index 7bc5fa6..21e7ebe 100644 --- a/arch_init.c +++ b/arch_init.c @@ -40,6 +40,7 @@ #include "hw/audio/audio.h" #include "sysemu/kvm.h" #include "migration/migration.h" +#include "migration/postcopy-ram.h" #include "hw/i386/smbios.h" #include "exec/address-spaces.h" #include "hw/audio/pcspk.h" @@ -414,9 +415,17 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, return bytes_sent; } +/* mr: The region to search for dirty pages in + * start: Start address (typically so we can continue from previous page) + * ram_addr_abs: Pointer into which to store the address of the dirty page + * within the global ram_addr space + * + * Returns: byte offset within memory region of the start of a dirty page + */ static inline ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, - ram_addr_t start) + ram_addr_t start, + ram_addr_t *ram_addr_abs) { unsigned long base = mr->ram_addr >> TARGET_PAGE_BITS; unsigned long nr = base + (start >> TARGET_PAGE_BITS); @@ -435,6 +444,7 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, clear_bit(next, migration_bitmap); migration_dirty_pages--; } + *ram_addr_abs = next << TARGET_PAGE_BITS; return (next - base) << TARGET_PAGE_BITS; } @@ -571,6 +581,19 @@ static void migration_bitmap_sync(void) } } +static RAMBlock *ram_find_block(const char *id) +{ + RAMBlock *block; + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + if (!strcmp(id, block->idstr)) { + return block; + } + } + + return NULL; +} + /* * ram_save_page: Send the given page to the stream * @@ -659,13 +682,16 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) bool complete_round = false; int bytes_sent = 0; MemoryRegion *mr; + ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in + ram_addr_t space */ if (!block) block = QTAILQ_FIRST(&ram_list.blocks); while (true) { mr = block->mr; - offset = migration_bitmap_find_and_reset_dirty(mr, offset); + offset = migration_bitmap_find_and_reset_dirty(mr, offset, + &dirty_ram_abs); if (complete_round && block == last_seen_block && offset >= last_offset) { break; @@ -683,6 +709,11 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage) /* if page is unmodified, continue to the next */ if (bytes_sent > 0) { + MigrationState *ms = migrate_get_current(); + if (ms->sentmap) { + set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap); + } + last_sent_block = block; break; } @@ -742,12 +773,19 @@ void free_xbzrle_decoded_buf(void) static void migration_end(void) { + MigrationState *s = migrate_get_current(); + if (migration_bitmap) { memory_global_dirty_log_stop(); g_free(migration_bitmap); migration_bitmap = NULL; } + if (s->sentmap) { + g_free(s->sentmap); + s->sentmap = NULL; + } + XBZRLE_cache_lock(); if (XBZRLE.cache) { cache_fini(XBZRLE.cache); @@ -815,6 +853,232 @@ void ram_debug_dump_bitmap(unsigned long *todump, bool expected) } } +/* **** functions for postcopy ***** */ + +/* + * A helper to get 32 bits from a bit map; trivial for HOST_LONG_BITS=32 + * messier for 64; the bitmaps are actually long's that are 32 or 64bit + */ +static uint32_t get_32bits_map(unsigned long *map, int64_t start) +{ +#if HOST_LONG_BITS == 64 + uint64_t tmp64; + + tmp64 = map[start / 64]; + return (start & 32) ? (tmp64 >> 32) : (tmp64 & 0xffffffffu); +#elif HOST_LONG_BITS == 32 + /* + * Irrespective of host endianness, sentmap[n] is for pages earlier + * than sentmap[n+1] so we can't just cast up + */ + return map[start / 32]; +#else +#error "Host long other than 64/32 not supported" +#endif +} + +/* + * A helper to put 32 bits into a bit map; trivial for HOST_LONG_BITS=32 + * messier for 64; the bitmaps are actually long's that are 32 or 64bit + */ +__attribute__ (( unused )) /* Until later in patch series */ +static void put_32bits_map(unsigned long *map, int64_t start, + uint32_t v) +{ +#if HOST_LONG_BITS == 64 + uint64_t tmp64 = v; + uint64_t mask = 0xffffffffu; + + if (start & 32) { + tmp64 = tmp64 << 32; + mask = mask << 32; + } + + map[start / 64] = (map[start / 64] & ~mask) | tmp64; +#elif HOST_LONG_BITS == 32 + /* + * Irrespective of host endianness, sentmap[n] is for pages earlier + * than sentmap[n+1] so we can't just cast up + */ + map[start / 32] = v; +#else +#error "Host long other than 64/32 not supported" +#endif +} + +/* + * When working on 32bit chunks of a bitmap where the only valid section + * is between start..end (inclusive), generate a mask with only those + * valid bits set for the current 32bit word within that bitmask. + */ +static int make_32bit_mask(unsigned long start, unsigned long end, + unsigned long cur32) +{ + unsigned long first32, last32; + uint32_t mask = ~(uint32_t)0; + first32 = start / 32; + last32 = end / 32; + + if ((cur32 == first32) && (start & 31)) { + /* e.g. (start & 31) = 3 + * 1 << . -> 2^3 + * . - 1 -> 2^3 - 1 i.e. mask 2..0 + * ~. -> mask 31..3 + */ + mask &= ~((((uint32_t)1) << (start & 31)) - 1); + } + + if ((cur32 == last32) && ((end & 31) != 31)) { + /* e.g. (end & 31) = 3 + * . +1 -> 4 + * 1 << . -> 2^4 + * . -1 -> 2^4 - 1 + * = mask set 3..0 + */ + mask &= (((uint32_t)1) << ((end & 31) + 1)) - 1; + } + + return mask; +} + +/* + * Callback from ram_postcopy_each_ram_discard for each RAMBlock + * start,end: Indexes into the bitmap for the first and last bit + * representing the named block + */ +static int pc_send_discard_bm_ram(MigrationState *ms, + PostcopyDiscardState *pds, + unsigned long start, unsigned long end) +{ + /* + * There is no guarantee that start, end are on convenient 32bit multiples + * (We always send 32bit chunks over the wire, irrespective of long size) + */ + unsigned long first32, last32, cur32; + first32 = start / 32; + last32 = end / 32; + + for (cur32 = first32; cur32 <= last32; cur32++) { + /* Deal with start/end not on alignment */ + uint32_t mask = make_32bit_mask(start, end, cur32); + + uint32_t data = get_32bits_map(ms->sentmap, cur32 * 32); + data &= mask; + + if (data) { + postcopy_discard_send_chunk(ms, pds, (cur32-first32) * 32, data); + } + } + + return 0; +} + +/* + * Utility for the outgoing postcopy code. + * Calls postcopy_send_discard_bm_ram for each RAMBlock + * passing it bitmap indexes and name. + * Returns: 0 on success + * (qemu_ram_foreach_block ends up passing unscaled lengths + * which would mean postcopy code would have to deal with target page) + */ +static int pc_each_ram_discard(MigrationState *ms) +{ + struct RAMBlock *block; + int ret; + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + unsigned long first = block->offset >> TARGET_PAGE_BITS; + unsigned long last = (block->offset + (block->max_length-1)) + >> TARGET_PAGE_BITS; + PostcopyDiscardState *pds = postcopy_discard_send_init(ms, + first & 31, + block->idstr); + + /* + * Postcopy sends chunks of bitmap over the wire, but it + * just needs indexes at this point, avoids it having + * target page specific code. + */ + ret = pc_send_discard_bm_ram(ms, pds, first, last); + postcopy_discard_send_finish(ms, pds); + if (ret) { + return ret; + } + } + + return 0; +} + +/* + * Transmit the set of pages to be discarded after precopy to the target + * these are pages that have been sent previously but have been dirtied + * Hopefully this is pretty sparse + */ +int ram_postcopy_send_discard_bitmap(MigrationState *ms) +{ + /* This should be our last sync, the src is now paused */ + migration_bitmap_sync(); + + /* + * Update the sentmap to be sentmap&=dirty + */ + bitmap_and(ms->sentmap, ms->sentmap, migration_bitmap, + last_ram_offset() >> TARGET_PAGE_BITS); + + + trace_ram_postcopy_send_discard_bitmap(); +#ifdef DEBUG_POSTCOPY + ram_debug_dump_bitmap(ms->sentmap, false); +#endif + + return pc_each_ram_discard(ms); +} + +/* + * At the start of the postcopy phase of migration, any now-dirty + * precopied pages are discarded. + * + * start..end is an inclusive range of bits indexed in the source + * VMs bitmap for this RAMBlock, source_target_page_bits tells + * us what one of those bits represents. + * + * start/end are offsets from the start of the bitmap for RAMBlock 'block_name' + * + * Returns 0 on success. + */ +int ram_discard_range(MigrationIncomingState *mis, + const char *block_name, + uint64_t start, uint64_t end) +{ + assert(end >= start); + + RAMBlock *rb = ram_find_block(block_name); + + if (!rb) { + error_report("ram_discard_range: Failed to find block '%s'", + block_name); + return -1; + } + + uint64_t index_offset = rb->offset >> TARGET_PAGE_BITS; + postcopy_pmi_discard_range(mis, start + index_offset, (end - start) + 1); + + /* +1 gives the byte after the end of the last page to be discarded */ + ram_addr_t end_offset = (end+1) << TARGET_PAGE_BITS; + uint8_t *host_startaddr = rb->host + (start << TARGET_PAGE_BITS); + uint8_t *host_endaddr; + + if (end_offset <= rb->used_length) { + host_endaddr = rb->host + (end_offset-1); + return postcopy_ram_discard_range(mis, host_startaddr, host_endaddr); + } else { + error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 + "/%" PRIu64 "/%zu)", + block_name, start, end, rb->used_length); + return -1; + } +} + static int ram_save_setup(QEMUFile *f, void *opaque) { RAMBlock *block; @@ -854,7 +1118,6 @@ static int ram_save_setup(QEMUFile *f, void *opaque) acct_clear(); } - qemu_mutex_lock_iothread(); qemu_mutex_lock_ramlist(); bytes_transferred = 0; @@ -864,6 +1127,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_bitmap = bitmap_new(ram_bitmap_pages); bitmap_set(migration_bitmap, 0, ram_bitmap_pages); + if (migrate_postcopy_ram()) { + MigrationState *s = migrate_get_current(); + s->sentmap = bitmap_new(ram_bitmap_pages); + bitmap_clear(s->sentmap, 0, ram_bitmap_pages); + } + /* * Count the total number of pages used by ram blocks not including any * gaps due to alignment or unplugs. diff --git a/include/migration/migration.h b/include/migration/migration.h index 86200b9..e749f4c 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -125,6 +125,13 @@ struct MigrationState /* Flag set once the migration has been asked to enter postcopy */ bool start_postcopy; + + /* bitmap of pages that have been sent at least once + * only maintained and used in postcopy at the moment + * where it's used to send the dirtymap at the start + * of the postcopy phase + */ + unsigned long *sentmap; }; void process_incoming_migration(QEMUFile *f); @@ -194,6 +201,11 @@ double xbzrle_mig_cache_miss_rate(void); void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); void ram_debug_dump_bitmap(unsigned long *todump, bool expected); +/* For outgoing discard bitmap */ +int ram_postcopy_send_discard_bitmap(MigrationState *ms); +/* For incoming postcopy discard */ +int ram_discard_range(MigrationIncomingState *mis, const char *block_name, + uint64_t start, uint64_t end); /** * @migrate_add_blocker - prevent migration from proceeding diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h index e93ee8a..1fec1c1 100644 --- a/include/migration/postcopy-ram.h +++ b/include/migration/postcopy-ram.h @@ -28,4 +28,38 @@ void postcopy_pmi_destroy(MigrationIncomingState *mis); void postcopy_pmi_discard_range(MigrationIncomingState *mis, size_t start, size_t npages); void postcopy_pmi_dump(MigrationIncomingState *mis); + +/* + * Discard the contents of memory start..end inclusive. + * We can assume that if we've been called postcopy_ram_hosttest returned true + */ +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + uint8_t *end); + + +/* + * Called at the start of each RAMBlock by the bitmap code + * offset is the bit within the first 32bit chunk of mask + * that represents the first page of the RAM Block + * Returns a new PDS + */ +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, + uint8_t offset, + const char *name); + +/* + * Called by the bitmap code for each chunk to discard + * May send a discard message, may just leave it queued to + * be sent later + */ +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds, + unsigned long pos, uint32_t bitmap); + +/* + * Called at the end of each RAMBlock by the bitmap code + * Sends any outstanding discard messages, frees the PDS + */ +void postcopy_discard_send_finish(MigrationState *ms, + PostcopyDiscardState *pds); + #endif diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h index 924eeb6..0651275 100644 --- a/include/qemu/typedefs.h +++ b/include/qemu/typedefs.h @@ -61,6 +61,7 @@ typedef struct PCIExpressHost PCIExpressHost; typedef struct PCIHostState PCIHostState; typedef struct PCMCIACardState PCMCIACardState; typedef struct PixelFormat PixelFormat; +typedef struct PostcopyDiscardState PostcopyDiscardState; typedef struct PostcopyPMI PostcopyPMI; typedef struct PropertyInfo PropertyInfo; typedef struct Property Property; diff --git a/migration/migration.c b/migration/migration.c index 6b20b56..850fe1a 100644 --- a/migration/migration.c +++ b/migration/migration.c @@ -22,6 +22,7 @@ #include "block/block.h" #include "qemu/sockets.h" #include "migration/block.h" +#include "migration/postcopy-ram.h" #include "qemu/thread.h" #include "qmp-commands.h" #include "trace.h" diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c index 4f29055..391e9c6 100644 --- a/migration/postcopy-ram.c +++ b/migration/postcopy-ram.c @@ -28,6 +28,19 @@ #include "qemu/error-report.h" #include "trace.h" +#define MAX_DISCARDS_PER_COMMAND 12 + +struct PostcopyDiscardState { + const char *name; + uint16_t cur_entry; + uint64_t addrlist[MAX_DISCARDS_PER_COMMAND]; + uint32_t masklist[MAX_DISCARDS_PER_COMMAND]; + uint8_t offset; /* Offset within 32bit mask at addr0 representing 1st + page of block */ + unsigned int nsentwords; + unsigned int nsentcmds; +}; + /* Postcopy needs to detect accesses to pages that haven't yet been copied * across, and efficiently map new pages in, the techniques for doing this * are target OS specific. @@ -364,6 +377,21 @@ out: return ret; } +/* + * Discard the contents of memory start..end inclusive. + * We can assume that if we've been called postcopy_ram_hosttest returned true + */ +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + uint8_t *end) +{ + if (madvise(start, (end-start)+1, MADV_DONTNEED)) { + perror("postcopy_ram_discard_range MADV_DONTNEED"); + return -1; + } + + return 0; +} + #else /* No target OS support, stubs just fail */ @@ -380,5 +408,88 @@ void postcopy_hook_early_receive(MigrationIncomingState *mis, /* We don't support postcopy so don't care */ } +void postcopy_pmi_destroy(MigrationIncomingState *mis) +{ + /* Called in normal cleanup path - so it's OK */ +} + +void postcopy_pmi_discard_range(MigrationIncomingState *mis, + size_t start, size_t npages) +{ + assert(0); +} + +int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, + uint8_t *end) +{ + assert(0); +} #endif +/* ------------------------------------------------------------------------- */ + +/* + * Called at the start of each RAMBlock by the bitmap code + * offset is the bit within the first 64bit chunk of mask + * that represents the first page of the RAM Block + * Returns a new PDS + */ +PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, + uint8_t offset, + const char *name) +{ + PostcopyDiscardState *res = g_try_malloc(sizeof(PostcopyDiscardState)); + + if (res) { + res->name = name; + res->cur_entry = 0; + res->nsentwords = 0; + res->nsentcmds = 0; + res->offset = offset; + } + + return res; +} + +/* + * Called by the bitmap code for each chunk to discard + * May send a discard message, may just leave it queued to + * be sent later + */ +void postcopy_discard_send_chunk(MigrationState *ms, PostcopyDiscardState *pds, + unsigned long pos, uint32_t bitmap) +{ + pds->addrlist[pds->cur_entry] = pos; + pds->masklist[pds->cur_entry] = bitmap; + pds->cur_entry++; + pds->nsentwords++; + + if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { + /* Full set, ship it! */ + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name, + pds->cur_entry, pds->offset, + pds->addrlist, pds->masklist); + pds->nsentcmds++; + pds->cur_entry = 0; + } +} + +/* + * Called at the end of each RAMBlock by the bitmap code + * Sends any outstanding discard messages, frees the PDS + */ +void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) +{ + /* Anything unsent? */ + if (pds->cur_entry) { + qemu_savevm_send_postcopy_ram_discard(ms->file, pds->name, + pds->cur_entry, pds->offset, + pds->addrlist, pds->masklist); + pds->nsentcmds++; + } + + trace_postcopy_discard_send_finish(pds->name, pds->nsentwords, + pds->nsentcmds); + + g_free(pds); +} diff --git a/savevm.c b/savevm.c index 1e8d289..2589b8c 100644 --- a/savevm.c +++ b/savevm.c @@ -1282,15 +1282,12 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, * we know there must be at least 1 bit set due to the loop entry * If there is no 0 firstzero will be 32 */ - /* TODO - ram_discard_range gets added in a later patch int ret = ram_discard_range(mis, ramid, startaddr + firstset - first_bit_offset, startaddr + (firstzero - 1) - first_bit_offset); - ret = -1; if (ret) { return ret; } - */ /* mask= .?0000000000 */ /* ^fz ^fs */ diff --git a/trace-events b/trace-events index a555b56..f985117 100644 --- a/trace-events +++ b/trace-events @@ -1217,6 +1217,7 @@ qemu_file_fclose(void) "" migration_bitmap_sync_start(void) "" migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64"" migration_throttle(void) "" +ram_postcopy_send_discard_bitmap(void) "" # hw/display/qxl.c disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d" @@ -1478,6 +1479,9 @@ rdma_start_incoming_migration_after_rdma_listen(void) "" rdma_start_outgoing_migration_after_rdma_connect(void) "" rdma_start_outgoing_migration_after_rdma_source_init(void) "" +# migration/postcopy-ram.c +postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands" + # kvm-all.c kvm_ioctl(int type, void *arg) "type 0x%x, arg %p" kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"