diff mbox

[v5,34/45] Page request: Consume pages off the post-copy queue

Message ID 1424883128-9841-35-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert Feb. 25, 2015, 4:51 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

When transmitting RAM pages, consume pages that have been queued by
MIG_RPCOMM_REQPAGE commands and send them ahead of normal page scanning.

Note:
  a) After a queued page the linear walk carries on from after the
unqueued page; there is a reasonable chance that the destination
was about to ask for other closeby pages anyway.

  b) We have to be careful of any assumptions that the page walking
code makes, in particular it does some short cuts on its first linear
walk that break as soon as we do a queued page.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 arch_init.c  | 154 +++++++++++++++++++++++++++++++++++++++++++++++++----------
 trace-events |   2 +
 2 files changed, 131 insertions(+), 25 deletions(-)

Comments

David Gibson March 24, 2015, 2:15 a.m. UTC | #1
On Wed, Feb 25, 2015 at 04:51:57PM +0000, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> When transmitting RAM pages, consume pages that have been queued by
> MIG_RPCOMM_REQPAGE commands and send them ahead of normal page scanning.
> 
> Note:
>   a) After a queued page the linear walk carries on from after the
> unqueued page; there is a reasonable chance that the destination
> was about to ask for other closeby pages anyway.
> 
>   b) We have to be careful of any assumptions that the page walking
> code makes, in particular it does some short cuts on its first linear
> walk that break as soon as we do a queued page.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  arch_init.c  | 154 +++++++++++++++++++++++++++++++++++++++++++++++++----------
>  trace-events |   2 +
>  2 files changed, 131 insertions(+), 25 deletions(-)
> 
> diff --git a/arch_init.c b/arch_init.c
> index 9d8fc6b..acf65e1 100644
> --- a/arch_init.c
> +++ b/arch_init.c
> @@ -328,6 +328,7 @@ static RAMBlock *last_seen_block;
>  /* This is the last block from where we have sent data */
>  static RAMBlock *last_sent_block;
>  static ram_addr_t last_offset;
> +static bool last_was_from_queue;
>  static unsigned long *migration_bitmap;
>  static uint64_t migration_dirty_pages;
>  static uint32_t last_version;
> @@ -461,6 +462,19 @@ static inline bool migration_bitmap_set_dirty(ram_addr_t addr)
>      return ret;
>  }
>  
> +static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
> +{
> +    bool ret;
> +    int nr = addr >> TARGET_PAGE_BITS;
> +
> +    ret = test_and_clear_bit(nr, migration_bitmap);
> +
> +    if (ret) {
> +        migration_dirty_pages--;
> +    }
> +    return ret;
> +}
> +
>  static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
>  {
>      ram_addr_t addr;
> @@ -669,6 +683,39 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
>  }
>  
>  /*
> + * Unqueue a page from the queue fed by postcopy page requests
> + *
> + * Returns:      The RAMBlock* to transmit from (or NULL if the queue is empty)
> + *      ms:      MigrationState in
> + *  offset:      the byte offset within the RAMBlock for the start of the page
> + * ram_addr_abs: global offset in the dirty/sent bitmaps
> + */
> +static RAMBlock *ram_save_unqueue_page(MigrationState *ms, ram_addr_t *offset,
> +                                       ram_addr_t *ram_addr_abs)
> +{
> +    RAMBlock *result = NULL;
> +    qemu_mutex_lock(&ms->src_page_req_mutex);
> +    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
> +        struct MigrationSrcPageRequest *entry =
> +                                    QSIMPLEQ_FIRST(&ms->src_page_requests);
> +        result = entry->rb;
> +        *offset = entry->offset;
> +        *ram_addr_abs = (entry->offset + entry->rb->offset) & TARGET_PAGE_MASK;
> +
> +        if (entry->len > TARGET_PAGE_SIZE) {
> +            entry->len -= TARGET_PAGE_SIZE;
> +            entry->offset += TARGET_PAGE_SIZE;
> +        } else {
> +            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
> +            g_free(entry);
> +        }
> +    }
> +    qemu_mutex_unlock(&ms->src_page_req_mutex);
> +
> +    return result;
> +}
> +
> +/*
>   * Queue the pages for transmission, e.g. a request from postcopy destination
>   *   ms: MigrationStatus in which the queue is held
>   *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
> @@ -732,46 +779,102 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname,
>  
>  static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
>  {
> +    MigrationState *ms = migrate_get_current();
>      RAMBlock *block = last_seen_block;
> +    RAMBlock *tmpblock;
>      ram_addr_t offset = last_offset;
> +    ram_addr_t tmpoffset;
>      bool complete_round = false;
>      int bytes_sent = 0;
> -    MemoryRegion *mr;
>      ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
>                                   ram_addr_t space */
> +    unsigned long hps = sysconf(_SC_PAGESIZE);
>  
> -    if (!block)
> +    if (!block) {
>          block = QTAILQ_FIRST(&ram_list.blocks);
> +        last_was_from_queue = false;
> +    }
>  
> -    while (true) {
> -        mr = block->mr;
> -        offset = migration_bitmap_find_and_reset_dirty(mr, offset,
> -                                                       &dirty_ram_abs);
> -        if (complete_round && block == last_seen_block &&
> -            offset >= last_offset) {
> -            break;
> +    while (true) { /* Until we send a block or run out of stuff to send */
> +        tmpblock = NULL;
> +
> +        /*
> +         * Don't break host-page chunks up with queue items
> +         * so only unqueue if,
> +         *   a) The last item came from the queue anyway
> +         *   b) The last sent item was the last target-page in a host page
> +         */
> +        if (last_was_from_queue || !last_sent_block ||
> +            ((last_offset & (hps - 1)) == (hps - TARGET_PAGE_SIZE))) {
> +            tmpblock = ram_save_unqueue_page(ms, &tmpoffset, &dirty_ram_abs);

This test for whether we've completed a host page is pretty awkward.
I think it would be cleaner to have an inner loop / helper function
that completes sending an entire host page (whether requested or
scanned), before allowing the outer loop to even come back to here to
reconsider the queue.

>          }
> -        if (offset >= block->used_length) {
> -            offset = 0;
> -            block = QTAILQ_NEXT(block, next);
> -            if (!block) {
> -                block = QTAILQ_FIRST(&ram_list.blocks);
> -                complete_round = true;
> -                ram_bulk_stage = false;
> +
> +        if (tmpblock) {
> +            /* We've got a block from the postcopy queue */
> +            trace_ram_find_and_save_block_postcopy(tmpblock->idstr,
> +                                                   (uint64_t)tmpoffset,
> +                                                   (uint64_t)dirty_ram_abs);
> +            /*
> +             * We're sending this page, and since it's postcopy nothing else
> +             * will dirty it, and we must make sure it doesn't get sent again.
> +             */
> +            if (!migration_bitmap_clear_dirty(dirty_ram_abs)) {
> +                trace_ram_find_and_save_block_postcopy_not_dirty(
> +                    tmpblock->idstr, (uint64_t)tmpoffset,
> +                    (uint64_t)dirty_ram_abs,
> +                    test_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap));
> +
> +                continue;
>              }
> +            /*
> +             * As soon as we start servicing pages out of order, then we have
> +             * to kill the bulk stage, since the bulk stage assumes
> +             * in (migration_bitmap_find_and_reset_dirty) that every page is
> +             * dirty, that's no longer true.
> +             */
> +            ram_bulk_stage = false;
> +            /*
> +             * We mustn't change block/offset unless it's to a valid one
> +             * otherwise we can go down some of the exit cases in the normal
> +             * path.
> +             */
> +            block = tmpblock;
> +            offset = tmpoffset;
> +            last_was_from_queue = true;

Hrm.  So now block can change during the execution of
ram_save_block(), which really suggests that splitting by block is no
longer a sensible subdivision of the loop surrounding ram_save_block.
I think it would make more sense to replace that entire surrounding
loop, so that the logic is essentially

    while (not finished) {
        if (something is queued) {
	    send that
        } else {
	    scan for an unsent block and offset
	    send that instead
	}


>          } else {
> -            bytes_sent = ram_save_page(f, block, offset, last_stage);
> -
> -            /* if page is unmodified, continue to the next */
> -            if (bytes_sent > 0) {
> -                MigrationState *ms = migrate_get_current();
> -                if (ms->sentmap) {
> -                    set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
> +            MemoryRegion *mr;
> +            /* priority queue empty, so just search for something dirty */
> +            mr = block->mr;
> +            offset = migration_bitmap_find_and_reset_dirty(mr, offset,
> +                                                           &dirty_ram_abs);
> +            if (complete_round && block == last_seen_block &&
> +                offset >= last_offset) {
> +                break;
> +            }
> +            if (offset >= block->used_length) {
> +                offset = 0;
> +                block = QTAILQ_NEXT(block, next);
> +                if (!block) {
> +                    block = QTAILQ_FIRST(&ram_list.blocks);
> +                    complete_round = true;
> +                    ram_bulk_stage = false;
>                  }
> +                continue; /* pick an offset in the new block */
> +            }
> +            last_was_from_queue = false;
> +        }
>  
> -                last_sent_block = block;
> -                break;
> +        /* We have a page to send, so send it */
> +        bytes_sent = ram_save_page(f, block, offset, last_stage);
> +
> +        /* if page is unmodified, continue to the next */
> +        if (bytes_sent > 0) {
> +            if (ms->sentmap) {
> +                set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
>              }
> +
> +            last_sent_block = block;
> +            break;
>          }
>      }
>      last_seen_block = block;
> @@ -865,6 +968,7 @@ static void reset_ram_globals(void)
>      last_offset = 0;
>      last_version = ram_list.version;
>      ram_bulk_stage = true;
> +    last_was_from_queue = false;
>  }
>  
>  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> diff --git a/trace-events b/trace-events
> index 8a0d70d..781cf5c 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1217,6 +1217,8 @@ qemu_file_fclose(void) ""
>  migration_bitmap_sync_start(void) ""
>  migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
>  migration_throttle(void) ""
> +ram_find_and_save_block_postcopy(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64
> +ram_find_and_save_block_postcopy_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)"
>  ram_postcopy_send_discard_bitmap(void) ""
>  ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"
>
Dr. David Alan Gilbert June 16, 2015, 10:48 a.m. UTC | #2
* David Gibson (david@gibson.dropbear.id.au) wrote:
> On Wed, Feb 25, 2015 at 04:51:57PM +0000, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > When transmitting RAM pages, consume pages that have been queued by
> > MIG_RPCOMM_REQPAGE commands and send them ahead of normal page scanning.
> > 
> > Note:
> >   a) After a queued page the linear walk carries on from after the
> > unqueued page; there is a reasonable chance that the destination
> > was about to ask for other closeby pages anyway.
> > 
> >   b) We have to be careful of any assumptions that the page walking
> > code makes, in particular it does some short cuts on its first linear
> > walk that break as soon as we do a queued page.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  arch_init.c  | 154 +++++++++++++++++++++++++++++++++++++++++++++++++----------
> >  trace-events |   2 +
> >  2 files changed, 131 insertions(+), 25 deletions(-)
> > 
> > diff --git a/arch_init.c b/arch_init.c
> > index 9d8fc6b..acf65e1 100644
> > --- a/arch_init.c
> > +++ b/arch_init.c
> > @@ -328,6 +328,7 @@ static RAMBlock *last_seen_block;
> >  /* This is the last block from where we have sent data */
> >  static RAMBlock *last_sent_block;
> >  static ram_addr_t last_offset;
> > +static bool last_was_from_queue;
> >  static unsigned long *migration_bitmap;
> >  static uint64_t migration_dirty_pages;
> >  static uint32_t last_version;
> > @@ -461,6 +462,19 @@ static inline bool migration_bitmap_set_dirty(ram_addr_t addr)
> >      return ret;
> >  }
> >  
> > +static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
> > +{
> > +    bool ret;
> > +    int nr = addr >> TARGET_PAGE_BITS;
> > +
> > +    ret = test_and_clear_bit(nr, migration_bitmap);
> > +
> > +    if (ret) {
> > +        migration_dirty_pages--;
> > +    }
> > +    return ret;
> > +}
> > +
> >  static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
> >  {
> >      ram_addr_t addr;
> > @@ -669,6 +683,39 @@ static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
> >  }
> >  
> >  /*
> > + * Unqueue a page from the queue fed by postcopy page requests
> > + *
> > + * Returns:      The RAMBlock* to transmit from (or NULL if the queue is empty)
> > + *      ms:      MigrationState in
> > + *  offset:      the byte offset within the RAMBlock for the start of the page
> > + * ram_addr_abs: global offset in the dirty/sent bitmaps
> > + */
> > +static RAMBlock *ram_save_unqueue_page(MigrationState *ms, ram_addr_t *offset,
> > +                                       ram_addr_t *ram_addr_abs)
> > +{
> > +    RAMBlock *result = NULL;
> > +    qemu_mutex_lock(&ms->src_page_req_mutex);
> > +    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
> > +        struct MigrationSrcPageRequest *entry =
> > +                                    QSIMPLEQ_FIRST(&ms->src_page_requests);
> > +        result = entry->rb;
> > +        *offset = entry->offset;
> > +        *ram_addr_abs = (entry->offset + entry->rb->offset) & TARGET_PAGE_MASK;
> > +
> > +        if (entry->len > TARGET_PAGE_SIZE) {
> > +            entry->len -= TARGET_PAGE_SIZE;
> > +            entry->offset += TARGET_PAGE_SIZE;
> > +        } else {
> > +            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
> > +            g_free(entry);
> > +        }
> > +    }
> > +    qemu_mutex_unlock(&ms->src_page_req_mutex);
> > +
> > +    return result;
> > +}
> > +
> > +/*
> >   * Queue the pages for transmission, e.g. a request from postcopy destination
> >   *   ms: MigrationStatus in which the queue is held
> >   *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
> > @@ -732,46 +779,102 @@ int ram_save_queue_pages(MigrationState *ms, const char *rbname,
> >  
> >  static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
> >  {
> > +    MigrationState *ms = migrate_get_current();
> >      RAMBlock *block = last_seen_block;
> > +    RAMBlock *tmpblock;
> >      ram_addr_t offset = last_offset;
> > +    ram_addr_t tmpoffset;
> >      bool complete_round = false;
> >      int bytes_sent = 0;
> > -    MemoryRegion *mr;
> >      ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
> >                                   ram_addr_t space */
> > +    unsigned long hps = sysconf(_SC_PAGESIZE);
> >  
> > -    if (!block)
> > +    if (!block) {
> >          block = QTAILQ_FIRST(&ram_list.blocks);
> > +        last_was_from_queue = false;
> > +    }
> >  
> > -    while (true) {
> > -        mr = block->mr;
> > -        offset = migration_bitmap_find_and_reset_dirty(mr, offset,
> > -                                                       &dirty_ram_abs);
> > -        if (complete_round && block == last_seen_block &&
> > -            offset >= last_offset) {
> > -            break;
> > +    while (true) { /* Until we send a block or run out of stuff to send */
> > +        tmpblock = NULL;
> > +
> > +        /*
> > +         * Don't break host-page chunks up with queue items
> > +         * so only unqueue if,
> > +         *   a) The last item came from the queue anyway
> > +         *   b) The last sent item was the last target-page in a host page
> > +         */
> > +        if (last_was_from_queue || !last_sent_block ||
> > +            ((last_offset & (hps - 1)) == (hps - TARGET_PAGE_SIZE))) {
> > +            tmpblock = ram_save_unqueue_page(ms, &tmpoffset, &dirty_ram_abs);
> 
> This test for whether we've completed a host page is pretty awkward.
> I think it would be cleaner to have an inner loop / helper function
> that completes sending an entire host page (whether requested or
> scanned), before allowing the outer loop to even come back to here to
> reconsider the queue.

I've reworked that in the v7 series I've just posted; please see if it's
more to your taste (I've not tested it on a machine with bigger pages
yet though).  (This patch is 32/42 in v7).

Dave

> 
> >          }
> > -        if (offset >= block->used_length) {
> > -            offset = 0;
> > -            block = QTAILQ_NEXT(block, next);
> > -            if (!block) {
> > -                block = QTAILQ_FIRST(&ram_list.blocks);
> > -                complete_round = true;
> > -                ram_bulk_stage = false;
> > +
> > +        if (tmpblock) {
> > +            /* We've got a block from the postcopy queue */
> > +            trace_ram_find_and_save_block_postcopy(tmpblock->idstr,
> > +                                                   (uint64_t)tmpoffset,
> > +                                                   (uint64_t)dirty_ram_abs);
> > +            /*
> > +             * We're sending this page, and since it's postcopy nothing else
> > +             * will dirty it, and we must make sure it doesn't get sent again.
> > +             */
> > +            if (!migration_bitmap_clear_dirty(dirty_ram_abs)) {
> > +                trace_ram_find_and_save_block_postcopy_not_dirty(
> > +                    tmpblock->idstr, (uint64_t)tmpoffset,
> > +                    (uint64_t)dirty_ram_abs,
> > +                    test_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap));
> > +
> > +                continue;
> >              }
> > +            /*
> > +             * As soon as we start servicing pages out of order, then we have
> > +             * to kill the bulk stage, since the bulk stage assumes
> > +             * in (migration_bitmap_find_and_reset_dirty) that every page is
> > +             * dirty, that's no longer true.
> > +             */
> > +            ram_bulk_stage = false;
> > +            /*
> > +             * We mustn't change block/offset unless it's to a valid one
> > +             * otherwise we can go down some of the exit cases in the normal
> > +             * path.
> > +             */
> > +            block = tmpblock;
> > +            offset = tmpoffset;
> > +            last_was_from_queue = true;
> 
> Hrm.  So now block can change during the execution of
> ram_save_block(), which really suggests that splitting by block is no
> longer a sensible subdivision of the loop surrounding ram_save_block.
> I think it would make more sense to replace that entire surrounding
> loop, so that the logic is essentially
> 
>     while (not finished) {
>         if (something is queued) {
> 	    send that
>         } else {
> 	    scan for an unsent block and offset
> 	    send that instead
> 	}
> 
> 
> >          } else {
> > -            bytes_sent = ram_save_page(f, block, offset, last_stage);
> > -
> > -            /* if page is unmodified, continue to the next */
> > -            if (bytes_sent > 0) {
> > -                MigrationState *ms = migrate_get_current();
> > -                if (ms->sentmap) {
> > -                    set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
> > +            MemoryRegion *mr;
> > +            /* priority queue empty, so just search for something dirty */
> > +            mr = block->mr;
> > +            offset = migration_bitmap_find_and_reset_dirty(mr, offset,
> > +                                                           &dirty_ram_abs);
> > +            if (complete_round && block == last_seen_block &&
> > +                offset >= last_offset) {
> > +                break;
> > +            }
> > +            if (offset >= block->used_length) {
> > +                offset = 0;
> > +                block = QTAILQ_NEXT(block, next);
> > +                if (!block) {
> > +                    block = QTAILQ_FIRST(&ram_list.blocks);
> > +                    complete_round = true;
> > +                    ram_bulk_stage = false;
> >                  }
> > +                continue; /* pick an offset in the new block */
> > +            }
> > +            last_was_from_queue = false;
> > +        }
> >  
> > -                last_sent_block = block;
> > -                break;
> > +        /* We have a page to send, so send it */
> > +        bytes_sent = ram_save_page(f, block, offset, last_stage);
> > +
> > +        /* if page is unmodified, continue to the next */
> > +        if (bytes_sent > 0) {
> > +            if (ms->sentmap) {
> > +                set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
> >              }
> > +
> > +            last_sent_block = block;
> > +            break;
> >          }
> >      }
> >      last_seen_block = block;
> > @@ -865,6 +968,7 @@ static void reset_ram_globals(void)
> >      last_offset = 0;
> >      last_version = ram_list.version;
> >      ram_bulk_stage = true;
> > +    last_was_from_queue = false;
> >  }
> >  
> >  #define MAX_WAIT 50 /* ms, half buffered_file limit */
> > diff --git a/trace-events b/trace-events
> > index 8a0d70d..781cf5c 100644
> > --- a/trace-events
> > +++ b/trace-events
> > @@ -1217,6 +1217,8 @@ qemu_file_fclose(void) ""
> >  migration_bitmap_sync_start(void) ""
> >  migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
> >  migration_throttle(void) ""
> > +ram_find_and_save_block_postcopy(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64
> > +ram_find_and_save_block_postcopy_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)"
> >  ram_postcopy_send_discard_bitmap(void) ""
> >  ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"
> >  
> 
> -- 
> David Gibson			| I'll have my music baroque, and my code
> david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
> 				| _way_ _around_!
> http://www.ozlabs.org/~dgibson


--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/arch_init.c b/arch_init.c
index 9d8fc6b..acf65e1 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -328,6 +328,7 @@  static RAMBlock *last_seen_block;
 /* This is the last block from where we have sent data */
 static RAMBlock *last_sent_block;
 static ram_addr_t last_offset;
+static bool last_was_from_queue;
 static unsigned long *migration_bitmap;
 static uint64_t migration_dirty_pages;
 static uint32_t last_version;
@@ -461,6 +462,19 @@  static inline bool migration_bitmap_set_dirty(ram_addr_t addr)
     return ret;
 }
 
+static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
+{
+    bool ret;
+    int nr = addr >> TARGET_PAGE_BITS;
+
+    ret = test_and_clear_bit(nr, migration_bitmap);
+
+    if (ret) {
+        migration_dirty_pages--;
+    }
+    return ret;
+}
+
 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 {
     ram_addr_t addr;
@@ -669,6 +683,39 @@  static int ram_save_page(QEMUFile *f, RAMBlock* block, ram_addr_t offset,
 }
 
 /*
+ * Unqueue a page from the queue fed by postcopy page requests
+ *
+ * Returns:      The RAMBlock* to transmit from (or NULL if the queue is empty)
+ *      ms:      MigrationState in
+ *  offset:      the byte offset within the RAMBlock for the start of the page
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ */
+static RAMBlock *ram_save_unqueue_page(MigrationState *ms, ram_addr_t *offset,
+                                       ram_addr_t *ram_addr_abs)
+{
+    RAMBlock *result = NULL;
+    qemu_mutex_lock(&ms->src_page_req_mutex);
+    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
+        struct MigrationSrcPageRequest *entry =
+                                    QSIMPLEQ_FIRST(&ms->src_page_requests);
+        result = entry->rb;
+        *offset = entry->offset;
+        *ram_addr_abs = (entry->offset + entry->rb->offset) & TARGET_PAGE_MASK;
+
+        if (entry->len > TARGET_PAGE_SIZE) {
+            entry->len -= TARGET_PAGE_SIZE;
+            entry->offset += TARGET_PAGE_SIZE;
+        } else {
+            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+            g_free(entry);
+        }
+    }
+    qemu_mutex_unlock(&ms->src_page_req_mutex);
+
+    return result;
+}
+
+/*
  * Queue the pages for transmission, e.g. a request from postcopy destination
  *   ms: MigrationStatus in which the queue is held
  *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
@@ -732,46 +779,102 @@  int ram_save_queue_pages(MigrationState *ms, const char *rbname,
 
 static int ram_find_and_save_block(QEMUFile *f, bool last_stage)
 {
+    MigrationState *ms = migrate_get_current();
     RAMBlock *block = last_seen_block;
+    RAMBlock *tmpblock;
     ram_addr_t offset = last_offset;
+    ram_addr_t tmpoffset;
     bool complete_round = false;
     int bytes_sent = 0;
-    MemoryRegion *mr;
     ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
                                  ram_addr_t space */
+    unsigned long hps = sysconf(_SC_PAGESIZE);
 
-    if (!block)
+    if (!block) {
         block = QTAILQ_FIRST(&ram_list.blocks);
+        last_was_from_queue = false;
+    }
 
-    while (true) {
-        mr = block->mr;
-        offset = migration_bitmap_find_and_reset_dirty(mr, offset,
-                                                       &dirty_ram_abs);
-        if (complete_round && block == last_seen_block &&
-            offset >= last_offset) {
-            break;
+    while (true) { /* Until we send a block or run out of stuff to send */
+        tmpblock = NULL;
+
+        /*
+         * Don't break host-page chunks up with queue items
+         * so only unqueue if,
+         *   a) The last item came from the queue anyway
+         *   b) The last sent item was the last target-page in a host page
+         */
+        if (last_was_from_queue || !last_sent_block ||
+            ((last_offset & (hps - 1)) == (hps - TARGET_PAGE_SIZE))) {
+            tmpblock = ram_save_unqueue_page(ms, &tmpoffset, &dirty_ram_abs);
         }
-        if (offset >= block->used_length) {
-            offset = 0;
-            block = QTAILQ_NEXT(block, next);
-            if (!block) {
-                block = QTAILQ_FIRST(&ram_list.blocks);
-                complete_round = true;
-                ram_bulk_stage = false;
+
+        if (tmpblock) {
+            /* We've got a block from the postcopy queue */
+            trace_ram_find_and_save_block_postcopy(tmpblock->idstr,
+                                                   (uint64_t)tmpoffset,
+                                                   (uint64_t)dirty_ram_abs);
+            /*
+             * We're sending this page, and since it's postcopy nothing else
+             * will dirty it, and we must make sure it doesn't get sent again.
+             */
+            if (!migration_bitmap_clear_dirty(dirty_ram_abs)) {
+                trace_ram_find_and_save_block_postcopy_not_dirty(
+                    tmpblock->idstr, (uint64_t)tmpoffset,
+                    (uint64_t)dirty_ram_abs,
+                    test_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap));
+
+                continue;
             }
+            /*
+             * As soon as we start servicing pages out of order, then we have
+             * to kill the bulk stage, since the bulk stage assumes
+             * in (migration_bitmap_find_and_reset_dirty) that every page is
+             * dirty, that's no longer true.
+             */
+            ram_bulk_stage = false;
+            /*
+             * We mustn't change block/offset unless it's to a valid one
+             * otherwise we can go down some of the exit cases in the normal
+             * path.
+             */
+            block = tmpblock;
+            offset = tmpoffset;
+            last_was_from_queue = true;
         } else {
-            bytes_sent = ram_save_page(f, block, offset, last_stage);
-
-            /* if page is unmodified, continue to the next */
-            if (bytes_sent > 0) {
-                MigrationState *ms = migrate_get_current();
-                if (ms->sentmap) {
-                    set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
+            MemoryRegion *mr;
+            /* priority queue empty, so just search for something dirty */
+            mr = block->mr;
+            offset = migration_bitmap_find_and_reset_dirty(mr, offset,
+                                                           &dirty_ram_abs);
+            if (complete_round && block == last_seen_block &&
+                offset >= last_offset) {
+                break;
+            }
+            if (offset >= block->used_length) {
+                offset = 0;
+                block = QTAILQ_NEXT(block, next);
+                if (!block) {
+                    block = QTAILQ_FIRST(&ram_list.blocks);
+                    complete_round = true;
+                    ram_bulk_stage = false;
                 }
+                continue; /* pick an offset in the new block */
+            }
+            last_was_from_queue = false;
+        }
 
-                last_sent_block = block;
-                break;
+        /* We have a page to send, so send it */
+        bytes_sent = ram_save_page(f, block, offset, last_stage);
+
+        /* if page is unmodified, continue to the next */
+        if (bytes_sent > 0) {
+            if (ms->sentmap) {
+                set_bit(dirty_ram_abs >> TARGET_PAGE_BITS, ms->sentmap);
             }
+
+            last_sent_block = block;
+            break;
         }
     }
     last_seen_block = block;
@@ -865,6 +968,7 @@  static void reset_ram_globals(void)
     last_offset = 0;
     last_version = ram_list.version;
     ram_bulk_stage = true;
+    last_was_from_queue = false;
 }
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
diff --git a/trace-events b/trace-events
index 8a0d70d..781cf5c 100644
--- a/trace-events
+++ b/trace-events
@@ -1217,6 +1217,8 @@  qemu_file_fclose(void) ""
 migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
 migration_throttle(void) ""
+ram_find_and_save_block_postcopy(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64
+ram_find_and_save_block_postcopy_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)"
 ram_postcopy_send_discard_bitmap(void) ""
 ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"