diff mbox

[v7,28/42] Postcopy: Postcopy startup in migration thread

Message ID 1434450415-11339-29-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert June 16, 2015, 10:26 a.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Rework the migration thread to setup and start postcopy.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |   3 +
 migration/migration.c         | 166 ++++++++++++++++++++++++++++++++++++++++--
 trace-events                  |   4 +
 3 files changed, 167 insertions(+), 6 deletions(-)

Comments

Juan Quintela July 13, 2015, 12:56 p.m. UTC | #1
"Dr. David Alan Gilbert (git)" <dgilbert@redhat.com> wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
>
> Rework the migration thread to setup and start postcopy.
>
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  include/migration/migration.h |   3 +
>  migration/migration.c         | 166 ++++++++++++++++++++++++++++++++++++++++--
>  trace-events                  |   4 +
>  3 files changed, 167 insertions(+), 6 deletions(-)
>
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index e6585c5..68a1731 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -120,6 +120,9 @@ struct MigrationState
>      /* Flag set once the migration has been asked to enter postcopy */
>      bool start_postcopy;
>  
> +    /* Flag set once the migration thread is running (and needs joining) */
> +    bool started_migration_thread;
> +

migration_thread_started?

> +
> +    /*
> +     * send rest of state - note things that are doing postcopy
> +     * will notice we're in POSTCOPY_ACTIVE and not actually
> +     * wrap their state up here
> +     */
> +    qemu_file_set_rate_limit(ms->file, INT64_MAX);

Do we undo this?  or, are we sure that it is ok to maximize network
output?

> +    /* Ping just for debugging, helps line traces up */
> +    qemu_savevm_send_ping(ms->file, 2);

Change the values 1, 2, 3 to constants?

> +     * We need to leave the fd free for page transfers during the
> +     * loading of the device state, so wrap all the remaining
> +     * commands and state into a package that gets sent in one go
> +     */
> +    QEMUFile *fb = qemu_bufopen("w", NULL);
> +    if (!fb) {
> +        error_report("Failed to create buffered file");
> +        goto fail;
> +    }
> +
> +    qemu_savevm_state_complete_precopy(fb);
> +    qemu_savevm_send_ping(fb, 3);
> +
> +    qemu_savevm_send_postcopy_run(fb);
> +
> +    /* <><> end of stuff going into the package */
> +    qsb = qemu_buf_get(fb);
> +
> +    /* Now send that blob */
> +    if (qemu_savevm_send_packaged(ms->file, qsb)) {
> +        goto fail_closefb;
> +    }
> +    qemu_fclose(fb);

Why can't we send this directly without the extra copy?
I guess that there are some missing/extra section starts/end whatever?
Anything specific?

> +    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;

Now, that we are here, is there a counter of the time that takes the
postcopy stage?  Just curious.
> +/*
>   * Master migration thread on the source VM.
>   * It drives the migration and pumps the data down the outgoing channel.
>   */
>  static void *migration_thread(void *opaque)
>  {
>      MigrationState *s = opaque;
> +    /* Used by the bandwidth calcs, updated later */
>      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>      int64_t initial_bytes = 0;
>      int64_t max_size = 0;
>      int64_t start_time = initial_time;
>      bool old_vm_running = false;
> +    bool entered_postcopy = false;
> +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> +    enum MigrationStatus current_active_type = MIGRATION_STATUS_ACTIVE;

current_active_state?
Dr. David Alan Gilbert July 13, 2015, 5:56 p.m. UTC | #2
* Juan Quintela (quintela@redhat.com) wrote:
> "Dr. David Alan Gilbert (git)" <dgilbert@redhat.com> wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> >
> > Rework the migration thread to setup and start postcopy.
> >
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  include/migration/migration.h |   3 +
> >  migration/migration.c         | 166 ++++++++++++++++++++++++++++++++++++++++--
> >  trace-events                  |   4 +
> >  3 files changed, 167 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index e6585c5..68a1731 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -120,6 +120,9 @@ struct MigrationState
> >      /* Flag set once the migration has been asked to enter postcopy */
> >      bool start_postcopy;
> >  
> > +    /* Flag set once the migration thread is running (and needs joining) */
> > +    bool started_migration_thread;
> > +
> 
> migration_thread_started?

Changed.

> > +
> > +    /*
> > +     * send rest of state - note things that are doing postcopy
> > +     * will notice we're in POSTCOPY_ACTIVE and not actually
> > +     * wrap their state up here
> > +     */
> > +    qemu_file_set_rate_limit(ms->file, INT64_MAX);
> 
> Do we undo this?  or, are we sure that it is ok to maximize network
> output?

No we don't undo it;  it's a good question what we can do better.
I'm trying to avoid delaying the postcopy-requested pages; ideally
I'd like to separate those out so they get satisfied but still
meet the bandwidth limit for the background transfer.
The ideal is separate fd's, however something else I've considered
is getting incoming postcopy requests to wake the outgoing side
up when it's sleeping for the bandwidth limit, although I've
not tried implementing that yet.

> > +    /* Ping just for debugging, helps line traces up */
> > +    qemu_savevm_send_ping(ms->file, 2);
> 
> Change the values 1, 2, 3 to constants?

Suggestions to names? - they purely for debugging so you can
match it up on the destination.

> > +     * We need to leave the fd free for page transfers during the
> > +     * loading of the device state, so wrap all the remaining
> > +     * commands and state into a package that gets sent in one go
> > +     */
> > +    QEMUFile *fb = qemu_bufopen("w", NULL);
> > +    if (!fb) {
> > +        error_report("Failed to create buffered file");
> > +        goto fail;
> > +    }
> > +
> > +    qemu_savevm_state_complete_precopy(fb);
> > +    qemu_savevm_send_ping(fb, 3);
> > +
> > +    qemu_savevm_send_postcopy_run(fb);
> > +
> > +    /* <><> end of stuff going into the package */
> > +    qsb = qemu_buf_get(fb);
> > +
> > +    /* Now send that blob */
> > +    if (qemu_savevm_send_packaged(ms->file, qsb)) {
> > +        goto fail_closefb;
> > +    }
> > +    qemu_fclose(fb);
> 
> Why can't we send this directly without the extra copy?
> I guess that there are some missing/extra section starts/end whatever?
> Anything specific?

The problem is that the destination has to be able to read the chunk
of migration stream off the fd and leave the fd free for page requests
that may be required during loading the device state.
Since the migration-stream is unstructured, there is no way to read
a chunk of stream off without knowing the length of that chunk, and the
only way to know that chunk is to write it to a buffer and then see
how big it is.

> > +    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
> 
> Now, that we are here, is there a counter of the time that takes the
> postcopy stage?  Just curious.

No, not separate.

> > +/*
> >   * Master migration thread on the source VM.
> >   * It drives the migration and pumps the data down the outgoing channel.
> >   */
> >  static void *migration_thread(void *opaque)
> >  {
> >      MigrationState *s = opaque;
> > +    /* Used by the bandwidth calcs, updated later */
> >      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> >      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
> >      int64_t initial_bytes = 0;
> >      int64_t max_size = 0;
> >      int64_t start_time = initial_time;
> >      bool old_vm_running = false;
> > +    bool entered_postcopy = false;
> > +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > +    enum MigrationStatus current_active_type = MIGRATION_STATUS_ACTIVE;
> 
> current_active_state?

Changed.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Juan Quintela July 13, 2015, 6:09 p.m. UTC | #3
"Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

>> > +
>> > +    /*
>> > +     * send rest of state - note things that are doing postcopy
>> > +     * will notice we're in POSTCOPY_ACTIVE and not actually
>> > +     * wrap their state up here
>> > +     */
>> > +    qemu_file_set_rate_limit(ms->file, INT64_MAX);
>> 
>> Do we undo this?  or, are we sure that it is ok to maximize network
>> output?
>
> No we don't undo it;  it's a good question what we can do better.
> I'm trying to avoid delaying the postcopy-requested pages; ideally
> I'd like to separate those out so they get satisfied but still
> meet the bandwidth limit for the background transfer.
> The ideal is separate fd's, however something else I've considered
> is getting incoming postcopy requests to wake the outgoing side
> up when it's sleeping for the bandwidth limit, although I've
> not tried implementing that yet.

I see.

>
>> > +    /* Ping just for debugging, helps line traces up */
>> > +    qemu_savevm_send_ping(ms->file, 2);
>> 
>> Change the values 1, 2, 3 to constants?
>
> Suggestions to names? - they purely for debugging so you can
> match it up on the destination.
>
>> > +     * We need to leave the fd free for page transfers during the
>> > +     * loading of the device state, so wrap all the remaining
>> > +     * commands and state into a package that gets sent in one go
>> > +     */
>> > +    QEMUFile *fb = qemu_bufopen("w", NULL);
>> > +    if (!fb) {
>> > +        error_report("Failed to create buffered file");
>> > +        goto fail;
>> > +    }
>> > +
>> > +    qemu_savevm_state_complete_precopy(fb);
>> > +    qemu_savevm_send_ping(fb, 3);
>> > +
>> > +    qemu_savevm_send_postcopy_run(fb);
>> > +
>> > +    /* <><> end of stuff going into the package */
>> > +    qsb = qemu_buf_get(fb);
>> > +
>> > +    /* Now send that blob */
>> > +    if (qemu_savevm_send_packaged(ms->file, qsb)) {
>> > +        goto fail_closefb;
>> > +    }
>> > +    qemu_fclose(fb);
>> 
>> Why can't we send this directly without the extra copy?
>> I guess that there are some missing/extra section starts/end whatever?
>> Anything specific?
>
> The problem is that the destination has to be able to read the chunk
> of migration stream off the fd and leave the fd free for page requests
> that may be required during loading the device state.
> Since the migration-stream is unstructured, there is no way to read
> a chunk of stream off without knowing the length of that chunk, and the
> only way to know that chunk is to write it to a buffer and then see
> how big it is.

Arghhh.  ok.  Comment?

>
>> > +    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
>> 
>> Now, that we are here, is there a counter of the time that takes the
>> postcopy stage?  Just curious.
>
> No, not separate.
>
>> > +/*
>> >   * Master migration thread on the source VM.
>> >   * It drives the migration and pumps the data down the outgoing channel.
>> >   */
>> >  static void *migration_thread(void *opaque)
>> >  {
>> >      MigrationState *s = opaque;
>> > +    /* Used by the bandwidth calcs, updated later */
>> >      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>> >      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>> >      int64_t initial_bytes = 0;
>> >      int64_t max_size = 0;
>> >      int64_t start_time = initial_time;
>> >      bool old_vm_running = false;
>> > +    bool entered_postcopy = false;
>> > +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
>> > +    enum MigrationStatus current_active_type = MIGRATION_STATUS_ACTIVE;
>> 
>> current_active_state?
>
> Changed.
>
> Dave
>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Amit Shah July 23, 2015, 5:53 a.m. UTC | #4
On (Mon) 13 Jul 2015 [18:56:55], Dr. David Alan Gilbert wrote:
> * Juan Quintela (quintela@redhat.com) wrote:

> > > +    /*
> > > +     * send rest of state - note things that are doing postcopy
> > > +     * will notice we're in POSTCOPY_ACTIVE and not actually
> > > +     * wrap their state up here
> > > +     */
> > > +    qemu_file_set_rate_limit(ms->file, INT64_MAX);
> > 
> > Do we undo this?  or, are we sure that it is ok to maximize network
> > output?
> 
> No we don't undo it;  it's a good question what we can do better.
> I'm trying to avoid delaying the postcopy-requested pages; ideally
> I'd like to separate those out so they get satisfied but still
> meet the bandwidth limit for the background transfer.
> The ideal is separate fd's, however something else I've considered
> is getting incoming postcopy requests to wake the outgoing side
> up when it's sleeping for the bandwidth limit, although I've
> not tried implementing that yet.

Might be a conflict in the knobs we expose (max_bandwidth) and us not
adhering to that.

I agree we want this to go full-throttle, so maybe document that
postcopy will override that knob?  It's tricky to get everyone to
understand that postcopy will do that.  Plus there'll be other
questions like what else does postcopy override? -- not that there's
anythign more, but users will wonder.

		Amit
Amit Shah July 23, 2015, 5:55 a.m. UTC | #5
On (Tue) 16 Jun 2015 [11:26:41], Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Rework the migration thread to setup and start postcopy.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

Reviewed-by: Amit Shah <amit.shah@redhat.com>

		Amit
Dr. David Alan Gilbert Sept. 23, 2015, 5:56 p.m. UTC | #6
* Juan Quintela (quintela@redhat.com) wrote:
> "Dr. David Alan Gilbert" <dgilbert@redhat.com> wrote:

> >> > +     * We need to leave the fd free for page transfers during the
> >> > +     * loading of the device state, so wrap all the remaining
> >> > +     * commands and state into a package that gets sent in one go
> >> > +     */
> >> > +    QEMUFile *fb = qemu_bufopen("w", NULL);
> >> > +    if (!fb) {
> >> > +        error_report("Failed to create buffered file");
> >> > +        goto fail;
> >> > +    }
> >> > +
> >> > +    qemu_savevm_state_complete_precopy(fb);
> >> > +    qemu_savevm_send_ping(fb, 3);
> >> > +
> >> > +    qemu_savevm_send_postcopy_run(fb);
> >> > +
> >> > +    /* <><> end of stuff going into the package */
> >> > +    qsb = qemu_buf_get(fb);
> >> > +
> >> > +    /* Now send that blob */
> >> > +    if (qemu_savevm_send_packaged(ms->file, qsb)) {
> >> > +        goto fail_closefb;
> >> > +    }
> >> > +    qemu_fclose(fb);
> >> 
> >> Why can't we send this directly without the extra copy?
> >> I guess that there are some missing/extra section starts/end whatever?
> >> Anything specific?
> >
> > The problem is that the destination has to be able to read the chunk
> > of migration stream off the fd and leave the fd free for page requests
> > that may be required during loading the device state.
> > Since the migration-stream is unstructured, there is no way to read
> > a chunk of stream off without knowing the length of that chunk, and the
> > only way to know that chunk is to write it to a buffer and then see
> > how big it is.
> 
> Arghhh.  ok.  Comment?

I've changed the comment at the start of that section to:

     * While loading the device state we may trigger page transfer
     * requests and the fd must be free to process those, and thus
     * the destination must read the whole device state off the fd before
     * it starts processing it.  Unfortunately the ad-hoc migration format
     * doesn't allow the destination to know the size to read without fully
     * parsing it through each devices load-state code (especially the open
     * coded devices that use get/put).
     * So we wrap the device state up in a package with a length at the start;
     * to do this we use a qemu_buf to hold the whole of the device state.

Dave

> 
> >
> >> > +    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
> >> 
> >> Now, that we are here, is there a counter of the time that takes the
> >> postcopy stage?  Just curious.
> >
> > No, not separate.
> >
> >> > +/*
> >> >   * Master migration thread on the source VM.
> >> >   * It drives the migration and pumps the data down the outgoing channel.
> >> >   */
> >> >  static void *migration_thread(void *opaque)
> >> >  {
> >> >      MigrationState *s = opaque;
> >> > +    /* Used by the bandwidth calcs, updated later */
> >> >      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> >> >      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
> >> >      int64_t initial_bytes = 0;
> >> >      int64_t max_size = 0;
> >> >      int64_t start_time = initial_time;
> >> >      bool old_vm_running = false;
> >> > +    bool entered_postcopy = false;
> >> > +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> >> > +    enum MigrationStatus current_active_type = MIGRATION_STATUS_ACTIVE;
> >> 
> >> current_active_state?
> >
> > Changed.
> >
> > Dave
> >
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index e6585c5..68a1731 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -120,6 +120,9 @@  struct MigrationState
     /* Flag set once the migration has been asked to enter postcopy */
     bool start_postcopy;
 
+    /* Flag set once the migration thread is running (and needs joining) */
+    bool started_migration_thread;
+
     /* bitmap of pages that have been sent at least once
      * only maintained and used in postcopy at the moment
      * where it's used to send the dirtymap at the start
diff --git a/migration/migration.c b/migration/migration.c
index 180e8b9..8d15f33 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -557,7 +557,10 @@  static void migrate_fd_cleanup(void *opaque)
     if (s->file) {
         trace_migrate_fd_cleanup();
         qemu_mutex_unlock_iothread();
-        qemu_thread_join(&s->thread);
+        if (s->started_migration_thread) {
+            qemu_thread_join(&s->thread);
+            s->started_migration_thread = false;
+        }
         qemu_mutex_lock_iothread();
 
         migrate_compress_threads_join();
@@ -1021,7 +1024,6 @@  out:
     return NULL;
 }
 
-__attribute__ (( unused )) /* Until later in patch series */
 static int open_return_path_on_source(MigrationState *ms)
 {
 
@@ -1060,23 +1062,141 @@  static int await_return_path_close_on_source(MigrationState *ms)
 }
 
 /*
+ * Switch from normal iteration to postcopy
+ * Returns non-0 on error
+ */
+static int postcopy_start(MigrationState *ms, bool *old_vm_running)
+{
+    int ret;
+    const QEMUSizedBuffer *qsb;
+    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    migrate_set_state(ms, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+    trace_postcopy_start();
+    qemu_mutex_lock_iothread();
+    trace_postcopy_start_set_run();
+
+    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+    *old_vm_running = runstate_is_running();
+
+    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /*
+     * in Finish migrate and with the io-lock held everything should
+     * be quiet, but we've potentially still got dirty pages and we
+     * need to tell the destination to throw any pages it's already received
+     * that are dirty
+     */
+    if (ram_postcopy_send_discard_bitmap(ms)) {
+        error_report("postcopy send discard bitmap failed");
+        goto fail;
+    }
+
+    /*
+     * send rest of state - note things that are doing postcopy
+     * will notice we're in POSTCOPY_ACTIVE and not actually
+     * wrap their state up here
+     */
+    qemu_file_set_rate_limit(ms->file, INT64_MAX);
+    /* Ping just for debugging, helps line traces up */
+    qemu_savevm_send_ping(ms->file, 2);
+
+    /*
+     * We need to leave the fd free for page transfers during the
+     * loading of the device state, so wrap all the remaining
+     * commands and state into a package that gets sent in one go
+     */
+    QEMUFile *fb = qemu_bufopen("w", NULL);
+    if (!fb) {
+        error_report("Failed to create buffered file");
+        goto fail;
+    }
+
+    qemu_savevm_state_complete_precopy(fb);
+    qemu_savevm_send_ping(fb, 3);
+
+    qemu_savevm_send_postcopy_run(fb);
+
+    /* <><> end of stuff going into the package */
+    qsb = qemu_buf_get(fb);
+
+    /* Now send that blob */
+    if (qemu_savevm_send_packaged(ms->file, qsb)) {
+        goto fail_closefb;
+    }
+    qemu_fclose(fb);
+    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
+
+    qemu_mutex_unlock_iothread();
+
+    /*
+     * Although this ping is just for debug, it could potentially be
+     * used for getting a better measurement of downtime at the source.
+     */
+    qemu_savevm_send_ping(ms->file, 4);
+
+    ret = qemu_file_get_error(ms->file);
+    if (ret) {
+        error_report("postcopy_start: Migration stream errored");
+        migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                              MIGRATION_STATUS_FAILED);
+    }
+
+    return ret;
+
+fail_closefb:
+    qemu_fclose(fb);
+fail:
+    migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                          MIGRATION_STATUS_FAILED);
+    qemu_mutex_unlock_iothread();
+    return -1;
+}
+
+/*
  * Master migration thread on the source VM.
  * It drives the migration and pumps the data down the outgoing channel.
  */
 static void *migration_thread(void *opaque)
 {
     MigrationState *s = opaque;
+    /* Used by the bandwidth calcs, updated later */
     int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     int64_t initial_bytes = 0;
     int64_t max_size = 0;
     int64_t start_time = initial_time;
     bool old_vm_running = false;
+    bool entered_postcopy = false;
+    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
+    enum MigrationStatus current_active_type = MIGRATION_STATUS_ACTIVE;
 
     qemu_savevm_state_header(s->file);
+
+    if (migrate_postcopy_ram()) {
+        /* Now tell the dest that it should open its end so it can reply */
+        qemu_savevm_send_open_return_path(s->file);
+
+        /* And do a ping that will make stuff easier to debug */
+        qemu_savevm_send_ping(s->file, 1);
+
+        /*
+         * Tell the destination that we *might* want to do postcopy later;
+         * if the other end can't do postcopy it should fail now, nice and
+         * early.
+         */
+        qemu_savevm_send_postcopy_advise(s->file);
+    }
+
     qemu_savevm_state_begin(s->file, &s->params);
 
     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+    current_active_type = MIGRATION_STATUS_ACTIVE;
     migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE);
 
     trace_migration_thread_setup_complete();
@@ -1095,6 +1215,22 @@  static void *migration_thread(void *opaque)
             trace_migrate_pending(pending_size, max_size,
                                   pend_post, pend_nonpost);
             if (pending_size && pending_size >= max_size) {
+                /* Still a significant amount to transfer */
+
+                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+                if (migrate_postcopy_ram() &&
+                    s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
+                    pend_nonpost <= max_size &&
+                    atomic_read(&s->start_postcopy)) {
+
+                    if (!postcopy_start(s, &old_vm_running)) {
+                        current_active_type = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+                        entered_postcopy = true;
+                    }
+
+                    continue;
+                }
+                /* Just another iteration step */
                 qemu_savevm_state_iterate(s->file);
             } else {
                 int ret;
@@ -1126,8 +1262,8 @@  static void *migration_thread(void *opaque)
         }
 
         if (qemu_file_get_error(s->file)) {
-            migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
-                              MIGRATION_STATUS_FAILED);
+            migrate_set_state(s, current_active_type, MIGRATION_STATUS_FAILED);
+            trace_migration_thread_file_err();
             break;
         }
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1158,19 +1294,22 @@  static void *migration_thread(void *opaque)
         }
     }
 
+    trace_migration_thread_after_loop();
     qemu_mutex_lock_iothread();
     if (s->state == MIGRATION_STATUS_COMPLETED) {
         int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
         uint64_t transferred_bytes = qemu_ftell(s->file);
         s->total_time = end_time - s->total_time;
-        s->downtime = end_time - start_time;
+        if (!entered_postcopy) {
+            s->downtime = end_time - start_time;
+        }
         if (s->total_time) {
             s->mbps = (((double) transferred_bytes * 8.0) /
                        ((double) s->total_time)) / 1000;
         }
         runstate_set(RUN_STATE_POSTMIGRATE);
     } else {
-        if (old_vm_running) {
+        if (old_vm_running && !entered_postcopy) {
             vm_start();
         }
     }
@@ -1192,9 +1331,24 @@  void migrate_fd_connect(MigrationState *s)
     /* Notify before starting migration thread */
     notifier_list_notify(&migration_state_notifiers, s);
 
+    /*
+     * Open the return path; currently for postcopy but other things might
+     * also want it.
+     */
+    if (migrate_postcopy_ram()) {
+        if (open_return_path_on_source(s)) {
+            error_report("Unable to open return-path for postcopy");
+            migrate_set_state(s, MIGRATION_STATUS_SETUP,
+                              MIGRATION_STATUS_FAILED);
+            migrate_fd_cleanup(s);
+            return;
+        }
+    }
+
     migrate_compress_threads_create();
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
+    s->started_migration_thread = true;
 }
 
 PostcopyState  postcopy_state_get(MigrationIncomingState *mis)
diff --git a/trace-events b/trace-events
index 2ffc1c6..f096877 100644
--- a/trace-events
+++ b/trace-events
@@ -1422,9 +1422,13 @@  migrate_fd_error(void) ""
 migrate_fd_cancel(void) ""
 migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
 migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
+migration_thread_after_loop(void) ""
+migration_thread_file_err(void) ""
 migration_thread_setup_complete(void) ""
 open_return_path_on_source(void) ""
 open_return_path_on_source_continue(void) ""
+postcopy_start(void) ""
+postcopy_start_set_run(void) ""
 source_return_path_thread_bad_end(void) ""
 source_return_path_thread_end(void) ""
 source_return_path_thread_entry(void) ""