diff mbox

[v4,33/47] Postcopy: Postcopy startup in migration thread

Message ID 1412358473-31398-34-git-send-email-dgilbert@redhat.com
State New
Headers show

Commit Message

Dr. David Alan Gilbert Oct. 3, 2014, 5:47 p.m. UTC
From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>

Rework the migration thread to setup and start postcopy.

Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |   3 +
 migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
 2 files changed, 185 insertions(+), 19 deletions(-)

Comments

Paolo Bonzini Oct. 4, 2014, 4:27 p.m. UTC | #1
Il 03/10/2014 19:47, Dr. David Alan Gilbert (git) ha scritto:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Rework the migration thread to setup and start postcopy.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  include/migration/migration.h |   3 +
>  migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 185 insertions(+), 19 deletions(-)
> 
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index b01cc17..f401775 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -125,6 +125,9 @@ struct MigrationState
>      /* Flag set once the migration has been asked to enter postcopy */
>      volatile bool start_postcopy;
>  
> +    /* Flag set once the migration thread is running (and needs joining) */
> +    volatile bool started_migration_thread;

volatile almost never does what you think it does. :)

In this case, I think only one thread reads/writes the variable so
"volatile" is unnecessary.

Otherwise, you would need to add actual memory barriers, atomic
operations, or synchronization primitives.

For start_postcopy, it is okay because it is just a hint to the compiler
and the processor will eventually see the assignment.  For this case
QEMU has atomic_read/atomic_set (corresponding to __ATOMIC_RELAXED in
C/C++1x), so you could use those as well.

>      /* bitmap of pages that have been sent at least once
>       * only maintained and used in postcopy at the moment
>       * where it's used to send the dirtymap at the start
> diff --git a/migration.c b/migration.c
> index 63d70b6..1731017 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -475,7 +475,10 @@ static void migrate_fd_cleanup(void *opaque)
>      if (s->file) {
>          trace_migrate_fd_cleanup();
>          qemu_mutex_unlock_iothread();
> -        qemu_thread_join(&s->thread);
> +        if (s->started_migration_thread) {
> +            qemu_thread_join(&s->thread);
> +            s->started_migration_thread = false;
> +        }
>          qemu_mutex_lock_iothread();
>  
>          qemu_fclose(s->file);
> @@ -872,7 +875,6 @@ out:
>      return NULL;
>  }
>  
> -__attribute__ (( unused )) /* Until later in patch series */
>  static int open_outgoing_return_path(MigrationState *ms)
>  {
>  
> @@ -890,7 +892,6 @@ static int open_outgoing_return_path(MigrationState *ms)
>      return 0;
>  }
>  
> -__attribute__ (( unused )) /* Until later in patch series */
>  static void await_outgoing_return_path_close(MigrationState *ms)
>  {
>      /*
> @@ -908,6 +909,97 @@ static void await_outgoing_return_path_close(MigrationState *ms)
>      DPRINTF("%s: Exit", __func__);
>  }
>  
> +/* Switch from normal iteration to postcopy
> + * Returns non-0 on error
> + */
> +static int postcopy_start(MigrationState *ms)
> +{
> +    int ret;
> +    const QEMUSizedBuffer *qsb;
> +    migrate_set_state(ms, MIG_STATE_ACTIVE, MIG_STATE_POSTCOPY_ACTIVE);
> +
> +    DPRINTF("postcopy_start\n");
> +    qemu_mutex_lock_iothread();
> +    DPRINTF("postcopy_start: setting run state\n");
> +    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> +
> +    if (ret < 0) {
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        return -1;

Please use "goto" for error returns, like

fail_locked:
    qemu_mutex_unlock_iothread();
fail:
    migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
    return -1;

> +    }
> +
> +    /*
> +     * in Finish migrate and with the io-lock held everything should
> +     * be quiet, but we've potentially still got dirty pages and we
> +     * need to tell the destination to throw any pages it's already received
> +     * that are dirty
> +     */
> +    if (ram_postcopy_send_discard_bitmap(ms)) {
> +        DPRINTF("postcopy send discard bitmap failed\n");
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        return -1;
> +    }
> +
> +    DPRINTF("postcopy_start: sending req 2\n");
> +    qemu_savevm_send_reqack(ms->file, 2);

Perhaps move it below qemu_file_set_rate_limit, and add
trace_qemu_savevm_send_reqack?

Also what is 2/3/4?  Is this just for debugging or is it part of the
protocol?

> +    /*
> +     * send rest of state - note things that are doing postcopy
> +     * will notice we're in MIG_STATE_POSTCOPY_ACTIVE and not actually
> +     * wrap their state up here
> +     */
> +    qemu_file_set_rate_limit(ms->file, INT64_MAX);
> +    DPRINTF("postcopy_start: do state_complete\n");
> +
> +    /*
> +     * We need to leave the fd free for page transfers during the
> +     * loading of the device state, so wrap all the remaining
> +     * commands and state into a package that gets sent in one go
> +     */

The comments in the code are very nice.  Thanks.  This is a huge
improvement from the last version I received.

> +    QEMUFile *fb = qemu_bufopen("w", NULL);
> +    if (!fb) {
> +        error_report("Failed to create buffered file");
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        return -1;
> +    }
> +
> +    qemu_savevm_state_complete(fb);
> +    DPRINTF("postcopy_start: sending req 3\n");
> +    qemu_savevm_send_reqack(fb, 3);
> +
> +    qemu_savevm_send_postcopy_ram_run(fb);
> +
> +    /* <><> end of stuff going into the package */
> +    qsb = qemu_buf_get(fb);
> +
> +    /* Now send that blob */
> +    if (qsb_get_length(qsb) > MAX_VM_CMD_PACKAGED_SIZE) {
> +        DPRINTF("postcopy_start: Unreasonably large packaged state: %lu\n",
> +                (unsigned long)(qsb_get_length(qsb)));
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        qemu_fclose(fb);

Close fb above migrate_set_state, and use goto as above.  Or just have
three labels.

> +        return -1;
> +    }
> +    qemu_savevm_send_packaged(ms->file, qsb);
> +    qemu_fclose(fb);
> +
> +    qemu_mutex_unlock_iothread();
> +
> +    DPRINTF("postcopy_start not finished sending ack\n");
> +    qemu_savevm_send_reqack(ms->file, 4);
> +
> +    ret = qemu_file_get_error(ms->file);
> +    if (ret) {
> +        error_report("postcopy_start: Migration stream errored");

This should have been reported already.

> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +    }
> +
> +    return ret;
> +}
> +
>  /*
>   * Master migration thread on the source VM.
>   * It drives the migration and pumps the data down the outgoing channel.
> @@ -915,16 +1007,36 @@ static void await_outgoing_return_path_close(MigrationState *ms)
>  static void *migration_thread(void *opaque)
>  {
>      MigrationState *s = opaque;
> +    /* Used by the bandwidth calcs, updated later */
>      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>      int64_t initial_bytes = 0;
>      int64_t max_size = 0;
>      int64_t start_time = initial_time;
> +
>      bool old_vm_running = false;
>  
> +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> +    enum MigrationPhase current_active_type = MIG_STATE_ACTIVE;
> +
>      qemu_savevm_state_begin(s->file, &s->params);
>  
> +    if (migrate_postcopy_ram()) {
> +        /* Now tell the dest that it should open it's end so it can reply */
> +        qemu_savevm_send_openrp(s->file);
> +
> +        /* And ask it to send an ack that will make stuff easier to debug */
> +        qemu_savevm_send_reqack(s->file, 1);
> +
> +        /* Tell the destination that we *might* want to do postcopy later;
> +         * if the other end can't do postcopy it should fail now, nice and
> +         * early.
> +         */
> +        qemu_savevm_send_postcopy_ram_advise(s->file);
> +    }

Should this be done here or in the save_state_begin function for RAM?
In general, I'm curious if there are parts of postcopy_start that
could/should be changed into new save state functions (with
postcopy_start just iterating on all devices).

>      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
> +    current_active_type = MIG_STATE_ACTIVE;
>      migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
>  
>      DPRINTF("setup complete\n");
> @@ -945,37 +1057,74 @@ static void *migration_thread(void *opaque)
>                      " nonpost=%" PRIu64 ")\n",
>                      pending_size, max_size, pend_post, pend_nonpost);
>              if (pending_size && pending_size >= max_size) {
> +                /* Still a significant amount to transfer */
> +
> +                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +                if (migrate_postcopy_ram() &&
> +                    s->state != MIG_STATE_POSTCOPY_ACTIVE &&
> +                    pend_nonpost == 0 && s->start_postcopy) {
> +
> +                    if (!postcopy_start(s)) {
> +                        current_active_type = MIG_STATE_POSTCOPY_ACTIVE;
> +                    }
> +
> +                    continue;
> +                }
> +                /* Just another iteration step */
>                  qemu_savevm_state_iterate(s->file);
>              } else {
>                  int ret;
>  
> -                DPRINTF("done iterating\n");
> -                qemu_mutex_lock_iothread();
> -                start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> -                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> -                old_vm_running = runstate_is_running();
> +                DPRINTF("done iterating pending size %" PRIu64 "\n",
> +                        pending_size);
> +
> +                if (s->state == MIG_STATE_ACTIVE) {
> +                    qemu_mutex_lock_iothread();
> +                    start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +                    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> +                    old_vm_running = runstate_is_running();
> +
> +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> +                    if (ret >= 0) {
> +                        qemu_file_set_rate_limit(s->file, INT64_MAX);
> +                        qemu_savevm_state_complete(s->file);
> +                    }
> +                    qemu_mutex_unlock_iothread();
> +
> +                    if (ret < 0) {
> +                        migrate_set_state(s, current_active_type,
> +                                          MIG_STATE_ERROR);
> +                        break;
> +                    }
> +                } else if (s->state == MIG_STATE_POSTCOPY_ACTIVE) {
> +                    DPRINTF("postcopy end\n");
> +
> +                    qemu_savevm_state_postcopy_complete(s->file);
> +                    DPRINTF("postcopy end after complete\n");
>  
> -                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> -                if (ret >= 0) {
> -                    qemu_file_set_rate_limit(s->file, INT64_MAX);
> -                    qemu_savevm_state_complete(s->file);
>                  }
> -                qemu_mutex_unlock_iothread();
>  
> -                if (ret < 0) {
> -                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
> -                    break;
> +                /*
> +                 * If rp was opened we must clean up the thread before
> +                 * cleaning everything else up.
> +                 * Postcopy opens rp if enabled (even if it's not avtivated)
> +                 */
> +                if (migrate_postcopy_ram()) {
> +                    DPRINTF("before rp close");
> +                    await_outgoing_return_path_close(s);

Should this be done even if there is an error?  Perhaps move it
altogether out of the big migration thread while() loop?

> +                    DPRINTF("after rp close");
>                  }
> -
>                  if (!qemu_file_get_error(s->file)) {
> -                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
> +                    migrate_set_state(s, current_active_type,
> +                                      MIG_STATE_COMPLETED);
>                      break;
>                  }

This "else" is huge, can you extract it into its own function?

>              }
>          }
>  
>          if (qemu_file_get_error(s->file)) {
> -            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
> +            migrate_set_state(s, current_active_type, MIG_STATE_ERROR);
> +            DPRINTF("migration_thread: file is in error state\n");
>              break;
>          }
>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> @@ -1006,6 +1155,7 @@ static void *migration_thread(void *opaque)
>          }
>      }
>  
> +    DPRINTF("migration_thread: After loop");
>      qemu_mutex_lock_iothread();
>      if (s->state == MIG_STATE_COMPLETED) {
>          int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> @@ -1043,6 +1193,19 @@ void migrate_fd_connect(MigrationState *s)
>      /* Notify before starting migration thread */
>      notifier_list_notify(&migration_state_notifiers, s);
>  
> +    /* Open the return path; currently for postcopy but other things might
> +     * also want it.
> +     */
> +    if (migrate_postcopy_ram()) {
> +        if (open_outgoing_return_path(s)) {
> +            error_report("Unable to open return-path for postcopy");
> +            migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ERROR);
> +            migrate_fd_cleanup(s);
> +            return;
> +        }
> +    }
> +
>      qemu_thread_create(&s->thread, "migration", migration_thread, s,
>                         QEMU_THREAD_JOINABLE);
> +    s->started_migration_thread = true;
>  }
>
David Gibson Nov. 10, 2014, 6:05 a.m. UTC | #2
On Fri, Oct 03, 2014 at 06:47:39PM +0100, Dr. David Alan Gilbert (git) wrote:
> From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> 
> Rework the migration thread to setup and start postcopy.
> 
> Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> ---
>  include/migration/migration.h |   3 +
>  migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
>  2 files changed, 185 insertions(+), 19 deletions(-)
> 
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index b01cc17..f401775 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -125,6 +125,9 @@ struct MigrationState
>      /* Flag set once the migration has been asked to enter postcopy */
>      volatile bool start_postcopy;
>  
> +    /* Flag set once the migration thread is running (and needs joining) */
> +    volatile bool started_migration_thread;
> +
>      /* bitmap of pages that have been sent at least once
>       * only maintained and used in postcopy at the moment
>       * where it's used to send the dirtymap at the start
> diff --git a/migration.c b/migration.c
> index 63d70b6..1731017 100644
> --- a/migration.c
> +++ b/migration.c
> @@ -475,7 +475,10 @@ static void migrate_fd_cleanup(void *opaque)
>      if (s->file) {
>          trace_migrate_fd_cleanup();
>          qemu_mutex_unlock_iothread();
> -        qemu_thread_join(&s->thread);
> +        if (s->started_migration_thread) {
> +            qemu_thread_join(&s->thread);
> +            s->started_migration_thread = false;
> +        }
>          qemu_mutex_lock_iothread();
>  
>          qemu_fclose(s->file);
> @@ -872,7 +875,6 @@ out:
>      return NULL;
>  }
>  
> -__attribute__ (( unused )) /* Until later in patch series */
>  static int open_outgoing_return_path(MigrationState *ms)
>  {
>  
> @@ -890,7 +892,6 @@ static int open_outgoing_return_path(MigrationState *ms)
>      return 0;
>  }
>  
> -__attribute__ (( unused )) /* Until later in patch series */
>  static void await_outgoing_return_path_close(MigrationState *ms)
>  {
>      /*
> @@ -908,6 +909,97 @@ static void await_outgoing_return_path_close(MigrationState *ms)
>      DPRINTF("%s: Exit", __func__);
>  }
>  
> +/* Switch from normal iteration to postcopy
> + * Returns non-0 on error
> + */
> +static int postcopy_start(MigrationState *ms)
> +{
> +    int ret;
> +    const QEMUSizedBuffer *qsb;
> +    migrate_set_state(ms, MIG_STATE_ACTIVE, MIG_STATE_POSTCOPY_ACTIVE);
> +
> +    DPRINTF("postcopy_start\n");
> +    qemu_mutex_lock_iothread();
> +    DPRINTF("postcopy_start: setting run state\n");
> +    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> +
> +    if (ret < 0) {
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        return -1;
> +    }
> +
> +    /*
> +     * in Finish migrate and with the io-lock held everything should
> +     * be quiet, but we've potentially still got dirty pages and we
> +     * need to tell the destination to throw any pages it's already received
> +     * that are dirty
> +     */
> +    if (ram_postcopy_send_discard_bitmap(ms)) {
> +        DPRINTF("postcopy send discard bitmap failed\n");
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        return -1;
> +    }
> +
> +    DPRINTF("postcopy_start: sending req 2\n");
> +    qemu_savevm_send_reqack(ms->file, 2);

Are these reqacks just for debugging, or do they affect the protocol?

> +    /*
> +     * send rest of state - note things that are doing postcopy
> +     * will notice we're in MIG_STATE_POSTCOPY_ACTIVE and not actually
> +     * wrap their state up here
> +     */
> +    qemu_file_set_rate_limit(ms->file, INT64_MAX);
> +    DPRINTF("postcopy_start: do state_complete\n");
> +
> +    /*
> +     * We need to leave the fd free for page transfers during the
> +     * loading of the device state, so wrap all the remaining
> +     * commands and state into a package that gets sent in one go
> +     */
> +    QEMUFile *fb = qemu_bufopen("w", NULL);
> +    if (!fb) {
> +        error_report("Failed to create buffered file");
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        return -1;
> +    }
> +
> +    qemu_savevm_state_complete(fb);
> +    DPRINTF("postcopy_start: sending req 3\n");
> +    qemu_savevm_send_reqack(fb, 3);
> +
> +    qemu_savevm_send_postcopy_ram_run(fb);
> +
> +    /* <><> end of stuff going into the package */
> +    qsb = qemu_buf_get(fb);
> +
> +    /* Now send that blob */
> +    if (qsb_get_length(qsb) > MAX_VM_CMD_PACKAGED_SIZE) {
> +        DPRINTF("postcopy_start: Unreasonably large packaged state: %lu\n",
> +                (unsigned long)(qsb_get_length(qsb)));
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +        qemu_mutex_unlock_iothread();
> +        qemu_fclose(fb);
> +        return -1;
> +    }
> +    qemu_savevm_send_packaged(ms->file, qsb);
> +    qemu_fclose(fb);
> +
> +    qemu_mutex_unlock_iothread();
> +
> +    DPRINTF("postcopy_start not finished sending ack\n");
> +    qemu_savevm_send_reqack(ms->file, 4);
> +
> +    ret = qemu_file_get_error(ms->file);
> +    if (ret) {
> +        error_report("postcopy_start: Migration stream errored");
> +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> +    }
> +
> +    return ret;
> +}
> +
>  /*
>   * Master migration thread on the source VM.
>   * It drives the migration and pumps the data down the outgoing channel.
> @@ -915,16 +1007,36 @@ static void await_outgoing_return_path_close(MigrationState *ms)
>  static void *migration_thread(void *opaque)
>  {
>      MigrationState *s = opaque;
> +    /* Used by the bandwidth calcs, updated later */
>      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
>      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
>      int64_t initial_bytes = 0;
>      int64_t max_size = 0;
>      int64_t start_time = initial_time;
> +
>      bool old_vm_running = false;
>  
> +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> +    enum MigrationPhase current_active_type = MIG_STATE_ACTIVE;
> +
>      qemu_savevm_state_begin(s->file, &s->params);
>  
> +    if (migrate_postcopy_ram()) {
> +        /* Now tell the dest that it should open it's end so it can reply */

s/it's/its/

> +        qemu_savevm_send_openrp(s->file);
> +
> +        /* And ask it to send an ack that will make stuff easier to debug */
> +        qemu_savevm_send_reqack(s->file, 1);
> +
> +        /* Tell the destination that we *might* want to do postcopy later;
> +         * if the other end can't do postcopy it should fail now, nice and
> +         * early.
> +         */
> +        qemu_savevm_send_postcopy_ram_advise(s->file);
> +    }
> +
>      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
> +    current_active_type = MIG_STATE_ACTIVE;
>      migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
>  
>      DPRINTF("setup complete\n");
> @@ -945,37 +1057,74 @@ static void *migration_thread(void *opaque)
>                      " nonpost=%" PRIu64 ")\n",
>                      pending_size, max_size, pend_post, pend_nonpost);
>              if (pending_size && pending_size >= max_size) {
> +                /* Still a significant amount to transfer */
> +
> +                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +                if (migrate_postcopy_ram() &&
> +                    s->state != MIG_STATE_POSTCOPY_ACTIVE &&
> +                    pend_nonpost == 0 && s->start_postcopy) {

Hrm.  This is checking for pend_nonpost == 0, rather than just close
to zero.  IIUC this will only work if all "live sendable" state is
also postcopyable.  But if we have live sendable data that's not
postcopyable - like the power hash page table - we'll need some
threshold here, like we currently have for entering the stopped vm
phase of a precopy migration.

Or am I missing something?

> +
> +                    if (!postcopy_start(s)) {
> +                        current_active_type = MIG_STATE_POSTCOPY_ACTIVE;
> +                    }
> +
> +                    continue;
> +                }
> +                /* Just another iteration step */
>                  qemu_savevm_state_iterate(s->file);
>              } else {
>                  int ret;
>  
> -                DPRINTF("done iterating\n");
> -                qemu_mutex_lock_iothread();
> -                start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> -                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> -                old_vm_running = runstate_is_running();
> +                DPRINTF("done iterating pending size %" PRIu64 "\n",
> +                        pending_size);
> +
> +                if (s->state == MIG_STATE_ACTIVE) {
> +                    qemu_mutex_lock_iothread();
> +                    start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +                    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> +                    old_vm_running = runstate_is_running();
> +
> +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> +                    if (ret >= 0) {
> +                        qemu_file_set_rate_limit(s->file, INT64_MAX);
> +                        qemu_savevm_state_complete(s->file);
> +                    }
> +                    qemu_mutex_unlock_iothread();
> +
> +                    if (ret < 0) {
> +                        migrate_set_state(s, current_active_type,
> +                                          MIG_STATE_ERROR);
> +                        break;
> +                    }
> +                } else if (s->state == MIG_STATE_POSTCOPY_ACTIVE) {
> +                    DPRINTF("postcopy end\n");
> +
> +                    qemu_savevm_state_postcopy_complete(s->file);
> +                    DPRINTF("postcopy end after complete\n");
>  
> -                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> -                if (ret >= 0) {
> -                    qemu_file_set_rate_limit(s->file, INT64_MAX);
> -                    qemu_savevm_state_complete(s->file);
>                  }
> -                qemu_mutex_unlock_iothread();
>  
> -                if (ret < 0) {
> -                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
> -                    break;
> +                /*
> +                 * If rp was opened we must clean up the thread before
> +                 * cleaning everything else up.
> +                 * Postcopy opens rp if enabled (even if it's not avtivated)
> +                 */
> +                if (migrate_postcopy_ram()) {
> +                    DPRINTF("before rp close");
> +                    await_outgoing_return_path_close(s);
> +                    DPRINTF("after rp close");
>                  }
> -
>                  if (!qemu_file_get_error(s->file)) {
> -                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
> +                    migrate_set_state(s, current_active_type,
> +                                      MIG_STATE_COMPLETED);
>                      break;
>                  }
>              }
>          }
>  
>          if (qemu_file_get_error(s->file)) {
> -            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
> +            migrate_set_state(s, current_active_type, MIG_STATE_ERROR);
> +            DPRINTF("migration_thread: file is in error state\n");
>              break;
>          }
>          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> @@ -1006,6 +1155,7 @@ static void *migration_thread(void *opaque)
>          }
>      }
>  
> +    DPRINTF("migration_thread: After loop");
>      qemu_mutex_lock_iothread();
>      if (s->state == MIG_STATE_COMPLETED) {
>          int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> @@ -1043,6 +1193,19 @@ void migrate_fd_connect(MigrationState *s)
>      /* Notify before starting migration thread */
>      notifier_list_notify(&migration_state_notifiers, s);
>  
> +    /* Open the return path; currently for postcopy but other things might
> +     * also want it.
> +     */
> +    if (migrate_postcopy_ram()) {
> +        if (open_outgoing_return_path(s)) {
> +            error_report("Unable to open return-path for postcopy");
> +            migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ERROR);
> +            migrate_fd_cleanup(s);
> +            return;
> +        }
> +    }
> +
>      qemu_thread_create(&s->thread, "migration", migration_thread, s,
>                         QEMU_THREAD_JOINABLE);
> +    s->started_migration_thread = true;
>  }
Dr. David Alan Gilbert Nov. 20, 2014, 11:45 a.m. UTC | #3
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> Il 03/10/2014 19:47, Dr. David Alan Gilbert (git) ha scritto:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Rework the migration thread to setup and start postcopy.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  include/migration/migration.h |   3 +
> >  migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 185 insertions(+), 19 deletions(-)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index b01cc17..f401775 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -125,6 +125,9 @@ struct MigrationState
> >      /* Flag set once the migration has been asked to enter postcopy */
> >      volatile bool start_postcopy;
> >  
> > +    /* Flag set once the migration thread is running (and needs joining) */
> > +    volatile bool started_migration_thread;
> 
> volatile almost never does what you think it does. :)

True.

> In this case, I think only one thread reads/writes the variable so
> "volatile" is unnecessary.

Lets just check that; so it's set by 'migrate_fd_connect' (from the main thread)
when it spawns the thread, and it's cleared by migrate_fd_cleanup that's always run
as a bh, so should always be in the main thread; so yes - always the same thread,
that's nice and simple; volatile evaporated.

> Otherwise, you would need to add actual memory barriers, atomic
> operations, or synchronization primitives.
> 
> For start_postcopy, it is okay because it is just a hint to the compiler
> and the processor will eventually see the assignment.

Yes, in this case my understanding is that it's necessary to stop the
compiler potentially moving the check outside the loop.

> For this case
> QEMU has atomic_read/atomic_set (corresponding to __ATOMIC_RELAXED in
> C/C++1x), so you could use those as well.

Ah, so those look like they just volatile cast anyway.

(I've probably got some other flags I need to think about reading/writing
atomically/safely).

Dave
(I'll take the other issues in this mail separately since there are quite a few).
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Nov. 20, 2014, 5:12 p.m. UTC | #4
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> Il 03/10/2014 19:47, Dr. David Alan Gilbert (git) ha scritto:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Rework the migration thread to setup and start postcopy.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  include/migration/migration.h |   3 +
> >  migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 185 insertions(+), 19 deletions(-)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index b01cc17..f401775 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h

<snip>

> > +/* Switch from normal iteration to postcopy
> > + * Returns non-0 on error
> > + */
> > +static int postcopy_start(MigrationState *ms)
> > +{
> > +    int ret;
> > +    const QEMUSizedBuffer *qsb;
> > +    migrate_set_state(ms, MIG_STATE_ACTIVE, MIG_STATE_POSTCOPY_ACTIVE);
> > +
> > +    DPRINTF("postcopy_start\n");
> > +    qemu_mutex_lock_iothread();
> > +    DPRINTF("postcopy_start: setting run state\n");
> > +    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> > +
> > +    if (ret < 0) {
> > +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> > +        qemu_mutex_unlock_iothread();
> > +        return -1;
> 
> Please use "goto" for error returns, like
> 
> fail_locked:
>     qemu_mutex_unlock_iothread();
> fail:
>     migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
>     return -1;

Done; they all end up unlocking, but I've got another label
for a case that has to close the fb later.

> > +    }
> > +
> > +    /*
> > +     * in Finish migrate and with the io-lock held everything should
> > +     * be quiet, but we've potentially still got dirty pages and we
> > +     * need to tell the destination to throw any pages it's already received
> > +     * that are dirty
> > +     */
> > +    if (ram_postcopy_send_discard_bitmap(ms)) {
> > +        DPRINTF("postcopy send discard bitmap failed\n");
> > +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> > +        qemu_mutex_unlock_iothread();
> > +        return -1;
> > +    }
> > +
> > +    DPRINTF("postcopy_start: sending req 2\n");
> > +    qemu_savevm_send_reqack(ms->file, 2);
> 
> Perhaps move it below qemu_file_set_rate_limit, and add
> trace_qemu_savevm_send_reqack?

Trace added, and also moved as requested - was the request to move
it just to elimintate the other DPRINTF?

> Also what is 2/3/4?  Is this just for debugging or is it part of the
> protocol?

Debug; they're very useful for matching the debug streams up, especially
when the timers on the two hosts are very different.
(I'm up for suggestions on how to mark the 2/3/4 for debug more clearly,
especially if it meant that it didn't make the ping (ne reqack) dedicated
to debug).

> > +    /*
> > +     * send rest of state - note things that are doing postcopy
> > +     * will notice we're in MIG_STATE_POSTCOPY_ACTIVE and not actually
> > +     * wrap their state up here
> > +     */
> > +    qemu_file_set_rate_limit(ms->file, INT64_MAX);
> > +    DPRINTF("postcopy_start: do state_complete\n");
> > +
> > +    /*
> > +     * We need to leave the fd free for page transfers during the
> > +     * loading of the device state, so wrap all the remaining
> > +     * commands and state into a package that gets sent in one go
> > +     */
> 
> The comments in the code are very nice.  Thanks.  This is a huge
> improvement from the last version I received.
> 
> > +    QEMUFile *fb = qemu_bufopen("w", NULL);
> > +    if (!fb) {
> > +        error_report("Failed to create buffered file");
> > +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> > +        qemu_mutex_unlock_iothread();
> > +        return -1;
> > +    }
> > +
> > +    qemu_savevm_state_complete(fb);
> > +    DPRINTF("postcopy_start: sending req 3\n");
> > +    qemu_savevm_send_reqack(fb, 3);
> > +
> > +    qemu_savevm_send_postcopy_ram_run(fb);
> > +
> > +    /* <><> end of stuff going into the package */
> > +    qsb = qemu_buf_get(fb);
> > +
> > +    /* Now send that blob */
> > +    if (qsb_get_length(qsb) > MAX_VM_CMD_PACKAGED_SIZE) {
> > +        DPRINTF("postcopy_start: Unreasonably large packaged state: %lu\n",
> > +                (unsigned long)(qsb_get_length(qsb)));
> > +        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
> > +        qemu_mutex_unlock_iothread();
> > +        qemu_fclose(fb);
> 
> Close fb above migrate_set_state, and use goto as above.  Or just have
> three labels.

Done, it's a separate label.

> 
> > +        return -1;
> > +    }
> > +    qemu_savevm_send_packaged(ms->file, qsb);
> > +    qemu_fclose(fb);
> > +
> > +    qemu_mutex_unlock_iothread();
> > +
> > +    DPRINTF("postcopy_start not finished sending ack\n");
> > +    qemu_savevm_send_reqack(ms->file, 4);
> > +
> > +    ret = qemu_file_get_error(ms->file);
> > +    if (ret) {
> > +        error_report("postcopy_start: Migration stream errored");
> 
> This should have been reported already.

No, sorry - I don't trust qemu_file reporting errors by itself.

Dave
(Again, the rest of the comments on this patch can wait for another mail)

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Paolo Bonzini Nov. 20, 2014, 5:19 p.m. UTC | #5
On 20/11/2014 18:12, Dr. David Alan Gilbert wrote:
> Trace added, and also moved as requested - was the request to move
> it just to elimintate the other DPRINTF?

Yes.

>> > Also what is 2/3/4?  Is this just for debugging or is it part of the
>> > protocol?
> Debug; they're very useful for matching the debug streams up, especially
> when the timers on the two hosts are very different.
> (I'm up for suggestions on how to mark the 2/3/4 for debug more clearly,
> especially if it meant that it didn't make the ping (ne reqack) dedicated
> to debug).
> 

No problem, as long as it's clear to the guy matching the code against
the debug output.

Paolo
Paolo Bonzini Nov. 21, 2014, 12:01 p.m. UTC | #6
On 20/11/2014 12:45, Dr. David Alan Gilbert wrote:
> > For this case QEMU has atomic_read/atomic_set (corresponding to
> > __ATOMIC_RELAXED in C/C++1x), so you could use those as well.
>
> Ah, so those look like they just volatile cast anyway.

Yeah, but it explicitly shows that the assignment is a) for a
multi-threaded operation b) using relaxed semantics.  It attaches the
information to the use instead of the variable; it just happens that
volatile is the pre-C11 way to express those.

Paolo
Dr. David Alan Gilbert Nov. 21, 2014, 12:07 p.m. UTC | #7
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> 
> 
> On 20/11/2014 12:45, Dr. David Alan Gilbert wrote:
> > > For this case QEMU has atomic_read/atomic_set (corresponding to
> > > __ATOMIC_RELAXED in C/C++1x), so you could use those as well.
> >
> > Ah, so those look like they just volatile cast anyway.
> 
> Yeah, but it explicitly shows that the assignment is a) for a
> multi-threaded operation b) using relaxed semantics.  It attaches the
> information to the use instead of the variable; it just happens that
> volatile is the pre-C11 way to express those.

OK, I'll use those anyway; Ideally what I'd have is a way to mark
something so that it'd compile-time-fail if I didn't use an atomic_
on it, because it's the type of thing that I'm bound to forget somewhere.

Dave

> 
> Paolo
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Nov. 24, 2014, 6:26 p.m. UTC | #8
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> Il 03/10/2014 19:47, Dr. David Alan Gilbert (git) ha scritto:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Rework the migration thread to setup and start postcopy.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  include/migration/migration.h |   3 +
> >  migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 185 insertions(+), 19 deletions(-)
> > 

> > @@ -915,16 +1007,36 @@ static void await_outgoing_return_path_close(MigrationState *ms)
> >  static void *migration_thread(void *opaque)
> >  {
> >      MigrationState *s = opaque;
> > +    /* Used by the bandwidth calcs, updated later */
> >      int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> >      int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
> >      int64_t initial_bytes = 0;
> >      int64_t max_size = 0;
> >      int64_t start_time = initial_time;
> > +
> >      bool old_vm_running = false;
> >  
> > +    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
> > +    enum MigrationPhase current_active_type = MIG_STATE_ACTIVE;
> > +
> >      qemu_savevm_state_begin(s->file, &s->params);
> >  
> > +    if (migrate_postcopy_ram()) {
> > +        /* Now tell the dest that it should open it's end so it can reply */
> > +        qemu_savevm_send_openrp(s->file);
> > +
> > +        /* And ask it to send an ack that will make stuff easier to debug */
> > +        qemu_savevm_send_reqack(s->file, 1);
> > +
> > +        /* Tell the destination that we *might* want to do postcopy later;
> > +         * if the other end can't do postcopy it should fail now, nice and
> > +         * early.
> > +         */
> > +        qemu_savevm_send_postcopy_ram_advise(s->file);
> > +    }
> 
> Should this be done here or in the save_state_begin function for RAM?
> In general, I'm curious if there are parts of postcopy_start that
> could/should be changed into new save state functions (with
> postcopy_start just iterating on all devices).

The contents of this 'if' are generic to whatever is being postcopied,
(and as per one of your other comments the _ram_ has been removed from
the send_postcopy_ram_advise); so I think this is the right place for it.

> >      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
> > +    current_active_type = MIG_STATE_ACTIVE;
> >      migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
> >  
> >      DPRINTF("setup complete\n");
> > @@ -945,37 +1057,74 @@ static void *migration_thread(void *opaque)
> >                      " nonpost=%" PRIu64 ")\n",
> >                      pending_size, max_size, pend_post, pend_nonpost);
> >              if (pending_size && pending_size >= max_size) {
> > +                /* Still a significant amount to transfer */
> > +
> > +                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +                if (migrate_postcopy_ram() &&
> > +                    s->state != MIG_STATE_POSTCOPY_ACTIVE &&
> > +                    pend_nonpost == 0 && s->start_postcopy) {
> > +
> > +                    if (!postcopy_start(s)) {
> > +                        current_active_type = MIG_STATE_POSTCOPY_ACTIVE;
> > +                    }
> > +
> > +                    continue;
> > +                }
> > +                /* Just another iteration step */
> >                  qemu_savevm_state_iterate(s->file);
> >              } else {
> >                  int ret;
> >  
> > -                DPRINTF("done iterating\n");
> > -                qemu_mutex_lock_iothread();
> > -                start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > -                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> > -                old_vm_running = runstate_is_running();
> > +                DPRINTF("done iterating pending size %" PRIu64 "\n",
> > +                        pending_size);
> > +
> > +                if (s->state == MIG_STATE_ACTIVE) {
> > +                    qemu_mutex_lock_iothread();
> > +                    start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +                    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> > +                    old_vm_running = runstate_is_running();
> > +
> > +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> > +                    if (ret >= 0) {
> > +                        qemu_file_set_rate_limit(s->file, INT64_MAX);
> > +                        qemu_savevm_state_complete(s->file);
> > +                    }
> > +                    qemu_mutex_unlock_iothread();
> > +
> > +                    if (ret < 0) {
> > +                        migrate_set_state(s, current_active_type,
> > +                                          MIG_STATE_ERROR);
> > +                        break;
> > +                    }
> > +                } else if (s->state == MIG_STATE_POSTCOPY_ACTIVE) {
> > +                    DPRINTF("postcopy end\n");
> > +
> > +                    qemu_savevm_state_postcopy_complete(s->file);
> > +                    DPRINTF("postcopy end after complete\n");
> >  
> > -                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> > -                if (ret >= 0) {
> > -                    qemu_file_set_rate_limit(s->file, INT64_MAX);
> > -                    qemu_savevm_state_complete(s->file);
> >                  }
> > -                qemu_mutex_unlock_iothread();
> >  
> > -                if (ret < 0) {
> > -                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
> > -                    break;
> > +                /*
> > +                 * If rp was opened we must clean up the thread before
> > +                 * cleaning everything else up.
> > +                 * Postcopy opens rp if enabled (even if it's not avtivated)
> > +                 */
> > +                if (migrate_postcopy_ram()) {
> > +                    DPRINTF("before rp close");
> > +                    await_outgoing_return_path_close(s);
> 
> Should this be done even if there is an error?  Perhaps move it
> altogether out of the big migration thread while() loop?

Yes, I've made a note of that; I need to go and look at more error
cases to see where it makes sense (e.g. the one above), however
in the non-error case I do want it to wait here for the 'SHUT' from
the destination to indicate that the destination believes migration
completed correctly (or not), and that should happen before
the state gets set to COMPLETED because we're waiting.

> > +                    DPRINTF("after rp close");
> >                  }
> > -
> >                  if (!qemu_file_get_error(s->file)) {
> > -                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
> > +                    migrate_set_state(s, current_active_type,
> > +                                      MIG_STATE_COMPLETED);
> >                      break;
> >                  }
> 
> This "else" is huge, can you extract it into its own function?

Done, and all the changes inside this "else" I've moved into a separate
commit that deals with the end rather than the start of postcopy.

Dave

> 
> >              }
> >          }
> >  
> >          if (qemu_file_get_error(s->file)) {
> > -            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
> > +            migrate_set_state(s, current_active_type, MIG_STATE_ERROR);
> > +            DPRINTF("migration_thread: file is in error state\n");
> >              break;
> >          }
> >          current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > @@ -1006,6 +1155,7 @@ static void *migration_thread(void *opaque)
> >          }
> >      }
> >  
> > +    DPRINTF("migration_thread: After loop");
> >      qemu_mutex_lock_iothread();
> >      if (s->state == MIG_STATE_COMPLETED) {
> >          int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > @@ -1043,6 +1193,19 @@ void migrate_fd_connect(MigrationState *s)
> >      /* Notify before starting migration thread */
> >      notifier_list_notify(&migration_state_notifiers, s);
> >  
> > +    /* Open the return path; currently for postcopy but other things might
> > +     * also want it.
> > +     */
> > +    if (migrate_postcopy_ram()) {
> > +        if (open_outgoing_return_path(s)) {
> > +            error_report("Unable to open return-path for postcopy");
> > +            migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ERROR);
> > +            migrate_fd_cleanup(s);
> > +            return;
> > +        }
> > +    }
> > +
> >      qemu_thread_create(&s->thread, "migration", migration_thread, s,
> >                         QEMU_THREAD_JOINABLE);
> > +    s->started_migration_thread = true;
> >  }
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert Jan. 5, 2015, 4:06 p.m. UTC | #9
* David Gibson (david@gibson.dropbear.id.au) wrote:
> On Fri, Oct 03, 2014 at 06:47:39PM +0100, Dr. David Alan Gilbert (git) wrote:
> > From: "Dr. David Alan Gilbert" <dgilbert@redhat.com>
> > 
> > Rework the migration thread to setup and start postcopy.
> > 
> > Signed-off-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> > ---
> >  include/migration/migration.h |   3 +
> >  migration.c                   | 201 ++++++++++++++++++++++++++++++++++++++----
> >  2 files changed, 185 insertions(+), 19 deletions(-)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index b01cc17..f401775 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h

> > +    DPRINTF("postcopy_start: sending req 2\n");
> > +    qemu_savevm_send_reqack(ms->file, 2);
> 
> Are these reqacks just for debugging, or do they affect the protocol?

Just for debugging; comment added.  They just make it easy to line
traces up between the source and destination, and also make it easy to figure
out how far stuff has got if it jams up.

> > +    if (migrate_postcopy_ram()) {
> > +        /* Now tell the dest that it should open it's end so it can reply */
> 
> s/it's/its/

Fixed.

> > +        qemu_savevm_send_openrp(s->file);
> > +
> > +        /* And ask it to send an ack that will make stuff easier to debug */
> > +        qemu_savevm_send_reqack(s->file, 1);
> > +
> > +        /* Tell the destination that we *might* want to do postcopy later;
> > +         * if the other end can't do postcopy it should fail now, nice and
> > +         * early.
> > +         */
> > +        qemu_savevm_send_postcopy_ram_advise(s->file);
> > +    }
> > +
> >      s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
> > +    current_active_type = MIG_STATE_ACTIVE;
> >      migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
> >  
> >      DPRINTF("setup complete\n");
> > @@ -945,37 +1057,74 @@ static void *migration_thread(void *opaque)
> >                      " nonpost=%" PRIu64 ")\n",
> >                      pending_size, max_size, pend_post, pend_nonpost);
> >              if (pending_size && pending_size >= max_size) {
> > +                /* Still a significant amount to transfer */
> > +
> > +                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> > +                if (migrate_postcopy_ram() &&
> > +                    s->state != MIG_STATE_POSTCOPY_ACTIVE &&
> > +                    pend_nonpost == 0 && s->start_postcopy) {
> 
> Hrm.  This is checking for pend_nonpost == 0, rather than just close
> to zero.  IIUC this will only work if all "live sendable" state is
> also postcopyable.  But if we have live sendable data that's not
> postcopyable - like the power hash page table - we'll need some
> threshold here, like we currently have for entering the stopped vm
> phase of a precopy migration.
> 
> Or am I missing something?

Hmm, I think you're right; I've changed this to:
	pend_nonpost <= max_size 

so that it's the same cut-off logic as the normal end-of-migrate;
I think that will work; i.e. it gets small enough to be expected
to complete quickly in the _complete phase that's at the start
of postcopy.

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index b01cc17..f401775 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -125,6 +125,9 @@  struct MigrationState
     /* Flag set once the migration has been asked to enter postcopy */
     volatile bool start_postcopy;
 
+    /* Flag set once the migration thread is running (and needs joining) */
+    volatile bool started_migration_thread;
+
     /* bitmap of pages that have been sent at least once
      * only maintained and used in postcopy at the moment
      * where it's used to send the dirtymap at the start
diff --git a/migration.c b/migration.c
index 63d70b6..1731017 100644
--- a/migration.c
+++ b/migration.c
@@ -475,7 +475,10 @@  static void migrate_fd_cleanup(void *opaque)
     if (s->file) {
         trace_migrate_fd_cleanup();
         qemu_mutex_unlock_iothread();
-        qemu_thread_join(&s->thread);
+        if (s->started_migration_thread) {
+            qemu_thread_join(&s->thread);
+            s->started_migration_thread = false;
+        }
         qemu_mutex_lock_iothread();
 
         qemu_fclose(s->file);
@@ -872,7 +875,6 @@  out:
     return NULL;
 }
 
-__attribute__ (( unused )) /* Until later in patch series */
 static int open_outgoing_return_path(MigrationState *ms)
 {
 
@@ -890,7 +892,6 @@  static int open_outgoing_return_path(MigrationState *ms)
     return 0;
 }
 
-__attribute__ (( unused )) /* Until later in patch series */
 static void await_outgoing_return_path_close(MigrationState *ms)
 {
     /*
@@ -908,6 +909,97 @@  static void await_outgoing_return_path_close(MigrationState *ms)
     DPRINTF("%s: Exit", __func__);
 }
 
+/* Switch from normal iteration to postcopy
+ * Returns non-0 on error
+ */
+static int postcopy_start(MigrationState *ms)
+{
+    int ret;
+    const QEMUSizedBuffer *qsb;
+    migrate_set_state(ms, MIG_STATE_ACTIVE, MIG_STATE_POSTCOPY_ACTIVE);
+
+    DPRINTF("postcopy_start\n");
+    qemu_mutex_lock_iothread();
+    DPRINTF("postcopy_start: setting run state\n");
+    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+
+    if (ret < 0) {
+        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
+        qemu_mutex_unlock_iothread();
+        return -1;
+    }
+
+    /*
+     * in Finish migrate and with the io-lock held everything should
+     * be quiet, but we've potentially still got dirty pages and we
+     * need to tell the destination to throw any pages it's already received
+     * that are dirty
+     */
+    if (ram_postcopy_send_discard_bitmap(ms)) {
+        DPRINTF("postcopy send discard bitmap failed\n");
+        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
+        qemu_mutex_unlock_iothread();
+        return -1;
+    }
+
+    DPRINTF("postcopy_start: sending req 2\n");
+    qemu_savevm_send_reqack(ms->file, 2);
+    /*
+     * send rest of state - note things that are doing postcopy
+     * will notice we're in MIG_STATE_POSTCOPY_ACTIVE and not actually
+     * wrap their state up here
+     */
+    qemu_file_set_rate_limit(ms->file, INT64_MAX);
+    DPRINTF("postcopy_start: do state_complete\n");
+
+    /*
+     * We need to leave the fd free for page transfers during the
+     * loading of the device state, so wrap all the remaining
+     * commands and state into a package that gets sent in one go
+     */
+    QEMUFile *fb = qemu_bufopen("w", NULL);
+    if (!fb) {
+        error_report("Failed to create buffered file");
+        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
+        qemu_mutex_unlock_iothread();
+        return -1;
+    }
+
+    qemu_savevm_state_complete(fb);
+    DPRINTF("postcopy_start: sending req 3\n");
+    qemu_savevm_send_reqack(fb, 3);
+
+    qemu_savevm_send_postcopy_ram_run(fb);
+
+    /* <><> end of stuff going into the package */
+    qsb = qemu_buf_get(fb);
+
+    /* Now send that blob */
+    if (qsb_get_length(qsb) > MAX_VM_CMD_PACKAGED_SIZE) {
+        DPRINTF("postcopy_start: Unreasonably large packaged state: %lu\n",
+                (unsigned long)(qsb_get_length(qsb)));
+        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
+        qemu_mutex_unlock_iothread();
+        qemu_fclose(fb);
+        return -1;
+    }
+    qemu_savevm_send_packaged(ms->file, qsb);
+    qemu_fclose(fb);
+
+    qemu_mutex_unlock_iothread();
+
+    DPRINTF("postcopy_start not finished sending ack\n");
+    qemu_savevm_send_reqack(ms->file, 4);
+
+    ret = qemu_file_get_error(ms->file);
+    if (ret) {
+        error_report("postcopy_start: Migration stream errored");
+        migrate_set_state(ms, MIG_STATE_POSTCOPY_ACTIVE, MIG_STATE_ERROR);
+    }
+
+    return ret;
+}
+
 /*
  * Master migration thread on the source VM.
  * It drives the migration and pumps the data down the outgoing channel.
@@ -915,16 +1007,36 @@  static void await_outgoing_return_path_close(MigrationState *ms)
 static void *migration_thread(void *opaque)
 {
     MigrationState *s = opaque;
+    /* Used by the bandwidth calcs, updated later */
     int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     int64_t initial_bytes = 0;
     int64_t max_size = 0;
     int64_t start_time = initial_time;
+
     bool old_vm_running = false;
 
+    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
+    enum MigrationPhase current_active_type = MIG_STATE_ACTIVE;
+
     qemu_savevm_state_begin(s->file, &s->params);
 
+    if (migrate_postcopy_ram()) {
+        /* Now tell the dest that it should open it's end so it can reply */
+        qemu_savevm_send_openrp(s->file);
+
+        /* And ask it to send an ack that will make stuff easier to debug */
+        qemu_savevm_send_reqack(s->file, 1);
+
+        /* Tell the destination that we *might* want to do postcopy later;
+         * if the other end can't do postcopy it should fail now, nice and
+         * early.
+         */
+        qemu_savevm_send_postcopy_ram_advise(s->file);
+    }
+
     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+    current_active_type = MIG_STATE_ACTIVE;
     migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ACTIVE);
 
     DPRINTF("setup complete\n");
@@ -945,37 +1057,74 @@  static void *migration_thread(void *opaque)
                     " nonpost=%" PRIu64 ")\n",
                     pending_size, max_size, pend_post, pend_nonpost);
             if (pending_size && pending_size >= max_size) {
+                /* Still a significant amount to transfer */
+
+                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+                if (migrate_postcopy_ram() &&
+                    s->state != MIG_STATE_POSTCOPY_ACTIVE &&
+                    pend_nonpost == 0 && s->start_postcopy) {
+
+                    if (!postcopy_start(s)) {
+                        current_active_type = MIG_STATE_POSTCOPY_ACTIVE;
+                    }
+
+                    continue;
+                }
+                /* Just another iteration step */
                 qemu_savevm_state_iterate(s->file);
             } else {
                 int ret;
 
-                DPRINTF("done iterating\n");
-                qemu_mutex_lock_iothread();
-                start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-                qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
-                old_vm_running = runstate_is_running();
+                DPRINTF("done iterating pending size %" PRIu64 "\n",
+                        pending_size);
+
+                if (s->state == MIG_STATE_ACTIVE) {
+                    qemu_mutex_lock_iothread();
+                    start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+                    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+                    old_vm_running = runstate_is_running();
+
+                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+                    if (ret >= 0) {
+                        qemu_file_set_rate_limit(s->file, INT64_MAX);
+                        qemu_savevm_state_complete(s->file);
+                    }
+                    qemu_mutex_unlock_iothread();
+
+                    if (ret < 0) {
+                        migrate_set_state(s, current_active_type,
+                                          MIG_STATE_ERROR);
+                        break;
+                    }
+                } else if (s->state == MIG_STATE_POSTCOPY_ACTIVE) {
+                    DPRINTF("postcopy end\n");
+
+                    qemu_savevm_state_postcopy_complete(s->file);
+                    DPRINTF("postcopy end after complete\n");
 
-                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-                if (ret >= 0) {
-                    qemu_file_set_rate_limit(s->file, INT64_MAX);
-                    qemu_savevm_state_complete(s->file);
                 }
-                qemu_mutex_unlock_iothread();
 
-                if (ret < 0) {
-                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
-                    break;
+                /*
+                 * If rp was opened we must clean up the thread before
+                 * cleaning everything else up.
+                 * Postcopy opens rp if enabled (even if it's not avtivated)
+                 */
+                if (migrate_postcopy_ram()) {
+                    DPRINTF("before rp close");
+                    await_outgoing_return_path_close(s);
+                    DPRINTF("after rp close");
                 }
-
                 if (!qemu_file_get_error(s->file)) {
-                    migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_COMPLETED);
+                    migrate_set_state(s, current_active_type,
+                                      MIG_STATE_COMPLETED);
                     break;
                 }
             }
         }
 
         if (qemu_file_get_error(s->file)) {
-            migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
+            migrate_set_state(s, current_active_type, MIG_STATE_ERROR);
+            DPRINTF("migration_thread: file is in error state\n");
             break;
         }
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1006,6 +1155,7 @@  static void *migration_thread(void *opaque)
         }
     }
 
+    DPRINTF("migration_thread: After loop");
     qemu_mutex_lock_iothread();
     if (s->state == MIG_STATE_COMPLETED) {
         int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1043,6 +1193,19 @@  void migrate_fd_connect(MigrationState *s)
     /* Notify before starting migration thread */
     notifier_list_notify(&migration_state_notifiers, s);
 
+    /* Open the return path; currently for postcopy but other things might
+     * also want it.
+     */
+    if (migrate_postcopy_ram()) {
+        if (open_outgoing_return_path(s)) {
+            error_report("Unable to open return-path for postcopy");
+            migrate_set_state(s, MIG_STATE_SETUP, MIG_STATE_ERROR);
+            migrate_fd_cleanup(s);
+            return;
+        }
+    }
+
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
+    s->started_migration_thread = true;
 }