diff mbox

[PULL,15/28] migration: create new section to store global state

Message ID 1436274549-28826-16-git-send-email-quintela@redhat.com
State New
Headers show

Commit Message

Juan Quintela July 7, 2015, 1:08 p.m. UTC
This includes a new section that for now just stores the current qemu state.

Right now, there are only one way to control what is the state of the
target after migration.

- If you run the target qemu with -S, it would start stopped.
- If you run the target qemu without -S, it would run just after migration finishes.

The problem here is what happens if we start the target without -S and
there happens one error during migration that puts current state as
-EIO.  Migration would ends (notice that the error happend doing block
IO, network IO, i.e. nothing related with migration), and when
migration finish, we would just "continue" running on destination,
probably hanging the guest/corruption data, whatever.

Signed-off-by: Juan Quintela <quintela@redhat.com>
Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
---
 include/migration/migration.h |   1 +
 migration/migration.c         | 105 +++++++++++++++++++++++++++++++++++++++---
 trace-events                  |   3 ++
 vl.c                          |   1 +
 4 files changed, 103 insertions(+), 7 deletions(-)

Comments

Christian Borntraeger July 8, 2015, 10:11 a.m. UTC | #1
Am 07.07.2015 um 15:08 schrieb Juan Quintela:
> This includes a new section that for now just stores the current qemu state.
> 
> Right now, there are only one way to control what is the state of the
> target after migration.
> 
> - If you run the target qemu with -S, it would start stopped.
> - If you run the target qemu without -S, it would run just after migration finishes.
> 
> The problem here is what happens if we start the target without -S and
> there happens one error during migration that puts current state as
> -EIO.  Migration would ends (notice that the error happend doing block
> IO, network IO, i.e. nothing related with migration), and when
> migration finish, we would just "continue" running on destination,
> probably hanging the guest/corruption data, whatever.
> 
> Signed-off-by: Juan Quintela <quintela@redhat.com>
> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

This is bisected to cause a regression on s390.

A guest restarts (booting) after managedsave/start instead of continuing.

Do you have any idea what might be wrong?

> ---
>  include/migration/migration.h |   1 +
>  migration/migration.c         | 105 +++++++++++++++++++++++++++++++++++++++---
>  trace-events                  |   3 ++
>  vl.c                          |   1 +
>  4 files changed, 103 insertions(+), 7 deletions(-)
> 
> diff --git a/include/migration/migration.h b/include/migration/migration.h
> index afba233..86df6cc 100644
> --- a/include/migration/migration.h
> +++ b/include/migration/migration.h
> @@ -197,4 +197,5 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
> 
>  void ram_mig_init(void);
>  void savevm_skip_section_footers(void);
> +void register_global_state(void);
>  #endif
> diff --git a/migration/migration.c b/migration/migration.c
> index c6ac08a..5e436f7 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -26,6 +26,7 @@
>  #include "qemu/thread.h"
>  #include "qmp-commands.h"
>  #include "trace.h"
> +#include "qapi/util.h"
> 
>  #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
> 
> @@ -97,6 +98,83 @@ void migration_incoming_state_destroy(void)
>      mis_current = NULL;
>  }
> 
> +
> +typedef struct {
> +    uint32_t size;
> +    uint8_t runstate[100];
> +} GlobalState;
> +
> +static GlobalState global_state;
> +
> +static int global_state_store(void)
> +{
> +    if (!runstate_store((char *)global_state.runstate,
> +                        sizeof(global_state.runstate))) {
> +        error_report("runstate name too big: %s", global_state.runstate);
> +        trace_migrate_state_too_big();
> +        return -EINVAL;
> +    }
> +    return 0;
> +}
> +
> +static char *global_state_get_runstate(void)
> +{
> +    return (char *)global_state.runstate;
> +}
> +
> +static int global_state_post_load(void *opaque, int version_id)
> +{
> +    GlobalState *s = opaque;
> +    int ret = 0;
> +    char *runstate = (char *)s->runstate;
> +
> +    trace_migrate_global_state_post_load(runstate);
> +
> +    if (strcmp(runstate, "running") != 0) {
> +        Error *local_err = NULL;
> +        int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX,
> +                                -1, &local_err);
> +
> +        if (r == -1) {
> +            if (local_err) {
> +                error_report_err(local_err);
> +            }
> +            return -EINVAL;
> +        }
> +        ret = vm_stop_force_state(r);
> +    }
> +
> +   return ret;
> +}
> +
> +static void global_state_pre_save(void *opaque)
> +{
> +    GlobalState *s = opaque;
> +
> +    trace_migrate_global_state_pre_save((char *)s->runstate);
> +    s->size = strlen((char *)s->runstate) + 1;
> +}
> +
> +static const VMStateDescription vmstate_globalstate = {
> +    .name = "globalstate",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .post_load = global_state_post_load,
> +    .pre_save = global_state_pre_save,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_UINT32(size, GlobalState),
> +        VMSTATE_BUFFER(runstate, GlobalState),
> +        VMSTATE_END_OF_LIST()
> +    },
> +};
> +
> +void register_global_state(void)
> +{
> +    /* We would use it independently that we receive it */
> +    strcpy((char *)&global_state.runstate, "");
> +    vmstate_register(NULL, 0, &vmstate_globalstate, &global_state);
> +}
> +
>  /*
>   * Called on -incoming with a defer: uri.
>   * The migration can be started later after any parameters have been
> @@ -164,10 +242,20 @@ static void process_incoming_migration_co(void *opaque)
>          exit(EXIT_FAILURE);
>      }
> 
> -    if (autostart) {
> +    /* runstate == "" means that we haven't received it through the
> +     * wire, so we obey autostart.  runstate == runing means that we
> +     * need to run it, we need to make sure that we do it after
> +     * everything else has finished.  Every other state change is done
> +     * at the post_load function */
> +
> +    if (strcmp(global_state_get_runstate(), "running") == 0) {
>          vm_start();
> -    } else {
> -        runstate_set(RUN_STATE_PAUSED);
> +    } else if (strcmp(global_state_get_runstate(), "") == 0) {
> +        if (autostart) {
> +            vm_start();
> +        } else {
> +            runstate_set(RUN_STATE_PAUSED);
> +        }
>      }
>      migrate_decompress_threads_join();
>  }
> @@ -793,10 +881,13 @@ static void *migration_thread(void *opaque)
>                  qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
>                  old_vm_running = runstate_is_running();
> 
> -                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> -                if (ret >= 0) {
> -                    qemu_file_set_rate_limit(s->file, INT64_MAX);
> -                    qemu_savevm_state_complete(s->file);
> +                ret = global_state_store();
> +                if (!ret) {
> +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> +                    if (ret >= 0) {
> +                        qemu_file_set_rate_limit(s->file, INT64_MAX);
> +                        qemu_savevm_state_complete(s->file);
> +                    }
>                  }
>                  qemu_mutex_unlock_iothread();
> 
> diff --git a/trace-events b/trace-events
> index a38dd2e..c0111d0 100644
> --- a/trace-events
> +++ b/trace-events
> @@ -1403,6 +1403,9 @@ migrate_fd_error(void) ""
>  migrate_fd_cancel(void) ""
>  migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
>  migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
> +migrate_state_too_big(void) ""
> +migrate_global_state_post_load(const char *state) "loaded state: %s"
> +migrate_global_state_pre_save(const char *state) "saved state: %s"
> 
>  # migration/rdma.c
>  qemu_rdma_accept_incoming_migration(void) ""
> diff --git a/vl.c b/vl.c
> index 19a8737..cfa6133 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -4624,6 +4624,7 @@ int main(int argc, char **argv, char **envp)
>          return 0;
>      }
> 
> +    register_global_state();
>      if (incoming) {
>          Error *local_err = NULL;
>          qemu_start_incoming_migration(incoming, &local_err);
>
Dr. David Alan Gilbert July 8, 2015, 10:14 a.m. UTC | #2
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
> > This includes a new section that for now just stores the current qemu state.
> > 
> > Right now, there are only one way to control what is the state of the
> > target after migration.
> > 
> > - If you run the target qemu with -S, it would start stopped.
> > - If you run the target qemu without -S, it would run just after migration finishes.
> > 
> > The problem here is what happens if we start the target without -S and
> > there happens one error during migration that puts current state as
> > -EIO.  Migration would ends (notice that the error happend doing block
> > IO, network IO, i.e. nothing related with migration), and when
> > migration finish, we would just "continue" running on destination,
> > probably hanging the guest/corruption data, whatever.
> > 
> > Signed-off-by: Juan Quintela <quintela@redhat.com>
> > Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> 
> This is bisected to cause a regression on s390.
> 
> A guest restarts (booting) after managedsave/start instead of continuing.
> 
> Do you have any idea what might be wrong?

I'd add some debug to the pre_save and post_load to see what state value is
being saved/restored.

Also, does that regression happen when doing the save/restore using the same/latest
git, or is it a load from an older version?

Dave

> 
> > ---
> >  include/migration/migration.h |   1 +
> >  migration/migration.c         | 105 +++++++++++++++++++++++++++++++++++++++---
> >  trace-events                  |   3 ++
> >  vl.c                          |   1 +
> >  4 files changed, 103 insertions(+), 7 deletions(-)
> > 
> > diff --git a/include/migration/migration.h b/include/migration/migration.h
> > index afba233..86df6cc 100644
> > --- a/include/migration/migration.h
> > +++ b/include/migration/migration.h
> > @@ -197,4 +197,5 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
> > 
> >  void ram_mig_init(void);
> >  void savevm_skip_section_footers(void);
> > +void register_global_state(void);
> >  #endif
> > diff --git a/migration/migration.c b/migration/migration.c
> > index c6ac08a..5e436f7 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -26,6 +26,7 @@
> >  #include "qemu/thread.h"
> >  #include "qmp-commands.h"
> >  #include "trace.h"
> > +#include "qapi/util.h"
> > 
> >  #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
> > 
> > @@ -97,6 +98,83 @@ void migration_incoming_state_destroy(void)
> >      mis_current = NULL;
> >  }
> > 
> > +
> > +typedef struct {
> > +    uint32_t size;
> > +    uint8_t runstate[100];
> > +} GlobalState;
> > +
> > +static GlobalState global_state;
> > +
> > +static int global_state_store(void)
> > +{
> > +    if (!runstate_store((char *)global_state.runstate,
> > +                        sizeof(global_state.runstate))) {
> > +        error_report("runstate name too big: %s", global_state.runstate);
> > +        trace_migrate_state_too_big();
> > +        return -EINVAL;
> > +    }
> > +    return 0;
> > +}
> > +
> > +static char *global_state_get_runstate(void)
> > +{
> > +    return (char *)global_state.runstate;
> > +}
> > +
> > +static int global_state_post_load(void *opaque, int version_id)
> > +{
> > +    GlobalState *s = opaque;
> > +    int ret = 0;
> > +    char *runstate = (char *)s->runstate;
> > +
> > +    trace_migrate_global_state_post_load(runstate);
> > +
> > +    if (strcmp(runstate, "running") != 0) {
> > +        Error *local_err = NULL;
> > +        int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX,
> > +                                -1, &local_err);
> > +
> > +        if (r == -1) {
> > +            if (local_err) {
> > +                error_report_err(local_err);
> > +            }
> > +            return -EINVAL;
> > +        }
> > +        ret = vm_stop_force_state(r);
> > +    }
> > +
> > +   return ret;
> > +}
> > +
> > +static void global_state_pre_save(void *opaque)
> > +{
> > +    GlobalState *s = opaque;
> > +
> > +    trace_migrate_global_state_pre_save((char *)s->runstate);
> > +    s->size = strlen((char *)s->runstate) + 1;
> > +}
> > +
> > +static const VMStateDescription vmstate_globalstate = {
> > +    .name = "globalstate",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .post_load = global_state_post_load,
> > +    .pre_save = global_state_pre_save,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_UINT32(size, GlobalState),
> > +        VMSTATE_BUFFER(runstate, GlobalState),
> > +        VMSTATE_END_OF_LIST()
> > +    },
> > +};
> > +
> > +void register_global_state(void)
> > +{
> > +    /* We would use it independently that we receive it */
> > +    strcpy((char *)&global_state.runstate, "");
> > +    vmstate_register(NULL, 0, &vmstate_globalstate, &global_state);
> > +}
> > +
> >  /*
> >   * Called on -incoming with a defer: uri.
> >   * The migration can be started later after any parameters have been
> > @@ -164,10 +242,20 @@ static void process_incoming_migration_co(void *opaque)
> >          exit(EXIT_FAILURE);
> >      }
> > 
> > -    if (autostart) {
> > +    /* runstate == "" means that we haven't received it through the
> > +     * wire, so we obey autostart.  runstate == runing means that we
> > +     * need to run it, we need to make sure that we do it after
> > +     * everything else has finished.  Every other state change is done
> > +     * at the post_load function */
> > +
> > +    if (strcmp(global_state_get_runstate(), "running") == 0) {
> >          vm_start();
> > -    } else {
> > -        runstate_set(RUN_STATE_PAUSED);
> > +    } else if (strcmp(global_state_get_runstate(), "") == 0) {
> > +        if (autostart) {
> > +            vm_start();
> > +        } else {
> > +            runstate_set(RUN_STATE_PAUSED);
> > +        }
> >      }
> >      migrate_decompress_threads_join();
> >  }
> > @@ -793,10 +881,13 @@ static void *migration_thread(void *opaque)
> >                  qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
> >                  old_vm_running = runstate_is_running();
> > 
> > -                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> > -                if (ret >= 0) {
> > -                    qemu_file_set_rate_limit(s->file, INT64_MAX);
> > -                    qemu_savevm_state_complete(s->file);
> > +                ret = global_state_store();
> > +                if (!ret) {
> > +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> > +                    if (ret >= 0) {
> > +                        qemu_file_set_rate_limit(s->file, INT64_MAX);
> > +                        qemu_savevm_state_complete(s->file);
> > +                    }
> >                  }
> >                  qemu_mutex_unlock_iothread();
> > 
> > diff --git a/trace-events b/trace-events
> > index a38dd2e..c0111d0 100644
> > --- a/trace-events
> > +++ b/trace-events
> > @@ -1403,6 +1403,9 @@ migrate_fd_error(void) ""
> >  migrate_fd_cancel(void) ""
> >  migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
> >  migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
> > +migrate_state_too_big(void) ""
> > +migrate_global_state_post_load(const char *state) "loaded state: %s"
> > +migrate_global_state_pre_save(const char *state) "saved state: %s"
> > 
> >  # migration/rdma.c
> >  qemu_rdma_accept_incoming_migration(void) ""
> > diff --git a/vl.c b/vl.c
> > index 19a8737..cfa6133 100644
> > --- a/vl.c
> > +++ b/vl.c
> > @@ -4624,6 +4624,7 @@ int main(int argc, char **argv, char **envp)
> >          return 0;
> >      }
> > 
> > +    register_global_state();
> >      if (incoming) {
> >          Error *local_err = NULL;
> >          qemu_start_incoming_migration(incoming, &local_err);
> > 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christian Borntraeger July 8, 2015, 10:19 a.m. UTC | #3
Am 08.07.2015 um 12:14 schrieb Dr. David Alan Gilbert:
> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>> This includes a new section that for now just stores the current qemu state.
>>>
>>> Right now, there are only one way to control what is the state of the
>>> target after migration.
>>>
>>> - If you run the target qemu with -S, it would start stopped.
>>> - If you run the target qemu without -S, it would run just after migration finishes.
>>>
>>> The problem here is what happens if we start the target without -S and
>>> there happens one error during migration that puts current state as
>>> -EIO.  Migration would ends (notice that the error happend doing block
>>> IO, network IO, i.e. nothing related with migration), and when
>>> migration finish, we would just "continue" running on destination,
>>> probably hanging the guest/corruption data, whatever.
>>>
>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>
>> This is bisected to cause a regression on s390.
>>
>> A guest restarts (booting) after managedsave/start instead of continuing.
>>
>> Do you have any idea what might be wrong?
> 
> I'd add some debug to the pre_save and post_load to see what state value is
> being saved/restored.

migrate_global_state_pre_save saved state: paused
migrate_global_state_post_load loaded state: paused

 
> Also, does that regression happen when doing the save/restore using the same/latest
> git, or is it a load from an older version?

It the same git that fails.
Christian Borntraeger July 8, 2015, 10:36 a.m. UTC | #4
Am 08.07.2015 um 12:14 schrieb Dr. David Alan Gilbert:
> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>> This includes a new section that for now just stores the current qemu state.
>>>
>>> Right now, there are only one way to control what is the state of the
>>> target after migration.
>>>
>>> - If you run the target qemu with -S, it would start stopped.
>>> - If you run the target qemu without -S, it would run just after migration finishes.
>>>
>>> The problem here is what happens if we start the target without -S and
>>> there happens one error during migration that puts current state as
>>> -EIO.  Migration would ends (notice that the error happend doing block
>>> IO, network IO, i.e. nothing related with migration), and when
>>> migration finish, we would just "continue" running on destination,
>>> probably hanging the guest/corruption data, whatever.
>>>
>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>
>> This is bisected to cause a regression on s390.
>>
>> A guest restarts (booting) after managedsave/start instead of continuing.
>>
>> Do you have any idea what might be wrong?
> 
> I'd add some debug to the pre_save and post_load to see what state value is
> being saved/restored.
> 
> Also, does that regression happen when doing the save/restore using the same/latest
> git, or is it a load from an older version?

Seems to happen only with some guest definitions, but I cant really pinpoint it yet.
e.g. removing queues='4' from my network card solved it for a reduced xml, but
doing the same on a bigger xml was not enough :-/
Dr. David Alan Gilbert July 8, 2015, 10:43 a.m. UTC | #5
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> Am 08.07.2015 um 12:14 schrieb Dr. David Alan Gilbert:
> > * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
> >>> This includes a new section that for now just stores the current qemu state.
> >>>
> >>> Right now, there are only one way to control what is the state of the
> >>> target after migration.
> >>>
> >>> - If you run the target qemu with -S, it would start stopped.
> >>> - If you run the target qemu without -S, it would run just after migration finishes.
> >>>
> >>> The problem here is what happens if we start the target without -S and
> >>> there happens one error during migration that puts current state as
> >>> -EIO.  Migration would ends (notice that the error happend doing block
> >>> IO, network IO, i.e. nothing related with migration), and when
> >>> migration finish, we would just "continue" running on destination,
> >>> probably hanging the guest/corruption data, whatever.
> >>>
> >>> Signed-off-by: Juan Quintela <quintela@redhat.com>
> >>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>
> >> This is bisected to cause a regression on s390.
> >>
> >> A guest restarts (booting) after managedsave/start instead of continuing.
> >>
> >> Do you have any idea what might be wrong?
> > 
> > I'd add some debug to the pre_save and post_load to see what state value is
> > being saved/restored.
> > 
> > Also, does that regression happen when doing the save/restore using the same/latest
> > git, or is it a load from an older version?
> 
> Seems to happen only with some guest definitions, but I cant really pinpoint it yet.
> e.g. removing queues='4' from my network card solved it for a reduced xml, but
> doing the same on a bigger xml was not enough :-/

Nasty;  Still the 'paused' value in the pre-save/post-load feels right.
I've read through the patch again and it still fells right to me, so I don't
see anything obvious.

Perhaps it's worth turning on the migration tracing on both sides and seeing what's
different with that 'queues=4' ?

Dave

--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Christian Borntraeger July 8, 2015, 10:54 a.m. UTC | #6
Am 08.07.2015 um 12:43 schrieb Dr. David Alan Gilbert:
> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>> Am 08.07.2015 um 12:14 schrieb Dr. David Alan Gilbert:
>>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
>>>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>>>> This includes a new section that for now just stores the current qemu state.
>>>>>
>>>>> Right now, there are only one way to control what is the state of the
>>>>> target after migration.
>>>>>
>>>>> - If you run the target qemu with -S, it would start stopped.
>>>>> - If you run the target qemu without -S, it would run just after migration finishes.
>>>>>
>>>>> The problem here is what happens if we start the target without -S and
>>>>> there happens one error during migration that puts current state as
>>>>> -EIO.  Migration would ends (notice that the error happend doing block
>>>>> IO, network IO, i.e. nothing related with migration), and when
>>>>> migration finish, we would just "continue" running on destination,
>>>>> probably hanging the guest/corruption data, whatever.
>>>>>
>>>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>
>>>> This is bisected to cause a regression on s390.
>>>>
>>>> A guest restarts (booting) after managedsave/start instead of continuing.
>>>>
>>>> Do you have any idea what might be wrong?
>>>
>>> I'd add some debug to the pre_save and post_load to see what state value is
>>> being saved/restored.
>>>
>>> Also, does that regression happen when doing the save/restore using the same/latest
>>> git, or is it a load from an older version?
>>
>> Seems to happen only with some guest definitions, but I cant really pinpoint it yet.
>> e.g. removing queues='4' from my network card solved it for a reduced xml, but
>> doing the same on a bigger xml was not enough :-/
> 
> Nasty;  Still the 'paused' value in the pre-save/post-load feels right.
> I've read through the patch again and it still fells right to me, so I don't
> see anything obvious.
> 
> Perhaps it's worth turning on the migration tracing on both sides and seeing what's
> different with that 'queues=4' ?

Reducing the amount of virtio disks also seem to help. I am asking myself if
some devices use the runstate somehow and this change triggers a race.
Juan Quintela July 8, 2015, 11:10 a.m. UTC | #7
Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>> This includes a new section that for now just stores the current qemu state.
>> 
>> Right now, there are only one way to control what is the state of the
>> target after migration.
>> 
>> - If you run the target qemu with -S, it would start stopped.
>> - If you run the target qemu without -S, it would run just after
>> migration finishes.
>> 
>> The problem here is what happens if we start the target without -S and
>> there happens one error during migration that puts current state as
>> -EIO.  Migration would ends (notice that the error happend doing block
>> IO, network IO, i.e. nothing related with migration), and when
>> migration finish, we would just "continue" running on destination,
>> probably hanging the guest/corruption data, whatever.
>> 
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>
> This is bisected to cause a regression on s390.
>
> A guest restarts (booting) after managedsave/start instead of continuing.
>
> Do you have any idea what might be wrong?


managedsaved is basically:

migrate_set_speed INT64_MAX
migrate to fd

So it makes exactly zero sense that it is failing.
That it fails only on s390 is even weirder, because this patch just add
a new section, nothing more, nothing else.  If you are running the same
binary on both sides, it should just be ok.

Do you have a trace/log of how it fails?

Thanks, Juan.


>
>> ---
>>  include/migration/migration.h |   1 +
>>  migration/migration.c         | 105 +++++++++++++++++++++++++++++++++++++++---
>>  trace-events                  |   3 ++
>>  vl.c                          |   1 +
>>  4 files changed, 103 insertions(+), 7 deletions(-)
>> 
>> diff --git a/include/migration/migration.h b/include/migration/migration.h
>> index afba233..86df6cc 100644
>> --- a/include/migration/migration.h
>> +++ b/include/migration/migration.h
>> @@ -197,4 +197,5 @@ size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
>> 
>>  void ram_mig_init(void);
>>  void savevm_skip_section_footers(void);
>> +void register_global_state(void);
>>  #endif
>> diff --git a/migration/migration.c b/migration/migration.c
>> index c6ac08a..5e436f7 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -26,6 +26,7 @@
>>  #include "qemu/thread.h"
>>  #include "qmp-commands.h"
>>  #include "trace.h"
>> +#include "qapi/util.h"
>> 
>>  #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */
>> 
>> @@ -97,6 +98,83 @@ void migration_incoming_state_destroy(void)
>>      mis_current = NULL;
>>  }
>> 
>> +
>> +typedef struct {
>> +    uint32_t size;
>> +    uint8_t runstate[100];
>> +} GlobalState;
>> +
>> +static GlobalState global_state;
>> +
>> +static int global_state_store(void)
>> +{
>> +    if (!runstate_store((char *)global_state.runstate,
>> +                        sizeof(global_state.runstate))) {
>> +        error_report("runstate name too big: %s", global_state.runstate);
>> +        trace_migrate_state_too_big();
>> +        return -EINVAL;
>> +    }
>> +    return 0;
>> +}
>> +
>> +static char *global_state_get_runstate(void)
>> +{
>> +    return (char *)global_state.runstate;
>> +}
>> +
>> +static int global_state_post_load(void *opaque, int version_id)
>> +{
>> +    GlobalState *s = opaque;
>> +    int ret = 0;
>> +    char *runstate = (char *)s->runstate;
>> +
>> +    trace_migrate_global_state_post_load(runstate);
>> +
>> +    if (strcmp(runstate, "running") != 0) {
>> +        Error *local_err = NULL;
>> +        int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX,
>> +                                -1, &local_err);
>> +
>> +        if (r == -1) {
>> +            if (local_err) {
>> +                error_report_err(local_err);
>> +            }
>> +            return -EINVAL;
>> +        }
>> +        ret = vm_stop_force_state(r);
>> +    }
>> +
>> +   return ret;
>> +}
>> +
>> +static void global_state_pre_save(void *opaque)
>> +{
>> +    GlobalState *s = opaque;
>> +
>> +    trace_migrate_global_state_pre_save((char *)s->runstate);
>> +    s->size = strlen((char *)s->runstate) + 1;
>> +}
>> +
>> +static const VMStateDescription vmstate_globalstate = {
>> +    .name = "globalstate",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .post_load = global_state_post_load,
>> +    .pre_save = global_state_pre_save,
>> +    .fields = (VMStateField[]) {
>> +        VMSTATE_UINT32(size, GlobalState),
>> +        VMSTATE_BUFFER(runstate, GlobalState),
>> +        VMSTATE_END_OF_LIST()
>> +    },
>> +};
>> +
>> +void register_global_state(void)
>> +{
>> +    /* We would use it independently that we receive it */
>> +    strcpy((char *)&global_state.runstate, "");
>> +    vmstate_register(NULL, 0, &vmstate_globalstate, &global_state);
>> +}
>> +
>>  /*
>>   * Called on -incoming with a defer: uri.
>>   * The migration can be started later after any parameters have been
>> @@ -164,10 +242,20 @@ static void process_incoming_migration_co(void *opaque)
>>          exit(EXIT_FAILURE);
>>      }
>> 
>> -    if (autostart) {
>> +    /* runstate == "" means that we haven't received it through the
>> +     * wire, so we obey autostart.  runstate == runing means that we
>> +     * need to run it, we need to make sure that we do it after
>> +     * everything else has finished.  Every other state change is done
>> +     * at the post_load function */
>> +
>> +    if (strcmp(global_state_get_runstate(), "running") == 0) {
>>          vm_start();
>> -    } else {
>> -        runstate_set(RUN_STATE_PAUSED);
>> +    } else if (strcmp(global_state_get_runstate(), "") == 0) {
>> +        if (autostart) {
>> +            vm_start();
>> +        } else {
>> +            runstate_set(RUN_STATE_PAUSED);
>> +        }
>>      }
>>      migrate_decompress_threads_join();
>>  }
>> @@ -793,10 +881,13 @@ static void *migration_thread(void *opaque)
>>                  qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
>>                  old_vm_running = runstate_is_running();
>> 
>> -                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
>> -                if (ret >= 0) {
>> -                    qemu_file_set_rate_limit(s->file, INT64_MAX);
>> -                    qemu_savevm_state_complete(s->file);
>> +                ret = global_state_store();
>> +                if (!ret) {
>> +                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
>> +                    if (ret >= 0) {
>> +                        qemu_file_set_rate_limit(s->file, INT64_MAX);
>> +                        qemu_savevm_state_complete(s->file);
>> +                    }
>>                  }
>>                  qemu_mutex_unlock_iothread();
>> 
>> diff --git a/trace-events b/trace-events
>> index a38dd2e..c0111d0 100644
>> --- a/trace-events
>> +++ b/trace-events
>> @@ -1403,6 +1403,9 @@ migrate_fd_error(void) ""
>>  migrate_fd_cancel(void) ""
>>  migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
>>  migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
>> +migrate_state_too_big(void) ""
>> +migrate_global_state_post_load(const char *state) "loaded state: %s"
>> +migrate_global_state_pre_save(const char *state) "saved state: %s"
>> 
>>  # migration/rdma.c
>>  qemu_rdma_accept_incoming_migration(void) ""
>> diff --git a/vl.c b/vl.c
>> index 19a8737..cfa6133 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -4624,6 +4624,7 @@ int main(int argc, char **argv, char **envp)
>>          return 0;
>>      }
>> 
>> +    register_global_state();
>>      if (incoming) {
>>          Error *local_err = NULL;
>>          qemu_start_incoming_migration(incoming, &local_err);
>>
Dr. David Alan Gilbert July 8, 2015, 11:14 a.m. UTC | #8
* Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> Am 08.07.2015 um 12:43 schrieb Dr. David Alan Gilbert:
> > * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >> Am 08.07.2015 um 12:14 schrieb Dr. David Alan Gilbert:
> >>> * Christian Borntraeger (borntraeger@de.ibm.com) wrote:
> >>>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
> >>>>> This includes a new section that for now just stores the current qemu state.
> >>>>>
> >>>>> Right now, there are only one way to control what is the state of the
> >>>>> target after migration.
> >>>>>
> >>>>> - If you run the target qemu with -S, it would start stopped.
> >>>>> - If you run the target qemu without -S, it would run just after migration finishes.
> >>>>>
> >>>>> The problem here is what happens if we start the target without -S and
> >>>>> there happens one error during migration that puts current state as
> >>>>> -EIO.  Migration would ends (notice that the error happend doing block
> >>>>> IO, network IO, i.e. nothing related with migration), and when
> >>>>> migration finish, we would just "continue" running on destination,
> >>>>> probably hanging the guest/corruption data, whatever.
> >>>>>
> >>>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
> >>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
> >>>>
> >>>> This is bisected to cause a regression on s390.
> >>>>
> >>>> A guest restarts (booting) after managedsave/start instead of continuing.
> >>>>
> >>>> Do you have any idea what might be wrong?
> >>>
> >>> I'd add some debug to the pre_save and post_load to see what state value is
> >>> being saved/restored.
> >>>
> >>> Also, does that regression happen when doing the save/restore using the same/latest
> >>> git, or is it a load from an older version?
> >>
> >> Seems to happen only with some guest definitions, but I cant really pinpoint it yet.
> >> e.g. removing queues='4' from my network card solved it for a reduced xml, but
> >> doing the same on a bigger xml was not enough :-/
> > 
> > Nasty;  Still the 'paused' value in the pre-save/post-load feels right.
> > I've read through the patch again and it still fells right to me, so I don't
> > see anything obvious.
> > 
> > Perhaps it's worth turning on the migration tracing on both sides and seeing what's
> > different with that 'queues=4' ?
> 
> Reducing the amount of virtio disks also seem to help. I am asking myself if
> some devices use the runstate somehow and this change triggers a race.

I'm not sure why it would make a difference, but...
The difference this patch makes is that in the 'paused' state the state of the VM is
set to 'paused' before all of the other devices have finished loading their state.
In the old case it would only be transitioned to pause at the end after
all the other devices have loaded.

Dave
> 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Juan Quintela July 8, 2015, 12:08 p.m. UTC | #9
Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>> This includes a new section that for now just stores the current qemu state.
>> 
>> Right now, there are only one way to control what is the state of the
>> target after migration.
>> 
>> - If you run the target qemu with -S, it would start stopped.
>> - If you run the target qemu without -S, it would run just after
>> migration finishes.
>> 
>> The problem here is what happens if we start the target without -S and
>> there happens one error during migration that puts current state as
>> -EIO.  Migration would ends (notice that the error happend doing block
>> IO, network IO, i.e. nothing related with migration), and when
>> migration finish, we would just "continue" running on destination,
>> probably hanging the guest/corruption data, whatever.
>> 
>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>
> This is bisected to cause a regression on s390.
>
> A guest restarts (booting) after managedsave/start instead of continuing.
>
> Do you have any idea what might be wrong?

Can you check the new patch series that I sent.  There is a fix that
*could* help there.  *could* because I don't fully understand why it can
give you problems (and even only sometimes).  Current guess is that some
of the devices are testing the guest state on LOAD, so that patch.

Please, test.

Later, Juan.
Christian Borntraeger July 8, 2015, 12:17 p.m. UTC | #10
Am 08.07.2015 um 14:08 schrieb Juan Quintela:
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>> This includes a new section that for now just stores the current qemu state.
>>>
>>> Right now, there are only one way to control what is the state of the
>>> target after migration.
>>>
>>> - If you run the target qemu with -S, it would start stopped.
>>> - If you run the target qemu without -S, it would run just after
>>> migration finishes.
>>>
>>> The problem here is what happens if we start the target without -S and
>>> there happens one error during migration that puts current state as
>>> -EIO.  Migration would ends (notice that the error happend doing block
>>> IO, network IO, i.e. nothing related with migration), and when
>>> migration finish, we would just "continue" running on destination,
>>> probably hanging the guest/corruption data, whatever.
>>>
>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>
>> This is bisected to cause a regression on s390.
>>
>> A guest restarts (booting) after managedsave/start instead of continuing.
>>
>> Do you have any idea what might be wrong?
> 
> Can you check the new patch series that I sent.  There is a fix that
> *could* help there.  *could* because I don't fully understand why it can
> give you problems (and even only sometimes).  Current guess is that some
> of the devices are testing the guest state on LOAD, so that patch.
> 
> Please, test.

That patch does indeed fix my problem.
I can see that virtio_init uses the runstate to set vm_running of the vdev. This
is used in virtio-net for several aspects.
But I really dont understand why this causes the symptoms.
So I am tempted to add
a
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>

but I have a bad feeling on the "why" :-/



Christian
Juan Quintela July 8, 2015, 12:25 p.m. UTC | #11
Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> Am 08.07.2015 um 14:08 schrieb Juan Quintela:
>> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>>> This includes a new section that for now just stores the current qemu state.
>>>>
>>>> Right now, there are only one way to control what is the state of the
>>>> target after migration.
>>>>
>>>> - If you run the target qemu with -S, it would start stopped.
>>>> - If you run the target qemu without -S, it would run just after
>>>> migration finishes.
>>>>
>>>> The problem here is what happens if we start the target without -S and
>>>> there happens one error during migration that puts current state as
>>>> -EIO.  Migration would ends (notice that the error happend doing block
>>>> IO, network IO, i.e. nothing related with migration), and when
>>>> migration finish, we would just "continue" running on destination,
>>>> probably hanging the guest/corruption data, whatever.
>>>>
>>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>
>>> This is bisected to cause a regression on s390.
>>>
>>> A guest restarts (booting) after managedsave/start instead of continuing.
>>>
>>> Do you have any idea what might be wrong?
>> 
>> Can you check the new patch series that I sent.  There is a fix that
>> *could* help there.  *could* because I don't fully understand why it can
>> give you problems (and even only sometimes).  Current guess is that some
>> of the devices are testing the guest state on LOAD, so that patch.
>> 
>> Please, test.
>
> That patch does indeed fix my problem.
> I can see that virtio_init uses the runstate to set vm_running of the vdev. This
> is used in virtio-net for several aspects.
> But I really dont understand why this causes the symptoms.
> So I am tempted to add
> a
> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
>
> but I have a bad feeling on the "why" :-/

The other reason of that patch is that it removes the need that the
global_state is the last migration section.  Could it be that you are
adding sections after you launch qemu?  Or that virtio devices are
generated later on s390 for any reason?

Later, Juan.
Christian Borntraeger July 8, 2015, 12:34 p.m. UTC | #12
Am 08.07.2015 um 14:25 schrieb Juan Quintela:
> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>> Am 08.07.2015 um 14:08 schrieb Juan Quintela:
>>> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>>>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>>>> This includes a new section that for now just stores the current qemu state.
>>>>>
>>>>> Right now, there are only one way to control what is the state of the
>>>>> target after migration.
>>>>>
>>>>> - If you run the target qemu with -S, it would start stopped.
>>>>> - If you run the target qemu without -S, it would run just after
>>>>> migration finishes.
>>>>>
>>>>> The problem here is what happens if we start the target without -S and
>>>>> there happens one error during migration that puts current state as
>>>>> -EIO.  Migration would ends (notice that the error happend doing block
>>>>> IO, network IO, i.e. nothing related with migration), and when
>>>>> migration finish, we would just "continue" running on destination,
>>>>> probably hanging the guest/corruption data, whatever.
>>>>>
>>>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>
>>>> This is bisected to cause a regression on s390.
>>>>
>>>> A guest restarts (booting) after managedsave/start instead of continuing.
>>>>
>>>> Do you have any idea what might be wrong?
>>>
>>> Can you check the new patch series that I sent.  There is a fix that
>>> *could* help there.  *could* because I don't fully understand why it can
>>> give you problems (and even only sometimes).  Current guess is that some
>>> of the devices are testing the guest state on LOAD, so that patch.
>>>
>>> Please, test.
>>
>> That patch does indeed fix my problem.
>> I can see that virtio_init uses the runstate to set vm_running of the vdev. This
>> is used in virtio-net for several aspects.
>> But I really dont understand why this causes the symptoms.
>> So I am tempted to add
>> a
>> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
>>
>> but I have a bad feeling on the "why" :-/
> 
> The other reason of that patch is that it removes the need that the
> global_state is the last migration section.

Hmm, we have some register_savevm here and there, but these seem
to be called in device realize/init or machine init.


> Could it be that you are
> adding sections after you launch qemu?  Or that virtio devices are
> generated later on s390 for any reason?

Not that I am aware of. There can be hotplug of course, but I dont see
why libvirt should do that during migration.

Still looking.....
Juan Quintela July 8, 2015, 12:51 p.m. UTC | #13
Christian Borntraeger <borntraeger@de.ibm.com> wrote:
> Am 08.07.2015 um 14:25 schrieb Juan Quintela:
>> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>>> Am 08.07.2015 um 14:08 schrieb Juan Quintela:
>>>> Christian Borntraeger <borntraeger@de.ibm.com> wrote:
>>>>> Am 07.07.2015 um 15:08 schrieb Juan Quintela:
>>>>>> This includes a new section that for now just stores the current qemu state.
>>>>>>
>>>>>> Right now, there are only one way to control what is the state of the
>>>>>> target after migration.
>>>>>>
>>>>>> - If you run the target qemu with -S, it would start stopped.
>>>>>> - If you run the target qemu without -S, it would run just after
>>>>>> migration finishes.
>>>>>>
>>>>>> The problem here is what happens if we start the target without -S and
>>>>>> there happens one error during migration that puts current state as
>>>>>> -EIO.  Migration would ends (notice that the error happend doing block
>>>>>> IO, network IO, i.e. nothing related with migration), and when
>>>>>> migration finish, we would just "continue" running on destination,
>>>>>> probably hanging the guest/corruption data, whatever.
>>>>>>
>>>>>> Signed-off-by: Juan Quintela <quintela@redhat.com>
>>>>>> Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>
>>>>>
>>>>> This is bisected to cause a regression on s390.
>>>>>
>>>>> A guest restarts (booting) after managedsave/start instead of continuing.
>>>>>
>>>>> Do you have any idea what might be wrong?
>>>>
>>>> Can you check the new patch series that I sent.  There is a fix that
>>>> *could* help there.  *could* because I don't fully understand why it can
>>>> give you problems (and even only sometimes).  Current guess is that some
>>>> of the devices are testing the guest state on LOAD, so that patch.
>>>>
>>>> Please, test.
>>>
>>> That patch does indeed fix my problem.
>>> I can see that virtio_init uses the runstate to set vm_running of the vdev. This
>>> is used in virtio-net for several aspects.
>>> But I really dont understand why this causes the symptoms.
>>> So I am tempted to add
>>> a
>>> Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
>>>
>>> but I have a bad feeling on the "why" :-/
>> 
>> The other reason of that patch is that it removes the need that the
>> global_state is the last migration section.
>
> Hmm, we have some register_savevm here and there, but these seem
> to be called in device realize/init or machine init.
>
>
>> Could it be that you are
>> adding sections after you launch qemu?  Or that virtio devices are
>> generated later on s390 for any reason?
>
> Not that I am aware of. There can be hotplug of course, but I dont see
> why libvirt should do that during migration.

It can also happens if it plugs the devices after we have registerd the
global save state.  On x86_64 that don't happens, but not sure how
things are structured in s390.

If it was that, it has been fixed with the patch that I sent.


>
> Still looking.....

Thanks, Juan.
diff mbox

Patch

diff --git a/include/migration/migration.h b/include/migration/migration.h
index afba233..86df6cc 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -197,4 +197,5 @@  size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,

 void ram_mig_init(void);
 void savevm_skip_section_footers(void);
+void register_global_state(void);
 #endif
diff --git a/migration/migration.c b/migration/migration.c
index c6ac08a..5e436f7 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -26,6 +26,7 @@ 
 #include "qemu/thread.h"
 #include "qmp-commands.h"
 #include "trace.h"
+#include "qapi/util.h"

 #define MAX_THROTTLE  (32 << 20)      /* Migration speed throttling */

@@ -97,6 +98,83 @@  void migration_incoming_state_destroy(void)
     mis_current = NULL;
 }

+
+typedef struct {
+    uint32_t size;
+    uint8_t runstate[100];
+} GlobalState;
+
+static GlobalState global_state;
+
+static int global_state_store(void)
+{
+    if (!runstate_store((char *)global_state.runstate,
+                        sizeof(global_state.runstate))) {
+        error_report("runstate name too big: %s", global_state.runstate);
+        trace_migrate_state_too_big();
+        return -EINVAL;
+    }
+    return 0;
+}
+
+static char *global_state_get_runstate(void)
+{
+    return (char *)global_state.runstate;
+}
+
+static int global_state_post_load(void *opaque, int version_id)
+{
+    GlobalState *s = opaque;
+    int ret = 0;
+    char *runstate = (char *)s->runstate;
+
+    trace_migrate_global_state_post_load(runstate);
+
+    if (strcmp(runstate, "running") != 0) {
+        Error *local_err = NULL;
+        int r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE_MAX,
+                                -1, &local_err);
+
+        if (r == -1) {
+            if (local_err) {
+                error_report_err(local_err);
+            }
+            return -EINVAL;
+        }
+        ret = vm_stop_force_state(r);
+    }
+
+   return ret;
+}
+
+static void global_state_pre_save(void *opaque)
+{
+    GlobalState *s = opaque;
+
+    trace_migrate_global_state_pre_save((char *)s->runstate);
+    s->size = strlen((char *)s->runstate) + 1;
+}
+
+static const VMStateDescription vmstate_globalstate = {
+    .name = "globalstate",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .post_load = global_state_post_load,
+    .pre_save = global_state_pre_save,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32(size, GlobalState),
+        VMSTATE_BUFFER(runstate, GlobalState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
+void register_global_state(void)
+{
+    /* We would use it independently that we receive it */
+    strcpy((char *)&global_state.runstate, "");
+    vmstate_register(NULL, 0, &vmstate_globalstate, &global_state);
+}
+
 /*
  * Called on -incoming with a defer: uri.
  * The migration can be started later after any parameters have been
@@ -164,10 +242,20 @@  static void process_incoming_migration_co(void *opaque)
         exit(EXIT_FAILURE);
     }

-    if (autostart) {
+    /* runstate == "" means that we haven't received it through the
+     * wire, so we obey autostart.  runstate == runing means that we
+     * need to run it, we need to make sure that we do it after
+     * everything else has finished.  Every other state change is done
+     * at the post_load function */
+
+    if (strcmp(global_state_get_runstate(), "running") == 0) {
         vm_start();
-    } else {
-        runstate_set(RUN_STATE_PAUSED);
+    } else if (strcmp(global_state_get_runstate(), "") == 0) {
+        if (autostart) {
+            vm_start();
+        } else {
+            runstate_set(RUN_STATE_PAUSED);
+        }
     }
     migrate_decompress_threads_join();
 }
@@ -793,10 +881,13 @@  static void *migration_thread(void *opaque)
                 qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
                 old_vm_running = runstate_is_running();

-                ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-                if (ret >= 0) {
-                    qemu_file_set_rate_limit(s->file, INT64_MAX);
-                    qemu_savevm_state_complete(s->file);
+                ret = global_state_store();
+                if (!ret) {
+                    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+                    if (ret >= 0) {
+                        qemu_file_set_rate_limit(s->file, INT64_MAX);
+                        qemu_savevm_state_complete(s->file);
+                    }
                 }
                 qemu_mutex_unlock_iothread();

diff --git a/trace-events b/trace-events
index a38dd2e..c0111d0 100644
--- a/trace-events
+++ b/trace-events
@@ -1403,6 +1403,9 @@  migrate_fd_error(void) ""
 migrate_fd_cancel(void) ""
 migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
 migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
+migrate_state_too_big(void) ""
+migrate_global_state_post_load(const char *state) "loaded state: %s"
+migrate_global_state_pre_save(const char *state) "saved state: %s"

 # migration/rdma.c
 qemu_rdma_accept_incoming_migration(void) ""
diff --git a/vl.c b/vl.c
index 19a8737..cfa6133 100644
--- a/vl.c
+++ b/vl.c
@@ -4624,6 +4624,7 @@  int main(int argc, char **argv, char **envp)
         return 0;
     }

+    register_global_state();
     if (incoming) {
         Error *local_err = NULL;
         qemu_start_incoming_migration(incoming, &local_err);