diff mbox series

[RFC,v2,10/33] migration: allow dst vm pause on postcopy

Message ID 1504081950-2528-11-git-send-email-peterx@redhat.com
State New
Headers show
Series Migration: postcopy failure recovery | expand

Commit Message

Peter Xu Aug. 30, 2017, 8:32 a.m. UTC
When there is IO error on the incoming channel (e.g., network down),
instead of bailing out immediately, we allow the dst vm to switch to the
new POSTCOPY_PAUSE state. Currently it is still simple - it waits the
new semaphore, until someone poke it for another attempt.

Signed-off-by: Peter Xu <peterx@redhat.com>
---
 migration/migration.c  |  1 +
 migration/migration.h  |  3 +++
 migration/savevm.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
 migration/trace-events |  2 ++
 4 files changed, 64 insertions(+), 2 deletions(-)

Comments

Dr. David Alan Gilbert Sept. 21, 2017, 7:29 p.m. UTC | #1
* Peter Xu (peterx@redhat.com) wrote:
> When there is IO error on the incoming channel (e.g., network down),
> instead of bailing out immediately, we allow the dst vm to switch to the
> new POSTCOPY_PAUSE state. Currently it is still simple - it waits the
> new semaphore, until someone poke it for another attempt.
> 
> Signed-off-by: Peter Xu <peterx@redhat.com>
> ---
>  migration/migration.c  |  1 +
>  migration/migration.h  |  3 +++
>  migration/savevm.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
>  migration/trace-events |  2 ++
>  4 files changed, 64 insertions(+), 2 deletions(-)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 8d26ea8..80de212 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -146,6 +146,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
>          memset(&mis_current, 0, sizeof(MigrationIncomingState));
>          qemu_mutex_init(&mis_current.rp_mutex);
>          qemu_event_init(&mis_current.main_thread_load_event, false);
> +        qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
>          once = true;
>      }
>      return &mis_current;
> diff --git a/migration/migration.h b/migration/migration.h
> index 0c957c9..c423682 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -60,6 +60,9 @@ struct MigrationIncomingState {
>      /* The coroutine we should enter (back) after failover */
>      Coroutine *migration_incoming_co;
>      QemuSemaphore colo_incoming_sem;
> +
> +    /* notify PAUSED postcopy incoming migrations to try to continue */
> +    QemuSemaphore postcopy_pause_sem_dst;
>  };
>  
>  MigrationIncomingState *migration_incoming_get_current(void);
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 7172f14..3777124 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1488,8 +1488,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
>   */
>  static void *postcopy_ram_listen_thread(void *opaque)
>  {
> -    QEMUFile *f = opaque;
>      MigrationIncomingState *mis = migration_incoming_get_current();
> +    QEMUFile *f = mis->from_src_file;
>      int load_res;
>  
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> @@ -1503,6 +1503,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
>       */
>      qemu_file_set_blocking(f, true);
>      load_res = qemu_loadvm_state_main(f, mis);
> +
> +    /*
> +     * This is tricky, but, mis->from_src_file can change after it
> +     * returns, when postcopy recovery happened. In the future, we may
> +     * want a wrapper for the QEMUFile handle.
> +     */
> +    f = mis->from_src_file;
> +
>      /* And non-blocking again so we don't block in any cleanup */
>      qemu_file_set_blocking(f, false);
>  
> @@ -1581,7 +1589,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
>      /* Start up the listening thread and wait for it to signal ready */
>      qemu_sem_init(&mis->listen_thread_sem, 0);
>      qemu_thread_create(&mis->listen_thread, "postcopy/listen",
> -                       postcopy_ram_listen_thread, mis->from_src_file,
> +                       postcopy_ram_listen_thread, NULL,
>                         QEMU_THREAD_DETACHED);
>      qemu_sem_wait(&mis->listen_thread_sem);
>      qemu_sem_destroy(&mis->listen_thread_sem);
> @@ -1966,11 +1974,44 @@ void qemu_loadvm_state_cleanup(void)
>      }
>  }
>  
> +/* Return true if we should continue the migration, or false. */
> +static bool postcopy_pause_incoming(MigrationIncomingState *mis)
> +{
> +    trace_postcopy_pause_incoming();
> +
> +    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> +                      MIGRATION_STATUS_POSTCOPY_PAUSED);
> +
> +    assert(mis->from_src_file);
> +    qemu_file_shutdown(mis->from_src_file);
> +    qemu_fclose(mis->from_src_file);
> +    mis->from_src_file = NULL;
> +
> +    assert(mis->to_src_file);
> +    qemu_mutex_lock(&mis->rp_mutex);
> +    qemu_file_shutdown(mis->to_src_file);

Should you not do the shutdown() before the lock?
For example if the other thread is stuck, with rp_mutex
held, trying to write to to_src_file, then you'll block
waiting for the mutex.  If you call shutdown and then take
the lock, the other thread will error and release the lock.

I'm not quite sure what will happen if we end up calling this
before the main thread has been returned from postcopy and the
device loading is complete.

Also, at this point have we guaranteed no one else is about
to do an op on mis->to_src_file and will seg?

Dave

> +    qemu_fclose(mis->to_src_file);
> +    mis->to_src_file = NULL;
> +    qemu_mutex_unlock(&mis->rp_mutex);
> +
> +    error_report("Detected IO failure for postcopy. "
> +                 "Migration paused.");
> +
> +    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
> +        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
> +    }
> +
> +    trace_postcopy_pause_incoming_continued();
> +
> +    return true;
> +}
> +
>  static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
>  {
>      uint8_t section_type;
>      int ret = 0;
>  
> +retry:
>      while (true) {
>          section_type = qemu_get_byte(f);
>  
> @@ -2016,6 +2057,21 @@ static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
>  out:
>      if (ret < 0) {
>          qemu_file_set_error(f, ret);
> +
> +        /*
> +         * Detect whether it is:
> +         *
> +         * 1. postcopy running
> +         * 2. network failure (-EIO)
> +         *
> +         * If so, we try to wait for a recovery.
> +         */
> +        if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE &&
> +            ret == -EIO && postcopy_pause_incoming(mis)) {
> +            /* Reset f to point to the newly created channel */
> +            f = mis->from_src_file;
> +            goto retry;
> +        }
>      }
>      return ret;
>  }
> diff --git a/migration/trace-events b/migration/trace-events
> index 907564b..7764c6f 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -99,6 +99,8 @@ open_return_path_on_source(void) ""
>  open_return_path_on_source_continue(void) ""
>  postcopy_start(void) ""
>  postcopy_pause_continued(void) ""
> +postcopy_pause_incoming(void) ""
> +postcopy_pause_incoming_continued(void) ""
>  postcopy_start_set_run(void) ""
>  source_return_path_thread_bad_end(void) ""
>  source_return_path_thread_end(void) ""
> -- 
> 2.7.4
> 
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu Sept. 27, 2017, 7:34 a.m. UTC | #2
On Thu, Sep 21, 2017 at 08:29:03PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > When there is IO error on the incoming channel (e.g., network down),
> > instead of bailing out immediately, we allow the dst vm to switch to the
> > new POSTCOPY_PAUSE state. Currently it is still simple - it waits the
> > new semaphore, until someone poke it for another attempt.
> > 
> > Signed-off-by: Peter Xu <peterx@redhat.com>
> > ---
> >  migration/migration.c  |  1 +
> >  migration/migration.h  |  3 +++
> >  migration/savevm.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
> >  migration/trace-events |  2 ++
> >  4 files changed, 64 insertions(+), 2 deletions(-)
> > 
> > diff --git a/migration/migration.c b/migration/migration.c
> > index 8d26ea8..80de212 100644
> > --- a/migration/migration.c
> > +++ b/migration/migration.c
> > @@ -146,6 +146,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
> >          memset(&mis_current, 0, sizeof(MigrationIncomingState));
> >          qemu_mutex_init(&mis_current.rp_mutex);
> >          qemu_event_init(&mis_current.main_thread_load_event, false);
> > +        qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
> >          once = true;
> >      }
> >      return &mis_current;
> > diff --git a/migration/migration.h b/migration/migration.h
> > index 0c957c9..c423682 100644
> > --- a/migration/migration.h
> > +++ b/migration/migration.h
> > @@ -60,6 +60,9 @@ struct MigrationIncomingState {
> >      /* The coroutine we should enter (back) after failover */
> >      Coroutine *migration_incoming_co;
> >      QemuSemaphore colo_incoming_sem;
> > +
> > +    /* notify PAUSED postcopy incoming migrations to try to continue */
> > +    QemuSemaphore postcopy_pause_sem_dst;
> >  };
> >  
> >  MigrationIncomingState *migration_incoming_get_current(void);
> > diff --git a/migration/savevm.c b/migration/savevm.c
> > index 7172f14..3777124 100644
> > --- a/migration/savevm.c
> > +++ b/migration/savevm.c
> > @@ -1488,8 +1488,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> >   */
> >  static void *postcopy_ram_listen_thread(void *opaque)
> >  {
> > -    QEMUFile *f = opaque;
> >      MigrationIncomingState *mis = migration_incoming_get_current();
> > +    QEMUFile *f = mis->from_src_file;
> >      int load_res;
> >  
> >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > @@ -1503,6 +1503,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
> >       */
> >      qemu_file_set_blocking(f, true);
> >      load_res = qemu_loadvm_state_main(f, mis);
> > +
> > +    /*
> > +     * This is tricky, but, mis->from_src_file can change after it
> > +     * returns, when postcopy recovery happened. In the future, we may
> > +     * want a wrapper for the QEMUFile handle.
> > +     */
> > +    f = mis->from_src_file;
> > +
> >      /* And non-blocking again so we don't block in any cleanup */
> >      qemu_file_set_blocking(f, false);
> >  
> > @@ -1581,7 +1589,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> >      /* Start up the listening thread and wait for it to signal ready */
> >      qemu_sem_init(&mis->listen_thread_sem, 0);
> >      qemu_thread_create(&mis->listen_thread, "postcopy/listen",
> > -                       postcopy_ram_listen_thread, mis->from_src_file,
> > +                       postcopy_ram_listen_thread, NULL,
> >                         QEMU_THREAD_DETACHED);
> >      qemu_sem_wait(&mis->listen_thread_sem);
> >      qemu_sem_destroy(&mis->listen_thread_sem);
> > @@ -1966,11 +1974,44 @@ void qemu_loadvm_state_cleanup(void)
> >      }
> >  }
> >  
> > +/* Return true if we should continue the migration, or false. */
> > +static bool postcopy_pause_incoming(MigrationIncomingState *mis)
> > +{
> > +    trace_postcopy_pause_incoming();
> > +
> > +    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> > +                      MIGRATION_STATUS_POSTCOPY_PAUSED);
> > +
> > +    assert(mis->from_src_file);
> > +    qemu_file_shutdown(mis->from_src_file);
> > +    qemu_fclose(mis->from_src_file);
> > +    mis->from_src_file = NULL;
> > +
> > +    assert(mis->to_src_file);
> > +    qemu_mutex_lock(&mis->rp_mutex);
> > +    qemu_file_shutdown(mis->to_src_file);
> 
> Should you not do the shutdown() before the lock?
> For example if the other thread is stuck, with rp_mutex
> held, trying to write to to_src_file, then you'll block
> waiting for the mutex.  If you call shutdown and then take
> the lock, the other thread will error and release the lock.

The problem is that IMHO QEMUFile is not yet thread-safe itself.  So
if we operate on it (even to shut it down) logically we need to have
the lock, right?

Then, IMHO the question would be: when will the send() be stuck in the
other thread?

Normally the only case I can think of is that source didn't recv()
fast enough, and we even consumed all the write buffer in dst side (I
don't really know how kernel manages the buffers though, and e.g. how
the size of buffer is defined...).

But when reach here, the channel (say, from_src_file and to_src_file,
since both of them are using the same channel behind the QEMUFile
interface) should already be broken in some way, then IIUC even there
is a send() in the other thread, it should return at some point with a
failure as well, just like how we reached here (possibly due to a
read() failure).

> 
> I'm not quite sure what will happen if we end up calling this
> before the main thread has been returned from postcopy and the
> device loading is complete.

IIUC you mean the time starts from when we got MIG_CMD_PACKAGED until
main thread finishes handling that package?

Normally I think that should not matter much since during handling the
package it should hardly fail (we were reading from a buffer QIO
channel, no real IOs there)...  But I agree about the reasoning.  How
about one more patch to postpone the "active" to "postcopy-active"
state change after the package is handled correctly?  Like:

--------------
diff --git a/migration/savevm.c b/migration/savevm.c                     
index b5c3214034..8317b2a7e2 100644 
--- a/migration/savevm.c            
+++ b/migration/savevm.c            
@@ -1573,8 +1573,6 @@ static void *postcopy_ram_listen_thread(void *opaque)                                                                       
     QEMUFile *f = mis->from_src_file;                                   
     int load_res;                  
                                    
-    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
-                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
     qemu_sem_post(&mis->listen_thread_sem);                             
     trace_postcopy_ram_listen_thread_start();                           
                                    
@@ -1817,6 +1815,9 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)                                                          
     qemu_fclose(packf);            
     object_unref(OBJECT(bioc));    
                                    
+    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
+                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
+                                   
     return ret;                    
 }                                  
--------------

This function will only be called with "postcopy-active" state.

> 
> Also, at this point have we guaranteed no one else is about
> to do an op on mis->to_src_file and will seg?

I think no?  Since IMHO the main thread is playing with the buffer QIO
channel, rather than the real one?

(btw, could I ask what's "seg"? :)
Dr. David Alan Gilbert Oct. 9, 2017, 6:58 p.m. UTC | #3
* Peter Xu (peterx@redhat.com) wrote:
> On Thu, Sep 21, 2017 at 08:29:03PM +0100, Dr. David Alan Gilbert wrote:
> > * Peter Xu (peterx@redhat.com) wrote:
> > > When there is IO error on the incoming channel (e.g., network down),
> > > instead of bailing out immediately, we allow the dst vm to switch to the
> > > new POSTCOPY_PAUSE state. Currently it is still simple - it waits the
> > > new semaphore, until someone poke it for another attempt.
> > > 
> > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > ---
> > >  migration/migration.c  |  1 +
> > >  migration/migration.h  |  3 +++
> > >  migration/savevm.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
> > >  migration/trace-events |  2 ++
> > >  4 files changed, 64 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/migration/migration.c b/migration/migration.c
> > > index 8d26ea8..80de212 100644
> > > --- a/migration/migration.c
> > > +++ b/migration/migration.c
> > > @@ -146,6 +146,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
> > >          memset(&mis_current, 0, sizeof(MigrationIncomingState));
> > >          qemu_mutex_init(&mis_current.rp_mutex);
> > >          qemu_event_init(&mis_current.main_thread_load_event, false);
> > > +        qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
> > >          once = true;
> > >      }
> > >      return &mis_current;
> > > diff --git a/migration/migration.h b/migration/migration.h
> > > index 0c957c9..c423682 100644
> > > --- a/migration/migration.h
> > > +++ b/migration/migration.h
> > > @@ -60,6 +60,9 @@ struct MigrationIncomingState {
> > >      /* The coroutine we should enter (back) after failover */
> > >      Coroutine *migration_incoming_co;
> > >      QemuSemaphore colo_incoming_sem;
> > > +
> > > +    /* notify PAUSED postcopy incoming migrations to try to continue */
> > > +    QemuSemaphore postcopy_pause_sem_dst;
> > >  };
> > >  
> > >  MigrationIncomingState *migration_incoming_get_current(void);
> > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > index 7172f14..3777124 100644
> > > --- a/migration/savevm.c
> > > +++ b/migration/savevm.c
> > > @@ -1488,8 +1488,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> > >   */
> > >  static void *postcopy_ram_listen_thread(void *opaque)
> > >  {
> > > -    QEMUFile *f = opaque;
> > >      MigrationIncomingState *mis = migration_incoming_get_current();
> > > +    QEMUFile *f = mis->from_src_file;
> > >      int load_res;
> > >  
> > >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > @@ -1503,6 +1503,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
> > >       */
> > >      qemu_file_set_blocking(f, true);
> > >      load_res = qemu_loadvm_state_main(f, mis);
> > > +
> > > +    /*
> > > +     * This is tricky, but, mis->from_src_file can change after it
> > > +     * returns, when postcopy recovery happened. In the future, we may
> > > +     * want a wrapper for the QEMUFile handle.
> > > +     */
> > > +    f = mis->from_src_file;
> > > +
> > >      /* And non-blocking again so we don't block in any cleanup */
> > >      qemu_file_set_blocking(f, false);
> > >  
> > > @@ -1581,7 +1589,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> > >      /* Start up the listening thread and wait for it to signal ready */
> > >      qemu_sem_init(&mis->listen_thread_sem, 0);
> > >      qemu_thread_create(&mis->listen_thread, "postcopy/listen",
> > > -                       postcopy_ram_listen_thread, mis->from_src_file,
> > > +                       postcopy_ram_listen_thread, NULL,
> > >                         QEMU_THREAD_DETACHED);
> > >      qemu_sem_wait(&mis->listen_thread_sem);
> > >      qemu_sem_destroy(&mis->listen_thread_sem);
> > > @@ -1966,11 +1974,44 @@ void qemu_loadvm_state_cleanup(void)
> > >      }
> > >  }
> > >  
> > > +/* Return true if we should continue the migration, or false. */
> > > +static bool postcopy_pause_incoming(MigrationIncomingState *mis)
> > > +{
> > > +    trace_postcopy_pause_incoming();
> > > +
> > > +    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> > > +                      MIGRATION_STATUS_POSTCOPY_PAUSED);
> > > +
> > > +    assert(mis->from_src_file);
> > > +    qemu_file_shutdown(mis->from_src_file);
> > > +    qemu_fclose(mis->from_src_file);
> > > +    mis->from_src_file = NULL;
> > > +
> > > +    assert(mis->to_src_file);
> > > +    qemu_mutex_lock(&mis->rp_mutex);
> > > +    qemu_file_shutdown(mis->to_src_file);
> > 
> > Should you not do the shutdown() before the lock?
> > For example if the other thread is stuck, with rp_mutex
> > held, trying to write to to_src_file, then you'll block
> > waiting for the mutex.  If you call shutdown and then take
> > the lock, the other thread will error and release the lock.
> 
> The problem is that IMHO QEMUFile is not yet thread-safe itself.  So
> if we operate on it (even to shut it down) logically we need to have
> the lock, right?

That probably needs fixing for 'shutdown' under the assumption that
   a) No one has or is deleting/freeing the QEMUFile
   b) No one is closing the QEMUFile

The whole point of using shutdown() is it forces any stuck send()'s or
read()'s to fail rather than staying stuck.

> Then, IMHO the question would be: when will the send() be stuck in the
> other thread?
> 
> Normally the only case I can think of is that source didn't recv()
> fast enough, and we even consumed all the write buffer in dst side (I
> don't really know how kernel manages the buffers though, and e.g. how
> the size of buffer is defined...).
> 
> But when reach here, the channel (say, from_src_file and to_src_file,
> since both of them are using the same channel behind the QEMUFile
> interface) should already be broken in some way, then IIUC even there
> is a send() in the other thread, it should return at some point with a
> failure as well, just like how we reached here (possibly due to a
> read() failure).

We have to be careful about this; a network can fail in a way it
gets stuck rather than fails - this can get stuck until a full TCP
disconnection; and that takes about 30mins (from memory).
The nice thing about using 'shutdown' is that you can kill the existing
connection if it's hung. (Which then makes an interesting question;
the rules in your migrate-incoming command become different if you
want to declare it's failed!).  Having said that, you're right that at
this point stuff has already failed - so do we need the shutdown?
(You might want to do the shutdown as part of the recovery earlier
or as a separate command to force the failure)

> > I'm not quite sure what will happen if we end up calling this
> > before the main thread has been returned from postcopy and the
> > device loading is complete.
> 
> IIUC you mean the time starts from when we got MIG_CMD_PACKAGED until
> main thread finishes handling that package?

Yes.

> Normally I think that should not matter much since during handling the
> package it should hardly fail (we were reading from a buffer QIO
> channel, no real IOs there)... 

Note that while the main thread is reading the package, the listener
thread is receiving pages, so you can legally get a failure at that
point when the fd fails as it's receiving pages at the same time
as reading the devices.
(There's an argument that if it fails before you've received all
your devices then perhaps you can just restart the source)

> But I agree about the reasoning.  How
> about one more patch to postpone the "active" to "postcopy-active"
> state change after the package is handled correctly?  Like:
> 
> --------------
> diff --git a/migration/savevm.c b/migration/savevm.c                     
> index b5c3214034..8317b2a7e2 100644 
> --- a/migration/savevm.c            
> +++ b/migration/savevm.c            
> @@ -1573,8 +1573,6 @@ static void *postcopy_ram_listen_thread(void *opaque)                                                                       
>      QEMUFile *f = mis->from_src_file;                                   
>      int load_res;                  
>                                     
> -    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> -                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
>      qemu_sem_post(&mis->listen_thread_sem);                             
>      trace_postcopy_ram_listen_thread_start();                           
>                                     
> @@ -1817,6 +1815,9 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)                                                          
>      qemu_fclose(packf);            
>      object_unref(OBJECT(bioc));    
>                                     
> +    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> +                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> +                                   
>      return ret;                    
>  }                                  
> --------------
> 
> This function will only be called with "postcopy-active" state.

I *think* that's safe; you've got to be careful, but I can't see
anyone on the destination that cares about the destinction.

> > Also, at this point have we guaranteed no one else is about
> > to do an op on mis->to_src_file and will seg?
> 
> I think no?  Since IMHO the main thread is playing with the buffer QIO
> channel, rather than the real one?

OK.

> (btw, could I ask what's "seg"? :)

just short for segmentation fault; sig 11.

Dave

> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu Oct. 10, 2017, 9:38 a.m. UTC | #4
On Mon, Oct 09, 2017 at 07:58:13PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Thu, Sep 21, 2017 at 08:29:03PM +0100, Dr. David Alan Gilbert wrote:
> > > * Peter Xu (peterx@redhat.com) wrote:
> > > > When there is IO error on the incoming channel (e.g., network down),
> > > > instead of bailing out immediately, we allow the dst vm to switch to the
> > > > new POSTCOPY_PAUSE state. Currently it is still simple - it waits the
> > > > new semaphore, until someone poke it for another attempt.
> > > > 
> > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > ---
> > > >  migration/migration.c  |  1 +
> > > >  migration/migration.h  |  3 +++
> > > >  migration/savevm.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
> > > >  migration/trace-events |  2 ++
> > > >  4 files changed, 64 insertions(+), 2 deletions(-)
> > > > 
> > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > index 8d26ea8..80de212 100644
> > > > --- a/migration/migration.c
> > > > +++ b/migration/migration.c
> > > > @@ -146,6 +146,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
> > > >          memset(&mis_current, 0, sizeof(MigrationIncomingState));
> > > >          qemu_mutex_init(&mis_current.rp_mutex);
> > > >          qemu_event_init(&mis_current.main_thread_load_event, false);
> > > > +        qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
> > > >          once = true;
> > > >      }
> > > >      return &mis_current;
> > > > diff --git a/migration/migration.h b/migration/migration.h
> > > > index 0c957c9..c423682 100644
> > > > --- a/migration/migration.h
> > > > +++ b/migration/migration.h
> > > > @@ -60,6 +60,9 @@ struct MigrationIncomingState {
> > > >      /* The coroutine we should enter (back) after failover */
> > > >      Coroutine *migration_incoming_co;
> > > >      QemuSemaphore colo_incoming_sem;
> > > > +
> > > > +    /* notify PAUSED postcopy incoming migrations to try to continue */
> > > > +    QemuSemaphore postcopy_pause_sem_dst;
> > > >  };
> > > >  
> > > >  MigrationIncomingState *migration_incoming_get_current(void);
> > > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > > index 7172f14..3777124 100644
> > > > --- a/migration/savevm.c
> > > > +++ b/migration/savevm.c
> > > > @@ -1488,8 +1488,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> > > >   */
> > > >  static void *postcopy_ram_listen_thread(void *opaque)
> > > >  {
> > > > -    QEMUFile *f = opaque;
> > > >      MigrationIncomingState *mis = migration_incoming_get_current();
> > > > +    QEMUFile *f = mis->from_src_file;
> > > >      int load_res;
> > > >  
> > > >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > @@ -1503,6 +1503,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
> > > >       */
> > > >      qemu_file_set_blocking(f, true);
> > > >      load_res = qemu_loadvm_state_main(f, mis);
> > > > +
> > > > +    /*
> > > > +     * This is tricky, but, mis->from_src_file can change after it
> > > > +     * returns, when postcopy recovery happened. In the future, we may
> > > > +     * want a wrapper for the QEMUFile handle.
> > > > +     */
> > > > +    f = mis->from_src_file;
> > > > +
> > > >      /* And non-blocking again so we don't block in any cleanup */
> > > >      qemu_file_set_blocking(f, false);
> > > >  
> > > > @@ -1581,7 +1589,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> > > >      /* Start up the listening thread and wait for it to signal ready */
> > > >      qemu_sem_init(&mis->listen_thread_sem, 0);
> > > >      qemu_thread_create(&mis->listen_thread, "postcopy/listen",
> > > > -                       postcopy_ram_listen_thread, mis->from_src_file,
> > > > +                       postcopy_ram_listen_thread, NULL,
> > > >                         QEMU_THREAD_DETACHED);
> > > >      qemu_sem_wait(&mis->listen_thread_sem);
> > > >      qemu_sem_destroy(&mis->listen_thread_sem);
> > > > @@ -1966,11 +1974,44 @@ void qemu_loadvm_state_cleanup(void)
> > > >      }
> > > >  }
> > > >  
> > > > +/* Return true if we should continue the migration, or false. */
> > > > +static bool postcopy_pause_incoming(MigrationIncomingState *mis)
> > > > +{
> > > > +    trace_postcopy_pause_incoming();
> > > > +
> > > > +    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> > > > +                      MIGRATION_STATUS_POSTCOPY_PAUSED);
> > > > +
> > > > +    assert(mis->from_src_file);
> > > > +    qemu_file_shutdown(mis->from_src_file);
> > > > +    qemu_fclose(mis->from_src_file);
> > > > +    mis->from_src_file = NULL;
> > > > +
> > > > +    assert(mis->to_src_file);
> > > > +    qemu_mutex_lock(&mis->rp_mutex);
> > > > +    qemu_file_shutdown(mis->to_src_file);
> > > 
> > > Should you not do the shutdown() before the lock?
> > > For example if the other thread is stuck, with rp_mutex
> > > held, trying to write to to_src_file, then you'll block
> > > waiting for the mutex.  If you call shutdown and then take
> > > the lock, the other thread will error and release the lock.
> > 
> > The problem is that IMHO QEMUFile is not yet thread-safe itself.  So
> > if we operate on it (even to shut it down) logically we need to have
> > the lock, right?
> 
> That probably needs fixing for 'shutdown' under the assumption that
>    a) No one has or is deleting/freeing the QEMUFile
>    b) No one is closing the QEMUFile
> 
> The whole point of using shutdown() is it forces any stuck send()'s or
> read()'s to fail rather than staying stuck.

I see.  I just noticed that actually qemu_file_shutdown() is
thread-safe itself - it boils down to the system shutdown() call (as
long as the above assumption is there).

Let me call qemu_file_shutdown() first before taking the lock to make
sure send()/recv() hang won't happen.

> 
> > Then, IMHO the question would be: when will the send() be stuck in the
> > other thread?
> > 
> > Normally the only case I can think of is that source didn't recv()
> > fast enough, and we even consumed all the write buffer in dst side (I
> > don't really know how kernel manages the buffers though, and e.g. how
> > the size of buffer is defined...).
> > 
> > But when reach here, the channel (say, from_src_file and to_src_file,
> > since both of them are using the same channel behind the QEMUFile
> > interface) should already be broken in some way, then IIUC even there
> > is a send() in the other thread, it should return at some point with a
> > failure as well, just like how we reached here (possibly due to a
> > read() failure).
> 
> We have to be careful about this; a network can fail in a way it
> gets stuck rather than fails - this can get stuck until a full TCP
> disconnection; and that takes about 30mins (from memory).
> The nice thing about using 'shutdown' is that you can kill the existing
> connection if it's hung. (Which then makes an interesting question;
> the rules in your migrate-incoming command become different if you
> want to declare it's failed!).  Having said that, you're right that at
> this point stuff has already failed - so do we need the shutdown?
> (You might want to do the shutdown as part of the recovery earlier
> or as a separate command to force the failure)

I assume if I call shutdown before the lock then we'll be good then.

> 
> > > I'm not quite sure what will happen if we end up calling this
> > > before the main thread has been returned from postcopy and the
> > > device loading is complete.
> > 
> > IIUC you mean the time starts from when we got MIG_CMD_PACKAGED until
> > main thread finishes handling that package?
> 
> Yes.
> 
> > Normally I think that should not matter much since during handling the
> > package it should hardly fail (we were reading from a buffer QIO
> > channel, no real IOs there)... 
> 
> Note that while the main thread is reading the package, the listener
> thread is receiving pages, so you can legally get a failure at that
> point when the fd fails as it's receiving pages at the same time
> as reading the devices.
> (There's an argument that if it fails before you've received all
> your devices then perhaps you can just restart the source)

Yes.

> 
> > But I agree about the reasoning.  How
> > about one more patch to postpone the "active" to "postcopy-active"
> > state change after the package is handled correctly?  Like:
> > 
> > --------------
> > diff --git a/migration/savevm.c b/migration/savevm.c                     
> > index b5c3214034..8317b2a7e2 100644 
> > --- a/migration/savevm.c            
> > +++ b/migration/savevm.c            
> > @@ -1573,8 +1573,6 @@ static void *postcopy_ram_listen_thread(void *opaque)                                                                       
> >      QEMUFile *f = mis->from_src_file;                                   
> >      int load_res;                  
> >                                     
> > -    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > -                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> >      qemu_sem_post(&mis->listen_thread_sem);                             
> >      trace_postcopy_ram_listen_thread_start();                           
> >                                     
> > @@ -1817,6 +1815,9 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)                                                          
> >      qemu_fclose(packf);            
> >      object_unref(OBJECT(bioc));    
> >                                     
> > +    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > +                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > +                                   
> >      return ret;                    
> >  }                                  
> > --------------
> > 
> > This function will only be called with "postcopy-active" state.
> 
> I *think* that's safe; you've got to be careful, but I can't see
> anyone on the destination that cares about the destinction.

Indeed, but I'd say that's the best thing I can think of (and the
simplest).  Even, not sure whether it'll be more clear if we set
postcopy-active state right before starting the VM on destination,
say, at the beginning of loadvm_postcopy_handle_run_bh().

> 
> > > Also, at this point have we guaranteed no one else is about
> > > to do an op on mis->to_src_file and will seg?
> > 
> > I think no?  Since IMHO the main thread is playing with the buffer QIO
> > channel, rather than the real one?
> 
> OK.
> 
> > (btw, could I ask what's "seg"? :)
> 
> just short for segmentation fault; sig 11.

I see.  Thanks!

> 
> Dave
> 
> > -- 
> > Peter Xu
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu Oct. 10, 2017, 11:31 a.m. UTC | #5
On Tue, Oct 10, 2017 at 05:38:01PM +0800, Peter Xu wrote:

[...]

> > > But I agree about the reasoning.  How
> > > about one more patch to postpone the "active" to "postcopy-active"
> > > state change after the package is handled correctly?  Like:
> > > 
> > > --------------
> > > diff --git a/migration/savevm.c b/migration/savevm.c                     
> > > index b5c3214034..8317b2a7e2 100644 
> > > --- a/migration/savevm.c            
> > > +++ b/migration/savevm.c            
> > > @@ -1573,8 +1573,6 @@ static void *postcopy_ram_listen_thread(void *opaque)                                                                       
> > >      QEMUFile *f = mis->from_src_file;                                   
> > >      int load_res;                  
> > >                                     
> > > -    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > > -                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > >      qemu_sem_post(&mis->listen_thread_sem);                             
> > >      trace_postcopy_ram_listen_thread_start();                           
> > >                                     
> > > @@ -1817,6 +1815,9 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)                                                          
> > >      qemu_fclose(packf);            
> > >      object_unref(OBJECT(bioc));    
> > >                                     
> > > +    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > > +                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > > +                                   
> > >      return ret;                    
> > >  }                                  
> > > --------------
> > > 
> > > This function will only be called with "postcopy-active" state.
> > 
> > I *think* that's safe; you've got to be careful, but I can't see
> > anyone on the destination that cares about the destinction.
> 
> Indeed, but I'd say that's the best thing I can think of (and the
> simplest).  Even, not sure whether it'll be more clear if we set
> postcopy-active state right before starting the VM on destination,
> say, at the beginning of loadvm_postcopy_handle_run_bh().

When thinking about this, I had another question.

How do we handle the case if we failed to send the device states in
postcopy_start()?  In that, we do qemu_savevm_send_packaged() then we
assume we are good and return with success. However
qemu_savevm_send_packaged() only means that the data is queued in
write buffer of source host, it does not mean that destination has
loaded the device states correctly.  It's still possible that
destination VM failed to receive the whole packaged data, but source
thought it had done so without problem.

Then source will continue with postcopy-active, destination VM will
instead fail, then fail the source. VM should be lost then since it's
postcopy rather than precopy.

Meanwhile, this cannot be handled by postcopy recovery, since IIUC
postcopy recovery only works after the states are at least loaded on
destination VM (I'll avoid going deeper to think a more complex
protocol for postcopy recovery, please see below).

I think the best/simplest thing to do when encountering this error is
that, when this happens we just fail the migration on source and
continue running on source, which should be the same failure handling
path with precopy.  But still it seems that we don't have a good
mechanism to detect the error when sending MIG_CMD_PACKAGED message
fails in some way (we can add one ACK from dst->src, however it breaks
old VMs).

Before going further, would my worry make any sense?

(I hope this can be a separate problem from postcopy recovery series,
 if it is indeed a problem.  For postcopy recovery, I hope the idea of
 postponing setup POSTCOPY_ACTIVE would suffice)
Dr. David Alan Gilbert Oct. 10, 2017, 12:30 p.m. UTC | #6
* Peter Xu (peterx@redhat.com) wrote:
> On Mon, Oct 09, 2017 at 07:58:13PM +0100, Dr. David Alan Gilbert wrote:
> > * Peter Xu (peterx@redhat.com) wrote:
> > > On Thu, Sep 21, 2017 at 08:29:03PM +0100, Dr. David Alan Gilbert wrote:
> > > > * Peter Xu (peterx@redhat.com) wrote:
> > > > > When there is IO error on the incoming channel (e.g., network down),
> > > > > instead of bailing out immediately, we allow the dst vm to switch to the
> > > > > new POSTCOPY_PAUSE state. Currently it is still simple - it waits the
> > > > > new semaphore, until someone poke it for another attempt.
> > > > > 
> > > > > Signed-off-by: Peter Xu <peterx@redhat.com>
> > > > > ---
> > > > >  migration/migration.c  |  1 +
> > > > >  migration/migration.h  |  3 +++
> > > > >  migration/savevm.c     | 60 ++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > >  migration/trace-events |  2 ++
> > > > >  4 files changed, 64 insertions(+), 2 deletions(-)
> > > > > 
> > > > > diff --git a/migration/migration.c b/migration/migration.c
> > > > > index 8d26ea8..80de212 100644
> > > > > --- a/migration/migration.c
> > > > > +++ b/migration/migration.c
> > > > > @@ -146,6 +146,7 @@ MigrationIncomingState *migration_incoming_get_current(void)
> > > > >          memset(&mis_current, 0, sizeof(MigrationIncomingState));
> > > > >          qemu_mutex_init(&mis_current.rp_mutex);
> > > > >          qemu_event_init(&mis_current.main_thread_load_event, false);
> > > > > +        qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
> > > > >          once = true;
> > > > >      }
> > > > >      return &mis_current;
> > > > > diff --git a/migration/migration.h b/migration/migration.h
> > > > > index 0c957c9..c423682 100644
> > > > > --- a/migration/migration.h
> > > > > +++ b/migration/migration.h
> > > > > @@ -60,6 +60,9 @@ struct MigrationIncomingState {
> > > > >      /* The coroutine we should enter (back) after failover */
> > > > >      Coroutine *migration_incoming_co;
> > > > >      QemuSemaphore colo_incoming_sem;
> > > > > +
> > > > > +    /* notify PAUSED postcopy incoming migrations to try to continue */
> > > > > +    QemuSemaphore postcopy_pause_sem_dst;
> > > > >  };
> > > > >  
> > > > >  MigrationIncomingState *migration_incoming_get_current(void);
> > > > > diff --git a/migration/savevm.c b/migration/savevm.c
> > > > > index 7172f14..3777124 100644
> > > > > --- a/migration/savevm.c
> > > > > +++ b/migration/savevm.c
> > > > > @@ -1488,8 +1488,8 @@ static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
> > > > >   */
> > > > >  static void *postcopy_ram_listen_thread(void *opaque)
> > > > >  {
> > > > > -    QEMUFile *f = opaque;
> > > > >      MigrationIncomingState *mis = migration_incoming_get_current();
> > > > > +    QEMUFile *f = mis->from_src_file;
> > > > >      int load_res;
> > > > >  
> > > > >      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
> > > > > @@ -1503,6 +1503,14 @@ static void *postcopy_ram_listen_thread(void *opaque)
> > > > >       */
> > > > >      qemu_file_set_blocking(f, true);
> > > > >      load_res = qemu_loadvm_state_main(f, mis);
> > > > > +
> > > > > +    /*
> > > > > +     * This is tricky, but, mis->from_src_file can change after it
> > > > > +     * returns, when postcopy recovery happened. In the future, we may
> > > > > +     * want a wrapper for the QEMUFile handle.
> > > > > +     */
> > > > > +    f = mis->from_src_file;
> > > > > +
> > > > >      /* And non-blocking again so we don't block in any cleanup */
> > > > >      qemu_file_set_blocking(f, false);
> > > > >  
> > > > > @@ -1581,7 +1589,7 @@ static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
> > > > >      /* Start up the listening thread and wait for it to signal ready */
> > > > >      qemu_sem_init(&mis->listen_thread_sem, 0);
> > > > >      qemu_thread_create(&mis->listen_thread, "postcopy/listen",
> > > > > -                       postcopy_ram_listen_thread, mis->from_src_file,
> > > > > +                       postcopy_ram_listen_thread, NULL,
> > > > >                         QEMU_THREAD_DETACHED);
> > > > >      qemu_sem_wait(&mis->listen_thread_sem);
> > > > >      qemu_sem_destroy(&mis->listen_thread_sem);
> > > > > @@ -1966,11 +1974,44 @@ void qemu_loadvm_state_cleanup(void)
> > > > >      }
> > > > >  }
> > > > >  
> > > > > +/* Return true if we should continue the migration, or false. */
> > > > > +static bool postcopy_pause_incoming(MigrationIncomingState *mis)
> > > > > +{
> > > > > +    trace_postcopy_pause_incoming();
> > > > > +
> > > > > +    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> > > > > +                      MIGRATION_STATUS_POSTCOPY_PAUSED);
> > > > > +
> > > > > +    assert(mis->from_src_file);
> > > > > +    qemu_file_shutdown(mis->from_src_file);
> > > > > +    qemu_fclose(mis->from_src_file);
> > > > > +    mis->from_src_file = NULL;
> > > > > +
> > > > > +    assert(mis->to_src_file);
> > > > > +    qemu_mutex_lock(&mis->rp_mutex);
> > > > > +    qemu_file_shutdown(mis->to_src_file);
> > > > 
> > > > Should you not do the shutdown() before the lock?
> > > > For example if the other thread is stuck, with rp_mutex
> > > > held, trying to write to to_src_file, then you'll block
> > > > waiting for the mutex.  If you call shutdown and then take
> > > > the lock, the other thread will error and release the lock.
> > > 
> > > The problem is that IMHO QEMUFile is not yet thread-safe itself.  So
> > > if we operate on it (even to shut it down) logically we need to have
> > > the lock, right?
> > 
> > That probably needs fixing for 'shutdown' under the assumption that
> >    a) No one has or is deleting/freeing the QEMUFile
> >    b) No one is closing the QEMUFile
> > 
> > The whole point of using shutdown() is it forces any stuck send()'s or
> > read()'s to fail rather than staying stuck.
> 
> I see.  I just noticed that actually qemu_file_shutdown() is
> thread-safe itself - it boils down to the system shutdown() call (as
> long as the above assumption is there).
> 
> Let me call qemu_file_shutdown() first before taking the lock to make
> sure send()/recv() hang won't happen.
> 
> > 
> > > Then, IMHO the question would be: when will the send() be stuck in the
> > > other thread?
> > > 
> > > Normally the only case I can think of is that source didn't recv()
> > > fast enough, and we even consumed all the write buffer in dst side (I
> > > don't really know how kernel manages the buffers though, and e.g. how
> > > the size of buffer is defined...).
> > > 
> > > But when reach here, the channel (say, from_src_file and to_src_file,
> > > since both of them are using the same channel behind the QEMUFile
> > > interface) should already be broken in some way, then IIUC even there
> > > is a send() in the other thread, it should return at some point with a
> > > failure as well, just like how we reached here (possibly due to a
> > > read() failure).
> > 
> > We have to be careful about this; a network can fail in a way it
> > gets stuck rather than fails - this can get stuck until a full TCP
> > disconnection; and that takes about 30mins (from memory).
> > The nice thing about using 'shutdown' is that you can kill the existing
> > connection if it's hung. (Which then makes an interesting question;
> > the rules in your migrate-incoming command become different if you
> > want to declare it's failed!).  Having said that, you're right that at
> > this point stuff has already failed - so do we need the shutdown?
> > (You might want to do the shutdown as part of the recovery earlier
> > or as a separate command to force the failure)
> 
> I assume if I call shutdown before the lock then we'll be good then.

The question is what happens if you only allow recovery if we're already
in postcopy-paused state; in the case of a hung socket, since no IO has
actually failed yet, you will still be in postcopy-active.

Dave

> > 
> > > > I'm not quite sure what will happen if we end up calling this
> > > > before the main thread has been returned from postcopy and the
> > > > device loading is complete.
> > > 
> > > IIUC you mean the time starts from when we got MIG_CMD_PACKAGED until
> > > main thread finishes handling that package?
> > 
> > Yes.
> > 
> > > Normally I think that should not matter much since during handling the
> > > package it should hardly fail (we were reading from a buffer QIO
> > > channel, no real IOs there)... 
> > 
> > Note that while the main thread is reading the package, the listener
> > thread is receiving pages, so you can legally get a failure at that
> > point when the fd fails as it's receiving pages at the same time
> > as reading the devices.
> > (There's an argument that if it fails before you've received all
> > your devices then perhaps you can just restart the source)
> 
> Yes.
> 
> > 
> > > But I agree about the reasoning.  How
> > > about one more patch to postpone the "active" to "postcopy-active"
> > > state change after the package is handled correctly?  Like:
> > > 
> > > --------------
> > > diff --git a/migration/savevm.c b/migration/savevm.c                     
> > > index b5c3214034..8317b2a7e2 100644 
> > > --- a/migration/savevm.c            
> > > +++ b/migration/savevm.c            
> > > @@ -1573,8 +1573,6 @@ static void *postcopy_ram_listen_thread(void *opaque)                                                                       
> > >      QEMUFile *f = mis->from_src_file;                                   
> > >      int load_res;                  
> > >                                     
> > > -    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > > -                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > >      qemu_sem_post(&mis->listen_thread_sem);                             
> > >      trace_postcopy_ram_listen_thread_start();                           
> > >                                     
> > > @@ -1817,6 +1815,9 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)                                                          
> > >      qemu_fclose(packf);            
> > >      object_unref(OBJECT(bioc));    
> > >                                     
> > > +    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > > +                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > > +                                   
> > >      return ret;                    
> > >  }                                  
> > > --------------
> > > 
> > > This function will only be called with "postcopy-active" state.
> > 
> > I *think* that's safe; you've got to be careful, but I can't see
> > anyone on the destination that cares about the destinction.
> 
> Indeed, but I'd say that's the best thing I can think of (and the
> simplest).  Even, not sure whether it'll be more clear if we set
> postcopy-active state right before starting the VM on destination,
> say, at the beginning of loadvm_postcopy_handle_run_bh().
> 
> > 
> > > > Also, at this point have we guaranteed no one else is about
> > > > to do an op on mis->to_src_file and will seg?
> > > 
> > > I think no?  Since IMHO the main thread is playing with the buffer QIO
> > > channel, rather than the real one?
> > 
> > OK.
> > 
> > > (btw, could I ask what's "seg"? :)
> > 
> > just short for segmentation fault; sig 11.
> 
> I see.  Thanks!
> 
> > 
> > Dave
> > 
> > > -- 
> > > Peter Xu
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> 
> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu Oct. 11, 2017, 3 a.m. UTC | #7
On Tue, Oct 10, 2017 at 01:30:18PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Mon, Oct 09, 2017 at 07:58:13PM +0100, Dr. David Alan Gilbert wrote:

[...]

> > > We have to be careful about this; a network can fail in a way it
> > > gets stuck rather than fails - this can get stuck until a full TCP
> > > disconnection; and that takes about 30mins (from memory).
> > > The nice thing about using 'shutdown' is that you can kill the existing
> > > connection if it's hung. (Which then makes an interesting question;
> > > the rules in your migrate-incoming command become different if you
> > > want to declare it's failed!).  Having said that, you're right that at
> > > this point stuff has already failed - so do we need the shutdown?
> > > (You might want to do the shutdown as part of the recovery earlier
> > > or as a separate command to force the failure)
> > 
> > I assume if I call shutdown before the lock then we'll be good then.
> 
> The question is what happens if you only allow recovery if we're already
> in postcopy-paused state; in the case of a hung socket, since no IO has
> actually failed yet, you will still be in postcopy-active.

Hmm, but isn't that a problem of kernel rather than QEMU?  Since
sockets are after all managed by kernel.

I don't really know what is the best thing to do to detect whether a
socket is stuck.  Assume we can observed that (say, we see migration
transferred bytes keep static for 30 seconds), IIRC you mentioned
about iptable tricks to break an existing e.g. TCP connection, then we
can trigger the -EIO path.

Or do you think we should provide a way to manually trigger the paused
state?  Then it goes back to something we discussed with Dan in the
earlier post - I'd appreciate if we can postpone the manual trigger
support a bit (to make this series small, which is already not...).

Thanks,
Dr. David Alan Gilbert Oct. 12, 2017, 12:19 p.m. UTC | #8
* Peter Xu (peterx@redhat.com) wrote:
> On Tue, Oct 10, 2017 at 01:30:18PM +0100, Dr. David Alan Gilbert wrote:
> > * Peter Xu (peterx@redhat.com) wrote:
> > > On Mon, Oct 09, 2017 at 07:58:13PM +0100, Dr. David Alan Gilbert wrote:
> 
> [...]
> 
> > > > We have to be careful about this; a network can fail in a way it
> > > > gets stuck rather than fails - this can get stuck until a full TCP
> > > > disconnection; and that takes about 30mins (from memory).
> > > > The nice thing about using 'shutdown' is that you can kill the existing
> > > > connection if it's hung. (Which then makes an interesting question;
> > > > the rules in your migrate-incoming command become different if you
> > > > want to declare it's failed!).  Having said that, you're right that at
> > > > this point stuff has already failed - so do we need the shutdown?
> > > > (You might want to do the shutdown as part of the recovery earlier
> > > > or as a separate command to force the failure)
> > > 
> > > I assume if I call shutdown before the lock then we'll be good then.
> > 
> > The question is what happens if you only allow recovery if we're already
> > in postcopy-paused state; in the case of a hung socket, since no IO has
> > actually failed yet, you will still be in postcopy-active.
> 
> Hmm, but isn't that a problem of kernel rather than QEMU?  Since
> sockets are after all managed by kernel.

Kind of, but it comes down to what the right behaviour of a TCP socket
is, and the kernel is probably doing the right thing.

> I don't really know what is the best thing to do to detect whether a
> socket is stuck.  Assume we can observed that (say, we see migration
> transferred bytes keep static for 30 seconds), IIRC you mentioned
> about iptable tricks to break an existing e.g. TCP connection, then we
> can trigger the -EIO path.

From the qemu level I'd prefer to make it a command;  if we start
adding heuristics and timeouts etc then it's very difficult to actually
get them right.

> Or do you think we should provide a way to manually trigger the paused
> state?  Then it goes back to something we discussed with Dan in the
> earlier post - I'd appreciate if we can postpone the manual trigger
> support a bit (to make this series small, which is already not...).

I think that manual trigger is probably necessary; it would just call a
shutdown() on the sockets and let the things fail into the paused state.
It'd be pretty simple.  It would be another OOB command; the tricky
part is just making sure it's thread safe against hte migration
finishing when you issue it.

I think it can wait until after this series if you want, but it would
be good if we can figure it out.

Dave

> Thanks,
> 
> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Peter Xu Oct. 13, 2017, 5:08 a.m. UTC | #9
On Thu, Oct 12, 2017 at 01:19:52PM +0100, Dr. David Alan Gilbert wrote:
> * Peter Xu (peterx@redhat.com) wrote:
> > On Tue, Oct 10, 2017 at 01:30:18PM +0100, Dr. David Alan Gilbert wrote:
> > > * Peter Xu (peterx@redhat.com) wrote:
> > > > On Mon, Oct 09, 2017 at 07:58:13PM +0100, Dr. David Alan Gilbert wrote:
> > 
> > [...]
> > 
> > > > > We have to be careful about this; a network can fail in a way it
> > > > > gets stuck rather than fails - this can get stuck until a full TCP
> > > > > disconnection; and that takes about 30mins (from memory).
> > > > > The nice thing about using 'shutdown' is that you can kill the existing
> > > > > connection if it's hung. (Which then makes an interesting question;
> > > > > the rules in your migrate-incoming command become different if you
> > > > > want to declare it's failed!).  Having said that, you're right that at
> > > > > this point stuff has already failed - so do we need the shutdown?
> > > > > (You might want to do the shutdown as part of the recovery earlier
> > > > > or as a separate command to force the failure)
> > > > 
> > > > I assume if I call shutdown before the lock then we'll be good then.
> > > 
> > > The question is what happens if you only allow recovery if we're already
> > > in postcopy-paused state; in the case of a hung socket, since no IO has
> > > actually failed yet, you will still be in postcopy-active.
> > 
> > Hmm, but isn't that a problem of kernel rather than QEMU?  Since
> > sockets are after all managed by kernel.
> 
> Kind of, but it comes down to what the right behaviour of a TCP socket
> is, and the kernel is probably doing the right thing.
> 
> > I don't really know what is the best thing to do to detect whether a
> > socket is stuck.  Assume we can observed that (say, we see migration
> > transferred bytes keep static for 30 seconds), IIRC you mentioned
> > about iptable tricks to break an existing e.g. TCP connection, then we
> > can trigger the -EIO path.
> 
> From the qemu level I'd prefer to make it a command;  if we start
> adding heuristics and timeouts etc then it's very difficult to actually
> get them right.
> 
> > Or do you think we should provide a way to manually trigger the paused
> > state?  Then it goes back to something we discussed with Dan in the
> > earlier post - I'd appreciate if we can postpone the manual trigger
> > support a bit (to make this series small, which is already not...).
> 
> I think that manual trigger is probably necessary; it would just call a
> shutdown() on the sockets and let the things fail into the paused state.
> It'd be pretty simple.  It would be another OOB command; the tricky
> part is just making sure it's thread safe against hte migration
> finishing when you issue it.
> 
> I think it can wait until after this series if you want, but it would
> be good if we can figure it out.

OK.  Let me try it in my next post.  I hope it won't grow into
something bigger (which does happens sometimes... :).
Dr. David Alan Gilbert Oct. 31, 2017, 6:57 p.m. UTC | #10
* Peter Xu (peterx@redhat.com) wrote:
> On Tue, Oct 10, 2017 at 05:38:01PM +0800, Peter Xu wrote:
> 
> [...]
> 
> > > > But I agree about the reasoning.  How
> > > > about one more patch to postpone the "active" to "postcopy-active"
> > > > state change after the package is handled correctly?  Like:
> > > > 
> > > > --------------
> > > > diff --git a/migration/savevm.c b/migration/savevm.c                     
> > > > index b5c3214034..8317b2a7e2 100644 
> > > > --- a/migration/savevm.c            
> > > > +++ b/migration/savevm.c            
> > > > @@ -1573,8 +1573,6 @@ static void *postcopy_ram_listen_thread(void *opaque)                                                                       
> > > >      QEMUFile *f = mis->from_src_file;                                   
> > > >      int load_res;                  
> > > >                                     
> > > > -    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > > > -                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > > >      qemu_sem_post(&mis->listen_thread_sem);                             
> > > >      trace_postcopy_ram_listen_thread_start();                           
> > > >                                     
> > > > @@ -1817,6 +1815,9 @@ static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)                                                          
> > > >      qemu_fclose(packf);            
> > > >      object_unref(OBJECT(bioc));    
> > > >                                     
> > > > +    migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,             
> > > > +                                   MIGRATION_STATUS_POSTCOPY_ACTIVE);   
> > > > +                                   
> > > >      return ret;                    
> > > >  }                                  
> > > > --------------
> > > > 
> > > > This function will only be called with "postcopy-active" state.
> > > 
> > > I *think* that's safe; you've got to be careful, but I can't see
> > > anyone on the destination that cares about the destinction.
> > 
> > Indeed, but I'd say that's the best thing I can think of (and the
> > simplest).  Even, not sure whether it'll be more clear if we set
> > postcopy-active state right before starting the VM on destination,
> > say, at the beginning of loadvm_postcopy_handle_run_bh().
> 
> When thinking about this, I had another question.
> 
> How do we handle the case if we failed to send the device states in
> postcopy_start()?  In that, we do qemu_savevm_send_packaged() then we
> assume we are good and return with success. However
> qemu_savevm_send_packaged() only means that the data is queued in
> write buffer of source host, it does not mean that destination has
> loaded the device states correctly.  It's still possible that
> destination VM failed to receive the whole packaged data, but source
> thought it had done so without problem.
> 
> Then source will continue with postcopy-active, destination VM will
> instead fail, then fail the source. VM should be lost then since it's
> postcopy rather than precopy.
> 
> Meanwhile, this cannot be handled by postcopy recovery, since IIUC
> postcopy recovery only works after the states are at least loaded on
> destination VM (I'll avoid going deeper to think a more complex
> protocol for postcopy recovery, please see below).
> 
> I think the best/simplest thing to do when encountering this error is
> that, when this happens we just fail the migration on source and
> continue running on source, which should be the same failure handling
> path with precopy.  But still it seems that we don't have a good
> mechanism to detect the error when sending MIG_CMD_PACKAGED message
> fails in some way (we can add one ACK from dst->src, however it breaks
> old VMs).
> 
> Before going further, would my worry make any sense?

Yes, I think it does; it wouldn't be unusual for a device-load to fail
due to some problem on the destination host or a problem in device
serialisation.
I also think we should be OK to restart on the source; although we
have to be careful - can we really know what the previous devices (that
loaded succesfully) did?  Hopefully they didn't change the state of the
storage/networking because the destination CPUs haven't started.

> (I hope this can be a separate problem from postcopy recovery series,
>  if it is indeed a problem.  For postcopy recovery, I hope the idea of
>  postponing setup POSTCOPY_ACTIVE would suffice)

Sure.

Dave

> -- 
> Peter Xu
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
diff mbox series

Patch

diff --git a/migration/migration.c b/migration/migration.c
index 8d26ea8..80de212 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -146,6 +146,7 @@  MigrationIncomingState *migration_incoming_get_current(void)
         memset(&mis_current, 0, sizeof(MigrationIncomingState));
         qemu_mutex_init(&mis_current.rp_mutex);
         qemu_event_init(&mis_current.main_thread_load_event, false);
+        qemu_sem_init(&mis_current.postcopy_pause_sem_dst, 0);
         once = true;
     }
     return &mis_current;
diff --git a/migration/migration.h b/migration/migration.h
index 0c957c9..c423682 100644
--- a/migration/migration.h
+++ b/migration/migration.h
@@ -60,6 +60,9 @@  struct MigrationIncomingState {
     /* The coroutine we should enter (back) after failover */
     Coroutine *migration_incoming_co;
     QemuSemaphore colo_incoming_sem;
+
+    /* notify PAUSED postcopy incoming migrations to try to continue */
+    QemuSemaphore postcopy_pause_sem_dst;
 };
 
 MigrationIncomingState *migration_incoming_get_current(void);
diff --git a/migration/savevm.c b/migration/savevm.c
index 7172f14..3777124 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1488,8 +1488,8 @@  static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
  */
 static void *postcopy_ram_listen_thread(void *opaque)
 {
-    QEMUFile *f = opaque;
     MigrationIncomingState *mis = migration_incoming_get_current();
+    QEMUFile *f = mis->from_src_file;
     int load_res;
 
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
@@ -1503,6 +1503,14 @@  static void *postcopy_ram_listen_thread(void *opaque)
      */
     qemu_file_set_blocking(f, true);
     load_res = qemu_loadvm_state_main(f, mis);
+
+    /*
+     * This is tricky, but, mis->from_src_file can change after it
+     * returns, when postcopy recovery happened. In the future, we may
+     * want a wrapper for the QEMUFile handle.
+     */
+    f = mis->from_src_file;
+
     /* And non-blocking again so we don't block in any cleanup */
     qemu_file_set_blocking(f, false);
 
@@ -1581,7 +1589,7 @@  static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
     /* Start up the listening thread and wait for it to signal ready */
     qemu_sem_init(&mis->listen_thread_sem, 0);
     qemu_thread_create(&mis->listen_thread, "postcopy/listen",
-                       postcopy_ram_listen_thread, mis->from_src_file,
+                       postcopy_ram_listen_thread, NULL,
                        QEMU_THREAD_DETACHED);
     qemu_sem_wait(&mis->listen_thread_sem);
     qemu_sem_destroy(&mis->listen_thread_sem);
@@ -1966,11 +1974,44 @@  void qemu_loadvm_state_cleanup(void)
     }
 }
 
+/* Return true if we should continue the migration, or false. */
+static bool postcopy_pause_incoming(MigrationIncomingState *mis)
+{
+    trace_postcopy_pause_incoming();
+
+    migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                      MIGRATION_STATUS_POSTCOPY_PAUSED);
+
+    assert(mis->from_src_file);
+    qemu_file_shutdown(mis->from_src_file);
+    qemu_fclose(mis->from_src_file);
+    mis->from_src_file = NULL;
+
+    assert(mis->to_src_file);
+    qemu_mutex_lock(&mis->rp_mutex);
+    qemu_file_shutdown(mis->to_src_file);
+    qemu_fclose(mis->to_src_file);
+    mis->to_src_file = NULL;
+    qemu_mutex_unlock(&mis->rp_mutex);
+
+    error_report("Detected IO failure for postcopy. "
+                 "Migration paused.");
+
+    while (mis->state == MIGRATION_STATUS_POSTCOPY_PAUSED) {
+        qemu_sem_wait(&mis->postcopy_pause_sem_dst);
+    }
+
+    trace_postcopy_pause_incoming_continued();
+
+    return true;
+}
+
 static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
 {
     uint8_t section_type;
     int ret = 0;
 
+retry:
     while (true) {
         section_type = qemu_get_byte(f);
 
@@ -2016,6 +2057,21 @@  static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
 out:
     if (ret < 0) {
         qemu_file_set_error(f, ret);
+
+        /*
+         * Detect whether it is:
+         *
+         * 1. postcopy running
+         * 2. network failure (-EIO)
+         *
+         * If so, we try to wait for a recovery.
+         */
+        if (mis->state == MIGRATION_STATUS_POSTCOPY_ACTIVE &&
+            ret == -EIO && postcopy_pause_incoming(mis)) {
+            /* Reset f to point to the newly created channel */
+            f = mis->from_src_file;
+            goto retry;
+        }
     }
     return ret;
 }
diff --git a/migration/trace-events b/migration/trace-events
index 907564b..7764c6f 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -99,6 +99,8 @@  open_return_path_on_source(void) ""
 open_return_path_on_source_continue(void) ""
 postcopy_start(void) ""
 postcopy_pause_continued(void) ""
+postcopy_pause_incoming(void) ""
+postcopy_pause_incoming_continued(void) ""
 postcopy_start_set_run(void) ""
 source_return_path_thread_bad_end(void) ""
 source_return_path_thread_end(void) ""