diff mbox series

[v2,11/22] migration/savevm: don't worry if bitmap migration postcopy failed

Message ID 20200217150246.29180-12-vsementsov@virtuozzo.com
State New
Headers show
Series Fix error handling during bitmap postcopy | expand

Commit Message

Vladimir Sementsov-Ogievskiy Feb. 17, 2020, 3:02 p.m. UTC
First, if only bitmaps postcopy enabled (not ram postcopy)
postcopy_pause_incoming crashes on assertion assert(mis->to_src_file).

And anyway, bitmaps postcopy is not prepared to be somehow recovered.
The original idea instead is that if bitmaps postcopy failed, we just
loss some bitmaps, which is not critical. So, on failure we just need
to remove unfinished bitmaps and guest should continue execution on
destination.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 migration/savevm.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

Comments

Dr. David Alan Gilbert Feb. 17, 2020, 4:57 p.m. UTC | #1
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> First, if only bitmaps postcopy enabled (not ram postcopy)
> postcopy_pause_incoming crashes on assertion assert(mis->to_src_file).
> 
> And anyway, bitmaps postcopy is not prepared to be somehow recovered.
> The original idea instead is that if bitmaps postcopy failed, we just
> loss some bitmaps, which is not critical. So, on failure we just need
> to remove unfinished bitmaps and guest should continue execution on
> destination.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com>

> ---
>  migration/savevm.c | 37 ++++++++++++++++++++++++++++++++-----
>  1 file changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 1d4220ece8..7e9dd58ccb 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1812,6 +1812,9 @@ static void *postcopy_ram_listen_thread(void *opaque)
>      MigrationIncomingState *mis = migration_incoming_get_current();
>      QEMUFile *f = mis->from_src_file;
>      int load_res;
> +    MigrationState *migr = migrate_get_current();
> +
> +    object_ref(OBJECT(migr));
>  
>      migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                                     MIGRATION_STATUS_POSTCOPY_ACTIVE);
> @@ -1838,11 +1841,24 @@ static void *postcopy_ram_listen_thread(void *opaque)
>  
>      trace_postcopy_ram_listen_thread_exit();
>      if (load_res < 0) {
> -        error_report("%s: loadvm failed: %d", __func__, load_res);
>          qemu_file_set_error(f, load_res);
> -        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> -                                       MIGRATION_STATUS_FAILED);
> -    } else {
> +        dirty_bitmap_mig_cancel_incoming();
> +        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
> +            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
> +        {
> +            error_report("%s: loadvm failed during postcopy: %d. All state is "
> +                         "migrated except for dirty bitmaps. Some dirty "
> +                         "bitmaps may be lost, and present migrated dirty "
> +                         "bitmaps are correctly migrated and valid.",
> +                         __func__, load_res);
> +            load_res = 0; /* prevent further exit() */
> +        } else {
> +            error_report("%s: loadvm failed: %d", __func__, load_res);
> +            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> +                                           MIGRATION_STATUS_FAILED);
> +        }
> +    }
> +    if (load_res >= 0) {
>          /*
>           * This looks good, but it's possible that the device loading in the
>           * main thread hasn't finished yet, and so we might not be in 'RUN'
> @@ -1878,6 +1894,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
>      mis->have_listen_thread = false;
>      postcopy_state_set(POSTCOPY_INCOMING_END);
>  
> +    object_unref(OBJECT(migr));
> +
>      return NULL;
>  }
>  
> @@ -2429,6 +2447,8 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
>  {
>      trace_postcopy_pause_incoming();
>  
> +    assert(migrate_postcopy_ram());
> +
>      /* Clear the triggered bit to allow one recovery */
>      mis->postcopy_recover_triggered = false;
>  
> @@ -2513,15 +2533,22 @@ out:
>      if (ret < 0) {
>          qemu_file_set_error(f, ret);
>  
> +        /* Cancel bitmaps incoming regardless of recovery */
> +        dirty_bitmap_mig_cancel_incoming();
> +
>          /*
>           * If we are during an active postcopy, then we pause instead
>           * of bail out to at least keep the VM's dirty data.  Note
>           * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
>           * during which we're still receiving device states and we
>           * still haven't yet started the VM on destination.
> +         *
> +         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
> +         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
> +         * recovering.
>           */
>          if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
> -            postcopy_pause_incoming(mis)) {
> +            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
>              /* Reset f to point to the newly created channel */
>              f = mis->from_src_file;
>              goto retry;
> -- 
> 2.21.0
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Andrey Shinkevich Feb. 18, 2020, 7:44 p.m. UTC | #2
On 17/02/2020 18:02, Vladimir Sementsov-Ogievskiy wrote:
> First, if only bitmaps postcopy enabled (not ram postcopy)
> postcopy_pause_incoming crashes on assertion assert(mis->to_src_file).
> 
> And anyway, bitmaps postcopy is not prepared to be somehow recovered.
> The original idea instead is that if bitmaps postcopy failed, we just
> loss some bitmaps, which is not critical. So, on failure we just need
> to remove unfinished bitmaps and guest should continue execution on
> destination.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>   migration/savevm.c | 37 ++++++++++++++++++++++++++++++++-----
>   1 file changed, 32 insertions(+), 5 deletions(-)
> 
> diff --git a/migration/savevm.c b/migration/savevm.c
> index 1d4220ece8..7e9dd58ccb 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1812,6 +1812,9 @@ static void *postcopy_ram_listen_thread(void *opaque)
>       MigrationIncomingState *mis = migration_incoming_get_current();
>       QEMUFile *f = mis->from_src_file;
>       int load_res;
> +    MigrationState *migr = migrate_get_current();
> +
> +    object_ref(OBJECT(migr));
>   
>       migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
>                                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
> @@ -1838,11 +1841,24 @@ static void *postcopy_ram_listen_thread(void *opaque)
>   
>       trace_postcopy_ram_listen_thread_exit();
>       if (load_res < 0) {
> -        error_report("%s: loadvm failed: %d", __func__, load_res);
>           qemu_file_set_error(f, load_res);
> -        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> -                                       MIGRATION_STATUS_FAILED);
> -    } else {
> +        dirty_bitmap_mig_cancel_incoming();
> +        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
> +            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
> +        {
> +            error_report("%s: loadvm failed during postcopy: %d. All state is "
> +                         "migrated except for dirty bitmaps. Some dirty "

"All states migrated except dirty bitmaps"

> +                         "bitmaps may be lost, and present migrated dirty "
> +                         "bitmaps are correctly migrated and valid.",
> +                         __func__, load_res);
> +            load_res = 0; /* prevent further exit() */
> +        } else {
> +            error_report("%s: loadvm failed: %d", __func__, load_res);
> +            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
> +                                           MIGRATION_STATUS_FAILED);
> +        }
> +    }
> +    if (load_res >= 0) {
>           /*
>            * This looks good, but it's possible that the device loading in the
>            * main thread hasn't finished yet, and so we might not be in 'RUN'
> @@ -1878,6 +1894,8 @@ static void *postcopy_ram_listen_thread(void *opaque)
>       mis->have_listen_thread = false;
>       postcopy_state_set(POSTCOPY_INCOMING_END);
>   
> +    object_unref(OBJECT(migr));
> +
>       return NULL;
>   }
>   
> @@ -2429,6 +2447,8 @@ static bool postcopy_pause_incoming(MigrationIncomingState *mis)
>   {
>       trace_postcopy_pause_incoming();
>   
> +    assert(migrate_postcopy_ram());
> +
>       /* Clear the triggered bit to allow one recovery */
>       mis->postcopy_recover_triggered = false;
>   
> @@ -2513,15 +2533,22 @@ out:
>       if (ret < 0) {
>           qemu_file_set_error(f, ret);
>   
> +        /* Cancel bitmaps incoming regardless of recovery */
> +        dirty_bitmap_mig_cancel_incoming();
> +
>           /*
>            * If we are during an active postcopy, then we pause instead
>            * of bail out to at least keep the VM's dirty data.  Note
>            * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
>            * during which we're still receiving device states and we
>            * still haven't yet started the VM on destination.
> +         *
> +         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
> +         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
> +         * recovering.
>            */
>           if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
> -            postcopy_pause_incoming(mis)) {
> +            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
>               /* Reset f to point to the newly created channel */
>               f = mis->from_src_file;
>               goto retry;
> 

Reviewed-by: Andrey Shinkevich <andrey.shinkevich@virtuozzo.com>
diff mbox series

Patch

diff --git a/migration/savevm.c b/migration/savevm.c
index 1d4220ece8..7e9dd58ccb 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1812,6 +1812,9 @@  static void *postcopy_ram_listen_thread(void *opaque)
     MigrationIncomingState *mis = migration_incoming_get_current();
     QEMUFile *f = mis->from_src_file;
     int load_res;
+    MigrationState *migr = migrate_get_current();
+
+    object_ref(OBJECT(migr));
 
     migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE,
                                    MIGRATION_STATUS_POSTCOPY_ACTIVE);
@@ -1838,11 +1841,24 @@  static void *postcopy_ram_listen_thread(void *opaque)
 
     trace_postcopy_ram_listen_thread_exit();
     if (load_res < 0) {
-        error_report("%s: loadvm failed: %d", __func__, load_res);
         qemu_file_set_error(f, load_res);
-        migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
-                                       MIGRATION_STATUS_FAILED);
-    } else {
+        dirty_bitmap_mig_cancel_incoming();
+        if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
+            !migrate_postcopy_ram() && migrate_dirty_bitmaps())
+        {
+            error_report("%s: loadvm failed during postcopy: %d. All state is "
+                         "migrated except for dirty bitmaps. Some dirty "
+                         "bitmaps may be lost, and present migrated dirty "
+                         "bitmaps are correctly migrated and valid.",
+                         __func__, load_res);
+            load_res = 0; /* prevent further exit() */
+        } else {
+            error_report("%s: loadvm failed: %d", __func__, load_res);
+            migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                                           MIGRATION_STATUS_FAILED);
+        }
+    }
+    if (load_res >= 0) {
         /*
          * This looks good, but it's possible that the device loading in the
          * main thread hasn't finished yet, and so we might not be in 'RUN'
@@ -1878,6 +1894,8 @@  static void *postcopy_ram_listen_thread(void *opaque)
     mis->have_listen_thread = false;
     postcopy_state_set(POSTCOPY_INCOMING_END);
 
+    object_unref(OBJECT(migr));
+
     return NULL;
 }
 
@@ -2429,6 +2447,8 @@  static bool postcopy_pause_incoming(MigrationIncomingState *mis)
 {
     trace_postcopy_pause_incoming();
 
+    assert(migrate_postcopy_ram());
+
     /* Clear the triggered bit to allow one recovery */
     mis->postcopy_recover_triggered = false;
 
@@ -2513,15 +2533,22 @@  out:
     if (ret < 0) {
         qemu_file_set_error(f, ret);
 
+        /* Cancel bitmaps incoming regardless of recovery */
+        dirty_bitmap_mig_cancel_incoming();
+
         /*
          * If we are during an active postcopy, then we pause instead
          * of bail out to at least keep the VM's dirty data.  Note
          * that POSTCOPY_INCOMING_LISTENING stage is still not enough,
          * during which we're still receiving device states and we
          * still haven't yet started the VM on destination.
+         *
+         * Only RAM postcopy supports recovery. Still, if RAM postcopy is
+         * enabled, canceled bitmaps postcopy will not affect RAM postcopy
+         * recovering.
          */
         if (postcopy_state_get() == POSTCOPY_INCOMING_RUNNING &&
-            postcopy_pause_incoming(mis)) {
+            migrate_postcopy_ram() && postcopy_pause_incoming(mis)) {
             /* Reset f to point to the newly created channel */
             f = mis->from_src_file;
             goto retry;