diff mbox series

[v10,05/12] migration: introduce postcopy-only pending

Message ID 20180207155837.92351-6-vsementsov@virtuozzo.com
State New
Headers show
Series Dirty bitmaps postcopy migration | expand

Commit Message

Vladimir Sementsov-Ogievskiy Feb. 7, 2018, 3:58 p.m. UTC
There would be savevm states (dirty-bitmap) which can migrate only in
postcopy stage. The corresponding pending is introduced here.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
---
 include/migration/register.h | 17 +++++++++++++++--
 migration/savevm.h           |  5 +++--
 hw/s390x/s390-stattrib.c     |  7 ++++---
 migration/block.c            |  7 ++++---
 migration/migration.c        | 16 +++++++++-------
 migration/ram.c              |  9 +++++----
 migration/savevm.c           | 13 ++++++++-----
 migration/trace-events       |  2 +-
 8 files changed, 49 insertions(+), 27 deletions(-)

Comments

Dr. David Alan Gilbert March 12, 2018, 3:30 p.m. UTC | #1
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> There would be savevm states (dirty-bitmap) which can migrate only in
> postcopy stage. The corresponding pending is introduced here.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> ---
>  include/migration/register.h | 17 +++++++++++++++--
>  migration/savevm.h           |  5 +++--
>  hw/s390x/s390-stattrib.c     |  7 ++++---
>  migration/block.c            |  7 ++++---
>  migration/migration.c        | 16 +++++++++-------
>  migration/ram.c              |  9 +++++----
>  migration/savevm.c           | 13 ++++++++-----
>  migration/trace-events       |  2 +-
>  8 files changed, 49 insertions(+), 27 deletions(-)
> 
> diff --git a/include/migration/register.h b/include/migration/register.h
> index f4f7bdc177..9436a87678 100644
> --- a/include/migration/register.h
> +++ b/include/migration/register.h
> @@ -37,8 +37,21 @@ typedef struct SaveVMHandlers {
>      int (*save_setup)(QEMUFile *f, void *opaque);
>      void (*save_live_pending)(QEMUFile *f, void *opaque,
>                                uint64_t threshold_size,
> -                              uint64_t *non_postcopiable_pending,
> -                              uint64_t *postcopiable_pending);
> +                              uint64_t *res_precopy_only,
> +                              uint64_t *res_compatible,
> +                              uint64_t *res_postcopy_only);
> +    /* Note for save_live_pending:
> +     * - res_precopy_only is for data which must be migrated in precopy phase
> +     *     or in stopped state, in other words - before target vm start
> +     * - res_compatible is for data which may be migrated in any phase
> +     * - res_postcopy_only is for data which must be migrated in postcopy phase
> +     *     or in stopped state, in other words - after source vm stop
> +     *
> +     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
> +     * whole amount of pending data.
> +     */
> +
> +
>      LoadStateHandler *load_state;
>      int (*load_setup)(QEMUFile *f, void *opaque);
>      int (*load_cleanup)(void *opaque);
> diff --git a/migration/savevm.h b/migration/savevm.h
> index 295c4a1f2c..cf4f0d37ca 100644
> --- a/migration/savevm.h
> +++ b/migration/savevm.h
> @@ -38,8 +38,9 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f);
>  int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
>                                         bool inactivate_disks);
>  void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
> -                               uint64_t *res_non_postcopiable,
> -                               uint64_t *res_postcopiable);
> +                               uint64_t *res_precopy_only,
> +                               uint64_t *res_compatible,
> +                               uint64_t *res_postcopy_only);
>  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
>  void qemu_savevm_send_open_return_path(QEMUFile *f);
>  int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
> diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
> index 2902f54f11..dd3fbfd1eb 100644
> --- a/hw/s390x/s390-stattrib.c
> +++ b/hw/s390x/s390-stattrib.c
> @@ -183,15 +183,16 @@ static int cmma_save_setup(QEMUFile *f, void *opaque)
>  }
>  
>  static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> -                             uint64_t *non_postcopiable_pending,
> -                             uint64_t *postcopiable_pending)
> +                              uint64_t *res_precopy_only,
> +                              uint64_t *res_compatible,
> +                              uint64_t *res_postcopy_only)
>  {
>      S390StAttribState *sas = S390_STATTRIB(opaque);
>      S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
>      long long res = sac->get_dirtycount(sas);
>  
>      if (res >= 0) {
> -        *non_postcopiable_pending += res;
> +        *res_precopy_only += res;
>      }
>  }
>  
> diff --git a/migration/block.c b/migration/block.c
> index 1f03946797..5652ca3337 100644
> --- a/migration/block.c
> +++ b/migration/block.c
> @@ -866,8 +866,9 @@ static int block_save_complete(QEMUFile *f, void *opaque)
>  }
>  
>  static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> -                               uint64_t *non_postcopiable_pending,
> -                               uint64_t *postcopiable_pending)
> +                               uint64_t *res_precopy_only,
> +                               uint64_t *res_compatible,
> +                               uint64_t *res_postcopy_only)
>  {
>      /* Estimate pending number of bytes to send */
>      uint64_t pending;
> @@ -888,7 +889,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
>  
>      DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
>      /* We don't do postcopy */
> -    *non_postcopiable_pending += pending;
> +    *res_precopy_only += pending;
>  }
>  
>  static int block_load(QEMUFile *f, void *opaque, int version_id)
> diff --git a/migration/migration.c b/migration/migration.c
> index c99a4e62d7..3beedd676e 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2215,21 +2215,23 @@ typedef enum {
>   */
>  static MigIterateState migration_iteration_run(MigrationState *s)
>  {
> -    uint64_t pending_size, pend_post, pend_nonpost;
> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
>      bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
>  
> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
> -                              &pend_nonpost, &pend_post);
> -    pending_size = pend_nonpost + pend_post;
> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
> +                              &pend_compat, &pend_post);
> +    pending_size = pend_pre + pend_compat + pend_post;
>  
>      trace_migrate_pending(pending_size, s->threshold_size,
> -                          pend_post, pend_nonpost);
> +                          pend_pre, pend_compat, pend_post);
>  
>      if (pending_size && pending_size >= s->threshold_size) {
>          /* Still a significant amount to transfer */
>          if (migrate_postcopy() && !in_postcopy &&
> -            pend_nonpost <= s->threshold_size &&
> -            atomic_read(&s->start_postcopy)) {
> +            pend_pre <= s->threshold_size &&
> +            (atomic_read(&s->start_postcopy) ||
> +             (pend_pre + pend_compat <= s->threshold_size)))

This change does something different from the description;
it causes a postcopy_start even if the user never ran the postcopy-start
command; so sorry, we can't do that; because postcopy for RAM is
something that users can enable but only switch into when they've given
up on it completing normally.

However, I guess that leaves you with a problem; which is what happens
to the system when you've run out of pend_pre+pend_compat but can't
complete because pend_post is non-0; so I don't know the answer to that.

Dave

> +        {
>              if (postcopy_start(s)) {
>                  error_report("%s: postcopy failed to start", __func__);
>              }
> diff --git a/migration/ram.c b/migration/ram.c
> index cb1950f3eb..38b1b2486a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -2359,8 +2359,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
>  }
>  
>  static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> -                             uint64_t *non_postcopiable_pending,
> -                             uint64_t *postcopiable_pending)
> +                             uint64_t *res_precopy_only,
> +                             uint64_t *res_compatible,
> +                             uint64_t *res_postcopy_only)
>  {
>      RAMState **temp = opaque;
>      RAMState *rs = *temp;
> @@ -2380,9 +2381,9 @@ static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
>  
>      if (migrate_postcopy_ram()) {
>          /* We can do postcopy, and all the data is postcopiable */
> -        *postcopiable_pending += remaining_size;
> +        *res_compatible += remaining_size;
>      } else {
> -        *non_postcopiable_pending += remaining_size;
> +        *res_precopy_only += remaining_size;
>      }
>  }
>  
> diff --git a/migration/savevm.c b/migration/savevm.c
> index b7908f62be..c3c60a15e3 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1218,13 +1218,15 @@ int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
>   * for units that can't do postcopy.
>   */
>  void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
> -                               uint64_t *res_non_postcopiable,
> -                               uint64_t *res_postcopiable)
> +                               uint64_t *res_precopy_only,
> +                               uint64_t *res_compatible,
> +                               uint64_t *res_postcopy_only)
>  {
>      SaveStateEntry *se;
>  
> -    *res_non_postcopiable = 0;
> -    *res_postcopiable = 0;
> +    *res_precopy_only = 0;
> +    *res_compatible = 0;
> +    *res_postcopy_only = 0;
>  
>  
>      QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
> @@ -1237,7 +1239,8 @@ void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
>              }
>          }
>          se->ops->save_live_pending(f, se->opaque, threshold_size,
> -                                   res_non_postcopiable, res_postcopiable);
> +                                   res_precopy_only, res_compatible,
> +                                   res_postcopy_only);
>      }
>  }
>  
> diff --git a/migration/trace-events b/migration/trace-events
> index 6f29fcc686..a04fffb877 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -86,7 +86,7 @@ migrate_fd_cleanup(void) ""
>  migrate_fd_error(const char *error_desc) "error=%s"
>  migrate_fd_cancel(void) ""
>  migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
> -migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
> +migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
>  migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
>  migration_completion_file_err(void) ""
>  migration_completion_postcopy_end(void) ""
> -- 
> 2.11.1
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
John Snow March 13, 2018, 5:38 a.m. UTC | #2
On 03/12/2018 11:30 AM, Dr. David Alan Gilbert wrote:
> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>> There would be savevm states (dirty-bitmap) which can migrate only in
>> postcopy stage. The corresponding pending is introduced here.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---
>>  include/migration/register.h | 17 +++++++++++++++--
>>  migration/savevm.h           |  5 +++--
>>  hw/s390x/s390-stattrib.c     |  7 ++++---
>>  migration/block.c            |  7 ++++---
>>  migration/migration.c        | 16 +++++++++-------
>>  migration/ram.c              |  9 +++++----
>>  migration/savevm.c           | 13 ++++++++-----
>>  migration/trace-events       |  2 +-
>>  8 files changed, 49 insertions(+), 27 deletions(-)
>>
>> diff --git a/include/migration/register.h b/include/migration/register.h
>> index f4f7bdc177..9436a87678 100644
>> --- a/include/migration/register.h
>> +++ b/include/migration/register.h
>> @@ -37,8 +37,21 @@ typedef struct SaveVMHandlers {
>>      int (*save_setup)(QEMUFile *f, void *opaque);
>>      void (*save_live_pending)(QEMUFile *f, void *opaque,
>>                                uint64_t threshold_size,
>> -                              uint64_t *non_postcopiable_pending,
>> -                              uint64_t *postcopiable_pending);
>> +                              uint64_t *res_precopy_only,
>> +                              uint64_t *res_compatible,
>> +                              uint64_t *res_postcopy_only);
>> +    /* Note for save_live_pending:
>> +     * - res_precopy_only is for data which must be migrated in precopy phase
>> +     *     or in stopped state, in other words - before target vm start
>> +     * - res_compatible is for data which may be migrated in any phase
>> +     * - res_postcopy_only is for data which must be migrated in postcopy phase
>> +     *     or in stopped state, in other words - after source vm stop
>> +     *
>> +     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
>> +     * whole amount of pending data.
>> +     */
>> +
>> +
>>      LoadStateHandler *load_state;
>>      int (*load_setup)(QEMUFile *f, void *opaque);
>>      int (*load_cleanup)(void *opaque);
>> diff --git a/migration/savevm.h b/migration/savevm.h
>> index 295c4a1f2c..cf4f0d37ca 100644
>> --- a/migration/savevm.h
>> +++ b/migration/savevm.h
>> @@ -38,8 +38,9 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f);
>>  int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
>>                                         bool inactivate_disks);
>>  void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
>> -                               uint64_t *res_non_postcopiable,
>> -                               uint64_t *res_postcopiable);
>> +                               uint64_t *res_precopy_only,
>> +                               uint64_t *res_compatible,
>> +                               uint64_t *res_postcopy_only);
>>  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
>>  void qemu_savevm_send_open_return_path(QEMUFile *f);
>>  int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
>> diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
>> index 2902f54f11..dd3fbfd1eb 100644
>> --- a/hw/s390x/s390-stattrib.c
>> +++ b/hw/s390x/s390-stattrib.c
>> @@ -183,15 +183,16 @@ static int cmma_save_setup(QEMUFile *f, void *opaque)
>>  }
>>  
>>  static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
>> -                             uint64_t *non_postcopiable_pending,
>> -                             uint64_t *postcopiable_pending)
>> +                              uint64_t *res_precopy_only,
>> +                              uint64_t *res_compatible,
>> +                              uint64_t *res_postcopy_only)
>>  {
>>      S390StAttribState *sas = S390_STATTRIB(opaque);
>>      S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
>>      long long res = sac->get_dirtycount(sas);
>>  
>>      if (res >= 0) {
>> -        *non_postcopiable_pending += res;
>> +        *res_precopy_only += res;
>>      }
>>  }
>>  
>> diff --git a/migration/block.c b/migration/block.c
>> index 1f03946797..5652ca3337 100644
>> --- a/migration/block.c
>> +++ b/migration/block.c
>> @@ -866,8 +866,9 @@ static int block_save_complete(QEMUFile *f, void *opaque)
>>  }
>>  
>>  static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
>> -                               uint64_t *non_postcopiable_pending,
>> -                               uint64_t *postcopiable_pending)
>> +                               uint64_t *res_precopy_only,
>> +                               uint64_t *res_compatible,
>> +                               uint64_t *res_postcopy_only)
>>  {
>>      /* Estimate pending number of bytes to send */
>>      uint64_t pending;
>> @@ -888,7 +889,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
>>  
>>      DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
>>      /* We don't do postcopy */
>> -    *non_postcopiable_pending += pending;
>> +    *res_precopy_only += pending;
>>  }
>>  
>>  static int block_load(QEMUFile *f, void *opaque, int version_id)
>> diff --git a/migration/migration.c b/migration/migration.c
>> index c99a4e62d7..3beedd676e 100644
>> --- a/migration/migration.c
>> +++ b/migration/migration.c
>> @@ -2215,21 +2215,23 @@ typedef enum {
>>   */
>>  static MigIterateState migration_iteration_run(MigrationState *s)
>>  {
>> -    uint64_t pending_size, pend_post, pend_nonpost;
>> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
>>      bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
>>  
>> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
>> -                              &pend_nonpost, &pend_post);
>> -    pending_size = pend_nonpost + pend_post;
>> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
>> +                              &pend_compat, &pend_post);
>> +    pending_size = pend_pre + pend_compat + pend_post;
>>  
>>      trace_migrate_pending(pending_size, s->threshold_size,
>> -                          pend_post, pend_nonpost);
>> +                          pend_pre, pend_compat, pend_post);
>>  
>>      if (pending_size && pending_size >= s->threshold_size) {
>>          /* Still a significant amount to transfer */
>>          if (migrate_postcopy() && !in_postcopy &&
>> -            pend_nonpost <= s->threshold_size &&
>> -            atomic_read(&s->start_postcopy)) {
>> +            pend_pre <= s->threshold_size &&
>> +            (atomic_read(&s->start_postcopy) ||
>> +             (pend_pre + pend_compat <= s->threshold_size)))
> 
> This change does something different from the description;
> it causes a postcopy_start even if the user never ran the postcopy-start
> command; so sorry, we can't do that; because postcopy for RAM is
> something that users can enable but only switch into when they've given
> up on it completing normally.
> 
> However, I guess that leaves you with a problem; which is what happens
> to the system when you've run out of pend_pre+pend_compat but can't
> complete because pend_post is non-0; so I don't know the answer to that.
> 
> Dave
> 

I'm willing to pull the bitmap related changes into my tree for 2.12,
but it would be _really_ nice to have this patchset in 2.12 -- is there
anything we can do to get this into the migration queue?

I'm afraid I don't know what the right answer here is either, so I can't
fix this up myself and I imagine Vladimir will need some feedback from
you to change this design.

Are we -hosed- ?
Vladimir Sementsov-Ogievskiy March 13, 2018, 7:47 a.m. UTC | #3
12.03.2018 18:30, Dr. David Alan Gilbert wrote:
> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>> There would be savevm states (dirty-bitmap) which can migrate only in
>> postcopy stage. The corresponding pending is introduced here.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>> ---

[...]

>>   static MigIterateState migration_iteration_run(MigrationState *s)
>>   {
>> -    uint64_t pending_size, pend_post, pend_nonpost;
>> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
>>       bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
>>   
>> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
>> -                              &pend_nonpost, &pend_post);
>> -    pending_size = pend_nonpost + pend_post;
>> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
>> +                              &pend_compat, &pend_post);
>> +    pending_size = pend_pre + pend_compat + pend_post;
>>   
>>       trace_migrate_pending(pending_size, s->threshold_size,
>> -                          pend_post, pend_nonpost);
>> +                          pend_pre, pend_compat, pend_post);
>>   
>>       if (pending_size && pending_size >= s->threshold_size) {
>>           /* Still a significant amount to transfer */
>>           if (migrate_postcopy() && !in_postcopy &&
>> -            pend_nonpost <= s->threshold_size &&
>> -            atomic_read(&s->start_postcopy)) {
>> +            pend_pre <= s->threshold_size &&
>> +            (atomic_read(&s->start_postcopy) ||
>> +             (pend_pre + pend_compat <= s->threshold_size)))
> This change does something different from the description;
> it causes a postcopy_start even if the user never ran the postcopy-start
> command; so sorry, we can't do that; because postcopy for RAM is
> something that users can enable but only switch into when they've given
> up on it completing normally.
>
> However, I guess that leaves you with a problem; which is what happens
> to the system when you've run out of pend_pre+pend_compat but can't
> complete because pend_post is non-0; so I don't know the answer to that.
>
>

Hmm. Here, we go to postcopy only if "pend_pre + pend_compat <= 
s->threshold_size". Pre-patch, in this case we will go to 
migration_completion(). So, precopy stage is finishing anyway. So, we 
want in this case to finish ram migration like it was finished by 
migration_completion(), and then, run postcopy, which will handle only 
dirty bitmaps, yes?

Hmm2. Looked through migration_completion(), I don't understand, how it 
finishes ram migration without postcopy. It calls 
qemu_savevm_state_complete_precopy(), which skips states with 
has_postcopy=true, which is ram...
Dr. David Alan Gilbert March 13, 2018, 10:30 a.m. UTC | #4
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> 12.03.2018 18:30, Dr. David Alan Gilbert wrote:
> > * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> > > There would be savevm states (dirty-bitmap) which can migrate only in
> > > postcopy stage. The corresponding pending is introduced here.
> > > 
> > > Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> > > ---
> 
> [...]
> 
> > >   static MigIterateState migration_iteration_run(MigrationState *s)
> > >   {
> > > -    uint64_t pending_size, pend_post, pend_nonpost;
> > > +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
> > >       bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
> > > -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
> > > -                              &pend_nonpost, &pend_post);
> > > -    pending_size = pend_nonpost + pend_post;
> > > +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
> > > +                              &pend_compat, &pend_post);
> > > +    pending_size = pend_pre + pend_compat + pend_post;
> > >       trace_migrate_pending(pending_size, s->threshold_size,
> > > -                          pend_post, pend_nonpost);
> > > +                          pend_pre, pend_compat, pend_post);
> > >       if (pending_size && pending_size >= s->threshold_size) {
> > >           /* Still a significant amount to transfer */
> > >           if (migrate_postcopy() && !in_postcopy &&
> > > -            pend_nonpost <= s->threshold_size &&
> > > -            atomic_read(&s->start_postcopy)) {
> > > +            pend_pre <= s->threshold_size &&
> > > +            (atomic_read(&s->start_postcopy) ||
> > > +             (pend_pre + pend_compat <= s->threshold_size)))
> > This change does something different from the description;
> > it causes a postcopy_start even if the user never ran the postcopy-start
> > command; so sorry, we can't do that; because postcopy for RAM is
> > something that users can enable but only switch into when they've given
> > up on it completing normally.
> > 
> > However, I guess that leaves you with a problem; which is what happens
> > to the system when you've run out of pend_pre+pend_compat but can't
> > complete because pend_post is non-0; so I don't know the answer to that.
> > 
> > 
> 
> Hmm. Here, we go to postcopy only if "pend_pre + pend_compat <=
> s->threshold_size". Pre-patch, in this case we will go to
> migration_completion(). So, precopy stage is finishing anyway.

Right.

> So, we want
> in this case to finish ram migration like it was finished by
> migration_completion(), and then, run postcopy, which will handle only dirty
> bitmaps, yes?

It's a bit tricky; the first important thing is that we can't change the
semantics of the migration without the 'dirty bitmaps'.

So then there's the question of how  a migration with both
postcopy-ram+dirty bitmaps should work;  again I don't think we should
enter the postcopy-ram phase until start-postcopy is issued.

Then there's the 3rd case; dirty-bitmaps but no postcopy-ram; in that
case I worry less about the semantics of how you want to do it.

> Hmm2. Looked through migration_completion(), I don't understand, how it
> finishes ram migration without postcopy. It calls
> qemu_savevm_state_complete_precopy(), which skips states with
> has_postcopy=true, which is ram...

Because savevm_state_complete_precopy only skips has_postcopy=true in
the in_postcopy case:

            (in_postcopy && se->ops->has_postcopy &&
             se->ops->has_postcopy(se->opaque)) ||

so when we call it in migration_completion(), if we've not entered
postcopy yet, then that test doesn't trigger.

(Apologies for not spotting this earlier; but I thought this patch was
a nice easy one just adding the postcopy_only_pending - I didn't realise it changed
existing semantics until I spotted that)

Dave

> -- 
> Best regards,
> Vladimir
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Dr. David Alan Gilbert March 13, 2018, 12:17 p.m. UTC | #5
* John Snow (jsnow@redhat.com) wrote:
> 
> 
> On 03/12/2018 11:30 AM, Dr. David Alan Gilbert wrote:
> > * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> >> There would be savevm states (dirty-bitmap) which can migrate only in
> >> postcopy stage. The corresponding pending is introduced here.
> >>
> >> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> >> ---
> >>  include/migration/register.h | 17 +++++++++++++++--
> >>  migration/savevm.h           |  5 +++--
> >>  hw/s390x/s390-stattrib.c     |  7 ++++---
> >>  migration/block.c            |  7 ++++---
> >>  migration/migration.c        | 16 +++++++++-------
> >>  migration/ram.c              |  9 +++++----
> >>  migration/savevm.c           | 13 ++++++++-----
> >>  migration/trace-events       |  2 +-
> >>  8 files changed, 49 insertions(+), 27 deletions(-)
> >>
> >> diff --git a/include/migration/register.h b/include/migration/register.h
> >> index f4f7bdc177..9436a87678 100644
> >> --- a/include/migration/register.h
> >> +++ b/include/migration/register.h
> >> @@ -37,8 +37,21 @@ typedef struct SaveVMHandlers {
> >>      int (*save_setup)(QEMUFile *f, void *opaque);
> >>      void (*save_live_pending)(QEMUFile *f, void *opaque,
> >>                                uint64_t threshold_size,
> >> -                              uint64_t *non_postcopiable_pending,
> >> -                              uint64_t *postcopiable_pending);
> >> +                              uint64_t *res_precopy_only,
> >> +                              uint64_t *res_compatible,
> >> +                              uint64_t *res_postcopy_only);
> >> +    /* Note for save_live_pending:
> >> +     * - res_precopy_only is for data which must be migrated in precopy phase
> >> +     *     or in stopped state, in other words - before target vm start
> >> +     * - res_compatible is for data which may be migrated in any phase
> >> +     * - res_postcopy_only is for data which must be migrated in postcopy phase
> >> +     *     or in stopped state, in other words - after source vm stop
> >> +     *
> >> +     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
> >> +     * whole amount of pending data.
> >> +     */
> >> +
> >> +
> >>      LoadStateHandler *load_state;
> >>      int (*load_setup)(QEMUFile *f, void *opaque);
> >>      int (*load_cleanup)(void *opaque);
> >> diff --git a/migration/savevm.h b/migration/savevm.h
> >> index 295c4a1f2c..cf4f0d37ca 100644
> >> --- a/migration/savevm.h
> >> +++ b/migration/savevm.h
> >> @@ -38,8 +38,9 @@ void qemu_savevm_state_complete_postcopy(QEMUFile *f);
> >>  int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
> >>                                         bool inactivate_disks);
> >>  void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
> >> -                               uint64_t *res_non_postcopiable,
> >> -                               uint64_t *res_postcopiable);
> >> +                               uint64_t *res_precopy_only,
> >> +                               uint64_t *res_compatible,
> >> +                               uint64_t *res_postcopy_only);
> >>  void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
> >>  void qemu_savevm_send_open_return_path(QEMUFile *f);
> >>  int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
> >> diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
> >> index 2902f54f11..dd3fbfd1eb 100644
> >> --- a/hw/s390x/s390-stattrib.c
> >> +++ b/hw/s390x/s390-stattrib.c
> >> @@ -183,15 +183,16 @@ static int cmma_save_setup(QEMUFile *f, void *opaque)
> >>  }
> >>  
> >>  static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> >> -                             uint64_t *non_postcopiable_pending,
> >> -                             uint64_t *postcopiable_pending)
> >> +                              uint64_t *res_precopy_only,
> >> +                              uint64_t *res_compatible,
> >> +                              uint64_t *res_postcopy_only)
> >>  {
> >>      S390StAttribState *sas = S390_STATTRIB(opaque);
> >>      S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
> >>      long long res = sac->get_dirtycount(sas);
> >>  
> >>      if (res >= 0) {
> >> -        *non_postcopiable_pending += res;
> >> +        *res_precopy_only += res;
> >>      }
> >>  }
> >>  
> >> diff --git a/migration/block.c b/migration/block.c
> >> index 1f03946797..5652ca3337 100644
> >> --- a/migration/block.c
> >> +++ b/migration/block.c
> >> @@ -866,8 +866,9 @@ static int block_save_complete(QEMUFile *f, void *opaque)
> >>  }
> >>  
> >>  static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> >> -                               uint64_t *non_postcopiable_pending,
> >> -                               uint64_t *postcopiable_pending)
> >> +                               uint64_t *res_precopy_only,
> >> +                               uint64_t *res_compatible,
> >> +                               uint64_t *res_postcopy_only)
> >>  {
> >>      /* Estimate pending number of bytes to send */
> >>      uint64_t pending;
> >> @@ -888,7 +889,7 @@ static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
> >>  
> >>      DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
> >>      /* We don't do postcopy */
> >> -    *non_postcopiable_pending += pending;
> >> +    *res_precopy_only += pending;
> >>  }
> >>  
> >>  static int block_load(QEMUFile *f, void *opaque, int version_id)
> >> diff --git a/migration/migration.c b/migration/migration.c
> >> index c99a4e62d7..3beedd676e 100644
> >> --- a/migration/migration.c
> >> +++ b/migration/migration.c
> >> @@ -2215,21 +2215,23 @@ typedef enum {
> >>   */
> >>  static MigIterateState migration_iteration_run(MigrationState *s)
> >>  {
> >> -    uint64_t pending_size, pend_post, pend_nonpost;
> >> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
> >>      bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
> >>  
> >> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
> >> -                              &pend_nonpost, &pend_post);
> >> -    pending_size = pend_nonpost + pend_post;
> >> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
> >> +                              &pend_compat, &pend_post);
> >> +    pending_size = pend_pre + pend_compat + pend_post;
> >>  
> >>      trace_migrate_pending(pending_size, s->threshold_size,
> >> -                          pend_post, pend_nonpost);
> >> +                          pend_pre, pend_compat, pend_post);
> >>  
> >>      if (pending_size && pending_size >= s->threshold_size) {
> >>          /* Still a significant amount to transfer */
> >>          if (migrate_postcopy() && !in_postcopy &&
> >> -            pend_nonpost <= s->threshold_size &&
> >> -            atomic_read(&s->start_postcopy)) {
> >> +            pend_pre <= s->threshold_size &&
> >> +            (atomic_read(&s->start_postcopy) ||
> >> +             (pend_pre + pend_compat <= s->threshold_size)))
> > 
> > This change does something different from the description;
> > it causes a postcopy_start even if the user never ran the postcopy-start
> > command; so sorry, we can't do that; because postcopy for RAM is
> > something that users can enable but only switch into when they've given
> > up on it completing normally.
> > 
> > However, I guess that leaves you with a problem; which is what happens
> > to the system when you've run out of pend_pre+pend_compat but can't
> > complete because pend_post is non-0; so I don't know the answer to that.
> > 
> > Dave
> > 
> 
> I'm willing to pull the bitmap related changes into my tree for 2.12,
> but it would be _really_ nice to have this patchset in 2.12 -- is there
> anything we can do to get this into the migration queue?
> 
> I'm afraid I don't know what the right answer here is either, so I can't
> fix this up myself and I imagine Vladimir will need some feedback from
> you to change this design.

Well I'm good with everything except the change in semantics for
existing migration;  we can't do that because it can break existing users.


> Are we -hosed- ?

Dave
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy March 13, 2018, 1:11 p.m. UTC | #6
13.03.2018 13:30, Dr. David Alan Gilbert wrote:
> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>> 12.03.2018 18:30, Dr. David Alan Gilbert wrote:
>>> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>>>> There would be savevm states (dirty-bitmap) which can migrate only in
>>>> postcopy stage. The corresponding pending is introduced here.
>>>>
>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>> ---
>> [...]
>>
>>>>    static MigIterateState migration_iteration_run(MigrationState *s)
>>>>    {
>>>> -    uint64_t pending_size, pend_post, pend_nonpost;
>>>> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
>>>>        bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
>>>> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
>>>> -                              &pend_nonpost, &pend_post);
>>>> -    pending_size = pend_nonpost + pend_post;
>>>> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
>>>> +                              &pend_compat, &pend_post);
>>>> +    pending_size = pend_pre + pend_compat + pend_post;
>>>>        trace_migrate_pending(pending_size, s->threshold_size,
>>>> -                          pend_post, pend_nonpost);
>>>> +                          pend_pre, pend_compat, pend_post);
>>>>        if (pending_size && pending_size >= s->threshold_size) {
>>>>            /* Still a significant amount to transfer */
>>>>            if (migrate_postcopy() && !in_postcopy &&
>>>> -            pend_nonpost <= s->threshold_size &&
>>>> -            atomic_read(&s->start_postcopy)) {
>>>> +            pend_pre <= s->threshold_size &&
>>>> +            (atomic_read(&s->start_postcopy) ||
>>>> +             (pend_pre + pend_compat <= s->threshold_size)))
>>> This change does something different from the description;
>>> it causes a postcopy_start even if the user never ran the postcopy-start
>>> command; so sorry, we can't do that; because postcopy for RAM is
>>> something that users can enable but only switch into when they've given
>>> up on it completing normally.
>>>
>>> However, I guess that leaves you with a problem; which is what happens
>>> to the system when you've run out of pend_pre+pend_compat but can't
>>> complete because pend_post is non-0; so I don't know the answer to that.
>>>
>>>
>> Hmm. Here, we go to postcopy only if "pend_pre + pend_compat <=
>> s->threshold_size". Pre-patch, in this case we will go to
>> migration_completion(). So, precopy stage is finishing anyway.
> Right.
>
>> So, we want
>> in this case to finish ram migration like it was finished by
>> migration_completion(), and then, run postcopy, which will handle only dirty
>> bitmaps, yes?
> It's a bit tricky; the first important thing is that we can't change the
> semantics of the migration without the 'dirty bitmaps'.
>
> So then there's the question of how  a migration with both
> postcopy-ram+dirty bitmaps should work;  again I don't think we should
> enter the postcopy-ram phase until start-postcopy is issued.
>
> Then there's the 3rd case; dirty-bitmaps but no postcopy-ram; in that
> case I worry less about the semantics of how you want to do it.

I have an idea:

in postcopy_start(), in ram_has_postcopy() (and may be some other 
places?), check atomic_read(&s->start_postcopy) instead of 
migrate_postcopy_ram()

then:

1. behavior without dirty-bitmaps is not changed, as currently we cant 
go into postcopy_start and ram_has_postcopy without s->start_postcopy
2. dirty-bitmaps+ram: if user don't set s->start_postcopy, 
postcopy_start() will operate as if migration capability was not 
enabled, so ram should complete its migration
3. only dirty-bitmaps: again, postcopy_start() will operate as if 
migration capability was not enabled, so ram should complete its migration

>
>> Hmm2. Looked through migration_completion(), I don't understand, how it
>> finishes ram migration without postcopy. It calls
>> qemu_savevm_state_complete_precopy(), which skips states with
>> has_postcopy=true, which is ram...
> Because savevm_state_complete_precopy only skips has_postcopy=true in
> the in_postcopy case:
>
>              (in_postcopy && se->ops->has_postcopy &&
>               se->ops->has_postcopy(se->opaque)) ||
>
> so when we call it in migration_completion(), if we've not entered
> postcopy yet, then that test doesn't trigger.
>
> (Apologies for not spotting this earlier; but I thought this patch was
> a nice easy one just adding the postcopy_only_pending - I didn't realise it changed
> existing semantics until I spotted that)

oh, yes, I was inattentive :(

>
> Dave
>
>> -- 
>> Best regards,
>> Vladimir
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy March 13, 2018, 1:32 p.m. UTC | #7
13.03.2018 16:11, Vladimir Sementsov-Ogievskiy wrote:
> 13.03.2018 13:30, Dr. David Alan Gilbert wrote:
>> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>>> 12.03.2018 18:30, Dr. David Alan Gilbert wrote:
>>>> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>>>>> There would be savevm states (dirty-bitmap) which can migrate only in
>>>>> postcopy stage. The corresponding pending is introduced here.
>>>>>
>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy<vsementsov@virtuozzo.com>
>>>>> ---
>>> [...]
>>>
>>>>>    static MigIterateState migration_iteration_run(MigrationState *s)
>>>>>    {
>>>>> -    uint64_t pending_size, pend_post, pend_nonpost;
>>>>> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
>>>>>        bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
>>>>> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
>>>>> -                              &pend_nonpost, &pend_post);
>>>>> -    pending_size = pend_nonpost + pend_post;
>>>>> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
>>>>> +                              &pend_compat, &pend_post);
>>>>> +    pending_size = pend_pre + pend_compat + pend_post;
>>>>>        trace_migrate_pending(pending_size, s->threshold_size,
>>>>> -                          pend_post, pend_nonpost);
>>>>> +                          pend_pre, pend_compat, pend_post);
>>>>>        if (pending_size && pending_size >= s->threshold_size) {
>>>>>            /* Still a significant amount to transfer */
>>>>>            if (migrate_postcopy() && !in_postcopy &&
>>>>> -            pend_nonpost <= s->threshold_size &&
>>>>> -            atomic_read(&s->start_postcopy)) {
>>>>> +            pend_pre <= s->threshold_size &&
>>>>> +            (atomic_read(&s->start_postcopy) ||
>>>>> +             (pend_pre + pend_compat <= s->threshold_size)))
>>>> This change does something different from the description;
>>>> it causes a postcopy_start even if the user never ran the postcopy-start
>>>> command; so sorry, we can't do that; because postcopy for RAM is
>>>> something that users can enable but only switch into when they've given
>>>> up on it completing normally.
>>>>
>>>> However, I guess that leaves you with a problem; which is what happens
>>>> to the system when you've run out of pend_pre+pend_compat but can't
>>>> complete because pend_post is non-0; so I don't know the answer to that.
>>>>
>>>>
>>> Hmm. Here, we go to postcopy only if "pend_pre + pend_compat <=
>>> s->threshold_size". Pre-patch, in this case we will go to
>>> migration_completion(). So, precopy stage is finishing anyway.
>> Right.
>>
>>> So, we want
>>> in this case to finish ram migration like it was finished by
>>> migration_completion(), and then, run postcopy, which will handle only dirty
>>> bitmaps, yes?
>> It's a bit tricky; the first important thing is that we can't change the
>> semantics of the migration without the 'dirty bitmaps'.
>>
>> So then there's the question of how  a migration with both
>> postcopy-ram+dirty bitmaps should work;  again I don't think we should
>> enter the postcopy-ram phase until start-postcopy is issued.
>>
>> Then there's the 3rd case; dirty-bitmaps but no postcopy-ram; in that
>> case I worry less about the semantics of how you want to do it.
>
> I have an idea:
>
> in postcopy_start(), in ram_has_postcopy() (and may be some other 
> places?), check atomic_read(&s->start_postcopy) instead of 
> migrate_postcopy_ram()
>
> then:
>
> 1. behavior without dirty-bitmaps is not changed, as currently we cant 
> go into postcopy_start and ram_has_postcopy without s->start_postcopy
> 2. dirty-bitmaps+ram: if user don't set s->start_postcopy, 
> postcopy_start() will operate as if migration capability was not 
> enabled, so ram should complete its migration
> 3. only dirty-bitmaps: again, postcopy_start() will operate as if 
> migration capability was not enabled, so ram should complete its migration

I mean s/migration capability/migration capability for ram postcopy/.

What do you think, will that work?

>
>
>>> Hmm2. Looked through migration_completion(), I don't understand, how it
>>> finishes ram migration without postcopy. It calls
>>> qemu_savevm_state_complete_precopy(), which skips states with
>>> has_postcopy=true, which is ram...
>> Because savevm_state_complete_precopy only skips has_postcopy=true in
>> the in_postcopy case:
>>
>>              (in_postcopy && se->ops->has_postcopy &&
>>               se->ops->has_postcopy(se->opaque)) ||
>>
>> so when we call it in migration_completion(), if we've not entered
>> postcopy yet, then that test doesn't trigger.
>>
>> (Apologies for not spotting this earlier; but I thought this patch was
>> a nice easy one just adding the postcopy_only_pending - I didn't realise it changed
>> existing semantics until I spotted that)
>
> oh, yes, I was inattentive :(
>
>> Dave
>>
>>> -- 
>>> Best regards,
>>> Vladimir
>>>
>> --
>> Dr. David Alan Gilbert /dgilbert@redhat.com  / Manchester, UK
>
>
> -- 
> Best regards,
> Vladimir
Dr. David Alan Gilbert March 13, 2018, 3:35 p.m. UTC | #8
* Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> 13.03.2018 13:30, Dr. David Alan Gilbert wrote:
> > * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> > > 12.03.2018 18:30, Dr. David Alan Gilbert wrote:
> > > > * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
> > > > > There would be savevm states (dirty-bitmap) which can migrate only in
> > > > > postcopy stage. The corresponding pending is introduced here.
> > > > > 
> > > > > Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
> > > > > ---
> > > [...]
> > > 
> > > > >    static MigIterateState migration_iteration_run(MigrationState *s)
> > > > >    {
> > > > > -    uint64_t pending_size, pend_post, pend_nonpost;
> > > > > +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
> > > > >        bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
> > > > > -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
> > > > > -                              &pend_nonpost, &pend_post);
> > > > > -    pending_size = pend_nonpost + pend_post;
> > > > > +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
> > > > > +                              &pend_compat, &pend_post);
> > > > > +    pending_size = pend_pre + pend_compat + pend_post;
> > > > >        trace_migrate_pending(pending_size, s->threshold_size,
> > > > > -                          pend_post, pend_nonpost);
> > > > > +                          pend_pre, pend_compat, pend_post);
> > > > >        if (pending_size && pending_size >= s->threshold_size) {
> > > > >            /* Still a significant amount to transfer */
> > > > >            if (migrate_postcopy() && !in_postcopy &&
> > > > > -            pend_nonpost <= s->threshold_size &&
> > > > > -            atomic_read(&s->start_postcopy)) {
> > > > > +            pend_pre <= s->threshold_size &&
> > > > > +            (atomic_read(&s->start_postcopy) ||
> > > > > +             (pend_pre + pend_compat <= s->threshold_size)))
> > > > This change does something different from the description;
> > > > it causes a postcopy_start even if the user never ran the postcopy-start
> > > > command; so sorry, we can't do that; because postcopy for RAM is
> > > > something that users can enable but only switch into when they've given
> > > > up on it completing normally.
> > > > 
> > > > However, I guess that leaves you with a problem; which is what happens
> > > > to the system when you've run out of pend_pre+pend_compat but can't
> > > > complete because pend_post is non-0; so I don't know the answer to that.
> > > > 
> > > > 
> > > Hmm. Here, we go to postcopy only if "pend_pre + pend_compat <=
> > > s->threshold_size". Pre-patch, in this case we will go to
> > > migration_completion(). So, precopy stage is finishing anyway.
> > Right.
> > 
> > > So, we want
> > > in this case to finish ram migration like it was finished by
> > > migration_completion(), and then, run postcopy, which will handle only dirty
> > > bitmaps, yes?
> > It's a bit tricky; the first important thing is that we can't change the
> > semantics of the migration without the 'dirty bitmaps'.
> > 
> > So then there's the question of how  a migration with both
> > postcopy-ram+dirty bitmaps should work;  again I don't think we should
> > enter the postcopy-ram phase until start-postcopy is issued.
> > 
> > Then there's the 3rd case; dirty-bitmaps but no postcopy-ram; in that
> > case I worry less about the semantics of how you want to do it.
> 
> I have an idea:
> 
> in postcopy_start(), in ram_has_postcopy() (and may be some other places?),
> check atomic_read(&s->start_postcopy) instead of migrate_postcopy_ram()

We've got to use migrate_postcopy_ram() to decide whether we should do
ram specific things, e.g. send the ram discard data.
I'm wanting to make sure that if we have another full postcopy device
(like RAM, maybe storage say) that we'll just add that in with
migrate_postcopy_whatever().

> then:
> 
> 1. behavior without dirty-bitmaps is not changed, as currently we cant go
> into postcopy_start and ram_has_postcopy without s->start_postcopy
> 2. dirty-bitmaps+ram: if user don't set s->start_postcopy, postcopy_start()
> will operate as if migration capability was not enabled, so ram should
> complete its migration
> 3. only dirty-bitmaps: again, postcopy_start() will operate as if migration
> capability was not enabled, so ram should complete its migration

Why can't we just remove the change to the trigger condition in this
patch?  Then I think everything works as long as the management layer
does eventually call migration-start-postcopy ?
(It might waste some bandwidth at the point where there's otherwise
nothing left to send).

Even with the use of migrate-start-postcopy, you're going to need to be
careful about the higher level story; you need to document when to do it
and what the higher levels should do after a migration failure - at the
moment they know that once postcopy starts migration is irrecoverable if
it fails; I suspect that's not true with your dirty bitmaps.

IMHO this still comes back to my original observation from ~18months ago
that in many ways this isn't very postcopy like; in the sense that all
the semantics are quite different from RAM.

Dave

> > 
> > > Hmm2. Looked through migration_completion(), I don't understand, how it
> > > finishes ram migration without postcopy. It calls
> > > qemu_savevm_state_complete_precopy(), which skips states with
> > > has_postcopy=true, which is ram...
> > Because savevm_state_complete_precopy only skips has_postcopy=true in
> > the in_postcopy case:
> > 
> >              (in_postcopy && se->ops->has_postcopy &&
> >               se->ops->has_postcopy(se->opaque)) ||
> > 
> > so when we call it in migration_completion(), if we've not entered
> > postcopy yet, then that test doesn't trigger.
> > 
> > (Apologies for not spotting this earlier; but I thought this patch was
> > a nice easy one just adding the postcopy_only_pending - I didn't realise it changed
> > existing semantics until I spotted that)
> 
> oh, yes, I was inattentive :(
> 
> > 
> > Dave
> > 
> > > -- 
> > > Best regards,
> > > Vladimir
> > > 
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> 
> 
> -- 
> Best regards,
> Vladimir
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
Vladimir Sementsov-Ogievskiy March 13, 2018, 4:14 p.m. UTC | #9
13.03.2018 18:35, Dr. David Alan Gilbert wrote:
> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>> 13.03.2018 13:30, Dr. David Alan Gilbert wrote:
>>> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>>>> 12.03.2018 18:30, Dr. David Alan Gilbert wrote:
>>>>> * Vladimir Sementsov-Ogievskiy (vsementsov@virtuozzo.com) wrote:
>>>>>> There would be savevm states (dirty-bitmap) which can migrate only in
>>>>>> postcopy stage. The corresponding pending is introduced here.
>>>>>>
>>>>>> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
>>>>>> ---
>>>> [...]
>>>>
>>>>>>     static MigIterateState migration_iteration_run(MigrationState *s)
>>>>>>     {
>>>>>> -    uint64_t pending_size, pend_post, pend_nonpost;
>>>>>> +    uint64_t pending_size, pend_pre, pend_compat, pend_post;
>>>>>>         bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
>>>>>> -    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
>>>>>> -                              &pend_nonpost, &pend_post);
>>>>>> -    pending_size = pend_nonpost + pend_post;
>>>>>> +    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
>>>>>> +                              &pend_compat, &pend_post);
>>>>>> +    pending_size = pend_pre + pend_compat + pend_post;
>>>>>>         trace_migrate_pending(pending_size, s->threshold_size,
>>>>>> -                          pend_post, pend_nonpost);
>>>>>> +                          pend_pre, pend_compat, pend_post);
>>>>>>         if (pending_size && pending_size >= s->threshold_size) {
>>>>>>             /* Still a significant amount to transfer */
>>>>>>             if (migrate_postcopy() && !in_postcopy &&
>>>>>> -            pend_nonpost <= s->threshold_size &&
>>>>>> -            atomic_read(&s->start_postcopy)) {
>>>>>> +            pend_pre <= s->threshold_size &&
>>>>>> +            (atomic_read(&s->start_postcopy) ||
>>>>>> +             (pend_pre + pend_compat <= s->threshold_size)))
>>>>> This change does something different from the description;
>>>>> it causes a postcopy_start even if the user never ran the postcopy-start
>>>>> command; so sorry, we can't do that; because postcopy for RAM is
>>>>> something that users can enable but only switch into when they've given
>>>>> up on it completing normally.
>>>>>
>>>>> However, I guess that leaves you with a problem; which is what happens
>>>>> to the system when you've run out of pend_pre+pend_compat but can't
>>>>> complete because pend_post is non-0; so I don't know the answer to that.
>>>>>
>>>>>
>>>> Hmm. Here, we go to postcopy only if "pend_pre + pend_compat <=
>>>> s->threshold_size". Pre-patch, in this case we will go to
>>>> migration_completion(). So, precopy stage is finishing anyway.
>>> Right.
>>>
>>>> So, we want
>>>> in this case to finish ram migration like it was finished by
>>>> migration_completion(), and then, run postcopy, which will handle only dirty
>>>> bitmaps, yes?
>>> It's a bit tricky; the first important thing is that we can't change the
>>> semantics of the migration without the 'dirty bitmaps'.
>>>
>>> So then there's the question of how  a migration with both
>>> postcopy-ram+dirty bitmaps should work;  again I don't think we should
>>> enter the postcopy-ram phase until start-postcopy is issued.
>>>
>>> Then there's the 3rd case; dirty-bitmaps but no postcopy-ram; in that
>>> case I worry less about the semantics of how you want to do it.
>> I have an idea:
>>
>> in postcopy_start(), in ram_has_postcopy() (and may be some other places?),
>> check atomic_read(&s->start_postcopy) instead of migrate_postcopy_ram()
> We've got to use migrate_postcopy_ram() to decide whether we should do
> ram specific things, e.g. send the ram discard data.
> I'm wanting to make sure that if we have another full postcopy device
> (like RAM, maybe storage say) that we'll just add that in with
> migrate_postcopy_whatever().
>
>> then:
>>
>> 1. behavior without dirty-bitmaps is not changed, as currently we cant go
>> into postcopy_start and ram_has_postcopy without s->start_postcopy
>> 2. dirty-bitmaps+ram: if user don't set s->start_postcopy, postcopy_start()
>> will operate as if migration capability was not enabled, so ram should
>> complete its migration
>> 3. only dirty-bitmaps: again, postcopy_start() will operate as if migration
>> capability was not enabled, so ram should complete its migration
> Why can't we just remove the change to the trigger condition in this
> patch?  Then I think everything works as long as the management layer
> does eventually call migration-start-postcopy ?
> (It might waste some bandwidth at the point where there's otherwise
> nothing left to send).

Hmm, I agree, it is the simplest thing we can do for now, and I'll 
rethink later,
how (and is it worth doing) to go to postcopy automatically in case of 
only-dirty-bitmaps.
Should I respin?

>
> Even with the use of migrate-start-postcopy, you're going to need to be
> careful about the higher level story; you need to document when to do it
> and what the higher levels should do after a migration failure - at the
> moment they know that once postcopy starts migration is irrecoverable if
> it fails; I suspect that's not true with your dirty bitmaps.
>
> IMHO this still comes back to my original observation from ~18months ago
> that in many ways this isn't very postcopy like; in the sense that all
> the semantics are quite different from RAM.
>
> Dave
>
>>>> Hmm2. Looked through migration_completion(), I don't understand, how it
>>>> finishes ram migration without postcopy. It calls
>>>> qemu_savevm_state_complete_precopy(), which skips states with
>>>> has_postcopy=true, which is ram...
>>> Because savevm_state_complete_precopy only skips has_postcopy=true in
>>> the in_postcopy case:
>>>
>>>               (in_postcopy && se->ops->has_postcopy &&
>>>                se->ops->has_postcopy(se->opaque)) ||
>>>
>>> so when we call it in migration_completion(), if we've not entered
>>> postcopy yet, then that test doesn't trigger.
>>>
>>> (Apologies for not spotting this earlier; but I thought this patch was
>>> a nice easy one just adding the postcopy_only_pending - I didn't realise it changed
>>> existing semantics until I spotted that)
>> oh, yes, I was inattentive :(
>>
>>> Dave
>>>
>>>> -- 
>>>> Best regards,
>>>> Vladimir
>>>>
>>> --
>>> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>>
>> -- 
>> Best regards,
>> Vladimir
>>
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
John Snow March 13, 2018, 4:16 p.m. UTC | #10
On 03/13/2018 12:14 PM, Vladimir Sementsov-Ogievskiy wrote:
> 
> Hmm, I agree, it is the simplest thing we can do for now, and I'll
> rethink later,
> how (and is it worth doing) to go to postcopy automatically in case of
> only-dirty-bitmaps.
> Should I respin?

Please do. I already staged patches 1-4 in my branch, so if you'd like,
you can respin just 5+.

https://github.com/jnsnow/qemu/tree/bitmaps

--js
Vladimir Sementsov-Ogievskiy March 13, 2018, 4:33 p.m. UTC | #11
13.03.2018 19:16, John Snow wrote:
>
> On 03/13/2018 12:14 PM, Vladimir Sementsov-Ogievskiy wrote:
>> Hmm, I agree, it is the simplest thing we can do for now, and I'll
>> rethink later,
>> how (and is it worth doing) to go to postcopy automatically in case of
>> only-dirty-bitmaps.
>> Should I respin?
> Please do. I already staged patches 1-4 in my branch, so if you'd like,
> you can respin just 5+.
>
> https://github.com/jnsnow/qemu/tree/bitmaps
>
> --js

Ok, I'll base on your branch. How should I write Based-on: for patchew 
in this case?
John Snow March 13, 2018, 5:10 p.m. UTC | #12
On 03/13/2018 12:33 PM, Vladimir Sementsov-Ogievskiy wrote:
> 13.03.2018 19:16, John Snow wrote:
>>
>> On 03/13/2018 12:14 PM, Vladimir Sementsov-Ogievskiy wrote:
>>> Hmm, I agree, it is the simplest thing we can do for now, and I'll
>>> rethink later,
>>> how (and is it worth doing) to go to postcopy automatically in case of
>>> only-dirty-bitmaps.
>>> Should I respin?
>> Please do. I already staged patches 1-4 in my branch, so if you'd like,
>> you can respin just 5+.
>>
>> https://github.com/jnsnow/qemu/tree/bitmaps
>>
>> --js
> 
> Ok, I'll base on your branch. How should I write Based-on: for patchew
> in this case?
> 

You know, that's actually a good question... just send out the whole
series for the sake of patchew. I Added an R-B to patches 1-4 and edited
a single comment in #4.

--js
diff mbox series

Patch

diff --git a/include/migration/register.h b/include/migration/register.h
index f4f7bdc177..9436a87678 100644
--- a/include/migration/register.h
+++ b/include/migration/register.h
@@ -37,8 +37,21 @@  typedef struct SaveVMHandlers {
     int (*save_setup)(QEMUFile *f, void *opaque);
     void (*save_live_pending)(QEMUFile *f, void *opaque,
                               uint64_t threshold_size,
-                              uint64_t *non_postcopiable_pending,
-                              uint64_t *postcopiable_pending);
+                              uint64_t *res_precopy_only,
+                              uint64_t *res_compatible,
+                              uint64_t *res_postcopy_only);
+    /* Note for save_live_pending:
+     * - res_precopy_only is for data which must be migrated in precopy phase
+     *     or in stopped state, in other words - before target vm start
+     * - res_compatible is for data which may be migrated in any phase
+     * - res_postcopy_only is for data which must be migrated in postcopy phase
+     *     or in stopped state, in other words - after source vm stop
+     *
+     * Sum of res_postcopy_only, res_compatible and res_postcopy_only is the
+     * whole amount of pending data.
+     */
+
+
     LoadStateHandler *load_state;
     int (*load_setup)(QEMUFile *f, void *opaque);
     int (*load_cleanup)(void *opaque);
diff --git a/migration/savevm.h b/migration/savevm.h
index 295c4a1f2c..cf4f0d37ca 100644
--- a/migration/savevm.h
+++ b/migration/savevm.h
@@ -38,8 +38,9 @@  void qemu_savevm_state_complete_postcopy(QEMUFile *f);
 int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
                                        bool inactivate_disks);
 void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
-                               uint64_t *res_non_postcopiable,
-                               uint64_t *res_postcopiable);
+                               uint64_t *res_precopy_only,
+                               uint64_t *res_compatible,
+                               uint64_t *res_postcopy_only);
 void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
 void qemu_savevm_send_open_return_path(QEMUFile *f);
 int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t *buf, size_t len);
diff --git a/hw/s390x/s390-stattrib.c b/hw/s390x/s390-stattrib.c
index 2902f54f11..dd3fbfd1eb 100644
--- a/hw/s390x/s390-stattrib.c
+++ b/hw/s390x/s390-stattrib.c
@@ -183,15 +183,16 @@  static int cmma_save_setup(QEMUFile *f, void *opaque)
 }
 
 static void cmma_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-                             uint64_t *non_postcopiable_pending,
-                             uint64_t *postcopiable_pending)
+                              uint64_t *res_precopy_only,
+                              uint64_t *res_compatible,
+                              uint64_t *res_postcopy_only)
 {
     S390StAttribState *sas = S390_STATTRIB(opaque);
     S390StAttribClass *sac = S390_STATTRIB_GET_CLASS(sas);
     long long res = sac->get_dirtycount(sas);
 
     if (res >= 0) {
-        *non_postcopiable_pending += res;
+        *res_precopy_only += res;
     }
 }
 
diff --git a/migration/block.c b/migration/block.c
index 1f03946797..5652ca3337 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -866,8 +866,9 @@  static int block_save_complete(QEMUFile *f, void *opaque)
 }
 
 static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-                               uint64_t *non_postcopiable_pending,
-                               uint64_t *postcopiable_pending)
+                               uint64_t *res_precopy_only,
+                               uint64_t *res_compatible,
+                               uint64_t *res_postcopy_only)
 {
     /* Estimate pending number of bytes to send */
     uint64_t pending;
@@ -888,7 +889,7 @@  static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
 
     DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
     /* We don't do postcopy */
-    *non_postcopiable_pending += pending;
+    *res_precopy_only += pending;
 }
 
 static int block_load(QEMUFile *f, void *opaque, int version_id)
diff --git a/migration/migration.c b/migration/migration.c
index c99a4e62d7..3beedd676e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2215,21 +2215,23 @@  typedef enum {
  */
 static MigIterateState migration_iteration_run(MigrationState *s)
 {
-    uint64_t pending_size, pend_post, pend_nonpost;
+    uint64_t pending_size, pend_pre, pend_compat, pend_post;
     bool in_postcopy = s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE;
 
-    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size,
-                              &pend_nonpost, &pend_post);
-    pending_size = pend_nonpost + pend_post;
+    qemu_savevm_state_pending(s->to_dst_file, s->threshold_size, &pend_pre,
+                              &pend_compat, &pend_post);
+    pending_size = pend_pre + pend_compat + pend_post;
 
     trace_migrate_pending(pending_size, s->threshold_size,
-                          pend_post, pend_nonpost);
+                          pend_pre, pend_compat, pend_post);
 
     if (pending_size && pending_size >= s->threshold_size) {
         /* Still a significant amount to transfer */
         if (migrate_postcopy() && !in_postcopy &&
-            pend_nonpost <= s->threshold_size &&
-            atomic_read(&s->start_postcopy)) {
+            pend_pre <= s->threshold_size &&
+            (atomic_read(&s->start_postcopy) ||
+             (pend_pre + pend_compat <= s->threshold_size)))
+        {
             if (postcopy_start(s)) {
                 error_report("%s: postcopy failed to start", __func__);
             }
diff --git a/migration/ram.c b/migration/ram.c
index cb1950f3eb..38b1b2486a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2359,8 +2359,9 @@  static int ram_save_complete(QEMUFile *f, void *opaque)
 }
 
 static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
-                             uint64_t *non_postcopiable_pending,
-                             uint64_t *postcopiable_pending)
+                             uint64_t *res_precopy_only,
+                             uint64_t *res_compatible,
+                             uint64_t *res_postcopy_only)
 {
     RAMState **temp = opaque;
     RAMState *rs = *temp;
@@ -2380,9 +2381,9 @@  static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
 
     if (migrate_postcopy_ram()) {
         /* We can do postcopy, and all the data is postcopiable */
-        *postcopiable_pending += remaining_size;
+        *res_compatible += remaining_size;
     } else {
-        *non_postcopiable_pending += remaining_size;
+        *res_precopy_only += remaining_size;
     }
 }
 
diff --git a/migration/savevm.c b/migration/savevm.c
index b7908f62be..c3c60a15e3 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -1218,13 +1218,15 @@  int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
  * for units that can't do postcopy.
  */
 void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
-                               uint64_t *res_non_postcopiable,
-                               uint64_t *res_postcopiable)
+                               uint64_t *res_precopy_only,
+                               uint64_t *res_compatible,
+                               uint64_t *res_postcopy_only)
 {
     SaveStateEntry *se;
 
-    *res_non_postcopiable = 0;
-    *res_postcopiable = 0;
+    *res_precopy_only = 0;
+    *res_compatible = 0;
+    *res_postcopy_only = 0;
 
 
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
@@ -1237,7 +1239,8 @@  void qemu_savevm_state_pending(QEMUFile *f, uint64_t threshold_size,
             }
         }
         se->ops->save_live_pending(f, se->opaque, threshold_size,
-                                   res_non_postcopiable, res_postcopiable);
+                                   res_precopy_only, res_compatible,
+                                   res_postcopy_only);
     }
 }
 
diff --git a/migration/trace-events b/migration/trace-events
index 6f29fcc686..a04fffb877 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -86,7 +86,7 @@  migrate_fd_cleanup(void) ""
 migrate_fd_error(const char *error_desc) "error=%s"
 migrate_fd_cancel(void) ""
 migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at 0x%zx len 0x%zx"
-migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
+migrate_pending(uint64_t size, uint64_t max, uint64_t pre, uint64_t compat, uint64_t post) "pending size %" PRIu64 " max %" PRIu64 " (pre = %" PRIu64 " compat=%" PRIu64 " post=%" PRIu64 ")"
 migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
 migration_completion_file_err(void) ""
 migration_completion_postcopy_end(void) ""